2021云+数:51job

来自CloudWiki
跳转至: 导航搜索

from selenium import webdriver #导入包 from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time import pandas as pd

driver = webdriver.Chrome("C:\Program Files\Google\Chrome\Application/chromedriver.exe") #打开Chrome浏览器

list_data = [ ] def search():

   #登录搜索页面
   driver.get('https://search.51job.com/list/120200%252C010000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=')  #输入url,打开百度首页
   elem = driver.find_element_by_xpath('//*[@id = "keywordInput"]')  #查找输入框
   elem.send_keys('数据分析',Keys.ENTER)  #模拟点击回车
   time.sleep(5)
   #print(driver.page_source)

def find_result( ):

   soup = BeautifulSoup(driver.page_source, "html.parser")
   
   joblist = soup.find("div", {'class': 'j_joblist'})  # 题目
   joblist = joblist.find_all("div", {'class': 'e'})  
   #公司名、职位名、工作地点、薪资、发布时间


   x = 0
   
   for job in joblist:
       #打印全部
       #print(job.text)
       dict ={}
       #公司名        
       company_name =  job.find('a', class_ = 'cname')
       #print(company_name.get_text().strip())
       dict['company'] = company_name.get_text().strip()
       
       #职位名
       job_name =  job.find('span', class_ = 'jname')        
       #print(job_name.get_text().strip())
       dict['job'] = job_name.get_text().strip()
       #工作地点
       area="济南|北京"
       #print(area)
       dict['area']=area
       #薪资
       salary =  job.find('span', class_ = 'sal')
       #print(salary.get_text().strip())
       dict['salary']=salary.get_text().strip()
       
       #发布时间
       pub_time = job.find('span',class_ ='time')
       #print(pub_time.get_text().strip())
       dict['pub_time'] = pub_time.get_text().strip()
       
       #将本职位信息添加到列表        
       global list_data
       list_data.append(dict)
       #print(dict)
       x += 1
       
   #driver.close()

def save_info():

   print("hello")
   #print(list_data)
   df = pd.DataFrame(list_data)
   print(df)

def next_page(page_number):

   time.sleep(5)
   elem = driver.find_elements_by_link_text(page_number)  #翻页按钮
   for e in elem:
       print(e)
   elem[0].click()
   
   #submit = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '2')))
   #submit.click()

def main():

   search()
   find_result()
   
   
   #爬取第2~5页产品信息
   for i in range(2,4):
       num = str(i)
       next_page(num)
       find_result()
   #保存结果
   save_info() 
   

if __name__ == '__main__':

   main()