2021云+数:51job
来自CloudWiki
from selenium import webdriver #导入包 from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import time import pandas as pd
driver = webdriver.Chrome("C:\Program Files\Google\Chrome\Application/chromedriver.exe") #打开Chrome浏览器
list_data = [ ] def search():
#登录搜索页面 driver.get('https://search.51job.com/list/120200%252C010000,000000,0000,00,9,99,%2B,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=') #输入url,打开百度首页
elem = driver.find_element_by_xpath('//*[@id = "keywordInput"]') #查找输入框 elem.send_keys('数据分析',Keys.ENTER) #模拟点击回车 time.sleep(5) #print(driver.page_source)
def find_result( ):
soup = BeautifulSoup(driver.page_source, "html.parser") joblist = soup.find("div", {'class': 'j_joblist'}) # 题目
joblist = joblist.find_all("div", {'class': 'e'}) #公司名、职位名、工作地点、薪资、发布时间
x = 0 for job in joblist: #打印全部 #print(job.text) dict ={} #公司名 company_name = job.find('a', class_ = 'cname') #print(company_name.get_text().strip()) dict['company'] = company_name.get_text().strip() #职位名 job_name = job.find('span', class_ = 'jname') #print(job_name.get_text().strip()) dict['job'] = job_name.get_text().strip()
#工作地点 area="济南|北京" #print(area) dict['area']=area
#薪资 salary = job.find('span', class_ = 'sal') #print(salary.get_text().strip()) dict['salary']=salary.get_text().strip() #发布时间 pub_time = job.find('span',class_ ='time') #print(pub_time.get_text().strip()) dict['pub_time'] = pub_time.get_text().strip() #将本职位信息添加到列表 global list_data list_data.append(dict)
#print(dict) x += 1 #driver.close()
def save_info():
print("hello") #print(list_data) df = pd.DataFrame(list_data) print(df)
def next_page(page_number):
time.sleep(5) elem = driver.find_elements_by_link_text(page_number) #翻页按钮 for e in elem: print(e) elem[0].click() #submit = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '2'))) #submit.click()
def main():
search() find_result() #爬取第2~5页产品信息 for i in range(2,4): num = str(i) next_page(num) find_result()
#保存结果 save_info()
if __name__ == '__main__':
main()