查看“Python爬虫案例:使用Selenium爬取中国制造网”的源代码
←
Python爬虫案例:使用Selenium爬取中国制造网
跳转至:
导航
,
搜索
因为以下原因,您没有权限编辑本页:
您所请求的操作仅限于该用户组的用户使用:
用户
您可以查看与复制此页面的源代码。
==实训介绍== 中国制造网的所有一级目录 在这里都能找到:https://www.made-in-china.com/prod/catlist/ 但是二级目录 这个页面上显示的不全,需要到这个页面上的"More“链接去访问。 因此 爬取分为两步,先获取各一级目录的链接,再在各一级目录的网页上爬取二级、三级 目录。 ==实训步骤== ===爬取一级目录链接=== <nowiki> import requests from bs4 import BeautifulSoup import time r = requests.get("https://www.made-in-china.com/prod/catlist/") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") print(soup) #soup = soup.prettify() a_list = soup.find_all('a', class_ = 'title-anchor') title="" count=1 for i in a_list: title = i.get_text().strip() print(str(count)+":"+title) title =title.replace(" & ","-").replace(", ","-").replace(" ","-") url = "https://www.made-in-china.com/multi-search" \ +"/catlist/listsubcat/"+title+"-Catalog/2.html#categories" print(url) response = requests.get(url) print(str(count)+": 状态码:",response.status_code) time.sleep(2) count +=1</nowiki> 可以把这里的url 保存在csv文件中,方便下一步爬取。 [[文件:bd20-6-13.png|600px]] ===爬取二级、三级目录=== 这一步读取的madeInChina_title.csv文件就是上一步保存的结果,这一步是到各一级目录上爬取二级、三级目录。 <nowiki>from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import requests import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的文件名 sub_cat_file='made_in_china_sub_cat.csv' def get_sub_cat(num ,header1):#num和header1:序号和一级目录 #url ="https://www.made-in-china.com/multi-search/catlist/listsubcat/Industrial-Equipment-Components-Catalog/2.html#categories" soup = BeautifulSoup(driver.page_source,"html.parser") #header1 = soup.find('h1',class_ = 'title').get_text().strip() li_list = soup.find_all('li', class_ = 'items-line-list') total="" count=1 print(len(li_list)) #''' header2="" for i in li_list: #title = i.a.get_text().strip() h2 =i.find('h2',class_ ='sub-title') if(h2 is not None): header2 = h2.a.get_text().strip() #更新现在的二级目录 header2_url = "https:"+h2.a['href'] print(str(num)+","+header1+str(count)+",二级目录:"+header2) print(str(num)+","+header1+str(count)+",二级目录URL:"+header2_url) h3_list = i.find_all('h3') if len(h3_list) ==0:#如果仅有二级目录、无三级目录 total += str(num)+","+header1+","+str(count)+","+header2+","+header2+","+header2_url+"\n" else: #如果二级目录下有三级目录 for j in h3_list: header3 = j.a.get_text().strip() header3_url = "https:"+j.a['href'] print(str(num)+","+header1+","+str(count)+","+","+header2+","+header3+","+header3_url+"\n") total += str(num)+","+header1+","+str(count)+","+header2+","+header3+","+header3_url+"\n" if(h2 is not None): count +=1 #计数器增1 fw = open(sub_cat_file,"a",encoding="utf-8") fw.write(total) fw.close() #根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() time.sleep(2)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#页面滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") time.sleep(0.25) #time.sleep(2) def read_total_category_file(): cat_list =[]#创建类别网址列表 fp =open('madeInChina_title.csv',"rt") #csv文件每行由num,header1,header1_url 这么几个字段 for line in fp: #文件对象可以直接迭代 d ={} data =line.split(',') d['num'] =data[0] d['header1'] =(data[1]) d['header1_url'] =(data[2]) cat_list.append(d)#将读取的每行信息添加到列表中 fp.close() return cat_list def main(): start = time.clock() #'alibaba_categary.csv' cat_file =read_total_category_file() #print(cat_file) for d in cat_file: #d中所存字段:num,header1,header1_url browse(d['header1_url']) get_sub_cat(d['num'],d['header1'])#得到二级目录 #break; elapsed = (time.clock() - start) print("Time used:",elapsed) #''' if __name__ == '__main__': main() </nowiki> [[文件:bd20-6-14.png|600px]]
返回至
Python爬虫案例:使用Selenium爬取中国制造网
。
导航菜单
个人工具
登录
命名空间
页面
讨论
变种
视图
阅读
查看源代码
查看历史
更多
搜索
导航
首页
最近更改
随机页面
帮助
工具
链入页面
相关更改
特殊页面
页面信息