Python爬虫案例:使用Selenium爬取中国制造网

来自CloudWiki
Cloud17讨论 | 贡献2020年8月13日 (四) 06:54的版本
跳转至: 导航搜索

实训介绍

中国制造网的所有一级目录 在这里都能找到:https://www.made-in-china.com/prod/catlist/

但是二级目录 这个页面上显示的不全,需要到这个页面上的"More“链接去访问。

因此 爬取分为两步,先获取各一级目录的链接,再在各一级目录的网页上爬取二级、三级 目录。

实训步骤

爬取一级目录链接

import requests
from bs4 import BeautifulSoup
import time
r = requests.get("https://www.made-in-china.com/prod/catlist/")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")
print(soup)
#soup = soup.prettify()

a_list = soup.find_all('a', class_ = 'title-anchor')
title=""
count=1
for i in a_list:
    title = i.get_text().strip()
    print(str(count)+":"+title)
    title =title.replace(" & ","-").replace(", ","-").replace(" ","-")
    
    
    url = "https://www.made-in-china.com/multi-search" \
          +"/catlist/listsubcat/"+title+"-Catalog/2.html#categories"
    print(url)
    response = requests.get(url)
    print(str(count)+": 状态码:",response.status_code)
    time.sleep(2)
    count +=1

可以把这里的url 保存在csv文件中,方便下一步爬取。

Bd20-6-13.png

爬取二级、三级目录

这一步读取的madeInChina_title.csv文件就是上一步保存的结果,这一步是到各一级目录上爬取二级、三级目录。


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import requests
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的文件名
sub_cat_file='made_in_china_sub_cat.csv'

def get_sub_cat(num ,header1):#num和header1:序号和一级目录
    #url ="https://www.made-in-china.com/multi-search/catlist/listsubcat/Industrial-Equipment-Components-Catalog/2.html#categories"
    soup = BeautifulSoup(driver.page_source,"html.parser")

    #header1 = soup.find('h1',class_ = 'title').get_text().strip()
    li_list = soup.find_all('li', class_ = 'items-line-list')
    total=""
    count=1
    print(len(li_list))
    #'''
    header2=""
    for i in li_list:
        #title = i.a.get_text().strip()
        h2 =i.find('h2',class_ ='sub-title')
        if(h2 is not None):
            header2 = h2.a.get_text().strip() #更新现在的二级目录
            header2_url = "https:"+h2.a['href']
            print(str(num)+","+header1+str(count)+",二级目录:"+header2)
            print(str(num)+","+header1+str(count)+",二级目录URL:"+header2_url)
            
        h3_list = i.find_all('h3')
        if len(h3_list) ==0:#如果仅有二级目录、无三级目录
            total += str(num)+","+header1+","+str(count)+","+header2+","+header2+","+header2_url+"\n"
        else:
            #如果二级目录下有三级目录
            for j in h3_list:
                header3 = j.a.get_text().strip()
                header3_url = "https:"+j.a['href']
                print(str(num)+","+header1+","+str(count)+","+","+header2+","+header3+","+header3_url+"\n")
                total += str(num)+","+header1+","+str(count)+","+header2+","+header3+","+header3_url+"\n"

        if(h2 is not None):
            count +=1 #计数器增1    
      
    fw = open(sub_cat_file,"a",encoding="utf-8")
    fw.write(total)
    fw.close()




#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        time.sleep(2)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#页面滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")
        time.sleep(0.25)
    
    #time.sleep(2)  


def read_total_category_file():
    cat_list =[]#创建类别网址列表
    fp =open('madeInChina_title.csv',"rt")
    #csv文件每行由num,header1,header1_url 这么几个字段
    for line in fp:                 #文件对象可以直接迭代
        d ={}
        data =line.split(',')
        d['num'] =data[0]
        d['header1'] =(data[1])
        d['header1_url'] =(data[2])
          
        
        cat_list.append(d)#将读取的每行信息添加到列表中
    fp.close()
    return cat_list

  
def main():
    start = time.clock()
    #'alibaba_categary.csv'
    
    cat_file =read_total_category_file()
    #print(cat_file)

    for d in cat_file:
        #d中所存字段:num,header1,header1_url
        browse(d['header1_url'])
        get_sub_cat(d['num'],d['header1'])#得到二级目录
        #break;
        
        
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)
    #'''


if __name__ == '__main__':
    main()


Bd20-6-14.png