Python爬虫案例:使用Selenium爬取中国制造网

来自CloudWiki
跳转至: 导航搜索

实训介绍

中国制造网的所有一级目录 在这里都能找到:https://www.made-in-china.com/prod/catlist/

但是二级目录 这个页面上显示的不全,需要到这个页面上的"More“链接去访问。

因此 爬取分为两步,先获取各一级目录的链接,再在各一级目录的网页上爬取二级、三级 目录。

实训步骤

爬取一级目录链接

import requests
from bs4 import BeautifulSoup
import time
r = requests.get("https://www.made-in-china.com/prod/catlist/")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")
print(soup)
#soup = soup.prettify()

a_list = soup.find_all('a', class_ = 'title-anchor')
title=""
count=1
for i in a_list:
    title = i.get_text().strip()
    print(str(count)+":"+title)
    title =title.replace(" & ","-").replace(", ","-").replace(" ","-")
    
    
    url = "https://www.made-in-china.com/multi-search" \
          +"/catlist/listsubcat/"+title+"-Catalog/2.html#categories"
    print(url)
    response = requests.get(url)
    print(str(count)+": 状态码:",response.status_code)
    time.sleep(2)
    count +=1

可以把这里的url 保存在csv文件中,方便下一步爬取。

Bd20-6-13.png

爬取二级、三级目录

这一步读取的madeInChina_title.csv文件就是上一步保存的结果,这一步是到各一级目录上爬取二级、三级目录。


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import requests
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的文件名
sub_cat_file='made_in_china_sub_cat.csv'

def get_sub_cat(num ,header1):#num和header1:序号和一级目录
    #url ="https://www.made-in-china.com/multi-search/catlist/listsubcat/Industrial-Equipment-Components-Catalog/2.html#categories"
    soup = BeautifulSoup(driver.page_source,"html.parser")

    #header1 = soup.find('h1',class_ = 'title').get_text().strip()
    li_list = soup.find_all('li', class_ = 'items-line-list')
    total=""
    count=1
    print(len(li_list))
    #'''
    header2=""
    for i in li_list:
        #title = i.a.get_text().strip()
        h2 =i.find('h2',class_ ='sub-title')
        if(h2 is not None):
            header2 = h2.a.get_text().strip() #更新现在的二级目录
            header2_url = "https:"+h2.a['href']
            print(str(num)+","+header1+str(count)+",二级目录:"+header2)
            print(str(num)+","+header1+str(count)+",二级目录URL:"+header2_url)
            
        h3_list = i.find_all('h3')
        if len(h3_list) ==0:#如果仅有二级目录、无三级目录
            total += str(num)+","+header1+","+str(count)+","+header2+","+header2+","+header2_url+"\n"
        else:
            #如果二级目录下有三级目录
            for j in h3_list:
                header3 = j.a.get_text().strip()
                header3_url = "https:"+j.a['href']
                print(str(num)+","+header1+","+str(count)+","+","+header2+","+header3+","+header3_url+"\n")
                total += str(num)+","+header1+","+str(count)+","+header2+","+header3+","+header3_url+"\n"

        if(h2 is not None):
            count +=1 #计数器增1    
      
    fw = open(sub_cat_file,"a",encoding="utf-8")
    fw.write(total)
    fw.close()




#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        time.sleep(2)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#页面滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")
        time.sleep(0.25)
    
    #time.sleep(2)  


def read_total_category_file():
    cat_list =[]#创建类别网址列表
    fp =open('madeInChina_title.csv',"rt")
    #csv文件每行由num,header1,header1_url 这么几个字段
    for line in fp:                 #文件对象可以直接迭代
        d ={}
        data =line.split(',')
        d['num'] =data[0]
        d['header1'] =(data[1])
        d['header1_url'] =(data[2])
          
        
        cat_list.append(d)#将读取的每行信息添加到列表中
    fp.close()
    return cat_list

  
def main():
    start = time.clock()
    #'alibaba_categary.csv'
    
    cat_file =read_total_category_file()
    #print(cat_file)

    for d in cat_file:
        #d中所存字段:num,header1,header1_url
        browse(d['header1_url'])
        get_sub_cat(d['num'],d['header1'])#得到二级目录
        #break;
        
        
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)
    #'''


if __name__ == '__main__':
    main()


Bd20-6-14.png

根据二级、三级目录爬取商品列表

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

# 初始参数设置
#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)

# 保存的时间和文件名
now_time = time.strftime("%Y%m%d%H%M%S", time.localtime())
start_class3 = 'Pneumatic Pipe'  # 从这个(三级目录)开始爬取,默认从第一个三级目录开始爬取,可以更改
start_page = 1
end_page =start_page+20#此参数一般不改
file_name ='made in china_'+now_time+"_"+start_class3+"_" \
            +str(start_page)+"_"+str(end_page) +'.csv'
begin = 0;  # 标记是否开始爬

soup =""



# 根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        for i in range(1, 11):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight*" + str(i) + "/10)")
            time.sleep(0.25)

        # time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        global soup
        soup= BeautifulSoup(driver.page_source, "html.parser")

        return 0
    except Exception as ex:
        print("出现如下异常%s" % ex)
        return 1



    # time.sleep(2)


# 得到商品信息
def get_products(page_number, class1, class2, class3):
    # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    
    div_list = soup.find_all('div', class_='list-node')
    print(len(div_list))
    total = ""

    count = 1
    for i in div_list:
        try:
            item_info =""
            item_info += "made in china," +str(page_number) + ","  # 记录是第几页的信息

            item_info += class1 + "," + class2 + "," + class3 + ","  # 记录是其分属的1,2,3级目录

            # 商品图片信息
            img = i.find('img')  # 商品图片
            print(str(count)+":"+img['src']+",")
            item_info += img['src'] + ","#添加图片信息

            title = i.find('h2', class_="product-name").a
            product_name =title.get_text().strip()
            product_name = product_name.replace(",", "_")
            product_url = title['href']            
            #print((str(count)+":"+product_name+","+product_url)+ ",")
            item_info += product_name+","+product_url + ","#添加名称和链接

            # 价格信息
            detail = i.find('div', class_='product-property') \
                .find('span')
            detail =detail.get_text().strip()
            detail =detail.replace(",", "_")
            p_property = detail.split("/")
            product_price =p_property[0];
            product_unit =p_property[1];
            #print(str(count)+":"+product_price+","+product_unit)
            item_info +=product_price+","+product_unit+","#添加价格和单位
            
            # 商家信息
            supplier = i.find('a',class_="compnay-name")
            #supplier = i.find('div', class_='pro-extra').span
            s_name = supplier.get_text().strip()
            s_name = s_name.replace(",", "_")
            s_href = 'https:'+supplier['href'];
            s_href = s_href.replace(",", "_")
            s_nation ="China"
            #print(str(count)+":"+s_name+","+s_href+","+s_nation+",")
            item_info += s_name + "," + s_href + ","+s_nation+","

            print(str(count)+":"+item_info)
            
            # 换行
            total +=item_info+"\n"

        except Exception as ex:
            print("出现如下异常%s" % ex)

        count = count + 1
    # print(title)

    return total   #汇集每一页面的所有信息



def read_category_file(start_class3):
    cat_list = []  # 创建类别网址列表
    fp = open('made_in_china_sub_cat.csv', "rt")# 打开csv文件
    # csv文件每行由class1,class2,class3,class3_url 这么几个字段
    for line in fp:  # 文件对象可以直接迭代
        d = {}
        data = line.split(',')
        d['class1'] = data[1]
        d['class2'] = (data[3])
        d['class3'] = (data[4])
        d['url'] = data[5]
        
        global begin
        if d['class3'] == start_class3:  # 到达start_class3才开始爬
            begin = 1;
        elif begin == 0:
            continue;
        cat_list.append(d)  # 将读取的每行信息添加到列表中
    fp.close()
    return cat_list


def get_category_info(class1, class2, class3, url):
    # url ="http://www.alibaba.com/Animal-Products_pid100003006"

    # total = int(re.compile('(\d+)').search(total).group(1))
    # 爬取所有的数据用total+1
    info =""
    this_url=url
    for i in range(start_page,end_page):#(21,41);(41,61)
        #next_page(i)
        status =browse(this_url)              
        if status ==0:#如果能正常打开页面,再去爬取信息
            info +=get_products(i,class1,class2,class3)
        this_url = next_page(i+1)
        if this_url is None:
            break;    
        
        
        
    #将该品类信息写入文件
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(info)
    fw.close()

def next_page(num):
    try:
        url = soup.find('a',class_="next")
        #print(url)
        #print(url['href'] is None)
        #print(url['href'])
        urls=url['href'].split(";")
        #print(urls[0])
        nu = urls[0].rsplit("-",1)
        next_url_header ="https:"+nu[0]
        next_url ="https:"+nu[0]+"-"+str(num)+".html"
    except Exception as ex:
        print("出现如下异常%s" % ex)
        next_url =None
        
    return next_url

def main():
    start = time.clock()
    # 'alibaba_categary.csv'

    cat_file = read_category_file(start_class3)  # 从start_class3这个三级目录开始读取
    for d in cat_file:
        # 爬取每一类别的商品信息,class1,class2,class3 分别指代1,2,3级目录
        get_category_info(d['class1'], d['class2'], d['class3'], d['url'])
        #break;
    elapsed = (time.clock() - start)
    print("Time used:", elapsed)
    # '''


if __name__ == '__main__':
    main()