Python爬虫案例:使用Selenium爬取敦煌网

来自CloudWiki
跳转至: 导航搜索

实训步骤

找到敦煌网网址

https://www.dhgate.com/all-categories/index.html?dspm=pcen.hp.cateall.1.mrTKDf7VqZU0tX5wE4CI&resource_id=#pu1806-all

爬取敦煌一级、二级目录

根据以下代码,

将敦煌网的一级目录 和二级目录 分别储存到dh_category.csv 和 dh_sub_category.csv中

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.dhgate.com/all-categories/index.html?dspm=pcen.hp.cateall.1.mrTKDf7VqZU0tX5wE4CI&resource_id=#pu1806-all")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")
#soup = soup.prettify()
#print(soup)
file_name = "dh_category.csv"#储存一级目录
file_name2 = "dh_sub_category.csv"#储存二级目录

category =""#暂存一级目录
sub_category=""#暂存二级目录

d_list = soup.find('div',class_ = 'first').find_all('dl')#寻找所有dl盒子
count =1
count2 =1
for d in d_list:
    #提取一级目录
    class1 = d.dt
    class1_title = class1.a.get_text().strip().replace(",","_")
    class1_url = 'https://www.dhgate.com' +class1.a['href']
    print(class1_title)
    print(class1_url)
    category +=str(count)+','+class1_title+','+ class1_url + '\n'

    #提取二级目录
    c2_list = d.find_all('dd')
    for i in c2_list:
        class2 = i
        class2_title = class2.a.get_text().strip().replace(",","_")
        class2_url = 'https://www.dhgate.com' +class2.a['href']
        print(class2_title)
        print(class2_url)
        sub_category +=str(count2)+','+class1_title+','+class2_title+','+ class2_url + '\n'
        count2 +=1
        
    count +=1
    

print("一级目录总数:"+str(count))
print("二级目录总数:"+str(count2))
    
fw = open(file_name,"w",encoding="utf-8")
fw.write(category)
fw.close()

fw = open(file_name2,"w",encoding="utf-8")
fw.write(sub_category)
fw.close()   

爬取敦煌网商品内容

根据以上爬取的 dh_sub_category.csv 目录,对每个品类逐一进行爬取:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

# 初始参数设置
# 初始参数设置
driver = webdriver.Chrome()
#wait = WebDriverWait(driver, 6)

# 保存的时间和文件名
now_time = time.strftime("%Y%m%d%H%M%S", time.localtime())
start_class2 = 'Apparel from Turkey'  # 从这个(二级目录)开始爬取,默认从第一个三级目录开始爬取,可以更改
start_line=1  # 从目录的这一行开始爬取,与start_class3等价
start_page = 1
end_page = start_page + 30  # 此参数一般不改
file_name = 'DG_gate_' + now_time + "_" + start_class2 + "_" \
            + str(start_page) + "_" + str(end_page) + '.csv'


err_log_file ='DG_gate_' + now_time + "_" + start_class2 + "_" \
            + str(start_page) + "_" + str(end_page) +'_err_log.csv'

soup = ""


# 根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        for i in range(1, 11):
            driver.execute_script("window.scrollTo(0,document.body.scrollHeight*" + str(i) + "/10)")
            time.sleep(0.25)

        # time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        global soup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        return 0
    except Exception as ex:
        print("出现如下异常%s" % ex)
        return 1

    # time.sleep(2)


# 得到商品信息
def get_products(page_number, class1, class2, line_num):
    # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    div_list = soup.find_all('div', class_='gitem')
    print(len(div_list))
    total = ""
    error = ""

    count = 1#记录是该页的第几个
    for div in div_list:
        try:
            item_info = ""
            item_info += "DG_gate_," + str(page_number) + ","  # 记录是第几页的信息

            item_info += class1 + "," + class2 + ","   # 记录是其分属的1,2级目录

            # 商品图片信息
            img = div.find('img')  # 商品图片
            print(str(count)+":"+img['src']+",")
            item_info += img['src'] + ","#添加图片信息

            
            # 标题
            title = div.find('h3', 'pro-title').a
            product_name =title.get_text().strip()
            product_name = product_name.replace(",", "_")
            product_url = title['href']
            item_info += product_name+","+product_url + ","#添加名称和链接
            
            # 价格
            price_info = div.find('li', 'price').text
            price_info = price_info.replace(",", "_")
            # print((str(count)+":"+title))            
            item_info += price_info+","
            
            # 商家信息
            supplier= div.find('div', 'seller-title').a
            s_name = supplier.get_text().strip()
            s_name = s_name.replace(",", "_")
            s_href = supplier['href'];
            s_href = s_href.replace(",", "_")
            
           
            #print(str(count)+":"+s_name+","+s_href+","+s_nation+",")
            item_info += s_name + "," + s_href 
            
            # 换行
            total += item_info +","+str(line_num)+"\n"

        except Exception as ex:
            print("出现如下异常%s" % ex)
            error += class2 +","+str(page_number)+ ","+str(count)+","+str(ex) + "\n"


        count = count + 1
    # print(title)

    return total,error   #汇集每一页面的所有信息


def read_category_file(start_class2):
    cat_list = []  # 创建类别网址列表
    fp = open('dh_sub_category.csv', "rt")  # 打开csv文件

    count= 0
    global start_line
    for line in fp:  # 文件对象可以直接迭代
        count +=1
        if count<start_line:
            continue
        d = {}
        data = line.split(',')
        d['line_num'] = data[0]
        d['class1'] = data[1]
        d['class2'] = data[2]
        d['url'] = data[3]

        cat_list.append(d)  # 将读取的每行信息添加到列表中
        
    fp.close()
    return cat_list


def get_category_info(line_num,class1, class2, url):
    # url ="http://www.alibaba.com/Animal-Products_pid100003006"

    # total = int(re.compile('(\d+)').search(total).group(1))
    # 爬取所有的数据用total+1
    info =""
    err =""
    this_url=url
    for i in range(start_page,end_page):#(21,41);(41,61)
        #next_page(i)
        status =browse(this_url)              
        if status ==0:#如果能正常打开页面,再去爬取信息
            message,error = get_products(i,class1,class2,line_num)
            info += message
            err  += error
        else:
            err +=  class2 +","+str(i)+ ",0,"+"Unable to browse page, "+this_url+"\n"

        this_url = next_page(i+1)
        if this_url is None:
            break;    
        
        
        
    #将该品类商品信息写入文件
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(info)
    fw.close()

    #将该品类错误信息写入文件
    
    
    fw2 = open(err_log_file,"a",encoding="utf-8")
    fw2.write(err)
    fw2.close()


def next_page(num):
    try:
        url = soup.find('a', class_="next")
        # print(url)
        # print(url['href'] is None)
        # print(url['href'])
        urls = url['href'].split(";")
        # print(urls[0])
        nu = urls[0].rsplit("-", 1)
        next_url_header =nu[0]
        next_url = nu[0] + "-" + str(num) + ".html"
    except Exception as ex:
        print("出现如下异常%s" % ex)
        next_url =None



    return next_url


def main():

    cat_file = read_category_file(start_class2)  # 从start_class3这个三级目录开始读取
    for d in cat_file:
        # 爬取每一类别的商品信息,class1,class2,class3 分别指代1,2,3级目录
        get_category_info(d['line_num'], d['class1'], d['class2'], d['url'])
        #break;
    


if __name__ == '__main__':
    main()