Python爬虫案例:使用Selenium爬取敦煌网
来自CloudWiki
实训步骤
找到敦煌网网址
爬取敦煌一级、二级目录
根据以下代码,
将敦煌网的一级目录 和二级目录 分别储存到dh_category.csv 和 dh_sub_category.csv中
import requests from bs4 import BeautifulSoup r = requests.get("https://www.dhgate.com/all-categories/index.html?dspm=pcen.hp.cateall.1.mrTKDf7VqZU0tX5wE4CI&resource_id=#pu1806-all") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") #soup = soup.prettify() #print(soup) file_name = "dh_category.csv"#储存一级目录 file_name2 = "dh_sub_category.csv"#储存二级目录 category =""#暂存一级目录 sub_category=""#暂存二级目录 d_list = soup.find('div',class_ = 'first').find_all('dl')#寻找所有dl盒子 count =1 count2 =1 for d in d_list: #提取一级目录 class1 = d.dt class1_title = class1.a.get_text().strip().replace(",","_") class1_url = 'https://www.dhgate.com' +class1.a['href'] print(class1_title) print(class1_url) category +=str(count)+','+class1_title+','+ class1_url + '\n' #提取二级目录 c2_list = d.find_all('dd') for i in c2_list: class2 = i class2_title = class2.a.get_text().strip().replace(",","_") class2_url = 'https://www.dhgate.com' +class2.a['href'] print(class2_title) print(class2_url) sub_category +=str(count2)+','+class1_title+','+class2_title+','+ class2_url + '\n' count2 +=1 count +=1 print("一级目录总数:"+str(count)) print("二级目录总数:"+str(count2)) fw = open(file_name,"w",encoding="utf-8") fw.write(category) fw.close() fw = open(file_name2,"w",encoding="utf-8") fw.write(sub_category) fw.close()
爬取敦煌网商品内容
根据以上爬取的 dh_sub_category.csv 目录,对每个品类逐一进行爬取:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv # 初始参数设置 # 初始参数设置 driver = webdriver.Chrome() #wait = WebDriverWait(driver, 6) # 保存的时间和文件名 now_time = time.strftime("%Y%m%d%H%M%S", time.localtime()) start_class2 = 'Apparel from Turkey' # 从这个(二级目录)开始爬取,默认从第一个三级目录开始爬取,可以更改 start_line=1 # 从目录的这一行开始爬取,与start_class3等价 start_page = 1 end_page = start_page + 30 # 此参数一般不改 file_name = 'DG_gate_' + now_time + "_" + start_class2 + "_" \ + str(start_page) + "_" + str(end_page) + '.csv' err_log_file ='DG_gate_' + now_time + "_" + start_class2 + "_" \ + str(start_page) + "_" + str(end_page) +'_err_log.csv' soup = "" # 根据url模拟访问网站 def browse(url): try: driver.get(url) for i in range(1, 11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*" + str(i) + "/10)") time.sleep(0.25) # time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") global soup soup = BeautifulSoup(driver.page_source, "html.parser") return 0 except Exception as ex: print("出现如下异常%s" % ex) return 1 # time.sleep(2) # 得到商品信息 def get_products(page_number, class1, class2, line_num): # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) div_list = soup.find_all('div', class_='gitem') print(len(div_list)) total = "" error = "" count = 1#记录是该页的第几个 for div in div_list: try: item_info = "" item_info += "DG_gate_," + str(page_number) + "," # 记录是第几页的信息 item_info += class1 + "," + class2 + "," # 记录是其分属的1,2级目录 # 商品图片信息 img = div.find('img') # 商品图片 print(str(count)+":"+img['src']+",") item_info += img['src'] + ","#添加图片信息 # 标题 title = div.find('h3', 'pro-title').a product_name =title.get_text().strip() product_name = product_name.replace(",", "_") product_url = title['href'] item_info += product_name+","+product_url + ","#添加名称和链接 # 价格 price_info = div.find('li', 'price').text price_info = price_info.replace(",", "_") # print((str(count)+":"+title)) item_info += price_info+"," # 商家信息 supplier= div.find('div', 'seller-title').a s_name = supplier.get_text().strip() s_name = s_name.replace(",", "_") s_href = supplier['href']; s_href = s_href.replace(",", "_") #print(str(count)+":"+s_name+","+s_href+","+s_nation+",") item_info += s_name + "," + s_href # 换行 total += item_info +","+str(line_num)+"\n" except Exception as ex: print("出现如下异常%s" % ex) error += class2 +","+str(page_number)+ ","+str(count)+","+str(ex) + "\n" count = count + 1 # print(title) return total,error #汇集每一页面的所有信息 def read_category_file(start_class2): cat_list = [] # 创建类别网址列表 fp = open('dh_sub_category.csv', "rt") # 打开csv文件 count= 0 global start_line for line in fp: # 文件对象可以直接迭代 count +=1 if count<start_line: continue d = {} data = line.split(',') d['line_num'] = data[0] d['class1'] = data[1] d['class2'] = data[2] d['url'] = data[3] cat_list.append(d) # 将读取的每行信息添加到列表中 fp.close() return cat_list def get_category_info(line_num,class1, class2, url): # url ="http://www.alibaba.com/Animal-Products_pid100003006" # total = int(re.compile('(\d+)').search(total).group(1)) # 爬取所有的数据用total+1 info ="" err ="" this_url=url for i in range(start_page,end_page):#(21,41);(41,61) #next_page(i) status =browse(this_url) if status ==0:#如果能正常打开页面,再去爬取信息 message,error = get_products(i,class1,class2,line_num) info += message err += error else: err += class2 +","+str(i)+ ",0,"+"Unable to browse page, "+this_url+"\n" this_url = next_page(i+1) if this_url is None: break; #将该品类商品信息写入文件 fw = open(file_name,"a",encoding="utf-8") fw.write(info) fw.close() #将该品类错误信息写入文件 fw2 = open(err_log_file,"a",encoding="utf-8") fw2.write(err) fw2.close() def next_page(num): try: url = soup.find('a', class_="next") # print(url) # print(url['href'] is None) # print(url['href']) urls = url['href'].split(";") # print(urls[0]) nu = urls[0].rsplit("-", 1) next_url_header =nu[0] next_url = nu[0] + "-" + str(num) + ".html" except Exception as ex: print("出现如下异常%s" % ex) next_url =None return next_url def main(): cat_file = read_category_file(start_class2) # 从start_class3这个三级目录开始读取 for d in cat_file: # 爬取每一类别的商品信息,class1,class2,class3 分别指代1,2,3级目录 get_category_info(d['line_num'], d['class1'], d['class2'], d['url']) #break; if __name__ == '__main__': main()