“Python爬虫案例:使用Selenium爬取阿里巴巴国际站”的版本间的差异
来自CloudWiki
第322行: | 第322行: | ||
==根据品类清单爬取全站商品== | ==根据品类清单爬取全站商品== | ||
+ | 这里的品类清单alibaba_categary.csv 由[[Python beautifulsoup4库 解析阿里巴巴分类网址]]获得。 | ||
+ | |||
+ | 运行时修改两个参数:start_class3 和start_page | ||
+ | |||
<nowiki>from selenium import webdriver | <nowiki>from selenium import webdriver | ||
from selenium.webdriver.common.by import By | from selenium.webdriver.common.by import By |
2020年8月11日 (二) 06:50的版本
目录
实训步骤
1.利用Selenium模拟访问网站
2.爬取页面商品信息
3.翻页,爬取下一页商品信息,直到完全爬完
实训过程
导入包及初始参数设置
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的时间和文件名 now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) file_name ='alibaba'+now_time+'.csv'
根据url模拟访问网站
#根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#模拟页面下拉滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") #time.sleep(2)
爬取商品信息
#得到商品信息 def get_products(page_number): #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) soup = BeautifulSoup(driver.page_source, "html.parser") div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter') print(len(div_list)) total ="" count=1 for i in div_list: try: total +=str(page_number)+"," #记录是第几页的信息 #商品图片信息 img = i.find('img')#商品图片 #print(str(count)+":"+img['src']) total += img['src']+"," title= i.find('h4').get_text().strip() title =title.replace(",","_") #print((str(count)+":"+title)) total += title+"," #价格信息 detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \ .find_all('span') for d in detail: di =d.get_text().strip() di=di.replace(",","_") #print(str(count)+":"+di) total += di+"," #商家信息 supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a s_name =supplier['title'];s_name=s_name.replace(",","_") s_href =supplier['href'];s_href=s_href.replace(",","_") #print(s_name+","+s_href+",") total += s_name+","+s_href+"," #国家和经营时间 country =i.find('span',class_ = 'seller-tag__country') if(country is not None): country=country['title'] else: country="NoneType" country = country.replace(",","_") year =i.find('span',class_ = 'seller-tag__year').get_text().strip() year = year.replace(",","_") print(country+","+year+",") total +=(country+","+year+",") #换行 total += "\n" except Exception as ex: print("出现如下异常%s"%ex) count=count+1 #print(title) fw = open(file_name,"a",encoding="utf-8") fw.write(total) fw.close()
跳转到下一页
#跳转到下一页 def next_page(page_number): try: input =wait.until(EC.presence_of_element_located \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input"))) submit = wait.until(EC.element_to_be_clickable \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a"))) #input = wait.until(EC.presence_of_element_located \ # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") scroll()#新页面 浏览 get_products(page_number) except TimeoutException: next_page(page_number)
main方法及调用
def main(): start = time.clock() url ="http://www.alibaba.com/Animal-Products_pid100003006" browse(url) get_products(1) #total = int(re.compile('(\d+)').search(total).group(1)) #爬取所有的数据用total+1 for i in range(2,6): #next_page(i) browse(url+"?page="+str(i)) get_products(i) elapsed = (time.clock() - start) print("Time used:",elapsed) if __name__ == '__main__': main()
全部代码
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的时间和文件名 now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) file_name ='alibaba'+now_time+'.csv' #根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#页面滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") #time.sleep(2) #得到商品信息 def get_products(page_number): #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) soup = BeautifulSoup(driver.page_source, "html.parser") div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter') print(len(div_list)) total ="" count=1 for i in div_list: try: total +=str(page_number)+"," #记录是第几页的信息 #商品图片信息 img = i.find('img')#商品图片 #print(str(count)+":"+img['src']) total += img['src']+"," title= i.find('h4').get_text().strip() title =title.replace(",","_") #print((str(count)+":"+title)) total += title+"," #价格信息 detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \ .find_all('span') for d in detail: di =d.get_text().strip() di=di.replace(",","_") #print(str(count)+":"+di) total += di+"," #商家信息 supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a s_name =supplier['title'];s_name=s_name.replace(",","_") s_href =supplier['href'];s_href=s_href.replace(",","_") #print(s_name+","+s_href+",") total += s_name+","+s_href+"," #国家和经营时间 country =i.find('span',class_ = 'seller-tag__country') if(country is not None): country=country['title'] else: country="NoneType" country = country.replace(",","_") year =i.find('span',class_ = 'seller-tag__year').get_text().strip() year = year.replace(",","_") print(country+","+year+",") total +=(country+","+year+",") #换行 total += "\n" except Exception as ex: print("出现如下异常%s"%ex) count=count+1 #print(title) fw = open(file_name,"a",encoding="utf-8") fw.write(total) fw.close() #跳转到下一页 def next_page(page_number): try: input =wait.until(EC.presence_of_element_located \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input"))) submit = wait.until(EC.element_to_be_clickable \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a"))) #input = wait.until(EC.presence_of_element_located \ # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") scroll()#新页面 浏览 get_products(page_number) except TimeoutException: next_page(page_number) def main(): start = time.clock() url ="http://www.alibaba.com/Animal-Products_pid100003006" browse(url) get_products(1) #total = int(re.compile('(\d+)').search(total).group(1)) #爬取所有的数据用total+1 for i in range(2,6): #next_page(i) browse(url+"?page="+str(i)) get_products(i) elapsed = (time.clock() - start) print("Time used:",elapsed) if __name__ == '__main__': main()
根据品类清单爬取全站商品
这里的品类清单alibaba_categary.csv 由Python beautifulsoup4库 解析阿里巴巴分类网址获得。
运行时修改两个参数:start_class3 和start_page
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的时间和文件名 now_time =time.strftime("%m%d%H%M%S", time.localtime()) start_class3 ='Holiday Gifts' #从这个(三级目录)开始爬取,默认从第一个三级目录开始爬取,可以更改 start_page = 1 end_page =start_page+20#此参数一般不改 file_name ='Alibaba_'+now_time+"_"+start_class3+"_" \ +str(start_page)+"_"+str(end_page) +'.csv' begin =0;#标记是否开始爬 #根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#页面滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") time.sleep(0.25) #time.sleep(2) #得到商品信息 def get_products(page_number,class1,class2,class3): #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) soup = BeautifulSoup(driver.page_source, "html.parser") div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter') print(len(div_list)) total ="" count=1 for i in div_list: try: total +=str(page_number)+"," #记录是第几页的信息 total +=class1+","+class2+","+class3+"," #记录是其分属的1,2,3级目录 #商品图片信息 img = i.find('img')#商品图片 #print(str(count)+":"+img['src']) total += img['src']+"," title= i.find('h4').get_text().strip() title =title.replace(",","_") #print((str(count)+":"+title)) total += title+"," #价格信息 detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \ .find_all('span') for d in detail: di =d.get_text().strip() di=di.replace(",","_") #print(str(count)+":"+di) total += di+"," #商家信息 supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a s_name =supplier['title'];s_name=s_name.replace(",","_") s_href =supplier['href'];s_href=s_href.replace(",","_") #print(s_name+","+s_href+",") total += s_name+","+s_href+"," #国家和经营时间 country =i.find('span',class_ = 'seller-tag__country') if(country is not None): country=country['title'] else: country="NoneType" country = country.replace(",","_") year =i.find('span',class_ = 'seller-tag__year') if year is not None: year =year.get_text().strip() else: year="NoneType" year = year.replace(",","_") #print(country+","+year+",") total +=(country+","+year+",") #换行 total += "\n" except Exception as ex: print("出现如下异常%s"%ex) total += "\n" count=count+1 #print(title) return total#汇集每一页面的所有信息 #跳转到下一页 def next_page(page_number): try: input =wait.until(EC.presence_of_element_located \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input"))) submit = wait.until(EC.element_to_be_clickable \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a"))) #input = wait.until(EC.presence_of_element_located \ # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") scroll()#新页面 浏览 get_products(page_number) except TimeoutException: next_page(page_number) def read_category_file(start_class3): cat_list =[]#创建类别网址列表 fp =open('alibaba_categary.csv',"rt") #csv文件每行由class1,class2,class3,class3_url 这么几个字段 for line in fp: #文件对象可以直接迭代 d ={} data =line.split(',') d['class1'] =data[0] d['class2'] =(data[1]) d['class3'] =(data[2]) d['url'] = data[3] if d['class3'] =='View More': continue global begin if d['class3'] ==start_class3:#到达start_class3才开始爬 begin=1; elif begin==0: continue; cat_list.append(d)#将读取的每行信息添加到列表中 fp.close() return cat_list def get_category_info(class1,class2,class3,url): #url ="http://www.alibaba.com/Animal-Products_pid100003006" #total = int(re.compile('(\d+)').search(total).group(1)) info ="" for i in range(start_page,end_page):#(21,41);(41,61) #next_page(i) browse(url+"?page="+str(i)) info +=get_products(i,class1,class2,class3) #将该品类信息写入文件 fw = open(file_name,"a",encoding="utf-8") fw.write(info) fw.close() def main(): start = time.clock() #'alibaba_categary.csv' cat_file =read_category_file(start_class3)#从start_class3这个三级目录开始读取 for d in cat_file: #爬取每一类别的商品信息,class1,class2,class3 分别指代1,2,3级目录 get_category_info(d['class1'],d['class2'],d['class3'],d['url']) elapsed = (time.clock() - start) print("Time used:",elapsed) #''' if __name__ == '__main__': main()