“Python爬虫案例:使用Selenium爬取阿里巴巴国际站”的版本间的差异
来自CloudWiki
(创建页面,内容为“==实训步骤== 1.利用Selenium模拟访问网站 2.爬取页面商品信息 3.翻页,爬取下一页商品信息,直到完全爬完 ==实训过程== ===导…”) |
(没有差异)
|
2020年8月9日 (日) 04:47的版本
实训步骤
1.利用Selenium模拟访问网站
2.爬取页面商品信息
3.翻页,爬取下一页商品信息,直到完全爬完
实训过程
导入包及初始参数设置
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的时间和文件名 now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) file_name ='alibaba'+now_time+'.csv'
根据url模拟访问网站
#根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#模拟页面下拉滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") #time.sleep(2)
爬取商品信息
#得到商品信息 def get_products(page_number): #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) soup = BeautifulSoup(driver.page_source, "html.parser") div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter') print(len(div_list)) total ="" count=1 for i in div_list: try: total +=str(page_number)+"," #记录是第几页的信息 #商品图片信息 img = i.find('img')#商品图片 #print(str(count)+":"+img['src']) total += img['src']+"," title= i.find('h4').get_text().strip() title =title.replace(",","_") #print((str(count)+":"+title)) total += title+"," #价格信息 detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \ .find_all('span') for d in detail: di =d.get_text().strip() di=di.replace(",","_") #print(str(count)+":"+di) total += di+"," #商家信息 supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a s_name =supplier['title'];s_name=s_name.replace(",","_") s_href =supplier['href'];s_href=s_href.replace(",","_") #print(s_name+","+s_href+",") total += s_name+","+s_href+"," #国家和经营时间 country =i.find('span',class_ = 'seller-tag__country') if(country is not None): country=country['title'] else: country="NoneType" country = country.replace(",","_") year =i.find('span',class_ = 'seller-tag__year').get_text().strip() year = year.replace(",","_") print(country+","+year+",") total +=(country+","+year+",") #换行 total += "\n" except Exception as ex: print("出现如下异常%s"%ex) count=count+1 #print(title) fw = open(file_name,"a",encoding="utf-8") fw.write(total) fw.close()
跳转到下一页
#跳转到下一页 def next_page(page_number): try: input =wait.until(EC.presence_of_element_located \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input"))) submit = wait.until(EC.element_to_be_clickable \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a"))) #input = wait.until(EC.presence_of_element_located \ # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") scroll()#新页面 浏览 get_products(page_number) except TimeoutException: next_page(page_number)
main方法及调用
def main(): start = time.clock() url ="http://www.alibaba.com/Animal-Products_pid100003006" browse(url) get_products(1) #total = int(re.compile('(\d+)').search(total).group(1)) #爬取所有的数据用total+1 for i in range(2,6): #next_page(i) browse(url+"?page="+str(i)) get_products(i) elapsed = (time.clock() - start) print("Time used:",elapsed) if __name__ == '__main__': main()
全部代码
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq import re import time from bs4 import BeautifulSoup import csv #初始参数设置 driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") wait = WebDriverWait(driver, 10) #保存的时间和文件名 now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) file_name ='alibaba'+now_time+'.csv' #根据url模拟访问网站 def browse(url): try: driver.get(url) scroll() #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码 print("访问成功!") #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total'))) #get_products() #return total.text return "ok" except TimeoutException: return browse(url) def scroll():#页面滚动函数 for i in range(1,11): driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)") #time.sleep(2) #得到商品信息 def get_products(page_number): #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) soup = BeautifulSoup(driver.page_source, "html.parser") div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter') print(len(div_list)) total ="" count=1 for i in div_list: try: total +=str(page_number)+"," #记录是第几页的信息 #商品图片信息 img = i.find('img')#商品图片 #print(str(count)+":"+img['src']) total += img['src']+"," title= i.find('h4').get_text().strip() title =title.replace(",","_") #print((str(count)+":"+title)) total += title+"," #价格信息 detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \ .find_all('span') for d in detail: di =d.get_text().strip() di=di.replace(",","_") #print(str(count)+":"+di) total += di+"," #商家信息 supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a s_name =supplier['title'];s_name=s_name.replace(",","_") s_href =supplier['href'];s_href=s_href.replace(",","_") #print(s_name+","+s_href+",") total += s_name+","+s_href+"," #国家和经营时间 country =i.find('span',class_ = 'seller-tag__country') if(country is not None): country=country['title'] else: country="NoneType" country = country.replace(",","_") year =i.find('span',class_ = 'seller-tag__year').get_text().strip() year = year.replace(",","_") print(country+","+year+",") total +=(country+","+year+",") #换行 total += "\n" except Exception as ex: print("出现如下异常%s"%ex) count=count+1 #print(title) fw = open(file_name,"a",encoding="utf-8") fw.write(total) fw.close() #跳转到下一页 def next_page(page_number): try: input =wait.until(EC.presence_of_element_located \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input"))) submit = wait.until(EC.element_to_be_clickable \ ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a"))) #input = wait.until(EC.presence_of_element_located \ # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input"))) #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) input.clear() input.send_keys(page_number) submit.click() #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") print("现在开始爬第 "+ str(page_number)+" 页...") scroll()#新页面 浏览 get_products(page_number) except TimeoutException: next_page(page_number) def main(): start = time.clock() url ="http://www.alibaba.com/Animal-Products_pid100003006" browse(url) get_products(1) #total = int(re.compile('(\d+)').search(total).group(1)) #爬取所有的数据用total+1 for i in range(2,6): #next_page(i) browse(url+"?page="+str(i)) get_products(i) elapsed = (time.clock() - start) print("Time used:",elapsed) if __name__ == '__main__': main()