Python爬虫案例:使用Selenium爬取阿里巴巴国际站

来自CloudWiki
跳转至: 导航搜索

实训步骤

1.利用Selenium模拟访问网站

2.爬取页面商品信息

3.翻页,爬取下一页商品信息,直到完全爬完

实训过程

导入包及初始参数设置

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的时间和文件名
now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) 
file_name ='alibaba'+now_time+'.csv'


根据url模拟访问网站

#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#模拟页面下拉滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")

    
    #time.sleep(2)

爬取商品信息

#得到商品信息
def get_products(page_number):
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter')
    print(len(div_list))
    total =""
    
    
    count=1
    for i in div_list:
        try:
            total +=str(page_number)+"," #记录是第几页的信息

            #商品图片信息
            img = i.find('img')#商品图片
            #print(str(count)+":"+img['src'])
            total += img['src']+","
           
            title= i.find('h4').get_text().strip()
            title =title.replace(",","_")
            #print((str(count)+":"+title))
            total += title+","

            #价格信息
            detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \
                    .find_all('span')
            for d in detail:
                di =d.get_text().strip()
                di=di.replace(",","_")
                #print(str(count)+":"+di)               
                total += di+","

            #商家信息
            supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a
            s_name =supplier['title'];s_name=s_name.replace(",","_")
            s_href =supplier['href'];s_href=s_href.replace(",","_")
            #print(s_name+","+s_href+",")
            total += s_name+","+s_href+","

            #国家和经营时间
            country =i.find('span',class_ = 'seller-tag__country')
            if(country is not None):
                country=country['title']
            else:
                country="NoneType"
            country = country.replace(",","_")
            year =i.find('span',class_ = 'seller-tag__year').get_text().strip()
            year = year.replace(",","_")
            print(country+","+year+",")
            total +=(country+","+year+",")

            #换行
            total += "\n"

        except Exception as ex:  
            print("出现如下异常%s"%ex)
            
        count=count+1
    #print(title)
    
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(total)
    fw.close()
  

跳转到下一页

#跳转到下一页
def next_page(page_number):
    try:
        input =wait.until(EC.presence_of_element_located \
                          ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input")))
        submit = wait.until(EC.element_to_be_clickable \
                            ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a")))
        #input = wait.until(EC.presence_of_element_located  \
        # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        scroll()#新页面 浏览
        get_products(page_number)
    except TimeoutException:
        next_page(page_number)
        

main方法及调用

def main():
    start = time.perf_counter()
    url ="http://www.alibaba.com/Animal-Products_pid100003006"
    browse(url)
    get_products(1)
    #total = int(re.compile('(\d+)').search(total).group(1))
    #爬取所有的数据用total+1
    for i in range(2,6):
        #next_page(i)
        browse(url+"?page="+str(i))
        get_products(i)
        
    elapsed = (time.perf_counter() - start)
    print("Time used:",elapsed)
   


if __name__ == '__main__':
    main()

全部代码

 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的时间和文件名
now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) 
file_name ='alibaba'+now_time+'.csv'

#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#页面滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")

    
    #time.sleep(2)
        

        
#得到商品信息
def get_products(page_number):
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter')
    print(len(div_list))
    total =""
    
    
    count=1
    for i in div_list:
        try:
            total +=str(page_number)+"," #记录是第几页的信息

            #商品图片信息
            img = i.find('img')#商品图片
            #print(str(count)+":"+img['src'])
            total += img['src']+","
           
            title= i.find('h4').get_text().strip()
            title =title.replace(",","_")
            #print((str(count)+":"+title))
            total += title+","

            #价格信息
            detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \
                    .find_all('span')
            for d in detail:
                di =d.get_text().strip()
                di=di.replace(",","_")
                #print(str(count)+":"+di)               
                total += di+","

            #商家信息
            supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a
            s_name =supplier['title'];s_name=s_name.replace(",","_")
            s_href =supplier['href'];s_href=s_href.replace(",","_")
            #print(s_name+","+s_href+",")
            total += s_name+","+s_href+","

            #国家和经营时间
            country =i.find('span',class_ = 'seller-tag__country')
            if(country is not None):
                country=country['title']
            else:
                country="NoneType"
            country = country.replace(",","_")
            year =i.find('span',class_ = 'seller-tag__year').get_text().strip()
            year = year.replace(",","_")
            print(country+","+year+",")
            total +=(country+","+year+",")

            #换行
            total += "\n"

        except Exception as ex:  
            print("出现如下异常%s"%ex)
            
        count=count+1
    #print(title)
    
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(total)
    fw.close()
    
#跳转到下一页
def next_page(page_number):
    try:
        input =wait.until(EC.presence_of_element_located \
                          ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input")))
        submit = wait.until(EC.element_to_be_clickable \
                            ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a")))
        #input = wait.until(EC.presence_of_element_located  \
        # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        scroll()#新页面 浏览
        get_products(page_number)
    except TimeoutException:
        next_page(page_number)  

  
    
def main():
    start = time.perf_counter()
    url ="http://www.alibaba.com/Animal-Products_pid100003006"
    browse(url)
    get_products(1)
    #total = int(re.compile('(\d+)').search(total).group(1))
    #爬取所有的数据用total+1
    for i in range(2,6):
        #next_page(i)
        browse(url+"?page="+str(i))
        get_products(i)
        
    elapsed = (time.perf_counter() - start)
    print("Time used:",elapsed)
   


if __name__ == '__main__':
    main()


根据品类清单爬取全站商品

这里的品类清单alibaba_categary.csv 由Python beautifulsoup4库 解析阿里巴巴分类网址获得。

运行时修改两个参数:start_class3 和start_page

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的时间和文件名
now_time =time.strftime("%m%d%H%M%S", time.localtime()) 
start_class3 ='Holiday Gifts'  #从这个(三级目录)开始爬取,默认从第一个三级目录开始爬取,可以更改
start_page = 1
end_page =start_page+20#此参数一般不改
file_name ='Alibaba_'+now_time+"_"+start_class3+"_" \
            +str(start_page)+"_"+str(end_page) +'.csv'
begin =0;#标记是否开始爬

#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        #time.sleep(60)#因为网站需验证码,故开始留出60秒的时间人工填验证码
        print("访问成功!")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#页面滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")
        time.sleep(0.25)
    
    #time.sleep(2)
        

        
#得到商品信息
def get_products(page_number,class1,class2,class3):
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter')
    print(len(div_list))
    total =""
    
    
    count=1
    for i in div_list:
        try:
            total +=str(page_number)+"," #记录是第几页的信息

            total +=class1+","+class2+","+class3+"," #记录是其分属的1,2,3级目录

            #商品图片信息
            img = i.find('img')#商品图片
            #print(str(count)+":"+img['src'])
            total += img['src']+","
           
            title= i.find('h4').get_text().strip()
            title =title.replace(",","_")
            #print((str(count)+":"+title))
            total += title+","

            #价格信息
            detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \
                    .find_all('span')
            for d in detail:
                di =d.get_text().strip()
                di=di.replace(",","_")
                #print(str(count)+":"+di)               
                total += di+","

            #商家信息
            supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a
            s_name =supplier['title'];s_name=s_name.replace(",","_")
            s_href =supplier['href'];s_href=s_href.replace(",","_")
            #print(s_name+","+s_href+",")
            total += s_name+","+s_href+","

            #国家和经营时间
            country =i.find('span',class_ = 'seller-tag__country')
            if(country is not None):
                country=country['title']
            else:
                country="NoneType"
            country = country.replace(",","_")
            year =i.find('span',class_ = 'seller-tag__year')
            if year is not None:
                year =year.get_text().strip()
            else:
                year="NoneType"
            year = year.replace(",","_")
            #print(country+","+year+",")
            total +=(country+","+year+",")

            #换行
            total += "\n"

        except Exception as ex:  
            print("出现如下异常%s"%ex)
            total += "\n"
            
        count=count+1
    #print(title)

    return total#汇集每一页面的所有信息



    
#跳转到下一页
def next_page(page_number):
    try:
        input =wait.until(EC.presence_of_element_located \
                          ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input")))
        submit = wait.until(EC.element_to_be_clickable \
                            ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a")))
        #input = wait.until(EC.presence_of_element_located  \
        # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        scroll()#新页面 浏览
        get_products(page_number)
    except TimeoutException:
        next_page(page_number)  


def read_category_file(start_class3):
    cat_list =[]#创建类别网址列表
    fp =open('alibaba_categary.csv',"rt")
    #csv文件每行由class1,class2,class3,class3_url 这么几个字段
    for line in fp:                 #文件对象可以直接迭代
        d ={}
        data =line.split(',')
        d['class1'] =data[0]
        d['class2'] =(data[1])
        d['class3'] =(data[2])
        d['url'] = data[3]
        if d['class3'] =='View More':
            continue
        global begin
        if d['class3'] ==start_class3:#到达start_class3才开始爬
            begin=1;
        elif begin==0:
            continue;
        cat_list.append(d)#将读取的每行信息添加到列表中
    fp.close()
    return cat_list

def get_category_info(class1,class2,class3,url):
    #url ="http://www.alibaba.com/Animal-Products_pid100003006"
       
    #total = int(re.compile('(\d+)').search(total).group(1))
    info =""
    for i in range(start_page,end_page):#(21,41);(41,61)
        #next_page(i)
        browse(url+"?page="+str(i))
        info +=get_products(i,class1,class2,class3)
        
    #将该品类信息写入文件
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(info)
    fw.close()
    
    
def main():
    start = time.clock()
    #'alibaba_categary.csv'
    
    cat_file =read_category_file(start_class3)#从start_class3这个三级目录开始读取
    
    for d in cat_file:
        #爬取每一类别的商品信息,class1,class2,class3 分别指代1,2,3级目录
        get_category_info(d['class1'],d['class2'],d['class3'],d['url'])      
        
        
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)
    #'''


if __name__ == '__main__':
    main()