“Python爬虫案例：使用Selenium爬取阿里巴巴国际站”的版本间的差异

2020年8月9日 (日) 04:47的版本

实训步骤

1.利用Selenium模拟访问网站

2.爬取页面商品信息

3.翻页，爬取下一页商品信息，直到完全爬完

实训过程

导入包及初始参数设置

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的时间和文件名
now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) 
file_name ='alibaba'+now_time+'.csv'

根据url模拟访问网站

#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        #time.sleep(60)#因为网站需验证码，故开始留出60秒的时间人工填验证码
        print("访问成功！")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#模拟页面下拉滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")

    
    #time.sleep(2)

爬取商品信息

#得到商品信息
def get_products(page_number):
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter')
    print(len(div_list))
    total =""
    
    
    count=1
    for i in div_list:
        try:
            total +=str(page_number)+"," #记录是第几页的信息

            #商品图片信息
            img = i.find('img')#商品图片
            #print(str(count)+":"+img['src'])
            total += img['src']+","
           
            title= i.find('h4').get_text().strip()
            title =title.replace(",","_")
            #print((str(count)+":"+title))
            total += title+","

            #价格信息
            detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \
                    .find_all('span')
            for d in detail:
                di =d.get_text().strip()
                di=di.replace(",","_")
                #print(str(count)+":"+di)               
                total += di+","

            #商家信息
            supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a
            s_name =supplier['title'];s_name=s_name.replace(",","_")
            s_href =supplier['href'];s_href=s_href.replace(",","_")
            #print(s_name+","+s_href+",")
            total += s_name+","+s_href+","

            #国家和经营时间
            country =i.find('span',class_ = 'seller-tag__country')
            if(country is not None):
                country=country['title']
            else:
                country="NoneType"
            country = country.replace(",","_")
            year =i.find('span',class_ = 'seller-tag__year').get_text().strip()
            year = year.replace(",","_")
            print(country+","+year+",")
            total +=(country+","+year+",")

            #换行
            total += "\n"

        except Exception as ex:  
            print("出现如下异常%s"%ex)
            
        count=count+1
    #print(title)
    
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(total)
    fw.close()

跳转到下一页

#跳转到下一页
def next_page(page_number):
    try:
        input =wait.until(EC.presence_of_element_located \
                          ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input")))
        submit = wait.until(EC.element_to_be_clickable \
                            ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a")))
        #input = wait.until(EC.presence_of_element_located  \
        # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        scroll()#新页面 浏览
        get_products(page_number)
    except TimeoutException:
        next_page(page_number)

main方法及调用

def main():
    start = time.clock()
    url ="http://www.alibaba.com/Animal-Products_pid100003006"
    browse(url)
    get_products(1)
    #total = int(re.compile('(\d+)').search(total).group(1))
    #爬取所有的数据用total+1
    for i in range(2,6):
        #next_page(i)
        browse(url+"?page="+str(i))
        get_products(i)
        
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)
   


if __name__ == '__main__':
    main()

全部代码

 from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import time
from bs4 import BeautifulSoup
import csv

#初始参数设置
driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")
wait = WebDriverWait(driver, 10)
#保存的时间和文件名
now_time =time.strftime("%Y%m%d%H%M%S", time.localtime()) 
file_name ='alibaba'+now_time+'.csv'

#根据url模拟访问网站
def browse(url):
    try:
        driver.get(url)
        scroll()
        #time.sleep(60)#因为网站需验证码，故开始留出60秒的时间人工填验证码
        print("访问成功！")
        #total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
        #get_products()
        #return total.text
        return "ok"
    except TimeoutException:
        return browse(url)

def scroll():#页面滚动函数
    
    for  i in range(1,11):
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight*"+str(i)+"/10)")

    
    #time.sleep(2)
        

        
#得到商品信息
def get_products(page_number):
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))

    soup = BeautifulSoup(driver.page_source, "html.parser")
    div_list = soup.find_all('div',class_ = 'organic-gallery-offer-outter')
    print(len(div_list))
    total =""
    
    
    count=1
    for i in div_list:
        try:
            total +=str(page_number)+"," #记录是第几页的信息

            #商品图片信息
            img = i.find('img')#商品图片
            #print(str(count)+":"+img['src'])
            total += img['src']+","
           
            title= i.find('h4').get_text().strip()
            title =title.replace(",","_")
            #print((str(count)+":"+title))
            total += title+","

            #价格信息
            detail = i.find('div', class_ = 'organic-gallery-offer-section__price') \
                    .find_all('span')
            for d in detail:
                di =d.get_text().strip()
                di=di.replace(",","_")
                #print(str(count)+":"+di)               
                total += di+","

            #商家信息
            supplier =i.find('div', class_ = 'organic-gallery-offer__seller-section').a
            s_name =supplier['title'];s_name=s_name.replace(",","_")
            s_href =supplier['href'];s_href=s_href.replace(",","_")
            #print(s_name+","+s_href+",")
            total += s_name+","+s_href+","

            #国家和经营时间
            country =i.find('span',class_ = 'seller-tag__country')
            if(country is not None):
                country=country['title']
            else:
                country="NoneType"
            country = country.replace(",","_")
            year =i.find('span',class_ = 'seller-tag__year').get_text().strip()
            year = year.replace(",","_")
            print(country+","+year+",")
            total +=(country+","+year+",")

            #换行
            total += "\n"

        except Exception as ex:  
            print("出现如下异常%s"%ex)
            
        count=count+1
    #print(title)
    
    fw = open(file_name,"a",encoding="utf-8")
    fw.write(total)
    fw.close()
    
#跳转到下一页
def next_page(page_number):
    try:
        input =wait.until(EC.presence_of_element_located \
                          ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/input")))
        submit = wait.until(EC.element_to_be_clickable \
                            ((By.XPATH,"//*[@id=\"root\"]/div/div[4]/div/div[2]/a")))
        #input = wait.until(EC.presence_of_element_located  \
        # ((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
        #submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        #wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        print("现在开始爬第 "+ str(page_number)+" 页...")
        scroll()#新页面 浏览
        get_products(page_number)
    except TimeoutException:
        next_page(page_number)  

  
    
def main():
    start = time.clock()
    url ="http://www.alibaba.com/Animal-Products_pid100003006"
    browse(url)
    get_products(1)
    #total = int(re.compile('(\d+)').search(total).group(1))
    #爬取所有的数据用total+1
    for i in range(2,6):
        #next_page(i)
        browse(url+"?page="+str(i))
        get_products(i)
        
    elapsed = (time.clock() - start)
    print("Time used:",elapsed)
   


if __name__ == '__main__':
    main()

“Python爬虫案例：使用Selenium爬取阿里巴巴国际站”的版本间的差异

2020年8月9日 (日) 04:47的版本

目录

实训步骤

实训过程

导入包及初始参数设置

根据url模拟访问网站

爬取商品信息

跳转到下一页

main方法及调用

全部代码

导航菜单

个人工具

命名空间

变种

视图

更多

搜索

导航

工具