“按照给出的网站商品爬取”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
url无规律
用selenium爬取
第1行: 第1行:
 
==url无规律时==
 
==url无规律时==
 
===用selenium爬取===
 
===用selenium爬取===
import requests
+
    class RedSprid():
import time
+
        def __init__(self):
from lxml import etree
+
            self.option = ChromeOptions()
from selenium import webdriver
+
            self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
# 实现无可视化界面
+
            # 实现无可视化界面
from selenium.webdriver.firefox.options import Options
+
            # self.opt = Options()
# 实现规避检测
+
            # self.opt.add_argument('--headless')
from selenium.webdriver import ChromeOptions
+
            # self.opt.add_argument('--disable-gpu')
  
 +
            self.url = 'http://880098.cn/'
 +
            # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option)
 +
            self.driver = webdriver.Firefox()
 +
        def get_data(self):
 +
            self.driver.get(self.url)
 +
            time.sleep(5)
 +
            html = self.driver.page_source
 +
            self.element = etree.HTML(html)
 +
            div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:]
 +
            div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:]
 +
            with open("red.txt", "w", encoding='utf-8') as f:
 +
                for div in div_list:
 +
                    price = div.xpath('./div/div[2]/text()')[0]
 +
                    name = div.xpath('./div/div[1]/a/text()')[0]
 +
                    url = div.xpath('./a/@href')[0]
 +
                    info = requests.get(url)
 +
                    html = info.text
 +
                    ele = etree.HTML(html)
 +
                    u = url.split("/")[-1].split(".")[0]
 +
                    num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
 +
                    look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
 +
                    comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
 +
                    xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
 +
                    data = ele.xpath(xp)[1]
 +
                    wr = "id:" + str(u) + "," + str(name) + "," + str(
 +
                        price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data
 +
                    nn = str(wr)
 +
                    f.write(nn)
 +
                    f.write("\n")
  
class RedSprid():
+
                for div in div_list2:
    def __init__(self):
+
                    price = div.xpath('./div/div[2]/text()')[0]
        self.option = ChromeOptions()
+
                    name = div.xpath('./div/div[1]/a/text()')[0]
        self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
+
                    url = div.xpath('./a/@href')[0]
        # 实现无可视化界面
+
                    info = requests.get(url)
        # self.opt = Options()
+
                    html = info.text
        # self.opt.add_argument('--headless')
+
                    ele = etree.HTML(html)
        # self.opt.add_argument('--disable-gpu')
+
                    xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
 +
                    data = ele.xpath(xp)[1]
 +
                    num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
 +
                    look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
 +
                    comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
 +
                    u = url.split("/")[-1].split(".")[0]
 +
                    wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data
 +
                    nn = str(wr)
  
        self.url = 'http://880098.cn/'
+
                     f.write(nn)
        # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option)
+
                    f.write("\n")
        self.driver = webdriver.Firefox()
 
    def get_data(self):
 
        self.driver.get(self.url)
 
        time.sleep(5)
 
        html = self.driver.page_source
 
        self.element = etree.HTML(html)
 
        div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:]
 
        div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:]
 
        with open("red.txt", "w", encoding='utf-8') as f:
 
            for div in div_list:
 
                price = div.xpath('./div/div[2]/text()')[0]
 
                name = div.xpath('./div/div[1]/a/text()')[0]
 
                url = div.xpath('./a/@href')[0]
 
                info = requests.get(url)
 
                html = info.text
 
                ele = etree.HTML(html)
 
                u = url.split("/")[-1].split(".")[0]
 
                num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
 
                look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
 
                comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
 
                xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
 
                data = ele.xpath(xp)[1]
 
                wr = "id:" + str(u) + "," + str(name) + "," + str(
 
                     price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data
 
                nn = str(wr)
 
                f.write(nn)
 
                f.write("\n")
 
  
            for div in div_list2:
 
                price = div.xpath('./div/div[2]/text()')[0]
 
                name = div.xpath('./div/div[1]/a/text()')[0]
 
                url = div.xpath('./a/@href')[0]
 
                info = requests.get(url)
 
                html = info.text
 
                ele = etree.HTML(html)
 
                xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
 
                data = ele.xpath(xp)[1]
 
                num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
 
                look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
 
                comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
 
                u = url.split("/")[-1].split(".")[0]
 
                wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data
 
                nn = str(wr)
 
  
                f.write(nn)
+
            # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')
                f.write("\n")
 
  
  
        # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')
 
  
  
 
+
    red = RedSprid()
 
+
    red.get_data()
red = RedSprid()
 
red.get_data()
 

2020年11月8日 (日) 13:45的版本

url无规律时

用selenium爬取

   class RedSprid():
       def __init__(self):
           self.option = ChromeOptions()
           self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
           # 实现无可视化界面
           # self.opt = Options()
           # self.opt.add_argument('--headless')
           # self.opt.add_argument('--disable-gpu')
           self.url = 'http://880098.cn/'
           # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option)
           self.driver = webdriver.Firefox()
       def get_data(self):
           self.driver.get(self.url)
           time.sleep(5)
           html = self.driver.page_source
           self.element = etree.HTML(html)
           div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:]
           div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:]
           with open("red.txt", "w", encoding='utf-8') as f:
               for div in div_list:
                   price = div.xpath('./div/div[2]/text()')[0]
                   name = div.xpath('./div/div[1]/a/text()')[0]
                   url = div.xpath('./a/@href')[0]
                   info = requests.get(url)
                   html = info.text
                   ele = etree.HTML(html)
                   u = url.split("/")[-1].split(".")[0]
                   num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
                   look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
                   comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
                   xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
                   data = ele.xpath(xp)[1]
                   wr = "id:" + str(u) + "," + str(name) + "," + str(
                       price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data
                   nn = str(wr)
                   f.write(nn)
                   f.write("\n")
               for div in div_list2:
                   price = div.xpath('./div/div[2]/text()')[0]
                   name = div.xpath('./div/div[1]/a/text()')[0]
                   url = div.xpath('./a/@href')[0]
                   info = requests.get(url)
                   html = info.text
                   ele = etree.HTML(html)
                   xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
                   data = ele.xpath(xp)[1]
                   num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
                   look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
                   comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
                   u = url.split("/")[-1].split(".")[0]
                   wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data
                   nn = str(wr)
                   f.write(nn)
                   f.write("\n")


           # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')



   red = RedSprid()
   red.get_data()