“按照给出的网站商品爬取”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
(创建页面,内容为“==url无规律== ===用selenium爬取=== <nowiki>import requests import time from lxml import etree from selenium import webdriver # 实现无可视化界面 from se…”)
 
url无规律
第1行: 第1行:
==url无规律==
+
==url无规律时==
 
===用selenium爬取===
 
===用selenium爬取===
<nowiki>import requests
+
import requests
 
import time
 
import time
 
from lxml import etree
 
from lxml import etree
第76行: 第76行:
  
 
red = RedSprid()
 
red = RedSprid()
red.get_data()</nowiki>
+
red.get_data()

2020年11月8日 (日) 13:43的版本

url无规律时

用selenium爬取

import requests import time from lxml import etree from selenium import webdriver

  1. 实现无可视化界面

from selenium.webdriver.firefox.options import Options

  1. 实现规避检测

from selenium.webdriver import ChromeOptions


class RedSprid():

   def __init__(self):
       self.option = ChromeOptions()
       self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
       # 实现无可视化界面
       # self.opt = Options()
       # self.opt.add_argument('--headless')
       # self.opt.add_argument('--disable-gpu')
       self.url = 'http://880098.cn/'
       # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option)
       self.driver = webdriver.Firefox()
   def get_data(self):
       self.driver.get(self.url)
       time.sleep(5)
       html = self.driver.page_source
       self.element = etree.HTML(html)
       div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:]
       div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:]
       with open("red.txt", "w", encoding='utf-8') as f:
           for div in div_list:
               price = div.xpath('./div/div[2]/text()')[0]
               name = div.xpath('./div/div[1]/a/text()')[0]
               url = div.xpath('./a/@href')[0]
               info = requests.get(url)
               html = info.text
               ele = etree.HTML(html)
               u = url.split("/")[-1].split(".")[0]
               num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
               look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
               comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
               xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
               data = ele.xpath(xp)[1]
               wr = "id:" + str(u) + "," + str(name) + "," + str(
                   price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data
               nn = str(wr)
               f.write(nn)
               f.write("\n")
           for div in div_list2:
               price = div.xpath('./div/div[2]/text()')[0]
               name = div.xpath('./div/div[1]/a/text()')[0]
               url = div.xpath('./a/@href')[0]
               info = requests.get(url)
               html = info.text
               ele = etree.HTML(html)
               xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
               data = ele.xpath(xp)[1]
               num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
               look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
               comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
               u = url.split("/")[-1].split(".")[0]
               wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data
               nn = str(wr)
               f.write(nn)
               f.write("\n")


       # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')



red = RedSprid() red.get_data()