“按照给出的网站商品爬取”的版本间的差异
来自CloudWiki
(→url无规律) |
(→用selenium爬取) |
||
第1行: | 第1行: | ||
==url无规律时== | ==url无规律时== | ||
===用selenium爬取=== | ===用selenium爬取=== | ||
− | + | class RedSprid(): | |
− | + | def __init__(self): | |
− | + | self.option = ChromeOptions() | |
− | + | self.option.add_experimental_option('excludeSwitches', ['enable-automation']) | |
− | # 实现无可视化界面 | + | # 实现无可视化界面 |
− | + | # self.opt = Options() | |
− | # | + | # self.opt.add_argument('--headless') |
− | + | # self.opt.add_argument('--disable-gpu') | |
+ | self.url = 'http://880098.cn/' | ||
+ | # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option) | ||
+ | self.driver = webdriver.Firefox() | ||
+ | def get_data(self): | ||
+ | self.driver.get(self.url) | ||
+ | time.sleep(5) | ||
+ | html = self.driver.page_source | ||
+ | self.element = etree.HTML(html) | ||
+ | div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:] | ||
+ | div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:] | ||
+ | with open("red.txt", "w", encoding='utf-8') as f: | ||
+ | for div in div_list: | ||
+ | price = div.xpath('./div/div[2]/text()')[0] | ||
+ | name = div.xpath('./div/div[1]/a/text()')[0] | ||
+ | url = div.xpath('./a/@href')[0] | ||
+ | info = requests.get(url) | ||
+ | html = info.text | ||
+ | ele = etree.HTML(html) | ||
+ | u = url.split("/")[-1].split(".")[0] | ||
+ | num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] | ||
+ | look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] | ||
+ | comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] | ||
+ | xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' | ||
+ | data = ele.xpath(xp)[1] | ||
+ | wr = "id:" + str(u) + "," + str(name) + "," + str( | ||
+ | price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data | ||
+ | nn = str(wr) | ||
+ | f.write(nn) | ||
+ | f.write("\n") | ||
− | + | for div in div_list2: | |
− | + | price = div.xpath('./div/div[2]/text()')[0] | |
− | + | name = div.xpath('./div/div[1]/a/text()')[0] | |
− | + | url = div.xpath('./a/@href')[0] | |
− | + | info = requests.get(url) | |
− | + | html = info.text | |
− | + | ele = etree.HTML(html) | |
− | + | xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' | |
+ | data = ele.xpath(xp)[1] | ||
+ | num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] | ||
+ | look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] | ||
+ | comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] | ||
+ | u = url.split("/")[-1].split(".")[0] | ||
+ | wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data | ||
+ | nn = str(wr) | ||
− | + | f.write(nn) | |
− | + | f.write("\n") | |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | + | # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()') | |
− | |||
− | |||
− | + | red = RedSprid() | |
− | + | red.get_data() | |
− | red = RedSprid() | ||
− | red.get_data() |
2020年11月8日 (日) 13:45的版本
url无规律时
用selenium爬取
class RedSprid(): def __init__(self): self.option = ChromeOptions() self.option.add_experimental_option('excludeSwitches', ['enable-automation']) # 实现无可视化界面 # self.opt = Options() # self.opt.add_argument('--headless') # self.opt.add_argument('--disable-gpu')
self.url = 'http://880098.cn/' # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option) self.driver = webdriver.Firefox() def get_data(self): self.driver.get(self.url) time.sleep(5) html = self.driver.page_source self.element = etree.HTML(html) div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:] div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:] with open("red.txt", "w", encoding='utf-8') as f: for div in div_list: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) u = url.split("/")[-1].split(".")[0] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] wr = "id:" + str(u) + "," + str(name) + "," + str( price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data nn = str(wr) f.write(nn) f.write("\n")
for div in div_list2: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] u = url.split("/")[-1].split(".")[0] wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data nn = str(wr)
f.write(nn) f.write("\n")
# salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')
red = RedSprid() red.get_data()