“按照给出的网站商品爬取”的版本间的差异
来自CloudWiki
(→用selenium爬取) |
(→url无规律时) |
||
(未显示2个用户的2个中间版本) | |||
第1行: | 第1行: | ||
==url无规律时== | ==url无规律时== | ||
− | === | + | 用selenium爬取 |
+ | |||
+ | ===__init__函数=== | ||
+ | 作用: | ||
+ | |||
+ | 代码: | ||
+ | |||
+ | ===get_data函数=== | ||
+ | 作用: | ||
+ | |||
+ | 代码: | ||
+ | ===XXX函数=== | ||
+ | 作用: | ||
+ | |||
+ | 代码: | ||
+ | |||
+ | ===完整代码=== | ||
+ | |||
+ | |||
class RedSprid(): | class RedSprid(): | ||
def __init__(self): | def __init__(self): | ||
第67行: | 第85行: | ||
red = RedSprid() | red = RedSprid() | ||
red.get_data() | red.get_data() | ||
+ | |||
+ | ==url有规律时== | ||
+ | |||
+ | ===用简单的requests包和xpath定位爬取=== | ||
+ | import requests | ||
+ | from lxml import etree | ||
+ | for i in range(1,30): | ||
+ | url = "http://880098.cn/index.php?s=/index/goods/index/id/%d.html" | ||
+ | page = i | ||
+ | data = requests.get(url%page) | ||
+ | html = data.text | ||
+ | if "资源不存在或已被删除" in html: | ||
+ | break | ||
+ | else: | ||
+ | ele = etree.HTML(html) | ||
+ | |||
+ | name = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/h1/text()')[0] | ||
+ | oldprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[1]/dd/b/text()')[0] | ||
+ | nowprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[2]/dd/b/text()')[0] | ||
+ | num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] | ||
+ | look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] | ||
+ | comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] | ||
+ | xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' | ||
+ | data = ele.xpath(xp)[1] | ||
+ | print("id"+str(page)+","+name+",原价"+oldprice+",现价"+nowprice+",累计销量"+num+",浏览次数"+look+",累计评价"+comm+",库存"+data) |
2020年11月18日 (三) 02:39的最新版本
url无规律时
用selenium爬取
__init__函数
作用:
代码:
get_data函数
作用:
代码:
XXX函数
作用:
代码:
完整代码
class RedSprid(): def __init__(self): self.option = ChromeOptions() self.option.add_experimental_option('excludeSwitches', ['enable-automation']) # 实现无可视化界面 # self.opt = Options() # self.opt.add_argument('--headless') # self.opt.add_argument('--disable-gpu')
self.url = 'http://880098.cn/' # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option) self.driver = webdriver.Firefox() def get_data(self): self.driver.get(self.url) time.sleep(5) html = self.driver.page_source self.element = etree.HTML(html) div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:] div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:] with open("red.txt", "w", encoding='utf-8') as f: for div in div_list: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) u = url.split("/")[-1].split(".")[0] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] wr = "id:" + str(u) + "," + str(name) + "," + str( price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data nn = str(wr) f.write(nn) f.write("\n")
for div in div_list2: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] u = url.split("/")[-1].split(".")[0] wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data nn = str(wr)
f.write(nn) f.write("\n")
# salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()')
red = RedSprid() red.get_data()
url有规律时
用简单的requests包和xpath定位爬取
import requests from lxml import etree for i in range(1,30): url = "http://880098.cn/index.php?s=/index/goods/index/id/%d.html" page = i data = requests.get(url%page) html = data.text if "资源不存在或已被删除" in html: break else: ele = etree.HTML(html) name = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/h1/text()')[0] oldprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[1]/dd/b/text()')[0] nowprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[2]/dd/b/text()')[0] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] print("id"+str(page)+","+name+",原价"+oldprice+",现价"+nowprice+",累计销量"+num+",浏览次数"+look+",累计评价"+comm+",库存"+data)