查看“按照给出的网站商品爬取”的源代码
←
按照给出的网站商品爬取
跳转至:
导航
,
搜索
因为以下原因,您没有权限编辑本页:
您所请求的操作仅限于该用户组的用户使用:
用户
您可以查看与复制此页面的源代码。
==url无规律时== ===用selenium爬取=== class RedSprid(): def __init__(self): self.option = ChromeOptions() self.option.add_experimental_option('excludeSwitches', ['enable-automation']) # 实现无可视化界面 # self.opt = Options() # self.opt.add_argument('--headless') # self.opt.add_argument('--disable-gpu') self.url = 'http://880098.cn/' # self.driver = webdriver.Firefox(firefox_options=self.opt, options=self.option) self.driver = webdriver.Firefox() def get_data(self): self.driver.get(self.url) time.sleep(5) html = self.driver.page_source self.element = etree.HTML(html) div_list = self.element.xpath('//div[@id="floor1"]/div[2]/div')[1:] div_list2 = self.element.xpath('//div[@id="floor2"]/div[2]/div')[1:] with open("red.txt", "w", encoding='utf-8') as f: for div in div_list: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) u = url.split("/")[-1].split(".")[0] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] wr = "id:" + str(u) + "," + str(name) + "," + str( price) + "," + "累计销量" + num + "," + "浏览次数" + look + "," + "累计评价" + comm + "," + "库存:" + data nn = str(wr) f.write(nn) f.write("\n") for div in div_list2: price = div.xpath('./div/div[2]/text()')[0] name = div.xpath('./div/div[1]/a/text()')[0] url = div.xpath('./a/@href')[0] info = requests.get(url) html = info.text ele = etree.HTML(html) xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] u = url.split("/")[-1].split(".")[0] wr = "id:" + str(u) + ","+ str(name) + ","+ str(price) + ","+ "累计销量" + num + ","+"浏览次数" + look + ","+"累计评价"+comm + ","+ "库存:" + data nn = str(wr) f.write(nn) f.write("\n") # salenum = self.element.xpath('/html/body/div[5]/div/div/div[2]/ul/li[1]/div/p[2]/span/text()') red = RedSprid() red.get_data() ==url有规律时== ===用简单的requests包和xpath定位爬取=== import requests from lxml import etree for i in range(1,30): url = "http://880098.cn/index.php?s=/index/goods/index/id/%d.html" page = i data = requests.get(url%page) html = data.text if "资源不存在或已被删除" in html: break else: ele = etree.HTML(html) name = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/h1/text()')[0] oldprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[1]/dd/b/text()')[0] nowprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[2]/dd/b/text()')[0] num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0] xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()' data = ele.xpath(xp)[1] print("id"+str(page)+","+name+",原价"+oldprice+",现价"+nowprice+",累计销量"+num+",浏览次数"+look+",累计评价"+comm+",库存"+data)
返回至
按照给出的网站商品爬取
。
导航菜单
个人工具
登录
命名空间
页面
讨论
变种
视图
阅读
查看源代码
查看历史
更多
搜索
导航
首页
最近更改
随机页面
帮助
工具
链入页面
相关更改
特殊页面
页面信息