数据爬取

来自CloudWiki
跳转至: 导航搜索
 import requests
  from lxml import etree
  for i in range(1,30):
      url = "http://880098.cn/index.php?s=/index/goods/index/id/%d.html"
      page = i
      data = requests.get(url%page)
      html = data.text
      if "资源不存在或已被删除" in html:
          break
      else:
          ele = etree.HTML(html)
  
          name = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/h1/text()')[0]
          oldprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[1]/dd/b/text()')[0]
          nowprice = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/div/div[2]/dd/b/text()')[0]
          num = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
          look = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
          comm = ele.xpath('/html/body/div[5]/div[2]/div[2]/div[2]/ul/li[3]/div/span[2]/text()')[0]
          xp = '/html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[1]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[3]/dd/span//text() | /html/body/div[5]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div[4]/dd/span//text()'
          data = ele.xpath(xp)[1]
          print("id"+str(page)+","+name+",原价"+oldprice+",现价"+nowprice+",累计销量"+num+",浏览次数"+look+",累计评价"+comm+",库存"+data)