2020红亚杯:商城系统爬虫

来自CloudWiki
跳转至: 导航搜索

题目要求

爬虫技术原理

爬虫代码

代码:

   import requests
   from lxml import etree
   data = []
   import pandas as pd
   urls = 'http://47.92.4.116/index.php?s=/index/goods/index/id/%d.html'
   with open("data.txt", "w",encoding='utf-8') as f:
       for i in range(1,1300):
           page = i
           url = urls%page
           response = requests.get(url)
           html = response.text
           if "资源不存在或已被删除" in html:
               print(str(page)+"页无数据")
           else:
               print("开始爬"+str(page)+"页")
               ele = etree.HTML(html)
               title = ""
               if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1/text()'))>0:
                   title = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1/text()')[0]
                   title = str(title).strip()
               price = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/div/div/dd/b/text()')
   
               if len(price)>0:
                   price = price[0][1:]
               else:
                   price = ""
               look = ""
               if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')) > 0:
                   look = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0]
               sale = ""
               if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')) > 0:
                   sale = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0]
   
               kucun = ""
               if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div/dd/span/span/text()')) > 0:
                   kucun = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div/dd/span/span/text()')[0]
   
   
           f.write(str(page)+","+title+","+price+","+look+","+sale+","+kucun+"\n")