2020红亚杯:商城系统爬虫
来自CloudWiki
题目要求
爬虫技术原理
爬虫代码
代码:
import requests from lxml import etree data = [] import pandas as pd urls = 'http://47.92.4.116/index.php?s=/index/goods/index/id/%d.html' with open("data.txt", "w",encoding='utf-8') as f: for i in range(1,1300): page = i url = urls%page response = requests.get(url) html = response.text if "资源不存在或已被删除" in html: print(str(page)+"页无数据") else: print("开始爬"+str(page)+"页") ele = etree.HTML(html) title = "" if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1/text()'))>0: title = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[1]/h1/text()')[0] title = str(title).strip() price = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/div/div/dd/b/text()') if len(price)>0: price = price[0][1:] else: price = "" look = "" if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')) > 0: look = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[2]/div/span[2]/text()')[0] sale = "" if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')) > 0: sale = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/ul/li[1]/div/span[2]/text()')[0] kucun = "" if len(ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div/dd/span/span/text()')) > 0: kucun = ele.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/dl/dd/div[2]/div[3]/form/div[1]/div/dd/span/span/text()')[0] f.write(str(page)+","+title+","+price+","+look+","+sale+","+kucun+"\n")