Python爬虫案例:爬取二手房信息
来自CloudWiki
代码功能
爬取某网站二手房信息
代码
# coding:utf-8 import urllib import urllib2 import re import sys from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding("utf-8") url1 = 'http://cs.58.com/ershoufang/pn' addr1="/home/hadoop/58fang-ml-line1.txt" def startscrapy(page, url, addr): url=url + str(page) user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20131029 Firefox/17.0' headers={'User-Agent':user_agent} try: request = urllib2.Request(url,headers=headers) response = urllib2.urlopen(request) content = response.read().decode('utf-8') soup = BeautifulSoup(content,"html.parser") house_list = soup.select('.house-list-wrap > li ') for house in house_list: room = house.select('.list-info > .baseinfo > span ')[0].string.encode('utf-8').replace(' ', '') area = house.select('.list-info > .baseinfo > span ')[1].string.encode('utf-8') if (len(house.select('.list-info > .baseinfo > span '))>3): floors = house.select('.list-info > .baseinfo > span ')[3].string.encode('utf-8') else: continue if (len(house.select('.list-info > .baseinfo > span > a '))>1): location =house.select('.list-info > .baseinfo > span > a ')[1].string.encode('utf-8').strip() else: continue if (len(house.select('.price > .unit'))>0): price = house.select('.price > .unit')[0].string.encode('utf-8') else: continue #subway:near subway means 1 far from subway means 0 subway1=house.select('.list-info > .baseinfo')[1] subway="0" if(len(subway1.select('span'))>1): subway="1" #room:caculate rooms room_all = re.findall('\d+',room) room ="0" for i in room_all: room = str(int(i)+ int(room)) #area:delete m2 area=area[:-5] #floors:get floor numbers floors = re.findall('\d+',floors)[0] #price:get price number price = price[:-7] #price = str(float(price) / 10000) #开福:5 芙蓉:5 天心:5 雨花:4 岳麓:3 星沙:2 望城:1 长沙周边:1 if (location=="开福"): location="5" elif(location=="芙蓉"): location = "5" elif(location=="雨花"): location = "4" elif(location=="岳麓"): location = "3" elif (location == "天心"): location = "5" elif (location == "长沙周边"): location = "1" elif (location == "望城"): location = "1" elif (location == "星沙"): location = "2" #write in txt file fileHandle = open(addr, 'a') #fileHandle.write(room + "," + area + "," +floors+","+location+","+subway+","+price+ "\n") fileHandle.write(price + "," + area + "," + room + "," + subway + "," + location + "\n") except urllib2.URLError, e: if hasattr(e, "code"): print e.code if hasattr(e, "reason"): print e.reason for page in range(2,71): startscrapy(page,url=url1,addr=addr1) print("scrapying:" + url1 + str(page))