Python爬虫案例:爬取二手房信息

来自CloudWiki
跳转至: 导航搜索

代码功能

爬取某网站二手房信息


代码

# coding:utf-8
import urllib
import urllib2
import re
import sys
from bs4 import BeautifulSoup

reload(sys)
sys.setdefaultencoding("utf-8")
url1 = 'http://cs.58.com/ershoufang/pn'
addr1="/home/hadoop/58fang-ml-line1.txt"

def startscrapy(page, url, addr):
    url=url + str(page)
    user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/20131029 Firefox/17.0'
    headers={'User-Agent':user_agent}

    try:
        request = urllib2.Request(url,headers=headers)
        response = urllib2.urlopen(request)
        content = response.read().decode('utf-8')
        soup = BeautifulSoup(content,"html.parser")
        house_list = soup.select('.house-list-wrap > li ')
        for house in house_list:
            room = house.select('.list-info > .baseinfo > span ')[0].string.encode('utf-8').replace(' ', '')
            area = house.select('.list-info > .baseinfo > span ')[1].string.encode('utf-8')
            if (len(house.select('.list-info > .baseinfo > span '))>3):
                floors = house.select('.list-info > .baseinfo > span ')[3].string.encode('utf-8')
            else:
                continue
            if (len(house.select('.list-info > .baseinfo > span > a '))>1):
                location =house.select('.list-info > .baseinfo > span > a ')[1].string.encode('utf-8').strip()
            else:
                continue
            if (len(house.select('.price > .unit'))>0):
                price = house.select('.price > .unit')[0].string.encode('utf-8')

            else:
                continue

            #subway:near subway means 1 far from subway means 0
            subway1=house.select('.list-info > .baseinfo')[1]
            subway="0"
            if(len(subway1.select('span'))>1):
                subway="1"

            #room:caculate rooms
            room_all = re.findall('\d+',room)
            room ="0"
            for i in room_all:
                room = str(int(i)+ int(room))

            #area:delete m2
            area=area[:-5]

            #floors:get floor numbers
            floors = re.findall('\d+',floors)[0]


            #price:get price number
            price = price[:-7]
            #price = str(float(price) / 10000)
            #开福:5 芙蓉:5 天心:5 雨花:4 岳麓:3 星沙:2 望城:1 长沙周边:1
            if (location=="开福"):
                location="5"
            elif(location=="芙蓉"):
                location = "5"
            elif(location=="雨花"):
                location = "4"
            elif(location=="岳麓"):
                location = "3"
            elif (location == "天心"):
                location = "5"
            elif (location == "长沙周边"):
                location = "1"
            elif (location == "望城"):
                location = "1"
            elif (location == "星沙"):
                location = "2"
            #write in txt file
            fileHandle = open(addr, 'a')
            #fileHandle.write(room + "," + area + "," +floors+","+location+","+subway+","+price+ "\n")
            fileHandle.write(price + "," + area  + "," + room + "," + subway + "," + location + "\n")
    except urllib2.URLError, e:
            if hasattr(e, "code"):
             print e.code
            if hasattr(e, "reason"):
             print e.reason
for page in range(2,71):
    startscrapy(page,url=url1,addr=addr1)
    print("scrapying:" + url1 + str(page))