Python解析网页

来自CloudWiki
跳转至: 导航搜索

题目背景

本章的实践项目是获取安居客网站上北京二手房的数据。本项目需要获取前10页二手房源的名称、价格、几房几厅、大小、建造年份、联系人、地址、标签。网页地址为:https://beijing.anjuke.com/sale/。


网站分析

Big1-2.png


代码

解析首页

import time
import random
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}


link = 'https://jinan.anjuke.com/sale/'
r = requests.get(link, headers = headers)

soup = BeautifulSoup(r.text, "html.parser")
house_list = soup.find_all('li', class_="list-item")

for house in house_list:
        name = house.find('div', class_ ='house-title').a.text.strip()
        price = house.find('span', class_='price-det').text.strip()
        price_area = house.find('span', class_='unit-price').text.strip()

        no_room = house.find('div', class_='details-item').span.text
        area = house.find('div', class_='details-item').contents[3].text
        floor = house.find('div', class_='details-item').contents[5].text
        year = house.find('div', class_='details-item').contents[7].text
        broker = house.find('span', class_='brokername').text
        broker = broker[1:]
        address = house.find('span', class_='comm-address').text.strip()
        address = address.replace('\xa0\xa0\n                    ','  ')
        tag_list = house.find_all('span', class_='item-tags')
        tags = [i.text for i in tag_list]    
        print (name, price, price_area, no_room, area, floor, year, broker, address, tags)
        

解析剩余页面

import time
import random
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}

for i in range(1,11):
    link = 'https://jinan.anjuke.com/sale/p' + str(i)
    r = requests.get(link, headers = headers)
    print ('现在爬取的是第', i, '页')


    soup = BeautifulSoup(r.text, "html.parser")
    house_list = soup.find_all('li', class_="list-item")

    for house in house_list:
        name = house.find('div', class_ ='house-title').a.text.strip()
        price = house.find('span', class_='price-det').text.strip()
        price_area = house.find('span', class_='unit-price').text.strip()

        no_room = house.find('div', class_='details-item').span.text
        area = house.find('div', class_='details-item').contents[3].text
        floor = house.find('div', class_='details-item').contents[5].text
        year = house.find('div', class_='details-item').contents[7].text
        broker = house.find('span', class_='brokername').text
        broker = broker[1:]
        address = house.find('span', class_='comm-address').text.strip()
        address = address.replace('\xa0\xa0\n                    ','  ')
        tag_list = house.find_all('span', class_='item-tags')
        tags = [i.text for i in tag_list]    
        print (name, price, price_area, no_room, area, floor, year, broker, address, tags)
        
    time_period = random.randint(10,18)
    time.sleep(time_period)

返回 大数据分析