Python解析网页
来自CloudWiki
题目背景
本章的实践项目是获取安居客网站上北京二手房的数据。本项目需要获取前10页二手房源的名称、价格、几房几厅、大小、建造年份、联系人、地址、标签。网页地址为:https://beijing.anjuke.com/sale/。
网站分析
代码
解析首页
import time import random import requests from bs4 import BeautifulSoup headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} link = 'https://jinan.anjuke.com/sale/' r = requests.get(link, headers = headers) soup = BeautifulSoup(r.text, "html.parser") house_list = soup.find_all('li', class_="list-item") for house in house_list: name = house.find('div', class_ ='house-title').a.text.strip() price = house.find('span', class_='price-det').text.strip() price_area = house.find('span', class_='unit-price').text.strip() no_room = house.find('div', class_='details-item').span.text area = house.find('div', class_='details-item').contents[3].text floor = house.find('div', class_='details-item').contents[5].text year = house.find('div', class_='details-item').contents[7].text broker = house.find('span', class_='brokername').text broker = broker[1:] address = house.find('span', class_='comm-address').text.strip() address = address.replace('\xa0\xa0\n ',' ') tag_list = house.find_all('span', class_='item-tags') tags = [i.text for i in tag_list] print (name, price, price_area, no_room, area, floor, year, broker, address, tags)
解析剩余页面
import time import random import requests from bs4 import BeautifulSoup headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'} for i in range(1,11): link = 'https://jinan.anjuke.com/sale/p' + str(i) r = requests.get(link, headers = headers) print ('现在爬取的是第', i, '页') soup = BeautifulSoup(r.text, "html.parser") house_list = soup.find_all('li', class_="list-item") for house in house_list: name = house.find('div', class_ ='house-title').a.text.strip() price = house.find('span', class_='price-det').text.strip() price_area = house.find('span', class_='unit-price').text.strip() no_room = house.find('div', class_='details-item').span.text area = house.find('div', class_='details-item').contents[3].text floor = house.find('div', class_='details-item').contents[5].text year = house.find('div', class_='details-item').contents[7].text broker = house.find('span', class_='brokername').text broker = broker[1:] address = house.find('span', class_='comm-address').text.strip() address = address.replace('\xa0\xa0\n ',' ') tag_list = house.find_all('span', class_='item-tags') tags = [i.text for i in tag_list] print (name, price, price_area, no_room, area, floor, year, broker, address, tags) time_period = random.randint(10,18) time.sleep(time_period)
返回 大数据分析