Pandas.read html() 获取静态网页表格数据
来自CloudWiki
目录
需求
获取 http://www.air-level.com/air/xian/ 的空气质量指数表格数据。
代码
import pandas as pd df = pd.read_html("http://www.air-level.com/air/xian/", encoding='utf-8', header=0)[0] df.to_excel('xian_tianqi.xlsx', index=False)
最后
read_html() 仅支持静态网页解析。你可以通过其他方法获取动态页面加载后response.text 传入 read_html() 再获取表格数据。
案例:爬取微博热搜
# -*- coding: UTF-8 -*- """ @File :微博热搜榜.py @Author :叶庭云 @Date :2020/9/18 15:01 """ import schedule import pandas as pd from datetime import datetime import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') count = 0 def get_content(): global count # 全局变量count print('----------- 正在爬取数据 -------------') url = 'https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102' df = pd.read_html(url)[0][1:11][['序号', '关键词']] # 获取热搜前10 time_ = datetime.now().strftime("%Y/%m/%d %H:%M") # 获取当前时间 df['序号'] = df['序号'].apply(int) df['热度'] = df['关键词'].str.split(' ', expand=True)[1] df['关键词'] = df['关键词'].str.split(' ', expand=True)[0] df['时间'] = [time_] * len(df['序号']) if count == 0: df.to_csv('datas.csv', mode='a+', index=False) count += 1 else: df.to_csv('datas.csv', mode='a+', index=False, header=False) # 定时爬虫 schedule.every(1).minutes.do(get_content) while True: schedule.run_pending()