“采集分析Discuz论坛上的数据”的版本间的差异
来自CloudWiki
201708010242(讨论 | 贡献) |
201708010242(讨论 | 贡献) (→爬取结果) |
||
(未显示另一用户的1个中间版本) | |||
第30行: | 第30行: | ||
from urllib.parse import urlparse | from urllib.parse import urlparse | ||
from urllib.parse import parse_qs | from urllib.parse import parse_qs | ||
− | import | + | import sys |
import csv | import csv | ||
− | |||
# 获取网页源代码 | # 获取网页源代码 | ||
− | def get_url_content( | + | def get_url_content(url): |
− | + | response = requests.get(url) | |
− | |||
− | |||
− | response = requests.get(url | ||
if response.status_code == 200: | if response.status_code == 200: | ||
if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text: | if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text: | ||
第51行: | 第47行: | ||
def parse_post_data(html_text): | def parse_post_data(html_text): | ||
soup_object = BeautifulSoup(html_text, 'lxml') | soup_object = BeautifulSoup(html_text, 'lxml') | ||
− | + | title = soup_object.title.string | |
− | + | url = soup_object.link['href'] | |
− | + | parsed_url = urlparse(url) | |
− | + | query_string_object = parse_qs(parsed_url.query) | |
− | + | tid = query_string_object['tid'][0] | |
− | + | user_list = get_post_userlist(soup_object) | |
− | + | content_list = get_post_content_list(soup_object) | |
− | + | for i in range(len(content_list)): | |
− | + | content_list[i]["user_info"] = user_list[i] | |
− | + | post_content_info = { | |
− | + | 'title':title, | |
− | + | 'url':url, | |
− | + | 'tid':tid, | |
− | + | 'author':user_list[0], | |
− | + | 'content':content_list[0]['content'], | |
− | + | 'comments':content_list | |
− | + | } | |
− | + | return post_content_info | |
− | + | ||
− | + | ||
− | + | ||
− | + | def get_post_userlist(post_soup_object): | |
− | + | user_info_doms = post_soup_object.select(".authi") | |
− | + | user_list = [] | |
− | + | for i in range(len(user_info_doms)): | |
+ | if i % 2 == 0: | ||
+ | user_name = user_info_doms[i].a['href'] | ||
+ | uid = parse_qs(user_info_doms[i].a['href'])['uid'][0] | ||
+ | user_list.append({"user_name": user_name, "uid": uid}) | ||
+ | return user_list | ||
− | def | + | def get_post_content_list(post_soup_object): |
− | + | content_object_list = post_soup_object.select('.t_f') | |
− | + | content_list = [] | |
− | + | for i in range(len(content_object_list)): | |
− | + | postmessage_id = content_object_list[i]['id'] | |
− | + | tid = postmessage_id.split('_')[1] | |
− | + | content = content_object_list[i].text.split() | |
− | + | content_list.append({"tid":tid,"content":content}) | |
+ | return content_list | ||
+ | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
def main(): | def main(): | ||
− | + | max_tid = input('输入最大文章') | |
− | + | heads = ['title', 'url', 'tid', 'author', 'content', 'comments'] | |
− | for page in range( | + | for page in range(int(max_tid)): |
− | + | url = 'http://114.112.74.138/forum.php?mod=viewthread&tid='+str(page)+'&extra=page%3D3' | |
− | + | ret = get_url_content(url) | |
− | + | if ret != False: | |
− | + | row = '' | |
− | + | data = parse_post_data(ret) | |
− | + | for item in heads: | |
− | + | row = row + str(data.get(item)) +'\t' | |
− | + | row = row+'\n' | |
− | + | with open('hongya.txt','a+') as f: | |
− | + | f.write(row) | |
− | + | else: | |
− | print( | + | print("not found") |
− | + | ||
− | |||
if __name__ == '__main__': | if __name__ == '__main__': | ||
main()</nowiki> | main()</nowiki> | ||
− | |||
− | |||
== 爬取结果 == | == 爬取结果 == | ||
− | [[文件: | + | [[文件:QQ截图20181203191332.png]] |
2018年12月3日 (一) 11:14的最新版本
.
目标环境
- Discuz!是目前国内知名的开源 php 社交系统。它的基础架构采用 PHP+MySQL 实现;适用于各种服务器环境的高效论坛系统。直接访问目标站点 ip 即可进入论坛主页。论坛的默认模块包含 5800+条主 题帖及 1700+条回复帖,共计 7500+条有效回复内容;包含 550+会员。其中涉及到的信息包含:论坛版块、发帖人、回帖人、发帖人 ID、发帖人 名称、回帖人 ID、回帖人名称、用户头像、发帖内容、回帖内容、发帖 ID、回 帖 ID 等。
逻辑图
逻辑关系为: (一)论坛版块对应多个帖子 (二)用户对应多个发帖 (三)用户对应多个回帖 (四)发帖对应多个回帖 (五)发帖包含:发帖 id、发帖标题、发帖内容、发帖人 id (六)回帖包含:发帖 id、回帖 id、回帖内容、回帖人 id (七)用户包含:用户 id、名称、头像
操作方法
代码实现
import requests from bs4 import BeautifulSoup from urllib.parse import urlparse from urllib.parse import parse_qs import sys import csv # 获取网页源代码 def get_url_content(url): response = requests.get(url) if response.status_code == 200: if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text: return False else: return response.text else: return False def parse_post_data(html_text): soup_object = BeautifulSoup(html_text, 'lxml') title = soup_object.title.string url = soup_object.link['href'] parsed_url = urlparse(url) query_string_object = parse_qs(parsed_url.query) tid = query_string_object['tid'][0] user_list = get_post_userlist(soup_object) content_list = get_post_content_list(soup_object) for i in range(len(content_list)): content_list[i]["user_info"] = user_list[i] post_content_info = { 'title':title, 'url':url, 'tid':tid, 'author':user_list[0], 'content':content_list[0]['content'], 'comments':content_list } return post_content_info def get_post_userlist(post_soup_object): user_info_doms = post_soup_object.select(".authi") user_list = [] for i in range(len(user_info_doms)): if i % 2 == 0: user_name = user_info_doms[i].a['href'] uid = parse_qs(user_info_doms[i].a['href'])['uid'][0] user_list.append({"user_name": user_name, "uid": uid}) return user_list def get_post_content_list(post_soup_object): content_object_list = post_soup_object.select('.t_f') content_list = [] for i in range(len(content_object_list)): postmessage_id = content_object_list[i]['id'] tid = postmessage_id.split('_')[1] content = content_object_list[i].text.split() content_list.append({"tid":tid,"content":content}) return content_list def main(): max_tid = input('输入最大文章') heads = ['title', 'url', 'tid', 'author', 'content', 'comments'] for page in range(int(max_tid)): url = 'http://114.112.74.138/forum.php?mod=viewthread&tid='+str(page)+'&extra=page%3D3' ret = get_url_content(url) if ret != False: row = '' data = parse_post_data(ret) for item in heads: row = row + str(data.get(item)) +'\t' row = row+'\n' with open('hongya.txt','a+') as f: f.write(row) else: print("not found") if __name__ == '__main__': main()