“采集分析Discuz论坛上的数据”的版本间的差异
来自CloudWiki
第25行: | 第25行: | ||
代码实现 | 代码实现 | ||
+ | |||
+ | <nowiki>import requests | ||
+ | from bs4 import BeautifulSoup | ||
+ | from urllib.parse import urlparse | ||
+ | from urllib.parse import parse_qs | ||
+ | import re | ||
+ | import csv | ||
+ | |||
+ | |||
+ | # 获取网页源代码 | ||
+ | def get_url_content(page): | ||
+ | url = 'http://www.discuz.net/forum.php?mod=guide&view=new&page='+page | ||
+ | headers = { | ||
+ | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} | ||
+ | response = requests.get(url, headers=headers) | ||
+ | if response.status_code == 200: | ||
+ | if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text: | ||
+ | return False | ||
+ | else: | ||
+ | return response.text | ||
+ | else: | ||
+ | return False | ||
+ | |||
+ | |||
+ | def parse_post_data(html_text): | ||
+ | soup_object = BeautifulSoup(html_text, 'lxml') | ||
+ | newlist = soup_object.select('.bm_c table tbody tr') | ||
+ | for new in newlist: | ||
+ | try: | ||
+ | title = new.select('.common a')[0].text | ||
+ | url = 'http://www.discuz.net/' + new.select('.common a')[0]['href'] | ||
+ | detail = get_url_deatil_content(url) | ||
+ | plate = new.select('.by')[0].select('a')[0].text | ||
+ | author = new.select('.by')[1].select('a')[0].text | ||
+ | authorHome = 'http://www.discuz.net/' + new.select('.by')[1].select('cite a')[0]['href'] | ||
+ | parsed_url = urlparse(authorHome) | ||
+ | query_string_object = parse_qs(parsed_url.query) | ||
+ | uid = query_string_object['uid'][0] | ||
+ | commNum = new.select('.num em')[0].text | ||
+ | post_content_info = { | ||
+ | 'title': title, | ||
+ | 'url': url, | ||
+ | 'uid': uid, | ||
+ | 'plate': plate, | ||
+ | 'author': author, | ||
+ | 'authorHome': authorHome, | ||
+ | 'commNum': commNum | ||
+ | } | ||
+ | yield post_content_info | ||
+ | except: | ||
+ | pass | ||
+ | |||
+ | |||
+ | def get_url_deatil_content(url): | ||
+ | headers = { | ||
+ | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} | ||
+ | response = requests.get(url, headers=headers) | ||
+ | if response.status_code == 200: | ||
+ | return response.text | ||
+ | else: | ||
+ | return 'error' | ||
+ | |||
+ | |||
+ | def parse_deatil_content(detail): | ||
+ | soup = BeautifulSoup(detail, 'lxml') | ||
+ | commList = soup.select('.t_f') | ||
+ | userList = soup.select('.authi a.xw1') | ||
+ | userArr = [] | ||
+ | commArr = [] | ||
+ | for i in range(len(userList)): | ||
+ | auth = userList[i].text | ||
+ | userArr.append(auth) | ||
+ | for i in range(len(commList)): | ||
+ | comm = commList[i].text | ||
+ | comm = re.sub(r'[^\u4e00-\u9fa5]+', '', comm) | ||
+ | commArr.append(comm) | ||
+ | return [userArr, commArr] | ||
+ | |||
+ | def save_csv(line): | ||
+ | with open('info.csv','w',newline='') as f: | ||
+ | writer = csv.writer(f) | ||
+ | writer.writerow(line) | ||
+ | |||
+ | def main(): | ||
+ | f = open('info.csv', 'w', newline='') | ||
+ | writer = csv.writer(f) | ||
+ | for page in range(12): | ||
+ | html = get_url_content(str(page)) | ||
+ | data = parse_post_data(html) | ||
+ | for info in data: | ||
+ | dataUrl = info['url'] | ||
+ | nextHtml = get_url_deatil_content(dataUrl) | ||
+ | user, comm = parse_deatil_content(nextHtml) | ||
+ | info['user'] = user | ||
+ | info['comm'] = comm | ||
+ | line = [] | ||
+ | for k in info: | ||
+ | line.append(str(info[k])) | ||
+ | print(line) | ||
+ | writer.writerow(line) | ||
+ | f.close() | ||
+ | if __name__ == '__main__': | ||
+ | main()</nowiki> |
2018年11月19日 (一) 09:57的版本
.
目标环境
- Discuz!是目前国内知名的开源 php 社交系统。它的基础架构采用 PHP+MySQL 实现;适用于各种服务器环境的高效论坛系统。直接访问目标站点 ip 即可进入论坛主页。论坛的默认模块包含 5800+条主 题帖及 1700+条回复帖,共计 7500+条有效回复内容;包含 550+会员。其中涉及到的信息包含:论坛版块、发帖人、回帖人、发帖人 ID、发帖人 名称、回帖人 ID、回帖人名称、用户头像、发帖内容、回帖内容、发帖 ID、回 帖 ID 等。
逻辑图
逻辑关系为: (一)论坛版块对应多个帖子 (二)用户对应多个发帖 (三)用户对应多个回帖 (四)发帖对应多个回帖 (五)发帖包含:发帖 id、发帖标题、发帖内容、发帖人 id (六)回帖包含:发帖 id、回帖 id、回帖内容、回帖人 id (七)用户包含:用户 id、名称、头像
操作方法
代码实现
import requests from bs4 import BeautifulSoup from urllib.parse import urlparse from urllib.parse import parse_qs import re import csv # 获取网页源代码 def get_url_content(page): url = 'http://www.discuz.net/forum.php?mod=guide&view=new&page='+page headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text: return False else: return response.text else: return False def parse_post_data(html_text): soup_object = BeautifulSoup(html_text, 'lxml') newlist = soup_object.select('.bm_c table tbody tr') for new in newlist: try: title = new.select('.common a')[0].text url = 'http://www.discuz.net/' + new.select('.common a')[0]['href'] detail = get_url_deatil_content(url) plate = new.select('.by')[0].select('a')[0].text author = new.select('.by')[1].select('a')[0].text authorHome = 'http://www.discuz.net/' + new.select('.by')[1].select('cite a')[0]['href'] parsed_url = urlparse(authorHome) query_string_object = parse_qs(parsed_url.query) uid = query_string_object['uid'][0] commNum = new.select('.num em')[0].text post_content_info = { 'title': title, 'url': url, 'uid': uid, 'plate': plate, 'author': author, 'authorHome': authorHome, 'commNum': commNum } yield post_content_info except: pass def get_url_deatil_content(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'} response = requests.get(url, headers=headers) if response.status_code == 200: return response.text else: return 'error' def parse_deatil_content(detail): soup = BeautifulSoup(detail, 'lxml') commList = soup.select('.t_f') userList = soup.select('.authi a.xw1') userArr = [] commArr = [] for i in range(len(userList)): auth = userList[i].text userArr.append(auth) for i in range(len(commList)): comm = commList[i].text comm = re.sub(r'[^\u4e00-\u9fa5]+', '', comm) commArr.append(comm) return [userArr, commArr] def save_csv(line): with open('info.csv','w',newline='') as f: writer = csv.writer(f) writer.writerow(line) def main(): f = open('info.csv', 'w', newline='') writer = csv.writer(f) for page in range(12): html = get_url_content(str(page)) data = parse_post_data(html) for info in data: dataUrl = info['url'] nextHtml = get_url_deatil_content(dataUrl) user, comm = parse_deatil_content(nextHtml) info['user'] = user info['comm'] = comm line = [] for k in info: line.append(str(info[k])) print(line) writer.writerow(line) f.close() if __name__ == '__main__': main()