“采集分析Discuz论坛上的数据”的版本间的差异

来自CloudWiki
跳转至: 导航搜索
爬取结果
 
(未显示3个用户的3个中间版本)
第25行: 第25行:
  
 
代码实现
 
代码实现
 +
 +
<nowiki>import requests
 +
from bs4 import BeautifulSoup
 +
from urllib.parse import urlparse
 +
from urllib.parse import parse_qs
 +
import sys
 +
import csv
 +
 +
# 获取网页源代码
 +
def get_url_content(url):
 +
    response = requests.get(url)
 +
    if response.status_code == 200:
 +
        if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text:
 +
            return False
 +
        else:
 +
            return response.text
 +
    else:
 +
        return False
 +
 +
 +
def parse_post_data(html_text):
 +
    soup_object = BeautifulSoup(html_text, 'lxml')
 +
    title = soup_object.title.string
 +
    url = soup_object.link['href']
 +
    parsed_url  =  urlparse(url)
 +
    query_string_object = parse_qs(parsed_url.query)
 +
    tid = query_string_object['tid'][0]
 +
    user_list = get_post_userlist(soup_object)
 +
    content_list = get_post_content_list(soup_object)
 +
    for i in range(len(content_list)):
 +
        content_list[i]["user_info"] = user_list[i]
 +
    post_content_info = {
 +
        'title':title,
 +
        'url':url,
 +
        'tid':tid,
 +
        'author':user_list[0],
 +
        'content':content_list[0]['content'],
 +
        'comments':content_list
 +
    }
 +
    return  post_content_info
 +
 +
 +
 +
def get_post_userlist(post_soup_object):
 +
    user_info_doms = post_soup_object.select(".authi")
 +
    user_list = []
 +
    for i in range(len(user_info_doms)):
 +
        if i % 2 == 0:
 +
            user_name = user_info_doms[i].a['href']
 +
            uid = parse_qs(user_info_doms[i].a['href'])['uid'][0]
 +
            user_list.append({"user_name": user_name, "uid": uid})
 +
    return user_list
 +
 +
 +
def get_post_content_list(post_soup_object):
 +
    content_object_list = post_soup_object.select('.t_f')
 +
    content_list = []
 +
    for i in range(len(content_object_list)):
 +
        postmessage_id = content_object_list[i]['id']
 +
        tid  = postmessage_id.split('_')[1]
 +
        content = content_object_list[i].text.split()
 +
        content_list.append({"tid":tid,"content":content})
 +
    return content_list
 +
 +
 +
 +
 +
 +
def main():
 +
    max_tid = input('输入最大文章')
 +
    heads = ['title', 'url', 'tid', 'author', 'content', 'comments']
 +
    for page in range(int(max_tid)):
 +
        url = 'http://114.112.74.138/forum.php?mod=viewthread&tid='+str(page)+'&extra=page%3D3'
 +
        ret = get_url_content(url)
 +
        if ret != False:
 +
            row = ''
 +
            data = parse_post_data(ret)
 +
            for item in heads:
 +
                row = row + str(data.get(item)) +'\t'
 +
            row = row+'\n'
 +
            with open('hongya.txt','a+') as f:
 +
                f.write(row)
 +
        else:
 +
            print("not found")
 +
 +
if __name__ == '__main__':
 +
    main()</nowiki>
 +
 +
== 爬取结果 ==
 +
 +
[[文件:QQ截图20181203191332.png]]

2018年12月3日 (一) 11:14的最新版本

.

目标环境

Discuz!是目前国内知名的开源 php 社交系统。它的基础架构采用 PHP+MySQL 实现;适用于各种服务器环境的高效论坛系统。直接访问目标站点 ip 即可进入论坛主页。论坛的默认模块包含 5800+条主 题帖及 1700+条回复帖,共计 7500+条有效回复内容;包含 550+会员。其中涉及到的信息包含:论坛版块、发帖人、回帖人、发帖人 ID、发帖人 名称、回帖人 ID、回帖人名称、用户头像、发帖内容、回帖内容、发帖 ID、回 帖 ID 等。


逻辑图

逻辑关系为:
(一)论坛版块对应多个帖子
(二)用户对应多个发帖
(三)用户对应多个回帖
(四)发帖对应多个回帖
(五)发帖包含:发帖 id、发帖标题、发帖内容、发帖人 id
(六)回帖包含:发帖 id、回帖 id、回帖内容、回帖人 id
(七)用户包含:用户 id、名称、头像


操作方法

代码实现

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import parse_qs
import sys
import csv

# 获取网页源代码
def get_url_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        if "抱歉,指定的主题不存在或已被删除或正在被审核" in response.text:
            return False
        else:
            return response.text
    else:
        return False


def parse_post_data(html_text):
    soup_object = BeautifulSoup(html_text, 'lxml')
    title = soup_object.title.string
    url = soup_object.link['href']
    parsed_url  =  urlparse(url)
    query_string_object = parse_qs(parsed_url.query)
    tid = query_string_object['tid'][0]
    user_list = get_post_userlist(soup_object)
    content_list = get_post_content_list(soup_object)
    for i in range(len(content_list)):
        content_list[i]["user_info"] = user_list[i]
    post_content_info = {
        'title':title,
        'url':url,
        'tid':tid,
        'author':user_list[0],
        'content':content_list[0]['content'],
        'comments':content_list
    }
    return  post_content_info



def get_post_userlist(post_soup_object):
    user_info_doms = post_soup_object.select(".authi")
    user_list = []
    for i in range(len(user_info_doms)):
        if i % 2 == 0:
            user_name = user_info_doms[i].a['href']
            uid = parse_qs(user_info_doms[i].a['href'])['uid'][0]
            user_list.append({"user_name": user_name, "uid": uid})
    return user_list


def get_post_content_list(post_soup_object):
    content_object_list = post_soup_object.select('.t_f')
    content_list = []
    for i in range(len(content_object_list)):
        postmessage_id = content_object_list[i]['id']
        tid  = postmessage_id.split('_')[1]
        content = content_object_list[i].text.split()
        content_list.append({"tid":tid,"content":content})
    return content_list





def main():
    max_tid = input('输入最大文章')
    heads = ['title', 'url', 'tid', 'author', 'content', 'comments']
    for page in range(int(max_tid)):
        url = 'http://114.112.74.138/forum.php?mod=viewthread&tid='+str(page)+'&extra=page%3D3'
        ret = get_url_content(url)
        if ret != False:
            row = ''
            data = parse_post_data(ret)
            for item in heads:
                row = row + str(data.get(item)) +'\t'
            row = row+'\n'
            with open('hongya.txt','a+') as f:
                f.write(row)
        else:
            print("not found")

if __name__ == '__main__':
    main()

爬取结果

QQ截图20181203191332.png