Python beautifulsoup4库的使用

beautifulsoup4库概述

更多内容，请参考bs官网：https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/

安装beautifulsoup4库

在命令行执行：

pip3 install bs4

beautifulsoup4库的用法

基本使用

>>> import requests
>>> from bs4 import BeautifulSoup
>>> r = requests.get("http://www.baidu.com")#打开一个网页
>>> r.encoding = 'utf-8'

>>> soup = BeautifulSoup(r.text, "html.parser")
>>> type(soup)
<class 'bs4.BeautifulSoup'>

错误分析：

>>> soup = BeautifulSoup(r.text) #soup就是一个BeautifulSoup对象

Warning (from warnings module):
  File "C:\Users\thinkpad\AppData\Local\Programs\Python\Python36-32\lib\site-packages\bs4\__init__.py", line 181
    markup_type=markup_type))
UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 1 of the file <string>. To get rid of this warning, change code that looks like this:

 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html.parser")

解析网页元素

>>> soup.head

<head><meta content="text/html;charset=utf-8" http-equiv="content-type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="always" name="referrer"/><link href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css" rel="stylesheet" type="text/css"/><title>百度一下，你就知道</title></head>

>>> title = soup.title

>>> title

<title>百度一下，你就知道</title>

>>> type(title)

<class 'bs4.element.Tag'>

>>> soup.p

<p id="lh"> <a href="http://home.baidu.com">关于百度</a> <a href="http://ir.baidu.com">About Baidu</a> </p>
>>>

>>> soup.a

<a class="mnav" href="http://news.baidu.com" name="tj_trnews">新闻</a>

>>> soup.a.name

'a'

>>> soup.a.attrs

{'href': 'http://news.baidu.com', 'name': 'tj_trnews', 'class': ['mnav']}

>>> soup.a.string

'新闻'

>>> title = soup.title

>>> title.name

'title'

>>> title.string

'百度一下，你就知道'

>>> soup.p.contents

[' ', <a href="http://home.baidu.com">关于百度</a>, ' ', <a href="http://ir.baidu.com">About Baidu</a>, ' ']

遍历网页标签

>>> a = soup.find_all('a')
>>> len(a)
11
>>> soup.find_all('script')
[<script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script>]
>>> import re
>>> soup.find_all('script',{'src':re.compile('jquery')})#使用正则表达式库，实现字符串片段匹配
[]
>>> soup.find_all(string=re.compile('百度'))#筛选带‘百度’字眼的标签
['百度一下，你就知道', '关于百度', '使用百度前必读']

网络爬虫实例

以下代码爬取了网站 http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html 的网页，打印出了中国大学的排名。

#e23.1CrawUnivRanking.py
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""
def fillUnivList(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd)==0:
            continue
        singleUniv = []
        for td in ltd:
            singleUniv.append(td.string)
        allUniv.append(singleUniv)
def printUnivList(num):
    print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名","学校名称","省市","总分","培养规模"))
    for i in range(num):
        u=allUniv[i]
        print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0],u[1],u[2],u[3],u[6]))
def main():
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    fillUnivList(soup)
    printUnivList(100)
main()

返回 Python网络爬虫和自动化