第1个网络爬虫

准备工作

在windows的命令行中输入如下命令，安装requests库和bs4库

pip3 install requests
pip3 install bs4

爬取流程

第一步：获取页面

初始代码：

#!/usr/bin/python
# coding: utf-8

import requests
link = "http://www.sohu.com/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 

r = requests.get(link, headers= headers)
print(r.encoding)#打印网页的编码方式
r.encoding = 'utf-8'  #将编码方式设为utf-8
print (r.text)

第二步：加入编码方式的判断

r.encoding打印网页的编码方式 ,大部分网页都是utf-8的，将编码设为utf-8就可以正常显示，但是也有很多网页不是，例如：

link = "http://www.sina.com.cn"#ISO-8859-1
link = "http://www.baidu.com"#ISO-8859-1
link = "http://www.163.com"#GBK
link = "http://www.qq.com"#GB2312
link = "http://www.sohu.com"  #utf-8
link = "http://www.taobao.com"#utf-8

对于这些网页，我们只能是先判断一下其编码方式，然后根据其编码方式进行打印：

#!/usr/bin/python
# coding: utf-8

import requests
import sys
from bs4 import BeautifulSoup     #从bs4这个库中导入BeautifulSoup

link = "http://www.baidu.com"#ISO-8859-1
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 
r = requests.get(link, headers= headers)
print(r.encoding)

if r.encoding=='gb2312': #对应于腾讯网这样的网页
    r.encoding='gb2312'
elif r.encoding =='GBK':#对应于像网易这样的网页
    r.encoding ='GBK'
elif r.encoding =='ISO-8859-1':#对应于像百度、新浪这样的网页
    r.encoding = 'utf-8'
   
#为防止特殊字符造成的干扰，将网页中的所有特殊字符用一个占位符来替代    
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
text=r.text.translate(non_bmp_map)

print(text)

第二步：解析页面

soup = BeautifulSoup(text, "html.parser")      #使用BeautifulSoup解析这段代码
link = soup.find_all("a")
print (link)

第三步：储存数据

with open(dst, "a+",encoding='utf-8') as f:
    
    f.write(str(soup))
    f.close()

当我们保存文件的时候，一定要注明文件的编码：encoding='utf-8'

完整代码

import requests
from bs4 import BeautifulSoup

#!/usr/bin/python
# coding: utf-8

import requests
import sys
from bs4 import BeautifulSoup     #从bs4这个库中导入BeautifulSoup

def download(link,dst):
    #link：要下载的网站网址；dst: 保存文件的地址
    #link = "http://www.eastmoney.com/"#ISO-8859-1
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 
    r = requests.get(link, headers= headers)
    print(r.encoding)

    if r.encoding=='gb2312': #对应于腾讯网这样的网页
        r.encoding='gb2312'
    elif r.encoding =='GBK':#对应于像网易这样的网页
        r.encoding ='GBK'
    elif r.encoding =='ISO-8859-1':#对应于像百度、新浪这样的网页
        r.encoding = 'utf-8'
       
    #为防止特殊字符造成的干扰，将网页中的所有特殊字符用一个占位符来替代    
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
    text=r.text.translate(non_bmp_map)

    #print(text)
    #dst="1.txt"
    soup = BeautifulSoup(text, "html.parser")

    with open(dst, "a+",encoding='utf-8') as f:
        
        f.write(str(soup))
        f.close()

link = "http://www.eastmoney.com/"
dst="1.txt";
download(link,dst);

改进版代码——加入编码表

由于request库在猜测网页编码的时候有时出错，因此可以建立一个编码表

对于猜测不出网页编码的网页，可以先查询一下编码表中是否有这个网站的编码，然后再进行查询

编码表的格式为：

http://news.hexun.com/	gb2312

以下的代码体现的就是这种思想：

#!/usr/bin/python
# coding: utf-8

import requests
import time
import sys
from bs4 import BeautifulSoup     #从bs4这个库中导入BeautifulSoup

def load(code_file):#加载网站编码表，编码表用于提示网站的编码方式
    code =dict()
    with open(code_file, 'r') as fp:
        for line in fp:
            info=line.split()
            url=info[0]
            code[url]=info[1]

    #for i in code:
    #print(i,code[i],sep=" ")
    return code

def lookup(url,code_dict ):#查询网站的编码方式
    
    return code_dict.get(url, 'Not Exists.')  
        
     

def download(link,dst):
    #link：要下载的网站网址；dst: 保存文件的地址
    #link = "http://www.eastmoney.com/"#ISO-8859-1
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} 
    r = requests.get(link, headers= headers)
    print(link)
    print(r.encoding)
             
    if r.encoding=='gb2312': #对应于腾讯网这样的网页
         r.encoding='gb2312'
    elif r.encoding =='GBK':#对应于像网易这样的网页
         r.encoding ='GBK'
    elif r.encoding =='ISO-8859-1':#对于猜不到网页是什么编码的，可以查询编码表来解决
         code =load("code_file.txt")
         site_code=lookup(link,code)
         if site_code!= 'Not Exists.':
             r.encoding=site_code
    
    
    #为防止特殊字符造成的干扰，将网页中的所有特殊字符用一个占位符来替代    
    non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
    text=r.text.translate(non_bmp_map)

    soup = BeautifulSoup(text, "html.parser")

    with open(dst, "w",encoding='utf-8') as f:
        
        f.write(str(soup))
        f.close()

links = ["http://news.hexun.com/","http://finance.qq.com",
        "http://finance.jrj.com.cn/"]
date = time.strftime('%Y%m%d',time.localtime())
i=1;
for link in links:
    dst=str(i)+"_"+date+".txt";
    download(link,"src\\"+dst);
    i=i+1

#'''

参考文档：《Python网络爬虫：从入门到实践》第9章解决网页乱码

返回大数据分析