“Big Yellow”的版本间的差异
来自CloudWiki
(→知识点二) |
(→动态网页抓取) |
||
第121行: | 第121行: | ||
get_movies() | get_movies() | ||
===动态网页抓取=== | ===动态网页抓取=== | ||
+ | *1.实例 抓取淘宝中iPhone8手机的评论 | ||
+ | import requests | ||
+ | link ="""https://rate.tmall.com/list_detail_rate.htm?itemId=560745175443&spuId=878124235&sellerId=2616970884& | ||
+ | order=3¤tPage=1 | ||
+ | &append=0&content=1&tagId=&posi=&picture= | ||
+ | &ua=098%23E1hvDQvUvbpvUvCkvvvvvjiPPFSpAj18RL5h0jivPmPU1jibPLswsjnjPLFUzjt8RphvCvvvphvPvpvhvv2MMQhCvvO | ||
+ | v9hCvvvvEvpCWvwPX0B0XRfJ0Io3EAp0zWdUZEcqhQ8TZHdUfbzc6%2Bu64de%2BRfJoKHdoJwZ2WlE%2B7RqwiLO2v5fVQKoZHlR9t%2BFuTWDAvD46XdigDNdyCvm9v | ||
+ | vhCvvvvvvvvvpdIvvvHSvvCVB9vv9LvvvhXVvvmCjvvvByOvvUhwuphvmvvvpo8VUqbGkphvC9hvpyPO08wCvvpvvhHh&isg=BKamCFPcwTtefJd2rGHI0j-U9xwibXQEg_aanpBPkkmk | ||
+ | E0Yt-Bc6UYzBbw-fu-JZ&needFold=0&_ksTS=1527427656035_398&callback=jsonp399""" | ||
+ | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/66.0.3359.181 Safari/537.36'} | ||
+ | r = requests.get(link,headers =headers) | ||
+ | print(r.text) |
2018年5月27日 (日) 13:45的最新版本
知识点一
if简用
- 用if语句写一个简易的密码输入小程序
def account_login(): #定义函数 password = input('Password:') #使用input获得用户输入的字符串并且储存在变量password中; password_correct = password == '12345' #设定密码为12345 if password_correct== '12345': #当用户输入密码为12345时 print('Login success!') #登陆成功 else: #输入密码若不是12345 print('Wrong password or invalid input!')#登录失败 account_login() account_login() #调用函数
- 用if语句写一个简易的修改密码的小程序
password_list =['*#*#','123456'] #创建一个列表,用于储存用户的密码 初始密码和其他数据; def account_login(): #定义函数 password = input('password:') #使用input获得用户输入的字符串并且储存在变量password中; password_correct = password == password_list[-1] #当用户输入的密码等于密码列表中最后一个元素时(及用户最新设定的密码) password_reset = password == password_list[0] #定义 : 当用户输入密码列表中第一个元素时修改密码 if password_correct: print('Login success!') elif password_reset: #修改密码 new_password = input('Enter a new password:') password_list.append(new_password) print('Your password has changed successfully!') account_login() else: print('Wrong password or invalid input!') account_login() account_login()
循环简用
- 小小循环
for num in range(1,11): #range(1,11) range函数 不包括11,实际范围1到10 print(str(num)+'+1=',num+1)
循环,if合用
- 让歌曲和歌手匹配
songslist =['Holy Diver', 'Thunderstruck', 'Rebel Rebel'] #定义一个列表,列表里面包含'Holy Diver', 'Thunderstruck', 'Rebel Rebel' for song in songslist: #使song在列表里面循环, if song == 'Holy Diver': #如果song=='Holy Diver' print(song,' - Dio') #输出'Holy Diver'的歌手名称为' - Dio' elif song == 'Thunderstruck': #重复以上命令 print(song,'-AD/CD') elif song == 'Rebel Rebel': print(song,'- David Bowie')
嵌套循环
- 简易的乘法表
for i in range(1,10): for j in range(1,10): print('{} X {} = {}'.format(i,j,i*j))
知识点二
python爬虫
- 我的第一个爬虫
import requests from bs4 import BeautifulSoup link ="http://www.santostang.com" headers ={'User-Agent' : 'Mozilla/5.0(Windows; U; Windows NT 6.1;en-US;rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'} r=requests.get(link,headers=headers) soup = BeautifulSoup(r.text,"lxml") title = soup.find("h1",class_="post-title").a.text.strip() print (title) with open('title.txt', "a+") as f: f.write(title) f.close()
静态网页抓取
- 1.获取响应内容
import requests r=requests.get('http://www.santostang.com') print ("文本编辑:",r.encoding) print ("响应状态吗:", r.status_code) print ("字符串方式的响应体:",r.text) #r.text 是服务器相应的内容,会根据响应头部的字符编码进行编译, #r.encoding 是服务器内容使用的文本编码 #r.status_code 用于检测响应状态的状态码,如果返回200,表示请求成功;若返回4xx,表示服务端错误;返回5xx表示服务器错误响应 #r.content是字节方式的响应体,会自动解码gzip和deflate 编码的相应数据 #r.json()是Requests中内置的JSON编解码
- 2.传递URL参数
import requests key_dict = {'key1': 'value1', 'key2': 'value2'} r=requests.get('http://httpbin.org/get',params=key_dict) print("URL已经正确编码",r.url) print("字符串方式的响应体:\n",r.text)
- 3.定制请求头
import requests headers = { 'user-agent':'Mozilla/5.0 {Windows NT 6.1; Win64;x64} AppleWebkit/537.36(KHTML,like Gecko) Chrome/52.0.2743.B2 Safari/537.36', 'Host':'www.santostang.com' } r=requests.get('http://www.santostang.com/',headers =headers) print("响应状态码:",r.status_code)
- 4.发送post请求
import requests key_dict = {'key1':'values1', 'key2': 'values2'} r= requests.post('http://httpbin.org/post',data =key_dict) print(r.text)
- 5.超时返回
import requests link="http://www.santostang.com/" r=requests.get(link,timeout=20) #timeout=20 表示如果连接服务器超过20秒就会自动返回异常
- 6.爬取豆瓣网top250
import requests from bs4 import BeautifulSoup def get_movies(): headers={ 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64 AppleWebKit/537.36(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Host':'movie.douban.com' } dymc_list =[] for i in range(0,10): link = 'https://movie.douban.com/top250?start=' +str(i*25) r = requests.get(link, headers =headers , timeout =10) print (str(i+1),"页响应码状态:",r.status_code) soup = BeautifulSoup(r.text,"lxml") div_list =soup.find_all('div',class_='hd') for each in div_list: movie = each.a.span.text.strip() dymc_list.append(movie) print(dymc_list) return dymc_list get_movies()
动态网页抓取
- 1.实例 抓取淘宝中iPhone8手机的评论
import requests link ="""https://rate.tmall.com/list_detail_rate.htm?itemId=560745175443&spuId=878124235&sellerId=2616970884& order=3¤tPage=1 &append=0&content=1&tagId=&posi=&picture= &ua=098%23E1hvDQvUvbpvUvCkvvvvvjiPPFSpAj18RL5h0jivPmPU1jibPLswsjnjPLFUzjt8RphvCvvvphvPvpvhvv2MMQhCvvO v9hCvvvvEvpCWvwPX0B0XRfJ0Io3EAp0zWdUZEcqhQ8TZHdUfbzc6%2Bu64de%2BRfJoKHdoJwZ2WlE%2B7RqwiLO2v5fVQKoZHlR9t%2BFuTWDAvD46XdigDNdyCvm9v vhCvvvvvvvvvpdIvvvHSvvCVB9vv9LvvvhXVvvmCjvvvByOvvUhwuphvmvvvpo8VUqbGkphvC9hvpyPO08wCvvpvvhHh&isg=BKamCFPcwTtefJd2rGHI0j-U9xwibXQEg_aanpBPkkmk E0Yt-Bc6UYzBbw-fu-JZ&needFold=0&_ksTS=1527427656035_398&callback=jsonp399""" headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/66.0.3359.181 Safari/537.36'} r = requests.get(link,headers =headers) print(r.text)