Python beautifulsoup4库 解析阿里巴巴分类网址
来自CloudWiki
实训目的
提取出找到阿里巴巴各个分类的网址
实训步骤
寻找所有链接
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") #print(soup.head) a = soup.find_all('a') fw = open('sample.txt',"w",encoding="utf-8") fw.write(str(a)) fw.close() print(a)
寻找所有一级类目
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") h3_list = soup.find_all('h3', class_ = 'big-title') title =" " for i in h3_list: title +=(i.get_text().strip())+"\n" fw = open('sample.txt',"w",encoding="utf-8") fw.write(title) fw.close() #print(h3_list)
寻找所有二级类目
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") my_list = soup.find_all('h4', class_ = 'sub-title') title ="" for i in my_list: #print(i) title +=(i.a.string.strip())+"\n" print(title) fw = open('sample2.txt',"w",encoding="utf-8") fw.write(title) fw.close()
寻找所有三级类目
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") my_list = soup.find_all('div', class_ = 'sub-item-cont-wrapper') title ="" for i in my_list: #print(i) link = i.find_all('a') for k in link: title +=(k.string.strip())+"\n" title +="\n" print(title) fw = open('sample3.txt',"w",encoding="utf-8") fw.write(title) fw.close()
寻找所有一、二级目录
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") #soup = soup.prettify() h3_list = soup.find_all('h3', class_ = 'big-title') title =" " sub_title="" for i in h3_list: title +="一级目录:"+i.get_text().strip()+"\n" print("一级目录:"+i.get_text().strip()) #print(i.next_sibling.next_sibling.h4.a.get_text().strip()) for sibling in i.next_sibling.next_sibling: if(sibling.name =='div'): print(" 二级目录:"+sibling.h4.a.get_text().strip()) title +=" 二级目录:"+sibling.h4.a.get_text().strip()+"\n" fw = open('sample4.txt',"w",encoding="utf-8") fw.write(title) fw.close() #print(h3_list)
寻找所有一、二、三级目录
import requests from bs4 import BeautifulSoup r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2") r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") #soup = soup.prettify() h3_list = soup.find_all('h3', class_ = 'big-title') title =" " sub_title="" for i in h3_list: title +="一级目录:"+i.get_text().strip()+"\n" print("一级目录:"+i.get_text().strip()) #print(i.next_sibling.next_sibling.h4.a.get_text().strip()) for sibling in i.next_sibling.next_sibling: if(sibling.name =='div'): print(" 二级目录:"+sibling.h4.a.get_text().strip()) title +=" 二级目录:"+sibling.h4.a.get_text().strip()+"\n" for a in sibling.div.find_all('a'): print(" 三级目录:"+a.get_text().strip()+" "+a['href']) title +=" 三级目录:"+a.get_text().strip()+" "+a['href']+"\n" fw = open('sample5.txt',"w",encoding="utf-8") fw.write(title) fw.close() #print(h3_list)