Python beautifulsoup4库 解析阿里巴巴分类网址

来自CloudWiki
跳转至: 导航搜索

实训目的

提取出找到阿里巴巴各个分类的网址

实训步骤

寻找所有链接

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")

#print(soup.head)
a = soup.find_all('a')
fw = open('sample.txt',"w",encoding="utf-8")
fw.write(str(a))
fw.close()
print(a)

寻找所有一级类目

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")

h3_list = soup.find_all('h3', class_ = 'big-title')
title =" "
for i in h3_list:
    title +=(i.get_text().strip())+"\n"
    
fw = open('sample.txt',"w",encoding="utf-8")
fw.write(title)
fw.close()
#print(h3_list)

寻找所有二级类目

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")

my_list = soup.find_all('h4', class_ = 'sub-title')
title =""
for i in my_list:
    #print(i)
    title +=(i.a.string.strip())+"\n"

print(title)   
fw = open('sample2.txt',"w",encoding="utf-8")
fw.write(title)
fw.close()



寻找所有三级类目

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")

my_list = soup.find_all('div', class_ = 'sub-item-cont-wrapper')
title =""
for i in my_list:
    #print(i)
    link = i.find_all('a')
    for k in link:
        title +=(k.string.strip())+"\n"

    title +="\n"

print(title)   
fw = open('sample3.txt',"w",encoding="utf-8")
fw.write(title)
fw.close()

寻找所有一、二级目录

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")
#soup = soup.prettify()

h3_list = soup.find_all('h3', class_ = 'big-title')
title =" "
sub_title=""
for i in h3_list:
    title +="一级目录:"+i.get_text().strip()+"\n"
    
    print("一级目录:"+i.get_text().strip())
    #print(i.next_sibling.next_sibling.h4.a.get_text().strip())
    for sibling in i.next_sibling.next_sibling:
        if(sibling.name =='div'):
            print("    二级目录:"+sibling.h4.a.get_text().strip())
            title +="    二级目录:"+sibling.h4.a.get_text().strip()+"\n"        
    
    

    
fw = open('sample4.txt',"w",encoding="utf-8")
fw.write(title)
fw.close()
#print(h3_list)

寻找所有一、二、三级目录

import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.alibaba.com/Products?spm=a2700.8293689.HomeLeftCategory.9.471167afqSKCB2")
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, "html.parser")
#soup = soup.prettify()

h3_list = soup.find_all('h3', class_ = 'big-title')
title =" "
sub_title=""
for i in h3_list:
    title +="一级目录:"+i.get_text().strip()+"\n"
    
    print("一级目录:"+i.get_text().strip())
    #print(i.next_sibling.next_sibling.h4.a.get_text().strip())
    for sibling in i.next_sibling.next_sibling:
        if(sibling.name =='div'):
            print("    二级目录:"+sibling.h4.a.get_text().strip())
            title +="    二级目录:"+sibling.h4.a.get_text().strip()+"\n"
            for a in sibling.div.find_all('a'):
                print("                三级目录:"+a.get_text().strip()+" "+a['href'])
                title +="                三级目录:"+a.get_text().strip()+" "+a['href']+"\n" 
    
    

    
fw = open('sample5.txt',"w",encoding="utf-8")
fw.write(title)
fw.close()
#print(h3_list)