Python爬虫案例:使用Selenium爬取百度前10页新闻

来自CloudWiki
Cloud17讨论 | 贡献2019年8月2日 (五) 08:52的版本 (创建页面,内容为“直接上代码: <nowiki>from selenium import webdriver #导入包 from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from selenium i…”)
(差异) ←上一版本 | 最后版本 (差异) | 下一版本→ (差异)
跳转至: 导航搜索

直接上代码:

from selenium import webdriver #导入包
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import time
#from pyquery import PyQuery as pq
#import re
#from lxml import etree

driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe")  #打开Chrome浏览器
wait = WebDriverWait(driver, 10)

def search( ):
    driver.get('https://news.baidu.com')  #输入url,打开百度首页
    
    elem = driver.find_element_by_xpath('//*[@id = "ww"]')  #查找输入框
    elem.send_keys('泰国',Keys.ENTER)  #模拟点击回车

    #s = etree.HTML(driver.page_source)
    #print(s.xpath('//*[@id="1"]/text()'))#使用.xpath()


def find_result( ):
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    titlelist = soup.find_all("h3", {'class': 'c-title'})  # 题目
   
    x = 0
    for title in titlelist:        
        print(title.text)
        x += 1
        
    #driver.close()

def next_page(page_number):
    time.sleep(10)
    elem = driver.find_elements_by_link_text(page_number)  #翻页按钮
    for e in elem:
        print(e)
    elem[0].click()
    
    #submit = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '2')))
    #submit.click()

def main():
    search()
    find_result()
    for i in range(2,5):
        num = str(i)
        next_page(num)
        find_result()

if __name__ == '__main__':
    main()




参考文档:

[1] https://www.cnblogs.com/zhaof/p/6953241.html