Python爬虫案例:使用Selenium爬取百度前10页新闻
来自CloudWiki
直接上代码:
from selenium import webdriver #导入包 from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time #from pyquery import PyQuery as pq #import re #from lxml import etree driver = webdriver.Chrome("C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe") #打开Chrome浏览器 wait = WebDriverWait(driver, 10) def search( ): driver.get('https://news.baidu.com') #输入url,打开百度首页 elem = driver.find_element_by_xpath('//*[@id = "ww"]') #查找输入框 elem.send_keys('泰国',Keys.ENTER) #模拟点击回车 #s = etree.HTML(driver.page_source) #print(s.xpath('//*[@id="1"]/text()'))#使用.xpath() def find_result( ): soup = BeautifulSoup(driver.page_source, "html.parser") titlelist = soup.find_all("h3", {'class': 'c-title'}) # 题目 x = 0 for title in titlelist: print(title.text) x += 1 #driver.close() def next_page(page_number): time.sleep(10) elem = driver.find_elements_by_link_text(page_number) #翻页按钮 for e in elem: print(e) elem[0].click() #submit = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, '2'))) #submit.click() def main(): search() find_result() for i in range(2,5): num = str(i) next_page(num) find_result() if __name__ == '__main__': main()
参考文档: