Python爬虫：Selenium定位页面元素

我们在实际使用浏览器的时候，很重要的操作有输入文本、点击确定等等。对此，Selenium提供了一系列的方法来方便我们实现以上操作。常说的8种定位页面元素的操作方式，我们一一演示一下！

我们以百度首页的搜索框节点为例，搜索python 搜索框

搜索框的html结构：

<input id="kw" name="wd" class="s_ipt" value="" maxlength="255" autocomplete="off">


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 在搜索框输入 python
search = browser.find_element(By.ID,'kw')
search.send_keys('python')
search.send_keys(Keys.ENTER)

time.sleep(2)

# 关闭浏览器
#browser.close()

name定位

find_element_by_name()根据name属性获取，这里name属性是 wd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

# 使用前先导入By类
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
browser.get(r'https://www.baidu.com')  
time.sleep(2)
time.sleep(2)

# 在搜索框输入 python
search = browser.find_element(By.NAME,'wd')
search.send_keys('python')
search.send_keys(Keys.ENTER)

time.sleep(2)

# 关闭浏览器
#browser.close()

class定位

find_element_by_class_name()根据class属性获取，这里class属性是s_ipt

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By


browser = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")

browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 在搜索框输入 python
search = browser.find_element(By.CLASS_NAME,'s_ipt')
search.send_keys('python')
search.send_keys(Keys.ENTER)

time.sleep(2)

# 关闭浏览器
#browser.close()

tag定位

我们知道HTML是通过tag来定义功能的，比如input是输入，table是表格等等。每个元素其实就是一个tag，一个tag往往用来定义一类功能，我们查看百度首页的html代码，可以看到有很多同类tag，所以其实很难通过tag去区分不同的元素。


from selenium import webdriver
import time  
# 使用前先导入By类
from selenium.webdriver.common.by import By


browser = webdriver.Chrome()

browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 在搜索框输入 python
browser.find_element_by_tag_name('input').send_keys('python')
time.sleep(2)

# 关闭浏览器
browser.close()

由于存在多个input，以上代码会报错。

link定位

这种方法顾名思义就是用来定位文本链接的，比如百度首页上方的分类模块链接。

find_element_by_link_text()

以新闻为例

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")


browser.get(r'https://www.baidu.com')  
time.sleep(2)

browser.find_element(By.LINK_TEXT,'网盘').click()
time.sleep(2)

# 关闭浏览器
#browser.close()
# 关闭浏览器全部页面
browser.quit()

partial定位

有时候一个超链接的文本很长，我们如果全部输入，既麻烦，又显得代码很不美观，这时候我们就可以只截取一部分字符串，用这种方法模糊匹配了。

find_element_by_partial_link_text()

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By


path = r'C:\Users\maxin\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe'
browser = webdriver.Chrome(path)

browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 点击新闻 链接
browser.find_element(By.PARTIAL_LINK_TEXT,'闻').click()
time.sleep(2)

# 关闭浏览器全部页面
browser.quit()
# 关闭浏览器
#browser.close()

xpath定位

前面介绍的几种定位方法都是在理想状态下，有一定使用范围的，那就是：在当前页面中，每个元素都有一个唯一的id或name或class或超链接文本的属性，那么我们就可以通过这个唯一的属性值来定位他们。

但是在实际工作中并非有这么美好，那么这个时候我们就只能通过xpath或者css来定位了。

Chrome浏览器获取XPATH的方法

find_element_by_xpath()

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By

browser = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")

browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 在搜索框输入 python
search = browser.find_element(By.XPATH,'//*[@id="kw"]')
search.send_keys('python')
search.send_keys(Keys.ENTER)

time.sleep(2)

# 关闭浏览器
#browser.close()

css定位

这种方法相对xpath要简洁些，定位速度也要快些。

find_element_by_css_selector()

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
# 使用前先导入By类
from selenium.webdriver.common.by import By

path = r'C:\Users\maxin\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe'
browser = webdriver.Chrome(path)

browser.get(r'https://www.baidu.com')  
time.sleep(2)

# 在搜索框输入 python
search = browser.find_element(By.CSS_SELECTOR,'#kw')
search.send_keys('python')
search.send_keys(Keys.ENTER)

time.sleep(2)

# 关闭浏览器
#browser.close()

find_element的By定位

除了上述的8种定位方法，Selenium还提供了一个通用的方法find_element()，这个方法有两个参数：定位方式和定位值。

# 使用前先导入By类
from selenium.webdriver.common.by import By

以上的操作可以等同于以下：

browser.find_element(By.ID,'kw')
browser.find_element(By.NAME,'wd')
browser.find_element(By.CLASS_NAME,'s_ipt')
browser.find_element(By.TAG_NAME,'input')
browser.find_element(By.LINK_TEXT,'新闻')
browser.find_element(By.PARTIAL_LINK_TEXT,'闻')
browser.find_element(By.XPATH,'//*[@id="kw"]')
browser.find_element(By.CSS_SELECTOR,'#kw')

多个元素

如果定位的目标元素在网页中不止一个，那么则需要用到find_elements，得到的结果会是列表形式。简单来说，就是element后面多了复数标识s，其他操作一致。