python爬虫 - selenium

🕷️ 简介

Selenium是一个自动化测试工具,可以模拟浏览器操作,适合爬取动态网页。

🚀 基本示例

import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import undetected_chromedriver as uc

# 设置 Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--headless')           # 无头模式
options.add_argument('--disable-gpu')        # 禁用 GPU
options.add_argument('--no-sandbox')         # 禁用沙盒
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--ignore-certificate-errors')

# 创建 Chrome webdriver(屏蔽cloudflare检测)
driver = uc.Chrome(options=options)

# 设置请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Content-Type': 'text/html;charset=UTF-8',
}

# 构造请求 URL
url = 'https://www.coingecko.com/en'

# 发送请求
driver.get(url)

# 等待页面加载完成
try:
    table = WebDriverWait(driver, 3000).until(
        EC.presence_of_element_located((By.ID, 'unobtrusive-flash-messages'))
    )
except:
    driver.quit()

# 解析 HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

# 找到加密货币表格
table = soup.find('table', {'class': 'table-scrollable'})
rows = table.find_all('tr')

# 数据处理...
for row in rows:
    cols = row.find_all('td')
    # 提取数据...

# 关闭浏览器
driver.quit()

📦 常用Options

参数说明
--headless无头模式
--disable-gpu禁用GPU
--no-sandbox禁用沙盒
--disable-extensions禁用扩展
--ignore-certificate-errors忽略证书错误

🔧 等待元素

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# 显式等待
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.ID, 'my-id'))
)

作者:spike

分类: Python

创作时间:2026-02-23

更新时间:2026-02-23