🕷️ 简介
使用线程池提高爬虫效率,适合IO密集型任务。
🚀 多线程爬虫示例
import requests
import logging
from fake_useragent import UserAgent
from lxml import etree
import openpyxl
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from datetime import datetime
# 日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
# 随机请求头
ua = UserAgent(verify_ssl=False, path='fake_useragent.json')
# 创建Excel
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['品名', '最新报价', '单位', '报价数', '报价时间'])
start = datetime.now()
def get_data(page):
url = f'https://www.zhongnongwang.com/quote/product-htm-page-{page}.html'
headers = {
"Accept-Encoding": "gzip",
"User-Agent": ua.random
}
rep = requests.get(url, headers=headers)
html = etree.HTML(rep.text)
items = html.xpath('/html/body/div[10]/table/tr[@align="center"]')
logging.info(f'该页有多少条信息:{len(items)}')
for item in items:
name = ''.join(item.xpath('.//td[1]/a/text()'))
price = ''.join(item.xpath('.//td[3]/text()'))
unit = ''.join(item.xpath('.//td[4]/text()'))
nums = ''.join(item.xpath('.//td[5]/text()'))
time_ = ''.join(item.xpath('.//td[6]/text()'))
sheet.append([name, price, unit, nums, time_])
logging.info([name, price, unit, nums, time_])
def run():
# 使用线程池,6个线程
with ThreadPoolExecutor(max_workers=6) as executor:
future_tasks = [executor.submit(get_data, i) for i in range(1, 51)]
wait(future_tasks, return_when=ALL_COMPLETED)
wb.save(filename='data2.xlsx')
delta = (datetime.now() - start).total_seconds()
print(f'用时:{delta}s')
run()
📦 关键组件
| 组件 | 说明 |
|---|---|
ThreadPoolExecutor | 线程池执行器 |
executor.submit | 提交任务 |
wait | 等待任务完成 |
ALL_COMPLETED | 所有任务完成 |