python爬虫 - 多线程爬虫

🕷️ 简介

使用线程池提高爬虫效率,适合IO密集型任务。

🚀 多线程爬虫示例

import requests
import logging
from fake_useragent import UserAgent
from lxml import etree
import openpyxl
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
from datetime import datetime

# 日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# 随机请求头
ua = UserAgent(verify_ssl=False, path='fake_useragent.json')

# 创建Excel
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['品名', '最新报价', '单位', '报价数', '报价时间'])

start = datetime.now()


def get_data(page):
    url = f'https://www.zhongnongwang.com/quote/product-htm-page-{page}.html'
    headers = {
        "Accept-Encoding": "gzip",
        "User-Agent": ua.random
    }
    rep = requests.get(url, headers=headers)
    
    html = etree.HTML(rep.text)
    items = html.xpath('/html/body/div[10]/table/tr[@align="center"]')
    logging.info(f'该页有多少条信息:{len(items)}')
    
    for item in items:
        name = ''.join(item.xpath('.//td[1]/a/text()'))
        price = ''.join(item.xpath('.//td[3]/text()'))
        unit = ''.join(item.xpath('.//td[4]/text()'))
        nums = ''.join(item.xpath('.//td[5]/text()'))
        time_ = ''.join(item.xpath('.//td[6]/text()'))
        sheet.append([name, price, unit, nums, time_])
        logging.info([name, price, unit, nums, time_])


def run():
    # 使用线程池,6个线程
    with ThreadPoolExecutor(max_workers=6) as executor:
        future_tasks = [executor.submit(get_data, i) for i in range(1, 51)]
        wait(future_tasks, return_when=ALL_COMPLETED)

    wb.save(filename='data2.xlsx')
    delta = (datetime.now() - start).total_seconds()
    print(f'用时:{delta}s')


run()

📦 关键组件

组件说明
ThreadPoolExecutor线程池执行器
executor.submit提交任务
wait等待任务完成
ALL_COMPLETED所有任务完成

作者:spike

分类: Python

创作时间:2026-02-23

更新时间:2026-02-23