python爬虫 - 协程爬虫

🕷️ 简介

使用asyncio和aiohttp实现异步爬虫,效率最高。

🚀 协程爬虫示例

import aiohttp
import asyncio
import logging
from fake_useragent import UserAgent
from lxml import etree
import openpyxl
from datetime import datetime

# 日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# 随机请求头
ua = UserAgent(verify_ssl=False, path='fake_useragent.json')

# 创建Excel
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['品名', '最新报价', '单位', '报价数', '报价时间'])

start = datetime.now()


class Spider(object):
    def __init__(self):
        self.header = {
            "Accept-Encoding": "gzip",
            "User-Agent": ua.random
        }

    async def scrape(self, url):
        session = aiohttp.ClientSession(
            headers=self.header, 
            connector=aiohttp.TCPConnector(ssl=False)
        )
        response = await session.get(url)
        result = await response.text()
        await session.close()
        return result

    async def scrape_index(self, page):
        url = f'https://www.zhongnongwang.com/quote/product-htm-page-{page}.html'
        text = await self.scrape(url)
        await self.parse(text)

    async def parse(self, text):
        html = etree.HTML(text)
        items = html.xpath('/html/body/div[10]/table/tr[@align="center"]')
        logging.info(f'该页有多少条信息:{len(items)}')
        
        for item in items:
            name = ''.join(item.xpath('.//td[1]/a/text()'))
            price = ''.join(item.xpath('.//td[3]/text()'))
            unit = ''.join(item.xpath('.//td[4]/text()'))
            nums = ''.join(item.xpath('.//td[5]/text()'))
            time_ = ''.join(item.xpath('.//td[6]/text()'))
            sheet.append([name, price, unit, nums, time_])
            logging.info([name, price, unit, nums, time_])

    def main(self):
        # 创建50个协程任务
        scrape_index_tasks = [
            asyncio.ensure_future(self.scrape_index(page)) 
            for page in range(1, 51)
        ]
        loop = asyncio.get_event_loop()
        tasks = asyncio.gather(*scrape_index_tasks)
        loop.run_until_complete(tasks)


if __name__ == '__main__':
    spider = Spider()
    spider.main()
    wb.save('data3.xlsx')
    delta = (datetime.now() - start).total_seconds()
    print("用时:{:.3f}s".format(delta))

📦 关键组件

组件说明
aiohttp.ClientSession异步HTTP会话
asyncio.ensure_future创建协程任务
asyncio.gather并发执行协程
asyncio.get_event_loop获取事件循环

作者:spike

分类: Python

创作时间:2026-02-23

更新时间:2026-02-23