🕷️ 简介
使用asyncio和aiohttp实现异步爬虫,效率最高。
🚀 协程爬虫示例
import aiohttp
import asyncio
import logging
from fake_useragent import UserAgent
from lxml import etree
import openpyxl
from datetime import datetime
# 日志配置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
# 随机请求头
ua = UserAgent(verify_ssl=False, path='fake_useragent.json')
# 创建Excel
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['品名', '最新报价', '单位', '报价数', '报价时间'])
start = datetime.now()
class Spider(object):
def __init__(self):
self.header = {
"Accept-Encoding": "gzip",
"User-Agent": ua.random
}
async def scrape(self, url):
session = aiohttp.ClientSession(
headers=self.header,
connector=aiohttp.TCPConnector(ssl=False)
)
response = await session.get(url)
result = await response.text()
await session.close()
return result
async def scrape_index(self, page):
url = f'https://www.zhongnongwang.com/quote/product-htm-page-{page}.html'
text = await self.scrape(url)
await self.parse(text)
async def parse(self, text):
html = etree.HTML(text)
items = html.xpath('/html/body/div[10]/table/tr[@align="center"]')
logging.info(f'该页有多少条信息:{len(items)}')
for item in items:
name = ''.join(item.xpath('.//td[1]/a/text()'))
price = ''.join(item.xpath('.//td[3]/text()'))
unit = ''.join(item.xpath('.//td[4]/text()'))
nums = ''.join(item.xpath('.//td[5]/text()'))
time_ = ''.join(item.xpath('.//td[6]/text()'))
sheet.append([name, price, unit, nums, time_])
logging.info([name, price, unit, nums, time_])
def main(self):
# 创建50个协程任务
scrape_index_tasks = [
asyncio.ensure_future(self.scrape_index(page))
for page in range(1, 51)
]
loop = asyncio.get_event_loop()
tasks = asyncio.gather(*scrape_index_tasks)
loop.run_until_complete(tasks)
if __name__ == '__main__':
spider = Spider()
spider.main()
wb.save('data3.xlsx')
delta = (datetime.now() - start).total_seconds()
print("用时:{:.3f}s".format(delta))
📦 关键组件
| 组件 | 说明 |
|---|---|
aiohttp.ClientSession | 异步HTTP会话 |
asyncio.ensure_future | 创建协程任务 |
asyncio.gather | 并发执行协程 |
asyncio.get_event_loop | 获取事件循环 |