|
[广告] VBA代码宝 - VBA编程加强工具 · VBA代码随查随用 · 内置多项VBA编程加强工具 ★ 免费下载 ★ ★ 使用手册★
python3.7 高并发异步爬虫
抓取 2015-11-03 到 2018-12-01 的数据并写入Excel,总计运行时间在30秒内完成!
- # -*- coding: utf-8 -*-
- import asyncio, aiohttp
- import async_timeout, time, re
- import pandas as pd
- #信号量,控制协程数,防止爬取太快!
- headers = {"User-Agent": "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6"}
- async def get_html(url,sess):
- with async_timeout.timeout(20):#设置请求的最长时间为20s
- async with sess.get(url, headers=headers) as res:
- if res.status == 200:
- text = await res.text(encoding='gb2312')
- return text
- async def crawl_spder(url):
- async with aiohttp.ClientSession() as sess:
- text = await get_html(url,sess)
- parse_html(text)
- def parse_html(text):
- global df
- if text:
- data_list = r.findall(str(text))
- df = df.append(data_list)
- def crawl():
- date_list = pd.date_range('2015-11-03', '2018-12-01')
- date_list = [pd.Timestamp(x).strftime("%Y%m%d") for x in date_list.values]
- start_url = 'http://kaijiang.500.com/static/info/kaijiang/xml/jsk3/{}.xml?_A=UAHNMCOE{}'
- tasks = [crawl_spder(start_url)]
- for data in date_list:
- rand_time = str(int(round(time.time() * 1000)))
- url = start_url.format(data, rand_time)
- tasks.append(crawl_spder(url))
- loop = asyncio.get_event_loop()
- loop.run_until_complete(asyncio.wait(tasks))
- loop.close()
- if __name__ == '__main__':
- t = time.time()
- r = re.compile(r'<row\s*expect="(\d+)"\s*opencode="([^"]+)"\s*opentime="([^"]+)"\s*/>')
- df = pd.DataFrame()
- crawl()
- with pd.ExcelWriter(r'output.xls') as writer: # 写入Excel文件,可以更改文件路径
- df.columns = ['期号', '号码', '时间']
- df.to_excel(writer, 'Sheet1', index=False, header=True)
- print(time.time()-t)
复制代码 |
评分
-
3
查看全部评分
-
|