|
- # -*- coding: utf-8 -*-
- # version: Python 3.7.0
- import requests, re
- from lxml import etree
- session = requests.Session()
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
- def get_html(url, post_data):
- with session.post(url=url, data=post_data, headers=headers) as res:
- parse_html(res.text)
- def parse_html(text):
- #这里解析数据!!!
- tree = etree.HTML(text)
- trs = tree.xpath('//tbody[@id="list-pub"]/tr')
- for tr in trs:
- tdlist = [x.xpath('./text()')[0].strip() for x in tr.xpath('./td')]
- print(tdlist)
- def main():
- htm_str = session.get('http://218.12.43.28:2018/pub/gongshi', headers=headers).text
- token = re.search(r'(?s)<input name="__RequestVerificationToken".*?value="([^"]+)"', htm_str)
- pages = re.search(r"<a href='/pub/gongshi\?pageIndex=(\d+)'>>>", htm_str)
- if pages and token:
- #动态获取总页数:共计4686页,print(pages.group(1))
- base_url = 'http://218.12.43.28:2018/pub/GongShiSearch'
- for p in range(1, int(pages.group(1)) + 1):
- url = '{}?pageIndex={}'.format(base_url, str(p))
- post_data = {'__RequestVerificationToken': token.group(1)}
- get_html(url, post_data)
- break
- if __name__ == '__main__':
- main()
复制代码
|
-
评分
-
1
查看全部评分
-
|