|
本帖最后由 sheeboard 于 2021-1-28 11:53 编辑
用selenium和beautifulsoup参考
- from selenium import webdriver
- from bs4 import BeautifulSoup
- import pandas as pd
- driver=webdriver.Firefox(executable_path='./geckodriver')
- url='http://www.csres.com/'
- result=pd.DataFrame()
- df=pd.read_excel('test.xlsx')
- target=df['已有的标名名称']
- target.dropna(inplace=True)
- for i in target():
- data=[]
- driver.get(url)
- fill=driver.find_element_by_id('keyword')
- fill.clear()
- fill.sendkeys(i)
- driver.find_element_by_name('submit2').click()
- html=driver.page_source
- soup=BeautifulSoup(html,'html.parser')
- table=soup.find('table',attrs={"class":"heng"})
- rows=table.find_all('tr')
- for row in rows:
- cols=row.find_all('td')
- cols=[ele.text.strip() for ele in cols]
- data.append(cols)
- del data[0]
- tempdf=pd.DataFrame(data,columns=['标准名称', '标准编号', '发布部门', '实施日期', '状态']
- tempdf['已有的标名名称']=i
- df=df.append(tempdf)
复制代码
|
评分
-
1
查看全部评分
-
|