|
本帖最后由 引子玄 于 2012-9-3 09:03 编辑
gongnl 发表于 2012-9-2 21:07
liucqa 大师,经过一天一夜的努力,对原实例更新,去除 QueryTables.Add,完成网页的抓取,请赐教
看看你的正则:(看后觉得很烦琐,有不通用的Private,还有一大堆复杂的正则。尽管抓得很好,但还是不喜欢这种类型的抓取代码)
Option Explicit
Public aryDatainfo, n&, viewstate$, eventvalidation$, txtDay$, dicMapIndex As Object, t As Date
Sub main()
Dim objWinhttp As Object, URL$
Dim i&, j&, strAllItemName, aryIndex
Dim tn$
'基础数据:中文与拼音缩写一一对应。汉字为工作表名称(提前建好),拼音缩写为Post提交的参数
strAllItemName = "地基与基础工程,djyjc,建筑幕墙工程,jzmq,建筑智能化工程,jzznh,建筑装修装饰工程,jzzxzs,桥梁工程,ql,消防设施工程,xfss,城市及道路照明,csjdlzm,机电设备安装工程,jdaz"
aryIndex = Split(strAllItemName, ",")
Set dicMapIndex = CreateObject("Scripting.Dictionary")
For i = 0 To UBound(aryIndex) Step 2
dicMapIndex(aryIndex(i)) = aryIndex(i + 1) '建立工作表名称与post参数的字典索引
Next
Application.ScreenUpdating = False
Application.DisplayStatusBar = True
t = Timer
tn = Sheets("Main").Cells(11, 4)
'Debug.Print tn
'Set objWinhttp = CreateObject("WinHttp.WinHttpRequest.5.1")
Set objWinhttp = CreateObject("Microsoft.XMLHTTP")
URL = "http://www.gzgcjg.com/gzqypjtx/Estimate/ZY/MainQueryMarkZY.aspx?clearPaging=true"
Call getPostData(objWinhttp, URL) '取Post参数
Call getMultiPageData(objWinhttp, URL, tn) '取得网页数据,写入对应工作表(工作表请提前建好)
'Call getMultiPageData(objWinhttp, URL, "地基与基础工程")
Application.StatusBar = False
Application.ScreenUpdating = True
MsgBox "检索完成" & vbCrLf & vbCrLf & "共用时: " & vbCrLf & Format(Timer - t, "0.00秒"), vbOKOnly, "提示"
ErrHandle:
Set objWinhttp = Nothing
Set dicMapIndex = Nothing
End Sub
Private Sub getPostData(objWinhttp As Object, URL As String)
Dim tt$
'第一次调用取得post的几个参数
Application.StatusBar = "正在获取Post参数..."
With objWinhttp
' .Option(6) = 0
.Open "GET", URL, False
.setRequestHeader "Connection", "Keep-Alive"
.send
'COOKIE = Split(.getResponseHeader("Set-Cookie"), ";")(0)
tt = .responsetext
With CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}") '调试用,数据放入剪贴板
.SetText tt
.PutInClipboard
End With
End With
If CheckServerError(objWinhttp.Status, tt) Then
Application.StatusBar = False
Application.ScreenUpdating = True
Set objWinhttp = Nothing
Set dicMapIndex = Nothing
End
End If
viewstate = Split(Split(tt, "__VIEWSTATE"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0) '取得VIEWSTATE的Post参数
eventvalidation = Split(Split(tt, "__EVENTVALIDATION"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0) '取得EVENTVALIDATION的Post参数
txtDay = Split(Split(tt, "ctl00$cph_content$txtDay"" type=""text"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0) '提交日期,默认当日
End Sub
Private Sub getMultiPageData(objWinhttp As Object, URL As String, strItemName As String)
Dim tt$, postdata$, drptitle$, drpZzdj$, PagingForwardTo$, ScriptManager2, eventtarget$, page%, Maxpage%, sht As Worksheet
Dim i As Integer, last_row As Integer
On Error Resume Next
Set sht = ThisWorkbook.Sheets(strItemName)
If Err > 0 Then
With ThisWorkbook
Set sht = ThisWorkbook.Worksheets.Add(after:=.Worksheets(.Worksheets.Count))
sht.Name = strItemName
End With
'MsgBox "未找到对应工作表:" & strItemName: Exit Sub
End If
sht.Cells.Clear
sht.Activate
sht.Range("a1:j1") = Split("排名 企业名称 市场行为 质量安全 建设单位 其他 总分 平均分 计算日期 评价类别", " ")
On Error GoTo 0
'ScriptManager2 = "ctl00$UpdatePanel1|ctl00$cph_content$drpZzdj" '切换专业资质的选择(首页),post参数
ScriptManager2 = "ctl00$UpdatePanel1|ctl00$cph_content$GridViewPaging1$btnForwardToPage"
eventtarget = "ctl00$cph_content$drpZzdj" '切换专业资质的选择(首页),post参数
drptitle = "ECB034F6E40C4E33A5F8C7587D112CB6" '这个参数在专业资质的页面是不变的,其他链接没有核对。
drpZzdj = dicMapIndex(strItemName)
PagingForwardTo = 1 'post提交的页码
i = 1
n = 1 '记录输出数据的数组下标
Maxpage = 1 '最大页码
last_row = 2
'***************打开选择的专业资质页面*******************
Application.StatusBar = "正在获取" & strItemName & "的首页..." & ",累计用时" & Format(Timer - t, "0.0秒"): DoEvents
tt = ""
With objWinhttp
' .Option(6) = 1
Do
postdata = "ctl00$ScriptManager2=" & ScriptManager2 & _
"&__EVENTTARGET=" & EncodePostdata(eventtarget) & _
"&__EVENTARGUMENT=" & _
"&__LASTFOCUS=" & _
"&__VIEWSTATE=" & EncodePostdata(viewstate) & _
"&__VIEWSTATEENCRYPTED=" & _
"&__EVENTVALIDATION=" & EncodePostdata(eventvalidation) & _
"&ctl00$WidthPixel=" & _
"&ctl00$HeightPixel=" & _
"&ctl00$cph_content$drpTitle=" & drptitle & _
"&ctl00$cph_content$drpZzdj=" & drpZzdj & _
"&ctl00$cph_content$txtEnterName=" & _
"&ctl00$cph_content$txtDay=" & txtDay & _
"&ctl00$cph_content$GridViewPaging1$txtGridViewPagingForwardTo=" & i & _
"&ctl00$cph_content$GridViewPaging1$btnForwardToPage=Go"
.Open "POST", URL, False
.setRequestHeader "Referer", "http://www.gzgcjg.com/gzqypjtx/Estimate/ZY/MainQueryMarkZY.aspx?clearPaging=true"
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.setRequestHeader "x-microsoftajax", "Delta=True"
.setRequestHeader "cache-Control", "no-cache"
.setRequestHeader "Accept-Encoding", "gzip,deflate"
.setRequestHeader "User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; AskTbPTV2/5.9.1.14019)"
.setRequestHeader "Content-Length", Len(postdata)
.setRequestHeader "Connection", "Keep-Alive"
.setRequestHeader "Pragma", "no-cache"
.send (postdata)
tt = .responsetext
'With CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}") '调试用,数据放入剪贴板
' .SetText tt
' .PutInClipboard
'End With
' Application.StatusBar = "将源文件拷到本地..."
' Call makehtm(tt, ThisWorkbook.Path)
If i = 1 Then
viewstate = Split(Split(tt, "__VIEWSTATE"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0) '取得VIEWSTATE的Post参数
eventvalidation = Split(Split(tt, "__EVENTVALIDATION"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0) '取得EVENTVALIDATION的Post参数
Maxpage = Val(Split(Split(tt, "共<font color='red'>", -1, vbTextCompare)(2), "</font>页", -1, vbTextCompare)(0))
'Debug.Print " 页码:" & Maxpage
'MsgBox strItemName & " 共有 " & Maxpage & " 页数据"
End If
Application.StatusBar = strItemName & " 共有 " & Maxpage & " 页数据,正则提取第 " & i & " 页数据..."
' With sht.QueryTables.Add(Connection:= _
' "URL;file:///" & Replace(ThisWorkbook.Path, " ", "%20") & "/temp_source.html", _
' Destination:=sht.Range("A" & last_row + 1))
' .WebTables = """ctl00_cph_content_GridView1"""
' .Refresh BackgroundQuery:=False
' End With
Call inputsheets(tt, strItemName, last_row)
last_row = sht.[A65536].End(xlUp).Row + 1
i = i + 1
Loop Until i > Maxpage
End With
If CheckServerError(objWinhttp.Status, tt) Then
Application.StatusBar = False
Application.ScreenUpdating = True
Set objWinhttp = Nothing
Set dicMapIndex = Nothing
End
End If
viewstate = Split(Split(tt, "__VIEWSTATE"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0)
eventvalidation = Split(Split(tt, "__EVENTVALIDATION"" value=""", -1, vbTextCompare)(1), """", -1, vbTextCompare)(0)
Maxpage = Val(Split(Split(tt, "共<font color='red'>", -1, vbTextCompare)(2), "</font>页", -1, vbTextCompare)(0))
MsgBox Maxpage & "页提取完成"
End Sub
Private Function CheckServerError(Status, tt As String) As Boolean
If InStr(1, tt, "__VIEWSTATE"" value=""", vbTextCompare) = 0 Then
MsgBox "数据错误,网站返回的数据已放入剪贴板,请在记事本打开ctrl_v粘贴查看。" & vbCrLf & "错误码:" & Status
With CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}") '调试用,数据放入剪贴板
.SetText tt
.PutInClipboard
End With
CheckServerError = True
Else
CheckServerError = False
End If
End Function
|
|