|
[广告] Excel易用宝 - 提升Excel的操作效率 · Excel / WPS表格插件 ★ 免费下载 ★ ★ 使用帮助★
本帖最后由 氿啊 于 2021-8-22 14:57 编辑
刘老师大神您好,我遇到很严重的问题向您请教
举例我需要取得
https://search.douban.com/movie/ ... =tt0770442&cat=1002
这个页面的数据。
而根据传统方式,
Sub 测试1
Set HTML = CreateObject("htmlfile")
Set http = CreateObject("Msxml2.XMLHTTP")
http.Open "GET", "https://search.douban.com/movie/subject_search?search_text=tt0770442&cat=1002", False
http.send
tt = http.responseText ' 得到数据
Window.clipboardData.SetData "text", tt '写入剪贴板
End Sub
或
Sub 测试2()
URL="https://search.douban.com/movie/subject_search?search_text=tt0770442&cat=1002"
With CreateObject("Microsoft.XMLHTTP")
.Open "GET", URL, False '要抓取的链接,"GET"尽量用大写,以免某些系统不兼容
.send
tt = .responseText
With CreateObject("new:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}") 'DataObject对象,数据放入剪贴板,记事本观察数据
.SetText tt '因为XMLHTTP默认是UTF-8,不能识别gb2312,会发现数据乱码
.PutInClipboard '所以不能采用.responsetext对象来得到字符串
End With
End With
End Sub
两种方式均无法取得想要的数据,搜索结束产生数据位置显示为"正在搜索…"
例如我想要的数据,在tt = .responseText或tt = http.responseText 的tt中并不存在。我用360加载网页后,F12调试中能看到想要的数据,在Fiddle抓包也看不到相应数据,所以很是困惑。我要的数据如下图: 必须完整加载网页后才能显示。
但是通过VBA程序打开网页方式,并加载完成后,就可以取得想要的数据,再对数据进行处理提取。
代码很长,是那种嵌套引用方式,老实讲,这段代码并非我写,而是修改的他人的代码,照猫画虎勉强使用,但画出来的虎它毕竟不是虎,它每次都需要打开网页,加载后才能找到相应的数据,效率极其低下。老师能否提供个思路,在我这种情况下,在不打开网页的情况下收集网页加载后的完整数据,只要字符串或者说HTML完整文档有了,就可以用您教的方法取得相应数据。后面有相应附件
Public Const lngStartRow As Long = 2 '起始输入行
Dim n As Long
Dim objDic As Object
Dim strRef As String, objIE As Object
Dim IsOpen As Boolean
Dim objframe As Object
Sub 网页元素分析()
URL = "https://search.douban.com/movie/subject_search?search_text=tt0770442&cat=1002" '测试
Set objIE = FindWin(URL) '先查找该网页是否已打开
If objIE Is Nothing Then
Set objIE = CreateObject("internetexplorer.application")
With objIE
.Visible = False
.Navigate URL '打开网页
Do While .ReadyState <> 4 Or .Busy
DoEvents
Loop
End With
Else
IsOpen = True
End If
Application.ScreenUpdating = False
Set objDic = CreateObject("scripting.dictionary")
DoEvents
Call FindFrame(objIE.Document.Frames, ".Document.") '寻找每个frame的内容
DoEvents
Cells.WrapText = False '单元格取消自动换行
Application.ScreenUpdating = True
Set objDic = Nothing
Set objIE = Nothing
MsgBox "完毕!"
End Sub
Sub FindFrame(ByVal objframe As Object, ByVal CellName As String)
'递归查找frame
Dim i As Long
DoEvents
Call OutPutAllCell(objframe, CellName) '输出元素内容
For i = 0 To objframe.Length - 1
objDic.RemoveAll
Call FindFrame(objframe(i), CellName & "frames(" & i & ").Document.")
Next
End Sub
Sub OutPutAllCell(ByVal objframe As Object, ByVal CellName As String)
'输出元素属性
Dim subitem As Object
Dim strCode As String
Dim strID As String
Dim j As Integer
Dim 元素代码(), 长度(), 标识(), 名字(), 标识名(), type值(), 值(), href(), 内部数据()
On Error Resume Next
n = 0
For Each subitem In objframe.Document.all
n = n + 1
objDic(subitem.tagName) = objDic(subitem.tagName) + 1
strCode = "(" & subitem.tagName & ")" & "(" & objDic(subitem.tagName) - 1 & ")"
strID = subitem.ID
If strID = "" Then strID = subitem.Name
ThisWorkbook.Sheets("Sheet1").Cells(n + 2, 1).Value = strCode '将数据放入第一个表格检测
ThisWorkbook.Sheets("Sheet1").Cells(n + 2, 2).Value = subitem.all.Length
For j = 3 To ThisWorkbook.Sheets("Sheet1").Cells(2, Columns.Count).End(xlToLeft).Column
ThisWorkbook.Sheets("Sheet1").Cells(n + 2, j).Value = CallByName(subitem, ThisWorkbook.Sheets("Sheet1").Cells(2, j).Value, VbGet)
Next
Next
Set subitem = Nothing
End Sub
Function FindWin(ByVal strRef As String) As Object
'找寻已打开的网页
Dim objWin As Object
For Each objWin In CreateObject("Shell.Application").Windows
Do While objWin.ReadyState <> 4 Or objWin.Busy
DoEvents
Loop
If LCase(TypeName(objWin.Document)) = "htmldocument" Then
If objWin.LocationURL = strRef Then
Set FindWin = objWin
Exit For
End If
End If
Next
Set objWin = Nothing
End Function
|
|