|
本帖最后由 duquancai 于 2016-7-19 05:58 编辑
请你测试:
Sub WordVBA网抓()
Dim strText$, t$, t1$, t2$, t3$, RegMatch, i%
Application.ScreenUpdating = False
With CreateObject("msxml2.xmlhttp")
For i = 2 To 395
.Open "GET", "http://www.tcm100.com/user/hxsjshl/zzbook" & i & ".htm", False
.send
Do While .readyState <> 4
DoEvents
Loop
strText = strText + .responseText
Next
End With
With CreateObject("VBScript.Regexp")
.Global = True
.Pattern = "<td[\s\S]*?<div\s*class='title'[\s\S]*?>([\s\S]+?)</div>[\s\S]*?<div\s*class='title'>([\s\S]+?)</div>[\s\S]*?<div\s* class='content'>([\s\S]+?)</div>[\s\S]*?</td>"
For Each RegMatch In .Execute(strText)
t1 = RegMatch.SubMatches(0)
t2 = RegMatch.SubMatches(1)
t3 = RegMatch.SubMatches(2)
t = t & t1 & Chr(13) & t2 & Chr(13) & t3 & Chr(13)
Next
.Pattern = "(?:<a\s*href=[\s\S]+?>)|</a>": t = .Replace(t, "")
.Pattern = "(?!<br>)(?: )+": t = .Replace(t, " ")
.Pattern = "<br>\s+": t = .Replace(t, Chr(13))
End With
Application.ScreenUpdating = True
Documents.Add.Content.Text = t
End Sub
|
|