用许多办法可以实现,下面的代码,针对新浪读书网,过滤了前面、后面许多无用的文字:
Option Explicit Function bytes2BSTR(vIn) Dim strReturn, i, ThisCharCode, innerCode, Hight8, Low8, NextCharCode strReturn = "" For i = 1 To LenB(vIn) ThisCharCode = AscB(MidB(vIn, i, 1)) If ThisCharCode < &H80 Then strReturn = strReturn & Chr(ThisCharCode) Else NextCharCode = AscB(MidB(vIn, i + 1, 1)) strReturn = strReturn & Chr(CLng(ThisCharCode) * &H100 + CInt(NextCharCode)) i = i + 1 End If Next bytes2BSTR = strReturn End Function Function viewHtmlCode(url) Dim baoxmlhttp As Object Set baoxmlhttp = CreateObject("Msxml2.xmlhttp") With baoxmlhttp .Open "GET", url, False, "", "" .Send viewHtmlCode = bytes2BSTR(.ResponseBody) End With Set baoxmlhttp = Nothing End Function Function stripHTML(strHTML) Dim objRegExp As Object, strOutput$ Set objRegExp = CreateObject("vbscript.RegExp") objRegExp.IgnoreCase = True objRegExp.Global = True objRegExp.Pattern = "<(.|\n)+?>" strHTML = VBA.Replace(strHTML, "<p>", vbCr) strOutput = objRegExp.Replace(strHTML, "") strOutput = Replace(strOutput, "<", "<") strOutput = Replace(strOutput, ">", ">") strOutput = Replace(strOutput, vbCr & vbCr, vbCr) strOutput = Replace(strOutput, "nbsp;", "") stripHTML = strOutput Set objRegExp = Nothing End Function
Function copyHtmlText(url As String) As String Dim SourceStr$, pStart#, pEnd#, strLen# SourceStr = viewHtmlCode(url) '网页的代码 pStart = VBA.InStr(1, SourceStr, "<div id=article>", vbTextCompare) pEnd = VBA.InStr(1, SourceStr, "发表评论", vbTextCompare) strLen = VBA.Len(SourceStr) SourceStr = VBA.Mid$(SourceStr, pStart, pEnd - pStart) copyHtmlText = stripHTML(SourceStr) End Function
Sub getText() Dim url As String With Selection url = "http://book.sina.com.cn/nzt/his/maozedongjuece/1.shtml" .EndKey unit:=wdStory, Extend:=wdMove .Text = copyHtmlText(url) url = "http://book.sina.com.cn/nzt/his/maozedongjuece/2.shtml" .EndKey unit:=wdStory, Extend:=wdMove .Text = copyHtmlText(url) url = "http://book.sina.com.cn/nzt/his/maozedongjuece/3.shtml" .EndKey unit:=wdStory, Extend:=wdMove .Text = copyHtmlText(url) End With End Sub
|