|
本帖最后由 bluexuemei 于 2015-1-3 17:57 编辑
关于提取文档程序
1,有的价格不正确,不知道为什么,已经在附件标注黄色。(当时没有考虑到千分号,已修改)
2,有一个宝贝名称死活出不来,很奇怪。(附件倒数第二行)(中间那个黑点惹的祸,导致转码失败)
3,第4行的名称为粗体字,原因是?(是你不小心按了工具栏中的字体加粗)完整程序如下:- Sub getwebdata提取文本() '提取文本
- ' http://www.artonx.org/data/asr/Ruby-2.1.1.msi
- Dim url$, y$, info, bname$, ojs As Object
- Set ojs = CreateObject("scriptcontrol"): ojs.Language = "rubyscript"
- For Each cell In Selection
- url = cell.Value
- y = ojs.eval("require 'open-uri';$r='decTxtAucPrice,StartPrice,decTxtBuyPrice,EndTime,SellerID'.split(',');$s=open(%Q(" & url & "),&:read).to_s")
- info = ojs.eval("$r.map{|p|$s.scan(/#{p}"">([\s\S]*?)</)[0]}.flatten.map{|x|x.to_s.encode('gbk').gsub(/\s+/,'')}.map.with_index{|x,i|i<3?x[/[\d,]+/]:x}")
- bname = ojs.eval("$s.scan(/title>(.+?)(?=\-)/)[0][0].split(//).select{|x|x.ord!=12539}.join.encode('gbk')")
- cell(1, 2).Resize(1, 5) = info
- cell(1, -1) = bname
- Next
- Set ojs = Nothing
- 'Stop
- End Sub
- Sub getwebpicture提取图片() '提取图片
- 'http://www.artonx.org/data/asr/Ruby-2.1.1.msi
- Dim url$, bname$, img, cell As Range, ojs As Object
- Dim nUrl As String, localFilename As String
- Dim XmlHttp As Object, ayrHttpBody() As Byte
- Set ojs = CreateObject("scriptcontrol"): ojs.Language = "rubyscript": ojs.timeout = -1
- Set XmlHttp = CreateObject("Microsoft.XMLHTTP")
- On Error Resume Next
- For Each cell In Selection
- url = cell.Value
- y = ojs.eval("require 'open-uri';$s=open(%Q(" & url & "),&:read)")
- bname = ojs.eval("$s.scan(/title>(.+?)(?=\-)/)[0][0].split(//).select{|x|x.ord!=12539}.join.encode('gbk')")
- img = ojs.eval("$s.scan(/<img src\=""(.+?.jpg)/).flatten")
- Debug.Print img
- subfolder = Trim(Replace(bname, "/", "")) & Replace(Round(Rnd(), 3), ".", "")
- MkDir ThisWorkbook.Path & "" & subfolder
- For i = 0 To UBound(img)
- nUrl = img(i)
- localFilename = ThisWorkbook.Path & "" & subfolder & "" & i + 1 & ".jpg"
- XmlHttp.Open "GET", nUrl, True '异步下载
- XmlHttp.Send
- Do Until XmlHttp.ReadyState = 4
- DoEvents
- Loop
- If XmlHttp.Status = 200 Then
- ayrHttpBody() = XmlHttp.ResponseBody
- Open localFilename For Binary As #1
- Put #1, , ayrHttpBody()
- Close #1
- End If
- Next
- Next
- Set XmlHttp = Nothing
- Set ojs = Nothing
- 'Stop
- End Sub
- Sub addcomment添加图片批注() '添加图片批注
- ' http://www.artonx.org/data/asr/Ruby-2.1.1.msi
- Dim url$, bname$, img$, cell As Range, ojs As Object
- Set ojs = CreateObject("scriptcontrol"): ojs.Language = "rubyscript"
- On Error Resume Next
- For Each cell In Selection
- url = cell.Value
- y = ojs.eval("require 'open-uri';$s=open(%Q(" & url & "),&:read)")
- bname = ojs.eval("$s.scan(/title>(.+?)(?=\-)/)[0][0].encode('gbk')")
- img = ojs.eval("$s.scan(/<img src\=""(.+?.jpg)/)[0][0]")
- With cell(1, -1)
- .ClearComments
- .addcomment
- .Comment.Shape.Fill.UserPicture img
- .Comment.Shape.Height = 450
- .Comment.Shape.Width = 600
- End With
-
- Next
- Set ojs = Nothing
- 'Stop
- End Sub
复制代码 |
|