刮刀无法解析从第一页的内容
问题描述:
我写了一些代码来解析来自yell.com不同商店的名称,地址和电话号码。如果有任何链接提供给我的抓取工具,它将解析整个内容,而不管它传播了多少页面。但是,我发现的唯一问题是,它总是跳过第一页的内容,如果有10页,我的抓取工具会抓取最后9页。有点抽搐可能导致我得到解决方法。这是完整的代码。提前致谢。刮刀无法解析从第一页的内容
Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60, html As New HTMLDocument, htm As New HTMLDocument
Dim post As HTMLHtmlElement, page As Object, newlink As String
With http
.Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
.send
html.body.innerHTML = .responseText
End With
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")
For i = 0 To page.Length - 2
newlink = mlink & Replace(page(i).href, "about:", "")
With http
.Open "GET", newlink, False
.send
htm.body.innerHTML = .responseText
End With
For Each post In htm.getElementsByClassName("js-LocalBusiness")
x = x + 1
With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
If .Length Then Cells(x + 1, 1) = .Item(0).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
End With
With post.getElementsByClassName("businessCapsule--tel")
If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
End With
Next post
Next i
End Sub
下面是其内的下一个页面的页面数存储元素:
<div class="row pagination">
<div class="col-sm-24">
<span class="pagination--page is-selected">1</span>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=2" data-tracking="DISPLAY:PAGINATION:NUMBER">2</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=3" data-tracking="DISPLAY:PAGINATION:NUMBER">3</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=4" data-tracking="DISPLAY:PAGINATION:NUMBER">4</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=5" data-tracking="DISPLAY:PAGINATION:NUMBER">5</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=6" data-tracking="DISPLAY:PAGINATION:NUMBER">6</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=7" data-tracking="DISPLAY:PAGINATION:NUMBER">7</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=8" data-tracking="DISPLAY:PAGINATION:NUMBER">8</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=9" data-tracking="DISPLAY:PAGINATION:NUMBER">9</a>
<a class="pagination--page" rel="nofollow" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=10" data-tracking="DISPLAY:PAGINATION:NUMBER">10</a>
<a rel="nofollow" class="pagination--next" href="/ucs/UcsSearchAction.do?location=United+Kingdom&keywords=pizza&scrambleSeed=721890588&pageNum=2" data-tracking="DISPLAY:PAGINATION:NEXT">Next</a>
</div>
</div>
答
这里的问题是,第一页已被选中的事实,因此它不具有分页中的锚点。解决方案将首先处理第一页,然后使用分页处理剩余页面。 HTH
Option Explicit
Sub YellUK()
Const mlink = "https://www.yell.com"
Dim http As New MSXML2.XMLHTTP60
Dim html As New HTMLDocument
Dim page As Object, newlink As String
With http
.Open "GET", "https://www.yell.com/ucs/UcsSearchAction.do?keywords=pizza&location=United+Kingdom&scrambleSeed=1426936001", False
.send
html.body.innerHTML = .responseText
End With
Set page = html.getElementsByClassName("row pagination")(0).getElementsByTagName("a")
Dim i, x
' First page first, is selected already, 'row pagination' doesn't have 'a' for it
GetPageData x, html
' Next pages then
Dim html2 As New HTMLDocument
For i = 0 To page.Length - 2
newlink = mlink & Replace(page(i).href, "about:", "")
With http
.Open "GET", newlink, False
.send
html2.body.innerHTML = .responseText
End With
GetPageData x, html2
Next i
End Sub
Private Sub GetPageData(ByRef x, ByRef html As HTMLDocument)
Dim post As HTMLHtmlElement
For Each post In html.getElementsByClassName("js-LocalBusiness")
x = x + 1
With post.getElementsByClassName("row businessCapsule--title")(0).getElementsByTagName("a")
If .Length Then Cells(x + 1, 1) = .Item(0).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 1 Then Cells(x + 1, 2) = .Item(1).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 2 Then Cells(x + 1, 3) = .Item(2).innerText
End With
With post.getElementsByClassName("col-sm-10 col-md-11 col-lg-12 businessCapsule--address")(0).getElementsByTagName("span")
If .Length > 3 Then Cells(x + 1, 4) = .Item(3).innerText
End With
With post.getElementsByClassName("businessCapsule--tel")
If .Length > 1 Then Cells(x + 1, 5) = .Item(1).innerText
End With
Next post
End Sub
编辑: 可能是这样的。第一页链接是为i=-1
创建的,然后是下一页。
For i = -1 To page.Length - 2
If i = -1 Then
newlink = mlink & Replace(page(i + 1).href, "about:", "")
newlink = Left(newlink, Len(newlink) - 1) & "1"
Else
newlink = mlink & Replace(page(i).href, "about:", "")
End If
Debug.Print i & ", " & newlink ' Prints the links for all the pages
With http
.Open "GET", newlink, False
.send
htm.body.innerHTML = .responseText
End With
' Get page data here ...
Next i
谢谢主席先生,让您一直处于循环状态总是非常高兴。您的解决方案确实获得了全部内容。难道不可能在一个子程序中创建整个事物吗? – SIM
欢迎您!这是可能的,例如,首先准备好'URLs'(包括第一页,然后通过这个'URLs'循环),否则就像我写的那样:第一页当前被加载,并且在分页控件中没有任何'a'元素合乎逻辑的,因为分页控件包含指向下一页的链接) – dee
请先生,再详细解释一下我可以如何在一个子程序中完成整个工作 – SIM