爬取京东商城中的书籍信息
京东页面搜索python|
每一页有60本书,但开始只有30页,当鼠标滚轮滚到下方时,后30本才由JavaScript加载
Chrome F12 在console中
在console中继续实验,用document.getElementByXXX方法随意选中页面下方的某个元素,发现点击下一页,书籍数量变成了60
进入下一页,发现观察发现每个页面URL仅仅page=1page=3第三页page=5,因此推算出所有页面的url
编码
这里首先开启splash服务
docker run -p 8050:8050 scrapinghub/splash
APLASH_URL = "http://localhost:8050"
#开启splash的两个下载中间件并调整HttpCompressionMiddleware次序
DOWNLOADER_MIDDLEWARES={
'scrapy_splash.SplashCookiesMiddleware':723,
'scrapy_splash.SplashMiddleware':725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':810,
}
#设置去重过滤器
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
#用来支持cache——args(可选)
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware':100,
}
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
jd_book.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy_splash import SplashRequest
lua_script = '''
function main(splash)
splash:go(splash.args.url)
splash:wait(2)
splash:runjs("document.getElementsByClassName('page')[0].scrollIntoView(true)")
splash:wait(2)
return splash:html()
end
'''
class JdBookSpider(scrapy.Spider):
name = 'jd_book'
allowed_domains = ['search.jd.com']
base_url = 'https://search.jd.com/Search?keyword=python&enc=utf-8&suggest=1.his.0.0&wq=&pvid=01cb0e938125479e99934b7e17c11f6a'
def start_requests(self):
yield Request(self.base_url,callback=self.parse_urls,dont_filter=True)
def parse_urls(self,response):
#爬取总数6.4万,这里我们不爬这么多
#total = int(response.css('span#J_resCount::text').extract_first())
#pageNum = total//60 +(1 if total%60 else 0)
pageNum = 20
for i in range(pageNum):
url = '%s&page=%s'%(self.base_url,2*i+1)
yield SplashRequest(url,endpoint='execute',args={'lua_source':lua_script},cache_args=['lua_source'])
def parse(self, response):
for sel in response.css('ul.gl-warp.clearfix>li.gl-item'):
yield{
'name':sel.css('div.p-name').xpath('string(.//em)').extract_first(),
'price':sel.css('div.p-price i::text').extract_first(),
}
运行
scrapy crawl jd_book -o books.csv