python爬虫之四 —— selenium和京东商品

前言

如果说爬虫是模拟浏览器向服务器发送请求,获取数据,那么有了selenium之后,我们可以操控浏览器自动帮我们抓取数据。关于selenium在python中的使用,可以参考Selenium DocumentationSelenium with Python中文翻译文档

步骤

这次以京东为例,获取京东的商品列表数据。步骤如下

  1. 打开首页
  2. 搜索关键字,进入第一页
  3. 网页下拉
  4. 获取网页源码,解析网页
  5. 存储数据
  6. 翻页,重复3、4、5步

最终我们能拿到如下数据,虽然selenium有一个非常明显的缺点——速度太慢,但却能轻松帮我们搞定动态网页,python + selenium合理使用我们能做很多事情。
python爬虫之四 —— selenium和京东商品

详细代码

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import time
import json


class JD:
    def __init__(self):
        self.url = 'https://www.jd.com/'
        self.chrome_options = Options()
        self.chrome_options.add_argument('--headless')
        self.driver = webdriver.Chrome(self.chrome_options) 
#        self.driver.maximize_window()
    
    def search(self, keyword):
        element = self.driver.find_element_by_id('key')
        element.send_keys(keyword)
        element.send_keys(Keys.RETURN)
    
    def scroll_down(self):
        for i in range(1, 12):
            js = "var q=document.documentElement.scrollTop=" + str(500 * i)
            self.driver.execute_script(js)
#            print('='*20)
            time.sleep(2)   
    
    def next_page(self):
        next_one = self.driver.find_elements_by_class_name('pn-next')
        next_one = next_one[0] if len(next_one) > 0 else None
        return next_one
    
    def parse_page(self):
        page_source = self.driver.page_source
        soup = bs(page_source, 'lxml')
        contents = soup.select('#J_goodsList > ul.gl-warp > li.gl-item > div.gl-i-wrap')
        for content in contents:
            result = {
                    'product_name': content.select_one('div.p-name > a > em').get_text(),
                    'product_ad': content.select_one('div.p-name > a > i.promo-words').get_text(),
                    'product_url': 'https' + content.select_one('div.p-name > a')['href'],
                    'comment_num': content.select_one('.p-commit strong a').get_text(),
                    'tags': [i.get_text() for i in content.select('.p-icons i')],
                    'store_name': content.select_one('div.p-shop > span.J_im_icon > a').get_text(),
                    'store_url': 'https' + content.select_one('div.p-shop > span.J_im_icon > a')['href'],
#                    'pic': 'https' + content.select_one('div.p-img > a > img')['src']
                    }
            yield result
    
    def on_save(self, content):
        if content:
            with open('E:/spiders/JD/jd.txt', 'a', encoding='utf-8') as f:
                f.write(json.dumps(content, ensure_ascii=False))
                f.write('\n')
    
    def run(self, keyword):
        # 1.打开首页
        self.driver.get(self.url)
        # 2. 搜索关键字,进入第一页
        self.search(keyword)
        time.sleep(2)
        self.scroll_down()
        # 3. 获取网页源码,解析网页 
        for result in self.parse_page():
            self.on_save(result)
            print(result['product_url'], 'saved')
        # 4. 存储数据
        # 5. 翻页,重复3、4、5步
        next_one = self.next_page()
        while next_one:
            next_one.click()
            time.sleep(2)
            self.scroll_down()
            for result in self.parse_page():
                self.on_save(result)
                print(result['product_url'], 'saved')
            next_one = self.next_page()
        self.driver.quit()

if __name__ == '__main__':
    jd=JD()
    jd.run('男装')