python爬虫之四 —— selenium和京东商品
前言
如果说爬虫是模拟浏览器向服务器发送请求,获取数据,那么有了selenium之后,我们可以操控浏览器自动帮我们抓取数据。关于selenium在python中的使用,可以参考Selenium Documentation和Selenium with Python中文翻译文档。
步骤
这次以京东为例,获取京东的商品列表数据。步骤如下
- 打开首页
- 搜索关键字,进入第一页
- 网页下拉
- 获取网页源码,解析网页
- 存储数据
- 翻页,重复3、4、5步
最终我们能拿到如下数据,虽然selenium有一个非常明显的缺点——速度太慢,但却能轻松帮我们搞定动态网页,python + selenium合理使用我们能做很多事情。
详细代码
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs
import time
import json
class JD:
def __init__(self):
self.url = 'https://www.jd.com/'
self.chrome_options = Options()
self.chrome_options.add_argument('--headless')
self.driver = webdriver.Chrome(self.chrome_options)
# self.driver.maximize_window()
def search(self, keyword):
element = self.driver.find_element_by_id('key')
element.send_keys(keyword)
element.send_keys(Keys.RETURN)
def scroll_down(self):
for i in range(1, 12):
js = "var q=document.documentElement.scrollTop=" + str(500 * i)
self.driver.execute_script(js)
# print('='*20)
time.sleep(2)
def next_page(self):
next_one = self.driver.find_elements_by_class_name('pn-next')
next_one = next_one[0] if len(next_one) > 0 else None
return next_one
def parse_page(self):
page_source = self.driver.page_source
soup = bs(page_source, 'lxml')
contents = soup.select('#J_goodsList > ul.gl-warp > li.gl-item > div.gl-i-wrap')
for content in contents:
result = {
'product_name': content.select_one('div.p-name > a > em').get_text(),
'product_ad': content.select_one('div.p-name > a > i.promo-words').get_text(),
'product_url': 'https' + content.select_one('div.p-name > a')['href'],
'comment_num': content.select_one('.p-commit strong a').get_text(),
'tags': [i.get_text() for i in content.select('.p-icons i')],
'store_name': content.select_one('div.p-shop > span.J_im_icon > a').get_text(),
'store_url': 'https' + content.select_one('div.p-shop > span.J_im_icon > a')['href'],
# 'pic': 'https' + content.select_one('div.p-img > a > img')['src']
}
yield result
def on_save(self, content):
if content:
with open('E:/spiders/JD/jd.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False))
f.write('\n')
def run(self, keyword):
# 1.打开首页
self.driver.get(self.url)
# 2. 搜索关键字,进入第一页
self.search(keyword)
time.sleep(2)
self.scroll_down()
# 3. 获取网页源码,解析网页
for result in self.parse_page():
self.on_save(result)
print(result['product_url'], 'saved')
# 4. 存储数据
# 5. 翻页,重复3、4、5步
next_one = self.next_page()
while next_one:
next_one.click()
time.sleep(2)
self.scroll_down()
for result in self.parse_page():
self.on_save(result)
print(result['product_url'], 'saved')
next_one = self.next_page()
self.driver.quit()
if __name__ == '__main__':
jd=JD()
jd.run('男装')