python爬虫(五):实战 【5. 使用正则爬亚马逊价格】

使用正则定位价格,更简单

import requests

import re

url = 'https://www.amazon.cn/s/field-keywords=spark'

# 隐藏爬虫

head = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

proxy_id = { "http": "http://61.135.155.82:443"}

cookie = {'session-id':'459-4568418-5692641', 'ubid-acbcn':'459-5049899-3055220','x-wl-uid':'1AK7YMFc9IzusayDn2fT6Topjz3iAOpR3EeA2UQSqco8fo5PbK2aCpyBA/fdPMfKFqZRHc4IeyuU=','session-token':'"OH1wPvfOj6Tylq2nnJcdn5wyxycR/lqyGsGU3+lUtU4mbC0ZD9s8/4Oihd1BlskUQG8zRbLVs9vfWXuiJmnRlDT4x35ircp2uLxOLNYQ4j5pzdFJIqqoZUnhHSJUq2yK80P3LqH8An7faXRCPW9BIqX1wu0WmHlSS9vYAPKA/2SGdV9b//EljYjIVCBjOuR/dKRiYEeGK3li0RJOVz7+vMWg7Rnzbx89QxlbCp0WyquZyVxG6f2mNw=="','csm-hit':'tb:0J5M3DH92ZKHNKA0QBAF+b-0J5M3DH92ZKHNKA0QBAF|1544276572483&adb:adblk_no','session-id-time':'2082787201l'}

r = requests.get(url,headers=head,proxies=proxy_id,cookies=cookie)

# 转换编码,apparent_encoding是基于文本推测的编码

r.encoding = r.apparent_encoding

html = r.text

# 使用正则定位价格

# 根据¥59.50 制定规则:¥数字(一或多次).数字(两次)

re.findall('¥\d+\.\d{2}',html)

 

结果:

python爬虫(五):实战 【5. 使用正则爬亚马逊价格】