Ajax加载爬取练习(1)微博爬取
第三天2,Ajax网页练习(仿照崔庆才的微博爬取例子)爬取了杨幂最近300条微博数据
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import sys
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
base_url = 'https://m.weibo.cn/api/container/getIndex?uid=1195242865&luicode=10000011&lfid=100103type%3D1%26q%3D%E6%9D%A8%E5%B9%82&sudaref=m.weibo.cn&display=0&retcode=6102&type=uid&value=1195242865&containerid=1076031195242865'
headers = {
'Host': 'm.weibo.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
max_page = 30
def get_page(page):
url = base_url + '&page=' + str(page)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page: int):
if json:
items = json.get('data').get('cards')
for index, item in enumerate(items):
if page == 1 and index == 1:
continue
else:
item = item.get('mblog', {})
weibo = {}
weibo['id'] = item.get('id')
weibo['text'] = pq(item.get('text')).text().translate(non_bmp_map)
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
print('=' * 150)
yield weibo
if __name__ == '__main__':
for page in range(1, max_page + 1):
json = get_page(page)
results = parse_page(*json)
for result in results:
print(result)
结果