使用搜狗接口爬取微信公众号
这里使用搜狗接口来爬取微信公众号,
使用的代理池是以前使用redis和flask一起来维护的,在网上爬取的免费代理(想要详细了解的可以到前面文章了解)
主文件代码如下,尽量写了备注
from urllib.parse import urlencode
import pymongoimport requests
from lxml.etree import XMLSyntaxError
from requests.exceptions import ConnectionError
from pyquery import PyQuery as pq
from config import *
# 连接数据库
client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]
# 爬取的url部分
base_url = 'http://weixin.sogou.com/weixin?'
# 请求头文件 需要带上头cookie,不然只能访问10页信息
headers = {
'Cookie': 'SUID=F6177C7B3220910A000000058E4D679; SUV=1491392122762346; ABTEST=1|1491392129|v1; SNUID=0DED8681FBFEB69230E6BF3DFB2F8D6B; [email protected]@@@@@@@@@; LSTMV=189%2C31; LCLKINT=1805; weixinIndexVisited=1; SUIR=0DED8681FBFEB69230E6BF3DFB2F8D6B; JSESSIONID=aaa-BcHIDk9xYdr4odFSv; PHPSESSID=afohijek3ju93ab6l0eqeph902; sct=21; IPLOC=CN; ppinf=5|1491580643|1492790243|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8Y3J0OjEwOjE0OTE1ODA2NDN8cmVmbmljazoyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=j7ojfJRegMrYrl96LmzUhNq-RujAWyuXT_H3xZba8nNtaj7NKA5d0ORq-yoqedkBg4USxLzmbUMnIVsCUjFciRnHDPJ6TyNrurEdWT_LvHsQIKkygfLJH-U2MJvhwtHuW09enCEzcDAA_GdjwX6_-_fqTJuv9w9Gsw4rF9xfGf4; sgid=; ppmdig=1491580643000000d6ae8b0ebe76bbd1844c993d1ff47cea',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
# 最开始是否启用ip代理
proxy = None
def get_proxy():
# 代理获取函数,这里使用类flask的接口
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
return response.text
# 没有异常直接结束
return None
except ConnectionError:
return None
def get_html(url, count=1):
print('Crawling', url)
print('Trying Count', count)
global proxy
# 设置访问深度
if count >= MAX_COUNT:
print('Tried Too Many Counts')
return None
try:
# 如果启用和有代理
if proxy:
# 在代理池中的都是没有加头的代理,在这里加上去
proxies = {
'http': 'http://' + proxy
}
# 需要设置好 allow_redirects=False 因为requests 带有自动处理异常 加上代理
response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies)
else:
response = requests.get(url, allow_redirects=False, headers=headers)
# 如果请求成功返回
if response.status_code == 200:
return response.text
if response.status_code == 302:
# Need Proxy
# 出现异常说明代理已经不可用,或用的人太多,这里我们就跟换代理
print('302')
# 调用代理获取函数,获取一个代理
proxy = get_proxy()
if proxy:
# 获取到代理ip重新获取网页
print('Using Proxy', proxy)
return get_html(url)
else:
print('Get Proxy Failed')
# 没有代理了,直接退出
return None
except ConnectionError as e:
print('Error Occurred', e.args)
proxy = get_proxy()
count += 1
return get_html(url, count)
def get_index(keyword, page):
# url后半部分信息
data = {
'query': keyword,
'type': 2,
'page': page
}
# 转码
queries = urlencode(data)
url = base_url + queries
html = get_html(url)
return html
def parse_index(html):
# pyquery或取文件
doc = pq(html)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
yield item.attr('href')
def get_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
return None
def parse_detail(html):
try:
doc = pq(html)
title = doc('.rich_media_title').text()
content = doc('.rich_media_content').text()
date = doc('#post-date').text()
nickname = doc('#js_profile_qrcode > div > strong').text()
wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
return {
'title': title,
'content': content,
'date': date,
'nickname': nickname,
'wechat': wechat
}
except XMLSyntaxError:
return None
def save_to_mongo(data):
# 去重处理并存mongodb数据库
if db['fenjin'].update({'title': data['title']}, {'$set': data}, True):
print('Saved to Mongo', data['title'])
else:
print('Saved to Mongo Failed', data['title'])
def main():
# 设置访问的页数和次数
for page in range(11, 15):
# 传入访问的关键字
html = get_index(KEYWORD, page)
# 获取到html文件,获取链接
if html:
article_urls = parse_index(html)
for article_url in article_urls:
article_html = get_detail(article_url)
if article_html:
# 解析文章内容
article_data = parse_detail(article_html)
print(article_data)
# 功德圆满,村数据库
if article_data:
save_to_mongo(article_data)
if __name__ == '__main__':
main()
设置代码如下,
PROXY_POOL_URL = 'http://127.0.0.1:5000/get'
KEYWORD = '风景'
MONGO_URI = 'localhost'
MONGO_DB = 'weixin'
MAX_COUNT = 5