python 爬虫 github个人信息页
首先,目标页面长这样
我们要爬出里面头像的url
但是想要进入这个页面需要先登录,获取cookie
所以先访问登录接口:https://github.com/session
这个参数需要的比较全,用户名,密码,token,cookie都要给
请求成功后,从返回的响应对象里获取cookie
再带上这个cookie去请求个人信息页
成功获得个人信息页之后用xpath把里面的头像url提取出来
结果
以下是完整代码:
开发工具pycharm
编译器版本:python3.6
需要装的包:requests, lxml
import requests
from lxml import etree
class Github_Spider(object):
def get_profile_page(self,cookies):
# 准备参数
url = 'https://github.com/settings/profile'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
}
response = requests.get(url,headers=headers,cookies=cookies)
with open('爬虫--github登录.html','wb') as f:
f.write(response.content)
print('保存成功')
def get_cookie(self):
url = 'https://github.com/session'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
"Cookie": 自己找值哟,
"Referer": "https://github.com/login"
}
data = {
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": 自己找值哟,
"login": 自己找值哟,
"password": 自己找值哟,
}
response = requests.post(url,headers=headers,data=data)
print('登录成功')
return response.cookies
def get_profile_pic(self):
with open('爬虫--github登录.html','r',encoding='utf-8') as f:
html = f.read()
eroot = etree.HTML(html)
img_url = eroot.xpath('//img[@class="avatar rounded-2"]/@src')
print(img_url)
if __name__ == '__main__':
aaa = Github_Spider()
cookies = aaa.get_cookie()
aaa.get_profile_page(cookies)
aaa.get_profile_pic()