QQ空间相册爬虫
QQ空间相册爬虫
目标:
- 不声不响的进入别人空间(直接进入内存消耗巨大,速度慢)
- 获取可以获取的所有的照片
- 获取.gif格式的照片(未实现)
- 获取视频(未实现,但可获取视频封面照片)
空间相册分析:
- 首先,不可操作的相册显然不在我们考虑的范围
- 对于可操作的相册我将其分为两个部分,是因为我操作‘你有权限进入的相册’时,我遇到了一些难题,与此同时,我也意识到自己的薄弱之处(我极其讨厌模拟请求形式的爬虫技术,话多了。。。。)
- 由于对于可见相册两种情况难以控制,所以在对图片处理时采用了比较low的方法(截图),并未使用图像下载的方法
流程图:
Json分析:
相册列表Json文件:
照片Json文件:
展开看看:
不好意思,让你看到我恶心的动态了
请求链接分析:
相册列表链接:
相册链接:
注*:
- 没用的关键字,意思就是你写什么都行(TX的工程师= =。。。)
- 相册链接有两种,我就只用一种了哈~
源码:
#encoding:utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import json
import importlib,sys
importlib.reload(sys)
import sys
import os
class Preprocessor:
def Analysis_Json(self):
jsonInfor.AnalysisJson().Analysis_Json()
def startSpider(self):
driver = webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://qzone.qq.com/')
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
User_QQnum = '******'#这里填写你的QQ号
User_QQpas = '******'#这里填写你的QQ密码
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(User_QQnum)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(User_QQpas)
driver.find_element_by_id('login_button').click()
time.sleep(2)
pic_num = 0
#---------------获得g_qzonetoken 和 gtk
html = driver.page_source
g_qzonetoken=re.search('window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)#从网页源码中提取g_qzonetoken
g_qzonetoken = str(g_qzonetoken[0]).split('\"')[1]
cookie = {}#初始化cookie字典
for elem in driver.get_cookies():#取cookies
cookie[elem['name']] = elem['value']
gtk=self.getGTK(cookie)#通过getGTK函数计算gtk
print(g_qzonetoken)
print(gtk)
targetQQ = '******'#这里填写目标QQ
photo_list = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/'\
'fcgi-bin/fcg_list_album_v3?g_tk='+str(gtk)+'&callback=shine0_Callback'\
'&t=111111111&hostUin='+str(targetQQ)+'&uin='+str(User_QQnum)+'&appid=4&inCharset=utf-8'\
'&outCharset=utf-8&source=qzone&plat=qzone&format=jsonp¬ice=0&filter=1'\
'&handset=4&pageNumModeSort=40&pageNumModeClass=15&needUserInfo=1&idcNum=4'\
'&callbackFun=shine0&_=111111111111'
driver.get(photo_list)
html = driver.page_source
f = open(r'photoList'+'.json','w+',encoding='utf-8')
f.write(html.split('pre-wrap;">')[1].split(';</pre></body></html>')[0].split('shine0_Callback(')[1][:-1])
f.close()
f = open('photoList.json', encoding='utf-8')
text = f.read()
f.close()
if text.startswith(u'\ufeff'):
text = text.encode('utf8')[3:].decode('utf8')
Json = json.loads(text)
photoShine = len(Json['data']['albumListModeSort'])
List = Json['data']['albumListModeSort']
if photoShine>0:
for index in range(photoShine):
try:
question = str(List[index]['question'])
if question!='':
continue
except:
Total = str(List[index]['total'])
Id = str(List[index]['id'])
photosUrl = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?'\
'g_tk='+str(gtk)+'&callback=shine0_Callback&t=111111111&mode=0&idcNum=4'\
'&hostUin='+str(targetQQ)+'&topicId='+str(Id)+'&noTopic=0&uin='+str(User_QQnum)+'&pageStart=0'\
'&pageNum='+str(Total)+'&skipCmtCount=0&singleurl=1&batchId=¬ice=0&appid=4&inCharset=utf-8'\
'&outCharset=utf-8&source=qzone&plat=qzone&outstyle=json&format=jsonp&json_esc=1'\
'&question=&answer=&callbackFun=shine0&_=1538206042361'
driver.get(photosUrl)
html = driver.page_source
f = open(r'shine'+str(index)+'photos'+'.json','w+',encoding='utf-8')
f.write(html.split('pre-wrap;">')[1].split(';</pre></body></html>')[0].split('shine0_Callback(')[1][:-1])
f.close()
print(index)
for i in range(photoShine-1):
f = open('shine'+str(i)+'photos'+'.json', encoding='utf-8')
text = f.read()
f.close()
if text.startswith(u'\ufeff'):
text = text.encode('utf8')[3:].decode('utf8')
Json = json.loads(text)
photolist = Json['data']['photoList']
for i in range(len(photolist)):
IMAGE_URL = photolist[i]['url']
driver.get(IMAGE_URL)
driver.get_screenshot_as_file('./image/image'+str(pic_num)+'.png')
pic_num = pic_num + 1
def getGTK(self,cookie):
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)
return hashes & 0x7fffffff
if __name__ == '__main__':
processor = Preprocessor()
processor.startSpider()
print("OK")