QQ空间相册爬虫

QQ空间相册爬虫

目标:

  1. 不声不响的进入别人空间(直接进入内存消耗巨大,速度慢)
  2. 获取可以获取的所有的照片
  3. 获取.gif格式的照片(未实现)
  4. 获取视频(未实现,但可获取视频封面照片)

空间相册分析:
 

QQ空间相册爬虫

  • 首先,不可操作的相册显然不在我们考虑的范围
  • 对于可操作的相册我将其分为两个部分,是因为我操作‘你有权限进入的相册’时,我遇到了一些难题,与此同时,我也意识到自己的薄弱之处(我极其讨厌模拟请求形式的爬虫技术,话多了。。。。)
  • 由于对于可见相册两种情况难以控制,所以在对图片处理时采用了比较low的方法(截图),并未使用图像下载的方法

流程图:

QQ空间相册爬虫

Json分析:

相册列表Json文件:

QQ空间相册爬虫

照片Json文件:

QQ空间相册爬虫

展开看看:

QQ空间相册爬虫

不好意思,让你看到我恶心的动态了

请求链接分析:

相册列表链接:

QQ空间相册爬虫

相册链接:

QQ空间相册爬虫

注*:

  1. 没用的关键字,意思就是你写什么都行(TX的工程师= =。。。)
  2. 相册链接有两种,我就只用一种了哈~

源码:

#encoding:utf-8
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import json
import importlib,sys
importlib.reload(sys)
import sys
import os

class Preprocessor:
    
    def Analysis_Json(self):
        jsonInfor.AnalysisJson().Analysis_Json()
        
    def startSpider(self):
        driver = webdriver.Chrome(executable_path='chromedriver.exe')
        driver.get('https://qzone.qq.com/')
 
        driver.switch_to.frame('login_frame')
        driver.find_element_by_id('switcher_plogin').click()
        User_QQnum = '******'#这里填写你的QQ号
        User_QQpas = '******'#这里填写你的QQ密码
        driver.find_element_by_id('u').clear()
        driver.find_element_by_id('u').send_keys(User_QQnum)  
        driver.find_element_by_id('p').clear()
        driver.find_element_by_id('p').send_keys(User_QQpas)  
 
        driver.find_element_by_id('login_button').click()
        time.sleep(2)
        pic_num = 0
        #---------------获得g_qzonetoken 和 gtk
        html = driver.page_source
        g_qzonetoken=re.search('window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)#从网页源码中提取g_qzonetoken
        g_qzonetoken = str(g_qzonetoken[0]).split('\"')[1]
        cookie = {}#初始化cookie字典
        for elem in driver.get_cookies():#取cookies
            cookie[elem['name']] = elem['value']
 
        gtk=self.getGTK(cookie)#通过getGTK函数计算gtk
        print(g_qzonetoken)
        print(gtk)
        
        targetQQ = '******'#这里填写目标QQ
        
        photo_list = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/'\
                     'fcgi-bin/fcg_list_album_v3?g_tk='+str(gtk)+'&callback=shine0_Callback'\
                     '&t=111111111&hostUin='+str(targetQQ)+'&uin='+str(User_QQnum)+'&appid=4&inCharset=utf-8'\
                     '&outCharset=utf-8&source=qzone&plat=qzone&format=jsonp&notice=0&filter=1'\
                     '&handset=4&pageNumModeSort=40&pageNumModeClass=15&needUserInfo=1&idcNum=4'\
                     '&callbackFun=shine0&_=111111111111'
        driver.get(photo_list)
        html = driver.page_source
        f = open(r'photoList'+'.json','w+',encoding='utf-8')
        f.write(html.split('pre-wrap;">')[1].split(';</pre></body></html>')[0].split('shine0_Callback(')[1][:-1])
        f.close()

        f = open('photoList.json', encoding='utf-8')
        text = f.read()
        f.close()
                    
        if text.startswith(u'\ufeff'): 
             text = text.encode('utf8')[3:].decode('utf8')
        Json = json.loads(text)
        photoShine = len(Json['data']['albumListModeSort'])
        List = Json['data']['albumListModeSort']
        if photoShine>0:
            for index in range(photoShine):
                try:
                    question = str(List[index]['question'])
                    if question!='':
                        continue
                except:
                    Total = str(List[index]['total'])
                    Id = str(List[index]['id'])
                    
                photosUrl = 'https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo?'\
                            'g_tk='+str(gtk)+'&callback=shine0_Callback&t=111111111&mode=0&idcNum=4'\
                            '&hostUin='+str(targetQQ)+'&topicId='+str(Id)+'&noTopic=0&uin='+str(User_QQnum)+'&pageStart=0'\
                            '&pageNum='+str(Total)+'&skipCmtCount=0&singleurl=1&batchId=&notice=0&appid=4&inCharset=utf-8'\
                            '&outCharset=utf-8&source=qzone&plat=qzone&outstyle=json&format=jsonp&json_esc=1'\
                            '&question=&answer=&callbackFun=shine0&_=1538206042361'
             
                driver.get(photosUrl)
                html = driver.page_source
                f = open(r'shine'+str(index)+'photos'+'.json','w+',encoding='utf-8')
                f.write(html.split('pre-wrap;">')[1].split(';</pre></body></html>')[0].split('shine0_Callback(')[1][:-1])
                f.close()
                print(index)
        for i in range(photoShine-1):
            f = open('shine'+str(i)+'photos'+'.json', encoding='utf-8')
            text = f.read()
            f.close()
            if text.startswith(u'\ufeff'): 
                 text = text.encode('utf8')[3:].decode('utf8')
            Json = json.loads(text)
            photolist = Json['data']['photoList']
            for i in range(len(photolist)):
                IMAGE_URL = photolist[i]['url']
                driver.get(IMAGE_URL)
                driver.get_screenshot_as_file('./image/image'+str(pic_num)+'.png')
                pic_num = pic_num + 1

            
    def getGTK(self,cookie):
        hashes = 5381
        for letter in cookie['p_skey']:
            hashes += (hashes << 5) + ord(letter) 
        return hashes & 0x7fffffff
     
if __name__ == '__main__':
    processor = Preprocessor()
    processor.startSpider()
    print("OK")

效果:

QQ空间相册爬虫