喜马拉雅爬取
https://www.ximalaya.com/revision/play/album?albumId=11219907&pageNum=1&sort=-1&pageSize=30这个url找了好久,fo了。。。
点击全部播放按钮后出现右边箭头指的url,发现是个json格式的字符串,直接解析就ok了。
code:
import re
import requests
from bs4 import BeautifulSoup
import json
headers_1 = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6824.400 QQBrowser/10.3.3127.400'
}
urls = []
for i in range(1,4):
urls.append('https://www.ximalaya.com/revision/play/album?albumId=11219907&pageNum='+str(i)+'&sort=-1&pageSize=30')
x = 1
for u in urls:
response = requests.get(u,headers = headers_1)
html = response.text
dic = json.loads(html)
#print(type(dic['data']['tracksAudioPlay'])) list
for content in dic['data']['tracksAudioPlay']:
print(x,content['trackName'],'正在下载',content['src'])
with open('E:\岳云鹏相声\%d%s.m4a' %(x,content['trackName']),'wb') as f:
f.write(requests.get(content['src'],headers = headers_1).content)
x += 1