python获取DeepMind发布的文章
以下提供了两种下载方法,一种是用自带的库下载,还有一种是调用迅雷下载
#coding:utf-8
from lxml import etree
import time
import requests
import progressbar
import win32com
from win32com.client import Dispatch
header={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+
'Chrome/67.0.3396.87 Safari/537.36'
}
#首先从原网站查询共有多少页,也就是下面的19页
urls=['https://deepmind.com/research/publications/?page={}'.format(str(i))
for i in range(1,20)]
path='I:\\DeepMind'
o = Dispatch("ThunderAgent.Agent64.1")
def download(title_name,download_src) :
data=requests.get(download_src,headers=header)
total_length = int(data.headers.get("Content-Length"))
with open(path+title_name+'.pdf', 'wb') as f:
widgets = ['Progress: ', progressbar.Percentage(), ' ',progressbar.Bar(marker='#', left='[', right=']'),' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
pbar = progressbar.ProgressBar(widgets=widgets, maxval=total_length).start()
count=0
for chunk in data.iter_content(chunk_size=1):
if chunk:
count=count+1
f.write(chunk)
f.flush()
pbar.update(count)
pbar.finish()
return
for url in urls :
page=page+1
res=requests.get(url)
selector=etree.HTML(res.text)
second_urls=selector.xpath('//div[@class="listing--list-items"]/article')
for second_url in second_urls:
count=count+1
title_name=second_url.xpath('div/div[2]/header/div/h1')[0].text
r=second_url.xpath('div/div[2]/footer/a[2]/@href')
download_src=r[0] if r else ''
if download_src.strip()=='':
count1=count1+1
v=second_url.xpath('div/div[2]/footer/a[1]/@href')
download_src=v[0] if v else ''
if download_src.strip()!='':
o.AddTask(download_src, title_name+'.pdf', path, "", "", -1, 0, 5)
o.CommitTasks()
time.sleep(3)
#download(title_name,download_src)
else :
#download(title_name,download_src)
o.AddTask(download_src, title_name+'.pdf', path, "", "", -1, 0, 5)
o.CommitTasks()
time.sleep(3)
print(title_name)
print(download_src)
print(page)
print(count)
有些发布的不是论文,在迅雷里会提示任务超时,不必担心