python获取DeepMind发布的文章

以下提供了两种下载方法,一种是用自带的库下载,还有一种是调用迅雷下载

#coding:utf-8

from lxml import etree
import time
import requests
import progressbar  
import win32com
from win32com.client import Dispatch


header={
	'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'+
	'Chrome/67.0.3396.87 Safari/537.36'
}

#首先从原网站查询共有多少页,也就是下面的19页

urls=['https://deepmind.com/research/publications/?page={}'.format(str(i))
		for i in range(1,20)]
path='I:\\DeepMind'

o = Dispatch("ThunderAgent.Agent64.1")
def download(title_name,download_src) :
	data=requests.get(download_src,headers=header)
	total_length = int(data.headers.get("Content-Length"))
	with open(path+title_name+'.pdf', 'wb') as f:
		widgets = ['Progress: ', progressbar.Percentage(), ' ',progressbar.Bar(marker='#', left='[', right=']'),' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
		pbar = progressbar.ProgressBar(widgets=widgets, maxval=total_length).start()
		count=0
		for chunk in data.iter_content(chunk_size=1):
			if chunk:
				count=count+1
				f.write(chunk)
				f.flush()
			pbar.update(count)
		pbar.finish()
	return
	
for url in urls :
	page=page+1
	res=requests.get(url)
	selector=etree.HTML(res.text)
	second_urls=selector.xpath('//div[@class="listing--list-items"]/article')
	
	
	for second_url in second_urls:
		count=count+1
		title_name=second_url.xpath('div/div[2]/header/div/h1')[0].text
		r=second_url.xpath('div/div[2]/footer/a[2]/@href')
		download_src=r[0] if r else ''
		if download_src.strip()=='':
			count1=count1+1
			v=second_url.xpath('div/div[2]/footer/a[1]/@href')
			download_src=v[0] if v else ''
			if download_src.strip()!='':
				o.AddTask(download_src, title_name+'.pdf', path, "", "", -1, 0, 5)
				o.CommitTasks()
				time.sleep(3)
				#download(title_name,download_src)
		else :
			#download(title_name,download_src)
			o.AddTask(download_src, title_name+'.pdf', path, "", "", -1, 0, 5)
			o.CommitTasks()
			time.sleep(3)
				
		print(title_name)
		print(download_src)
		
	print(page)
		
print(count)


    
   

有些发布的不是论文,在迅雷里会提示任务超时,不必担心

python获取DeepMind发布的文章