爬取猫眼电影
from itertools import chain
from urllib import request
from urllib.request import urlopen
def getPageHtml(url):
#获取网页对应的源码
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"
req = request.Request(url, headers={'User-Agent': user_agent})
obj = urlopen(req)
return obj.read()
def getfilename(text):
#获取电影名称
'''<p class="name"><a href="/films/1297" title="肖申克的救赎" data-act="boarditem-click" data-val="{movieId:1297}">肖申克的救赎</a></p>'''
paattern = r'<p class="name"><.*?>(.*?)</a></p>'
return re.findall(paattern,text)
def getfiletime(text):
# 获取上映时间
'''<p class="releasetime">上映时间:1994-10-14(美国)</p>'''
pattern = r'<p class="releasetime">(.*?)</p>'
return re.findall(pattern,text)
def getstar(text):
#获取主演
'''<p class="star">
主演:宋在浩,李顺才,尹秀晶
</p>
<p class="releasetime">上映时间:1994-10-14(美国)</p> </div>'''
pattern = r'<p class="star">(.*?)</p>'
s = re.findall(pattern,text)
starname = []
for i in s:
starname.append(i.strip(' '))
return starname
def get_imgurl(text):
#获取图片的url
'''<img data-src="http://p0.meituan.net/movie/[email protected]_220h_1e_1c" alt="辩护人" class="board-img" />'''
pattern = r'<img data-src="(http://.*?)" .*? class="board-img" />'
return re.findall(pattern,text)
def main():
#建立四个列表把爬取的信息按类存放
filename = []
filetime = []
filestar = []
fileurl = []
#爬取10页信息,并存入对应的列表中
for i in range(10):
url = 'http://maoyan.com/board/4?offset=%d0' %i
text = getPageHtml(url).decode('utf-8').replace("\n", "")
filestar.append(getstar(text))
filename.append(getfilename(text))
filetime.append(getfiletime(text))
fileurl.append(get_imgurl(text))
# http://maoyan.com/board/4?offset=20
# print(filestar)
# print(filename)
# print(filetime)
#把所有的电影名,主演,时间,图片网址一一对应
for name,star,time,url in zip(filename,filestar,filetime,fileurl):
for name1,star1,time1,url1 in zip(name,star,time,url):
print(name1,star1,time1,url1)
urlli = [j for i in fileurl for j in i] #将所有页的图片保存到一个列表中
# 将图片保存到指定的文件夹中
for i,v in enumerate(urlli):
with open("img1/img%d.jpg" %(i+1),'wb') as f:
content = getPageHtml(v)
f.write(content)
main()
图片: