python requests bs4练习
豆瓣评论分析:
# 1). 获取豆瓣最新上映的所有电影的前10页评论信息;
# 2). 清洗数据;
# 3). 分析每个电影评论信息分析绘制成词云, 保存为png图片,文件名为: 电影名.png;
import re
import jieba
import requests
import wordcloud
from bs4 import BeautifulSoup
def getpagecomments(id,pageNum):
start = (pageNum-1)*20
url = "https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P" %(id,start)
content = requests.get(url).text
soup = BeautifulSoup(content,'html5lib')
commentsList = soup.find_all('span',class_='short')
comments = ""
for commentTag in commentsList:
comments += commentTag.text
return comments
def getID():
url = 'https://movie.douban.com/cinema/nowplaying/xian/'
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content, 'html5lib')
nowplaying_movie_list = soup.find_all('li', class_='list-item')
# print(nowplaying_movie_list,type(nowplaying_movie_list))
movies_info = []
for item in nowplaying_movie_list:
nowplaying_movie_dict = {}
nowplaying_movie_dict['title'] = item['data-title']
nowplaying_movie_dict['id'] = item['id']
movies_info.append(nowplaying_movie_dict)
return movies_info
threads = []
movies_info = getID()
for i in movies_info:
id = i['id']
comments = ""
for j in range(3):
pageNum = j+1
pagecomments = getpagecomments(id,pageNum)
comments += pagecomments
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
newComments = ''
for item in deal_comments:
newComments += item
result = jieba.lcut(newComments)
print("切分结果:", result)
wc = wordcloud.WordCloud(
background_color='snow',
font_path='./font/msyh.ttf',
min_font_size=5,
max_font_size=55,
width=200,
)
wc.generate(",".join(result))
wc.to_file('./font/%s.png' %i['title'])
爬取慕客网所有关于python的课程名及描述信息, 并通过词云进行分析展示;
- 网址: https://www.imooc.com/search/course?words=python
import re
import jieba
import requests
import wordcloud
from bs4 import BeautifulSoup
def getclassinform(page):
url = 'https://www.imooc.com/search/course?words=python&page=%d' %(page)
content = requests.get(url).text
soup = BeautifulSoup(content,'html5lib')
commentList = soup.find_all('a',attrs={'class':"course-detail-title"})
commentListIntroduce = soup.find_all('div',attrs={'class':"course-item"})
comments = ""
for comment in commentList:
comments += comment.text
for comment in commentListIntroduce:
a = comment.find('p')
comments += a.text
return comments
comments = ''
for i in range(2):
page = i+1
pagecomments = getclassinform(page)
comments += pagecomments
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
newComments = ''
for item in deal_comments:
newComments += item
result = jieba.lcut(newComments)
print("切分结果:", result)
wc = wordcloud.WordCloud(
background_color='snow',
font_path='./font/msyh.ttf',
min_font_size=5,
max_font_size=55,
width=300,
)
wc.generate(",".join(result))
wc.to_file('python.png')
python爬取今日百度热点前10的新闻;
import requests
from bs4 import BeautifulSoup
def getnews():
url = 'http://top.baidu.com/buzz?b=1'
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
news = soup.find_all('a',class_='list-title')
for new in news:
print(new.text)
getnews()