爬取三国演义小说全文进行词频统计并生成词云
from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
import time
import jieba
from PIL import Image
from wordcloud import WordCloud
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
def get_info(url):
soup = BeautifulSoup(requests.get(url, headers=headers, verify=False).text, 'lxml')
# string = re.findall(
# '</h1></div><div class="content"><p>(.*?)</p></div></div><!-- 中国古典文学顶部书籍内容页底部 -->', html, re.S)
# pattern = re.compile('(<.*?>)')
# result = pattern.sub('', str(string))#注意此处加str
# print(result)
# real = ''.join(string)
# print(real)
contents = soup.select('div.content > p')
for content in contents:
with open('D:/三国演义.txt', 'a+', encoding='gbk') as fp:
fp.write(content.get_text())
def get_words():
txt = open("D:/三国演义.txt", "r", encoding='gbk').read()
words = jieba.lcut(txt)
counts = dict()
stopwords = ['二人', '却说', '不能', '不可', '如此', '左右',
'次日', '大喜', '忽然', '此人', '今日', '于是', '一人'] # 出现频繁的无用词
for word in words: # 词频统计
if len(word) == 1 or word in stopwords:
continue
else:
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True) # 按词频由大到小排序
mylist = []
for i in range(100):
word, count = items[i]
mylist.append(word)
text1 = ' '.join(mylist) # 注意空格
print(text1)
return text1
def create(imgFile, s):
im = Image.open(imgFile)
w, h = im.size
# 创建wordcloud对象
wc = WordCloud(
r'C:\windows\fonts\simfang.ttf', width=w, height=h,
background_color='white', font_step=3,
random_state=False, prefer_horizontal=0.9
)
t = wc.generate(s)
t = t.to_image()
for w1 in range(w):
for h1 in range(h):
if im.getpixel((w1, h1))[:3] == (255, 255, 255):
t.putpixel((w1, h1), (255, 255, 255))
t.save('D:/result.jpg')
# 测试
# chs = string.ascii_letters + string.digits + string.punctuation
# s = [''.join((random.choice(chs) for i in range(8))) for j in range(650)]
# s = ''.join(s)
if __name__ == '__main__':
urls = [
'http://www.zggdwx.com/sanguo/{}.html'.format(str(i)) for i in range(1, 121)]
'''多进程下载用10秒,单进程下载用13秒,开启多进程电脑风扇呱呱叫。。
one1 = time.time()
pool = Pool(processes=4)
# for url in urls:
# pool.apply_async(get_info,url)
pool.map(get_info, urls)
two1 = time.time()
print('花费时间', two1 - one1
'''
one2 = time.time()
for url in urls:
get_info(url)
two2 = time.time()
print('爬取花费时间',two2 - one2)
text1 = get_words()
create('D:/12345.jpg', text1)
原图
结果