爬取三国演义小说全文进行词频统计并生成词云

from bs4 import BeautifulSoup
import requests
from multiprocessing import Pool
import time
import jieba
from PIL import Image
from wordcloud import WordCloud

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def get_info(url):
    soup = BeautifulSoup(requests.get(url, headers=headers, verify=False).text, 'lxml')
    # string = re.findall(
    #     '</h1></div><div class="content"><p>(.*?)</p></div></div><!-- 中国古典文学顶部书籍内容页底部 -->', html, re.S)
    # pattern = re.compile('(<.*?>)')
    # result = pattern.sub('', str(string))#注意此处加str
    # print(result)
    # real = ''.join(string)
    # print(real)
    contents = soup.select('div.content > p')
    for content in contents:
        with open('D:/三国演义.txt', 'a+', encoding='gbk') as fp:
            fp.write(content.get_text())

def get_words():
    txt = open("D:/三国演义.txt", "r", encoding='gbk').read()
    words = jieba.lcut(txt)
    counts = dict()
    stopwords = ['二人', '却说', '不能', '不可', '如此', '左右',
                 '次日', '大喜', '忽然', '此人', '今日', '于是', '一人']  # 出现频繁的无用词
    for word in words:  # 词频统计
        if len(word) == 1 or word in stopwords:
            continue
        else:
            counts[word] = counts.get(word, 0) + 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)  # 按词频由大到小排序
    mylist = []
    for i in range(100):
        word, count = items[i]
        mylist.append(word)
    text1 = ' '.join(mylist)  # 注意空格
    print(text1)
    return text1

def create(imgFile, s):
    im = Image.open(imgFile)
    w, h = im.size
    # 创建wordcloud对象
    wc = WordCloud(
        r'C:\windows\fonts\simfang.ttf', width=w, height=h,
        background_color='white', font_step=3,
        random_state=False, prefer_horizontal=0.9
    )
    t = wc.generate(s)
    t = t.to_image()
    for w1 in range(w):
        for h1 in range(h):
            if im.getpixel((w1, h1))[:3] == (255, 255, 255):
                t.putpixel((w1, h1), (255, 255, 255))
    t.save('D:/result.jpg')

# 测试
# chs = string.ascii_letters + string.digits + string.punctuation
# s = [''.join((random.choice(chs) for i in range(8))) for j in range(650)]
# s = ''.join(s)

if __name__ == '__main__':
    urls = [
        'http://www.zggdwx.com/sanguo/{}.html'.format(str(i)) for i in range(1, 121)]
    '''多进程下载用10秒，单进程下载用13秒，开启多进程电脑风扇呱呱叫。。
    one1 = time.time()
    pool = Pool(processes=4)
    # for url in urls:
    #     pool.apply_async(get_info,url)
    pool.map(get_info, urls)
    two1 = time.time()
    print('花费时间', two1 - one1
    '''
    one2 = time.time()
    for url in urls:
        get_info(url)

    two2 = time.time()
    print('爬取花费时间',two2 - one2)
    text1 = get_words()
    create('D:/12345.jpg', text1)
原图
爬取三国演义小说全文进行词频统计并生成词云
结果
爬取三国演义小说全文进行词频统计并生成词云

相关推荐