python爬虫之内涵段子

段子爬取

这次解析使用的re

import requests
import re

class Spider:
    def __init__(self):
        self.page = 1
        self.switch = True

    def getConnect(self):

        """

        获取html,使用re库解析出想要的内容
        """
        url = "https://www.neihan-8.com/article/list_5_"+str(self.page)+".html"
        print(url)
        headers = {"User-Agent":  "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
        html = requests.get(url,headers = headers)
        html.encoding = 'gb2312'
        txt = html.text
        
        #re.S全文匹配
        pattern = re.compile('<div\sclass="f18 mb20">(.*?)</div>',re.S)
        #返回段子列表
        content_list = pattern.findall(txt)

        self.dealPage(content_list)
        
        #for content in content_list:
            #print(content)

    def dealPage(self,content_list):
        """
        处理每个段子
        """
        for content in content_list:
            content = content.replace('<p>','').replace("</p>",'').replace("<br>","").replace('<br />',"").replace('&ldquo;',"").replace("&rdquo",'')
            self.writePage(content)
 

    def writePage(self,content):
        """
        把每条段子写进文件里
        """
        with open("段子.txt",'a',encoding="utf8") as f:
            f.write(content)
            f.close()
 

    def startWork(self):
        """
        控制爬虫工作
        """
        while self.switch:
            a = input("继续爬取按回车,停止爬取输入quit:")
            if a == "quit":
                self.switch = False
            self.getConnect()
            self.page += 1


if __name__ == "__main__":
    run = Spider()
    run.startWork()

python爬虫之内涵段子