python爬虫之内涵段子
段子爬取
这次解析使用的re
import requests
import re
class Spider:
def __init__(self):
self.page = 1
self.switch = True
def getConnect(self):
"""
获取html,使用re库解析出想要的内容
"""
url = "https://www.neihan-8.com/article/list_5_"+str(self.page)+".html"
print(url)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
html = requests.get(url,headers = headers)
html.encoding = 'gb2312'
txt = html.text
#re.S全文匹配
pattern = re.compile('<div\sclass="f18 mb20">(.*?)</div>',re.S)
#返回段子列表
content_list = pattern.findall(txt)
self.dealPage(content_list)
#for content in content_list:
#print(content)
def dealPage(self,content_list):
"""
处理每个段子
"""
for content in content_list:
content = content.replace('<p>','').replace("</p>",'').replace("<br>","").replace('<br />',"").replace('“',"").replace("&rdquo",'')
self.writePage(content)
def writePage(self,content):
"""
把每条段子写进文件里
"""
with open("段子.txt",'a',encoding="utf8") as f:
f.write(content)
f.close()
def startWork(self):
"""
控制爬虫工作
"""
while self.switch:
a = input("继续爬取按回车,停止爬取输入quit:")
if a == "quit":
self.switch = False
self.getConnect()
self.page += 1
if __name__ == "__main__":
run = Spider()
run.startWork()