爬内涵段子,开心一刻
使用py2爬取笑话,~~
# coding=utf-8
import urllib2import re
class Pacong(object):
def __init__(self,begin=1):
self.begin = begin
self.confirm = True
self.filename = 1
def get_html(self):
"""获得html网页文件"""
url = "http://xiaohua.zol.com.cn/aiqing/"+str(self.begin)+".html"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
resquest = urllib2.Request(url,headers=headers)
date = urllib2.urlopen(resquest)
html = date.read().decode("gbk").encode("utf-8")
self.clear(html)
# print html
def clear(self,html):
"""使用正则整理html获取想要的文件"""
pattern = re.compile('<div\sclass="summary-text">(.*?)</div>',re.S)
content_list = pattern.findall(html)
content_list = content_list
# print content_list
#每一次写入都使用不同的文件
filenames = "第" + str(self.filename) + "页.txt"
for i in content_list:
i = i.replace("<p>"," ").replace("</p>......"," ").replace("</p>"," ").replace(" ","")
self.writePage(i,filenames)
# print i
self.filename += 1
def writePage(self,i,filenames):
"""将整理好的内容写入到本地文件"""
with open(filenames,"a+") as f:
f.write(i)
def command(self):
"""控制程序的运行"""
while self.confirm:
duanzi.get_html()
com = raw_input("是否继续爬去网页:(是按任意键,退出输入exit)")
if com == "exit":
break
self.begin += 1
if __name__ == '__main__':
duanzi = Pacong()
duanzi.command()