import requests
from lxml import etree
class QiuBaiSpider(object):
def __init__(self):
self.url_temp = 'https://www.qiushibaike.com/text/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_url_list(self, page_num):
return [self.url_temp.format(i) for i in range(1, page_num)]
def paser_one_url(self, url1):
response = requests.get(url1, headers=self.headers)
return response.content.decode()
def paser_two_url(self, url2):
response = requests.get(url2, headers=self.headers)
return response.content
def get_page_url(self, html_str):
num = 1
html = etree.HTML(html_str)
self.div_list = html.xpath('//div[@id="content-left"]/div/a[1]/@href')
for i in self.div_list:
with open('已下载段子网址.txt', 'r') as f:
a = f.readlines()
if i + '\n' not in a:
url2 = 'https://www.qiushibaike.com' + i
url2_html = self.paser_two_url(url2)
html2 = etree.HTML(url2_html)
data_list = html2.xpath('//div[@id="single-next-link"]/div[@class="content"]/text()')
data = ''.join(data_list)
print('-' * 1000)
print('%s.' % num)
print(data)
with open('糗事百科.txt', 'a', encoding='utf-8') as f:
f.write(str(num) + '.')
f.write(data)
f.write('\n\n\n')
num += 1
print('保存成功')
print('-' * 1000)
with open('已下载段子网址.txt', 'a', encoding='utf-8') as f:
f.write(i)
f.write('\n')
else:
print('已下载')
f.close()
def run(self):
page_num = int(input('请输入要爬取的页数:'))
url_list = self.get_url_list(int(page_num) + 1)
print(url_list)
num = 1
for url1 in url_list:
print('第%s页' % num)
html_str = self.paser_one_url(url1)
get_data = self.get_page_url(html_str)
with open('糗事百科.txt', 'a', encoding='utf-8') as f:
f.write('\n\n\n')
num += 1
if __name__ == '__main__':
qiubai = QiuBaiSpider()
qiubai.run()
