python写个爬小说的小爬虫
安装python;
安装BeautifulSoup 打开window 的命令窗口,输入
$ easy_install beautifulsoup4 或者
$ pip install beautifulsoup4
安装html5lib 页面解析器
$ apt-get install Python-html5lib
$ easy_install html5lib
$ pip install html5lib
当然你可以安装其它的 https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
我爬的是笔趣阁里的药皇,打开他的目录
查看代码,chorme按F12键可以直接查看他的页面结构
如下:
a标签就是我们需要的东西 div 就是它的外包装
代码如下:
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
server = "http://www.xbiquge.la"
url = 'http://www.xbiquge.la/8/8917/'
req = requests.get(url)
req.encoding = 'utf-8'
htm = req.text
bf = BeautifulSoup(htm,"html5lib")
div = bf.find_all('div',id="list") #查找放章节的div
a_bf = BeautifulSoup(str(div[0]),"html5lib")
a = a_bf.find_all('a') #查找所有a标签
for each in a:
print(each.string, server+each.get('href'))#打印
根据上面得到的网址 获取内容,原来跟上面相同;
from bs4 import BeautifulSoup
import requests
if __name__ == '__main__':
target = "http://www.xbiquge.la/8/8917/3960324.html"
req = requests.get(url=target)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div',id = 'content')
print(texts[0].text.replace('\xa0'*4,'\n\n'))
把这两个东西整合组装在一起,代码如下:
from bs4 import BeautifulSoup
import requests,sys
"""
说明:下载《笔趣阁》网络小说《药皇》
Parameters:
无
Returns:
无
Modify:
2017-09-13
"""
class downloader(object):
def __init__(self):
self.server = "http://www.xbiquge.la"
self.target = "http://www.xbiquge.la/8/8917/";
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
#获取目录
def get_download_url(self):
req = requests.get(url = self.target)
req.encoding = 'utf-8'
html = req.text
div_bf = BeautifulSoup(html,"html5lib")
div = div_bf.find_all('div',id="list")
a_bf = BeautifulSoup(str(div[0]),"html5lib")
a = a_bf.find_all('a')
self.nums = len(a)
#章节
for each in a:
self.names.append(each.string)
self.urls.append(self.server+each.get('href'))
#获取章节内容
def get_contents(self,target):
req = requests.get(url = target)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html,"html5lib")
texts = bf.find_all('div',id = 'content')
texts = texts[0].text.replace('\xa0'*4,'\n\n')
return texts
#将获取文章内容写入文件
def writer(self,name,path,text):
write_flag = True
with open(path,'a',encoding = 'utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader()
dl.get_download_url()
print('《药皇》开始下载')
for i in range(dl.nums):
dl.writer(dl.names[i],'药皇.txt',dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%0.3f%%" % float(i/dl.nums) + '\r')
sys.stdout.flush()
print('《药皇》已经下载完成')
遇到的问题:中文变乱码:加上 req.encoding = 'utf-8'就可以了
__name__ == "__main__" 什么意思?直接执行当前文件 两者是相等的,如果只是导入两者就不相等,__main__ 表示的是执行的是那个文件*.py