python爬取豆瓣top250图书信息
python爬虫之爬取豆瓣top250图书信息
1. 爬虫库准备
BeautifulSoup
Requests
2. 爬取网页
(1)分析豆瓣图书TOP 250的url:
第一页是https://book.douban.com/top250?start=0
第二页是https://book.douban.com/top250?start=25
第三页是https://book.douban.com/top250?start=50
(2)分析网页源代码中图书信息的所在位置:
3. 代码
import requests from bs4 import BeautifulSoup import time class top250books(): def html(self, href): max_span = 250 for page in range(0,int(max_span),25): page_url = href + '?start='+ str(page) self.parse(page_url) def parse(self,url): html = self.request(url) all_tables = BeautifulSoup(html.text,'html.parser').find('div', class_='indent').find_all('table') for table in all_tables: time.sleep(0.5) title = table.find('div', class_='pl2').find('a').get_text() info = table.find('p', class_='pl').get_text() rating_nums = table.find('span', class_='rating_nums').get_text() rating_people = table.find('span', class_='pl').get_text() quote = table.find('span', class_='inq').get_text() f = open('test.txt', 'ab') f.write(("".join(title.split()) + " ").encode('utf-8')) f.write(("".join(info.split()) + " ").encode('utf-8')) f.write(("".join(rating_nums.split()) + " ").encode('utf-8')) f.write(("".join(rating_people.split()) + " ").encode('utf-8')) f.write(("".join(quote.split()) + " " + '\n').encode('utf-8')) f.close() print("".join(title.split())) def request(self,url): headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8" } content = requests.get(url,headers=headers) return content books = top250books() books.html('https://book.douban.com/top250')