python爬取豆瓣top250图书信息

python爬虫之爬取豆瓣top250图书信息

1. 爬虫库准备

    BeautifulSoup

    Requests

2. 爬取网页

(1)分析豆瓣图书TOP 250的url:

          第一页是https://book.douban.com/top250?start=0

          第二页是https://book.douban.com/top250?start=25

          第三页是https://book.douban.com/top250?start=50

(2)分析网页源代码中图书信息的所在位置:

          python爬取豆瓣top250图书信息

3. 代码

import requests
from bs4 import BeautifulSoup
import time


class top250books():
    def html(self, href):
        max_span = 250
        for page in range(0,int(max_span),25):
            page_url = href + '?start='+ str(page)
            self.parse(page_url)


    def parse(self,url):
        html = self.request(url)
        all_tables = BeautifulSoup(html.text,'html.parser').find('div',  class_='indent').find_all('table')
        for table in all_tables:
            time.sleep(0.5)
            title = table.find('div', class_='pl2').find('a').get_text()
            info = table.find('p', class_='pl').get_text()
            rating_nums = table.find('span', class_='rating_nums').get_text()
            rating_people = table.find('span', class_='pl').get_text()
            quote = table.find('span', class_='inq').get_text()
            f = open('test.txt', 'ab')
            f.write(("".join(title.split()) + " ").encode('utf-8'))
            f.write(("".join(info.split()) + " ").encode('utf-8'))
            f.write(("".join(rating_nums.split()) + " ").encode('utf-8'))
            f.write(("".join(rating_people.split()) + " ").encode('utf-8'))
            f.write(("".join(quote.split()) + " " + '\n').encode('utf-8'))
            f.close()
            print("".join(title.split()))


    def request(self,url):
        headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8"
        }
        content = requests.get(url,headers=headers)
        return content


books = top250books()
books.html('https://book.douban.com/top250')