使用scrapy框架对淘车网进行爬取数据

对淘车网各个省份的卖车的列表页和详情页进行数据爬取

首先建立一个项目

scrapy startproject day0513

然后在进入此项目下建立爬虫主程序

scrapy genspider taoche taoche.com

items.py文件建立存储的字段

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Day0513Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 列表页
    name = scrapy.Field()  # 标题
    registered_date = scrapy.Field() # 上牌时间
    mileage = scrapy.Field()  # 车程
    city = scrapy.Field()  # 车程
    price = scrapy.Field()  # 原价
    new_price = scrapy.Field()  # 售价
    detail_url = scrapy.Field()  # 详情页链接

    # 详情页内容
    displacement = scrapy.Field()  # 排量
     # = scrapy.Field()  # 参考价
    transmission = scrapy.Field() # 变速箱
    brand_type = scrapy.Field() # 品牌型号
    loc_of_lic = scrapy.Field() # 牌照所在地
    oil_wear = scrapy.Field() # 油耗
    engine = scrapy.Field() # 发动机
    three_high = scrapy.Field() # 长宽高
    drive_type = scrapy.Field() # 驱动方式
    body_way = scrapy.Field() # 车身类型
    che_level = scrapy.Field() # 车辆级别
    trunk_cap = scrapy.Field() # 后备箱容量

主程序中由于底层默认进行主页访问,所以在进行详情页的访问的时候,不能再调用该访问函数,需要重新定义一个详情页访问的函数,并且将存储的数据一类字典的形式传递过来。

程序如下:

# -*- coding: utf-8 -*-
import scrapy
from day0513.spiders.city import CITY_CODE,CAR_CODE_LIST
from day0513.items import Day0513Item
class TaocheSpider(scrapy.Spider):
    name = 'taoche'
    allowed_domains = ['taoche.com']
    start_urls = []
    # 大恒诚所有城市不同车型的地址
    for city in CITY_CODE:
        for car in CAR_CODE_LIST:
            url = f'https://{city}.taoche.com/{car}/'
            start_urls.append(url)




    def parse(self, response):
        # 从首页获取最大页面page
        max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract()
        max_page = self.get_value(max_page)

        # 列表页单页
        for i in range(1,int(max_page)+1):
            url = response.url + '?page=%d#pagetag'%(i)
            yield scrapy.Request(url=url,callback=self.parse_1)

    # 列表页解析函数
    def parse_1(self,response):

        # 获取整个车辆信息
        car_info_list = response.xpath('//ul[@class="gongge_ul"]/li')
        for car in car_info_list:

            # 标题
            name =  car.xpath('./div[@class="gongge_main"]/a/span/text()').extract()
            name = self.get_value(name)
            # 车程
            mileage =  car.xpath('./div[2]/p/i[2]/text()').extract() # 标题
            mileage = self.get_value(mileage)
            # 城市
            city = car.xpath('./div[2]/p/i[3]/span/text()').extract()
            city = self.get_value([i.strip() for i in city])

            # 原价
            price = car.xpath('./div[2]/div[1]/i[3]/text()').extract()
            price = self.get_value(price)
            # 现价
            new_price = car.xpath('./div[2]/div[1]/i[2]//text()').extract()
            new_price = ''.join(new_price)
            # 上牌时间
            registered_date = car.xpath('./div[2]/p/i[1]/text()').extract()
            registered_date = self.get_value(registered_date)
            # 详情页链接
            detail_url = car.xpath('./div[2]/a/@href').extract()
            detail_url = 'https:'+self.get_value(detail_url)

            # 实例化
            item = Day0513Item()
            item['name'] = name
            item['mileage'] = mileage
            item['city'] = city
            item['price'] = price
            item['new_price'] = new_price
            item['registered_date'] = registered_date
            item['detail_url'] = detail_url


            yield scrapy.Request(
                url=detail_url,
                callback=self.parse_detail,
                meta={'data':item},
                encoding = 'utf-8',
                dont_filter=True
            )

    # 详情页解析数据
    def parse_detail(self, response):
        print(response.url)

        li_box = response.xpath('//div[@class="row parameter-configure"]//div[2]/ul')[0]
        # 排量
        displacement = li_box.xpath('./li[1]//text()').extract()
        displacement = ''.join([i.strip() for i in displacement])
        # 油耗
        oil_wear = li_box.xpath('./li[2]//text()').extract()
        oil_wear = ''.join(oil_wear)

        # 长宽高
        three_high = li_box.xpath('./li[3]//text()').extract()
        three_high = ''.join(three_high)
        # 车身类型
        body_way = li_box.xpath('./li[4]//text()').extract()
        body_way = ''.join(body_way)
        #驱动方式
        trunk_cap = li_box.xpath('./li[5]//text()').extract()
        trunk_cap = ''.join(trunk_cap)

        ul = response.xpath('//div[@class="row parameter-configure"]//div[1]/ul')[0]
        # 品牌型号
        brand_type = ul.xpath('/li[1]/span//text()').extract()
        brand_type = ''.join(brand_type)
        # 牌照所在地
        loc_of_lic = ul.xpath('./li[2]//text()').extract()
        loc_of_lic = ''.join([i.strip() for i in loc_of_lic])
        # 发动机
        engine = ul.xpath('./li[3]//text()').extract()
        engine = ''.join(engine)
        # 驱动方式
        drive_type = ul.xpath('./li[4]//text()').extract()
        drive_type = ''.join(drive_type)
        # 车辆级别
        che_level = ul.xpath('./li[5]//text()').extract()
        che_level = ''.join([i.strip()  for i in che_level])
        # 变速箱
        transmission = response.xpath('//div[@class="summary-attrs"]/dl[3]//text()').extract()
        transmission = ''.join([i.strip() for i in transmission])

        item = response.meta['data']
        item['displacement'] = displacement
        item['oil_wear'] = oil_wear
        item['three_high'] = three_high
        item['body_way'] = body_way
        item['trunk_cap'] = trunk_cap
        item['brand_type'] = brand_type
        item['loc_of_lic'] = loc_of_lic
        item['engine'] = engine
        item['drive_type'] = drive_type
        item['che_level'] = che_level
        item['transmission'] = transmission
        yield item


    def get_value(self,value):
        if value:
            value = value[0]
        else:
            value = 1
        return value

最后在pipeline.py文件中江都区的数据存储到taoche.txt文件中

import json

class Day0513Pipeline(object):
    def process_item(self, item, spider):
        with open('taoche.txt', 'a', encoding='utf-8') as fp:
            fp.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
        return item

 

运行结果:

使用scrapy框架对淘车网进行爬取数据