使用scrapy框架对淘车网进行爬取数据
对淘车网各个省份的卖车的列表页和详情页进行数据爬取
首先建立一个项目
scrapy startproject day0513
然后在进入此项目下建立爬虫主程序
scrapy genspider taoche taoche.com
items.py文件建立存储的字段
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Day0513Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 列表页
name = scrapy.Field() # 标题
registered_date = scrapy.Field() # 上牌时间
mileage = scrapy.Field() # 车程
city = scrapy.Field() # 车程
price = scrapy.Field() # 原价
new_price = scrapy.Field() # 售价
detail_url = scrapy.Field() # 详情页链接
# 详情页内容
displacement = scrapy.Field() # 排量
# = scrapy.Field() # 参考价
transmission = scrapy.Field() # 变速箱
brand_type = scrapy.Field() # 品牌型号
loc_of_lic = scrapy.Field() # 牌照所在地
oil_wear = scrapy.Field() # 油耗
engine = scrapy.Field() # 发动机
three_high = scrapy.Field() # 长宽高
drive_type = scrapy.Field() # 驱动方式
body_way = scrapy.Field() # 车身类型
che_level = scrapy.Field() # 车辆级别
trunk_cap = scrapy.Field() # 后备箱容量
主程序中由于底层默认进行主页访问,所以在进行详情页的访问的时候,不能再调用该访问函数,需要重新定义一个详情页访问的函数,并且将存储的数据一类字典的形式传递过来。
程序如下:
# -*- coding: utf-8 -*-
import scrapy
from day0513.spiders.city import CITY_CODE,CAR_CODE_LIST
from day0513.items import Day0513Item
class TaocheSpider(scrapy.Spider):
name = 'taoche'
allowed_domains = ['taoche.com']
start_urls = []
# 大恒诚所有城市不同车型的地址
for city in CITY_CODE:
for car in CAR_CODE_LIST:
url = f'https://{city}.taoche.com/{car}/'
start_urls.append(url)
def parse(self, response):
# 从首页获取最大页面page
max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract()
max_page = self.get_value(max_page)
# 列表页单页
for i in range(1,int(max_page)+1):
url = response.url + '?page=%d#pagetag'%(i)
yield scrapy.Request(url=url,callback=self.parse_1)
# 列表页解析函数
def parse_1(self,response):
# 获取整个车辆信息
car_info_list = response.xpath('//ul[@class="gongge_ul"]/li')
for car in car_info_list:
# 标题
name = car.xpath('./div[@class="gongge_main"]/a/span/text()').extract()
name = self.get_value(name)
# 车程
mileage = car.xpath('./div[2]/p/i[2]/text()').extract() # 标题
mileage = self.get_value(mileage)
# 城市
city = car.xpath('./div[2]/p/i[3]/span/text()').extract()
city = self.get_value([i.strip() for i in city])
# 原价
price = car.xpath('./div[2]/div[1]/i[3]/text()').extract()
price = self.get_value(price)
# 现价
new_price = car.xpath('./div[2]/div[1]/i[2]//text()').extract()
new_price = ''.join(new_price)
# 上牌时间
registered_date = car.xpath('./div[2]/p/i[1]/text()').extract()
registered_date = self.get_value(registered_date)
# 详情页链接
detail_url = car.xpath('./div[2]/a/@href').extract()
detail_url = 'https:'+self.get_value(detail_url)
# 实例化
item = Day0513Item()
item['name'] = name
item['mileage'] = mileage
item['city'] = city
item['price'] = price
item['new_price'] = new_price
item['registered_date'] = registered_date
item['detail_url'] = detail_url
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={'data':item},
encoding = 'utf-8',
dont_filter=True
)
# 详情页解析数据
def parse_detail(self, response):
print(response.url)
li_box = response.xpath('//div[@class="row parameter-configure"]//div[2]/ul')[0]
# 排量
displacement = li_box.xpath('./li[1]//text()').extract()
displacement = ''.join([i.strip() for i in displacement])
# 油耗
oil_wear = li_box.xpath('./li[2]//text()').extract()
oil_wear = ''.join(oil_wear)
# 长宽高
three_high = li_box.xpath('./li[3]//text()').extract()
three_high = ''.join(three_high)
# 车身类型
body_way = li_box.xpath('./li[4]//text()').extract()
body_way = ''.join(body_way)
#驱动方式
trunk_cap = li_box.xpath('./li[5]//text()').extract()
trunk_cap = ''.join(trunk_cap)
ul = response.xpath('//div[@class="row parameter-configure"]//div[1]/ul')[0]
# 品牌型号
brand_type = ul.xpath('/li[1]/span//text()').extract()
brand_type = ''.join(brand_type)
# 牌照所在地
loc_of_lic = ul.xpath('./li[2]//text()').extract()
loc_of_lic = ''.join([i.strip() for i in loc_of_lic])
# 发动机
engine = ul.xpath('./li[3]//text()').extract()
engine = ''.join(engine)
# 驱动方式
drive_type = ul.xpath('./li[4]//text()').extract()
drive_type = ''.join(drive_type)
# 车辆级别
che_level = ul.xpath('./li[5]//text()').extract()
che_level = ''.join([i.strip() for i in che_level])
# 变速箱
transmission = response.xpath('//div[@class="summary-attrs"]/dl[3]//text()').extract()
transmission = ''.join([i.strip() for i in transmission])
item = response.meta['data']
item['displacement'] = displacement
item['oil_wear'] = oil_wear
item['three_high'] = three_high
item['body_way'] = body_way
item['trunk_cap'] = trunk_cap
item['brand_type'] = brand_type
item['loc_of_lic'] = loc_of_lic
item['engine'] = engine
item['drive_type'] = drive_type
item['che_level'] = che_level
item['transmission'] = transmission
yield item
def get_value(self,value):
if value:
value = value[0]
else:
value = 1
return value
最后在pipeline.py文件中江都区的数据存储到taoche.txt文件中
import json
class Day0513Pipeline(object):
def process_item(self, item, spider):
with open('taoche.txt', 'a', encoding='utf-8') as fp:
fp.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
return item
运行结果: