04--定义item,使用meta在不同解析函数之间传递参数
使用爬阳光政务平台的例子
创建项目
scrapy startproject yg
scrapy genspider yguang wz.sun0769.com
文件结构
定义item
到item.py中
只有在item.py中定义的字段才能在yg.py中使用
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SunshineItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
publish_date = scrapy.Field()
content_img = scrapy.Field()
content = scrapy.Field()
yg.py
注意定义meta的方式,在不同的parse之间传递item需要定义meta
# -*- coding: utf-8 -*-
import scrapy
from sunshine.items import SunshineItem
class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['sun0769.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
def parse(self, response):# 处理列表页
# 分组
tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
for tr in tr_list:
item = SunshineItem()
item['title'] = tr.xpath("./td[2]/a[2]/@title").extract_first()
item['href'] = tr.xpath("./td[2]/a[2]/@href").extract_first()
item['publish_date'] = tr.xpath("./td[last()]/text()").extract_first()
# 到详情页里面去
yield scrapy.Request(
item['href'],
callback = self.parse_detail,
meta={"item":item} # 提取的数据不完全,给到callback函数继续处理
)
# 翻页
next = response.xpath("//a[text()]='>'/@href").extract_first()
if next is not None:
yield scrapy.Request(
next,
callback=self.parse
)
def parse_detail(self,response): # 处理详情页
item = response.meta['item']
# extract()提取列表,extract_first()提取第一个
item['content'] = response.xpath("//td[@class='txt16_3']//text()").extract()
item['content_img'] = response.xpath("//td[@class='txt16_3']//img/@src").extract()
item['content_img'] = ["http://wz.sun0769.com"+i for i in item['content_img']]
# print(item)
yield item
处理数据
因为爬到的数据content中有一些我们不需要的字段,我们需要删除
到管道piplines.py中
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re
class SunshinePipeline(object):
def process_item(self, item, spider):
item['content'] = self.process_codntent(item['content'])
print(item)
return item
def process_codntent(self,content):
content = [re.sub("\xa0|\s","",i)for i in content]
content = [i for i in content if len(i)>0] # 去除列表中空字符串
return content
第一个私有函数调用第二个处理字符串的私有函数
注意
使用管道的时候需要在setting中把管道解开
定义log_level,以及把user_agent改成自己的