完整项目|scrapy爬取伯乐在线保存数据完成
今天使用scrapy框架来爬取伯乐在线的文章内容,保存到数据库,需要注意的是在保存缩略图的时候可能会报错,这是因为python没有安装pillow模块,安装就好了,还用到了itemloader来进行item的优化,具体代码如下:
程序主函数
import scrapy
import re
import datetime, time
from bo_bole.items import BoBoleItem
from scrapy.http import Request
from urllib import parse
from scrapy.loader import ItemLoader
from bo_bole.until.command import get_md5 # 处理md5的函数
from bo_bole.items import ArticleItemLoader # 自定义的itemloader
class BoleSpider(scrapy.Spider):
name = 'bole'
allowed_domains = ['jobbole.com']
start_urls = ['http://python.jobbole.com/all-posts/']
def parse(self, response):
# 获取列表文章url的列表
div_lists = response.xpath('//div[@class="grid-8"]/div[@class="post floated-thumb"]')
for url_list in div_lists:
img_url = url_list.xpath('.//div[@class="post-thumb"]/a/img/@src').get()
detail_url = url_list.xpath('.//div[@class="post-meta"]/p/a[1]/@href').get()
yield Request(url=parse.urljoin(response.url, detail_url), meta={'font_image': img_url}, callback=self.parse_detail)
next_url = response.css('.next.page-numbers::attr(href)').extract_first('')
if next_url:
yield Request(url=parse.urljoin(response.url,next_url), callback=self.parse)
# 提取下一页url
def parse_detail(self,response):
# 提取文章的具体字段
article_item = BoBoleItem() # 实例化item
front_img_url = response.meta.get('font_image', '')
title = response.xpath('//div[@class="entry-header"]/h1/text()').get()
creat_date = response.xpath('//div[@class="entry-meta"]/p[@class="entry-meta-hide-on-mobile"]/text()').get().strip()
create_time = re.sub(' |·', '', creat_date)
list_resove = response.xpath('//div[@class="entry-meta"]/p[@class="entry-meta-hide-on-mobile"]/a[1]/text()').get()
old_writer = response.xpath('//div[@class="copyright-area"]/a[1]/text()').get()
zan_num = response.xpath('//div[@class="post-adds"]/span[1]//h10/text()').get()
if zan_num:
zan_num = int(zan_num)
else:
zan_num = 0
article_item['url_object_id'] = get_md5(response.url)
article_item['title'] = title
article_item['url'] = response.url
# 把字符串的日期格式转化成数字日期格式
try:
creat_time = datetime.datetime.strptime(create_time, '%Y/%m/%d').date()
except Exception as e:
creat_time = datetime.datetime.now().date()
article_item['creat_time'] = creat_time
article_item['list_resove'] = list_resove
article_item['old_writer'] = old_writer
article_item['zan_num'] = zan_num
article_item['front_img_url'] = [front_img_url]
# 通过ItemLoader加载Item
item_loader = ArticleItemLoader(item=BoBoleItem(), response=response)
front_img_url = response.meta.get('font_image', '')
# 这三个方法
# item_loader.add_css()
# item_loader.add_xpath()
# item_loader.add_value()
item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
item_loader.add_value('url', response.url)
item_loader.add_value('url_object_id', get_md5(response.url))
item_loader.add_xpath('list_resove', '//div[@class="entry-meta"]/p[@class="entry-meta-hide-on-mobile"]/a[1]/text()')
item_loader.add_xpath('old_writer', '//div[@class="copyright-area"]/a[1]/text()')
item_loader.add_xpath('zan_num', '//div[@class="post-adds"]/span[1]//h10/text()')
item_loader.add_value('front_img_url', [front_img_url])
item_loader.add_xpath('creat_time', '//div[@class="entry-meta"]/p[@class="entry-meta-hide-on-mobile"]/text()')
article_item = item_loader.load_item()
yield article_item
item设计
import datetime
import re
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
def data_str_date(value):
'''
自定义处理str转化datetime的函数
'''
create_time = re.sub(' |·', '', value)
try:
creat_time = datetime.datetime.strptime(create_time, '%Y/%m/%d').date()
except Exception as e:
creat_time = datetime.datetime.now().date()
return creat_time
def get_nums(value):
'''
重写数字转化,str->int
'''
if value:
nums = int(value)
else:
nums = 0
return nums
class ArticleItemLoader(ItemLoader):
# 自定义所有的item都执行这个方法
default_output_processor = TakeFirst() # TakeFirst()只取第一个值
class BoBoleItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
creat_time = scrapy.Field(
input_processor=MapCompose(data_str_date)
)
url_object_id = scrapy.Field() # 生成md5加密的
list_resove = scrapy.Field()
old_writer = scrapy.Field()
zan_num = scrapy.Field(
input_processor=MapCompose(get_nums)
)
front_img_url = scrapy.Field()
front_img_path = scrapy.Field() # 图片保存路径
保存Json文件pipeline
from scrapy.exporters import JsonLinesItemExporter
class JsonExporterPipeline(object):
'''
使用scrapy提供的写入文件的方式,JsonLinesItemExporter,其他类型使用方法一样
'''
# 调用scrapy提供的json exporter导出json文件
def __init__(self):
self.file = open('articleexporter.json', 'wb')
self.exporter = JsonLinesItemExporter(self.file, encoding='utf-8', ensure_ascii=False) # 实例化
self.exporter.start_exporting()
# 关闭并停止
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
# 做正式保存
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
保存缩略图的pipeline
from scrapy.pipelines.images import ImagesPipeline
class ArticleImage(ImagesPipeline):
'''
保存图片的配置
'''
def item_completed(self, results, item, info):
for ok, value in results:
image_file_path = value['path']
item['front_img_path'] = image_file_path
return item
自定义保存Json文件的pipeline
class JsonWithEncodingPipeline(object):
'''
把爬取下来的数据保存到json文件,自定义方式导出
'''
def __init__(self):
self.file = codecs.open('article.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
lines = json.dumps(dict(item), ensure_ascii=False)
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
保存MySql同步的Pipeline
class MysqlPipeline(object):
'''
同步插入MYsql的方法,适用于数据爬取比较少的
'''
def __init__(self):
# self.conn = MySQLdb.connect('host', 'user', 'password', 'dbname', charset='utf-8', use_unicode=True) # 用法
self.conn = MySQLdb.connect('localhost', 'root', 'root', 'spider', charset='utf8', use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into article_bole (title, url,creat_time,url_object_id, list_resove, old_writer, zan_num, front_img_url, front_img_path)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
self.cursor.execute(insert_sql,
(
item['title'],
item['url'],
item['creat_time'],
item['url_object_id'],
item['list_resove'],
item['old_writer'],
item['zan_num'],
item['front_img_url'][0],
item['front_img_path'],
))
self.conn.commit()
保存MySQL异步方法
class MysqlTwistedPipeline(object):
'''
异步插入数据库的方法,是Twisted提供的方法
在setting里面配置mysql的基本配置
在setting里面的pipeline里面配置这个类
'''
def __init__(self, dbpool):
self.dbpool = dbpool
# 自定义setting扩展
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
password = settings['MYSQL_PASSWORD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
'''
使用twisted将mysql插入变成异步执行
'''
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item) # 处理异常
def handle_error(self, failure, item, spider):
# 错误处理逻辑,异步插入错误
print(failure)
def do_insert(self, cursor, item):
# 执行具体的插入逻辑
insert_sql = """
insert into article_bole (title, url,creat_time,url_object_id, list_resove, old_writer, zan_num, front_img_url, front_img_path)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
cursor.execute(insert_sql,
(
item['title'],
item['url'],
item['creat_time'],
item['url_object_id'],
item['list_resove'],
item['old_writer'],
item['zan_num'],
item['front_img_url'][0],
item['front_img_path'],
))
settings配置
ITEM_PIPELINES = {
'bo_bole.pipelines.BoBolePipeline': 2,
# 'scrapy.pipelines.images.ImagesPipeline': 100,
# 'bo_bole.pipelines.ArticleImage': 1,
# 'bo_bole.pipelines.MysqlPipeline': 3, # 保存到Mysql数据库的pipeline
# 'bo_bole.pipelines.MysqlTwistedPipeline': 3, # 异步插入mysql
}
IMAGES_URLS_FIELD = 'front_img_url' # 下载图片配置
project_dir = os.path.abspath(os.path.dirname(__file__)) # 图片存放路劲
IMAGES_STORE = os.path.join(project_dir, 'images')
# 异步时需要配置这些
MYSQL_HOST = "localhost"
MYSQL_DBNAME = "spider"
MYSQL_USER = "root"
MYSQL_PASSWORD = 'root'
写下来代码还是挺多的,不过功能比较齐全,都已经测试可以实现,下面贴两张效果图: