神经网络学习速率设置指南(CLR Callback,LRFinder,SGDR等最新的学习率设置方案)附完整代码解析
周期性学习率(CLR)
循环学习率是学习率调整的策略,其在周期性质中将学习率从基值增加。通常,周期的频率是恒定的,但是振幅通常在每个周期或每个小批量迭代中动态地缩放。
from keras.callbacks import *
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.optimizers import *
import matplotlib.pyplot as plt
'''循环学习率是学习率调整的策略,其在周期性质中将学习率从基值增加。
通常,周期的频率是恒定的,但是振幅通常在每个周期或每个小批量迭代中动态地缩放。
'''
class CyclicLR(Callback):
"""This callback implements a cyclical learning rate policy (CLR).
The method cycles the learning rate between two boundaries with
some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
The amplitude of the cycle can be scaled on a per-iteration or
per-cycle basis.
This class has three built-in policies, as put forth in the paper.
"triangular":
A basic triangular cycle w/ no amplitude scaling.
"triangular2":
A basic triangular cycle that scales initial amplitude by half each cycle.
"exp_range":
A cycle that scales initial amplitude by gamma**(cycle iterations) at each
cycle iteration.
For more detail, please see paper.
# Example
```python
clr = CyclicLR(base_lr=0.001, max_lr=0.006,
step_size=2000., mode='triangular')
model.fit(X_train, Y_train, callbacks=[clr])
```
Class also supports custom scaling functions:
```python
clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
clr = CyclicLR(base_lr=0.001, max_lr=0.006,
step_size=2000., scale_fn=clr_fn,
scale_mode='cycle')
model.fit(X_train, Y_train, callbacks=[clr])
```
# Arguments
base_lr: initial learning rate which is the
lower boundary in the cycle.
max_lr: upper boundary in the cycle. Functionally,
it defines the cycle amplitude (max_lr - base_lr).
The lr at any cycle is the sum of base_lr
and some scaling of the amplitude; therefore
max_lr may not actually be reached depending on
scaling function.
step_size: number of training iterations per
half cycle. Authors suggest setting step_size
2-8 x training iterations in epoch.
mode: one of {triangular, triangular2, exp_range}.
Default 'triangular'.
Values correspond to policies detailed above.
If scale_fn is not None, this argument is ignored.
gamma: constant in 'exp_range' scaling function:
gamma**(cycle iterations)
scale_fn: Custom scaling policy defined by a single
argument lambda function, where
0 <= scale_fn(x) <= 1 for all x >= 0.
mode paramater is ignored
scale_mode: {'cycle', 'iterations'}.
Defines whether scale_fn is evaluated on
cycle number or cycle iterations (training
iterations since start of cycle). Default is 'cycle'.
"""
def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
gamma=1., scale_fn=None, scale_mode='cycle'):
super(CyclicLR, self).__init__()
self.base_lr = base_lr
self.max_lr = max_lr
self.step_size = step_size
self.mode = mode
self.gamma = gamma
if scale_fn == None:
if self.mode == 'triangular':
self.scale_fn = lambda x: 1.
self.scale_mode = 'cycle'
elif self.mode == 'triangular2':
self.scale_fn = lambda x: 1 / (2. ** (x - 1))
self.scale_mode = 'cycle'
elif self.mode == 'exp_range':
self.scale_fn = lambda x: gamma ** (x)
self.scale_mode = 'iterations'
else:
self.scale_fn = scale_fn
self.scale_mode = scale_mode
self.clr_iterations = 0.
self.trn_iterations = 0.
self.history = {}
self._reset()
def _reset(self, new_base_lr=None, new_max_lr=None,
new_step_size=None):
"""Resets cycle iterations.
Optional boundary/step size adjustment.
"""
if new_base_lr != None:
self.base_lr = new_base_lr
if new_max_lr != None:
self.max_lr = new_max_lr
if new_step_size != None:
self.step_size = new_step_size
self.clr_iterations = 0.
def clr(self):
cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
if self.scale_mode == 'cycle':
return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(cycle)
else:
return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(
self.clr_iterations)
def on_train_begin(self, logs={}):
logs = logs or {}
if self.clr_iterations == 0:
K.set_value(self.model.optimizer.lr, self.base_lr)
else:
K.set_value(self.model.optimizer.lr, self.clr())
def on_batch_end(self, epoch, logs=None):
logs = logs or {}
self.trn_iterations += 1
self.clr_iterations += 1
self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
self.history.setdefault('iterations', []).append(self.trn_iterations)
for k, v in logs.items():
self.history.setdefault(k, []).append(v)
K.set_value(self.model.optimizer.lr, self.clr())
if __name__ == '__main__':
'''
一个epoch是至将整个训练集训练一轮。如果我们令batch_size等于100(每次使用100个样本进行训练),
那么一个epoch总共需要计算500次iteration。
iteration : 一代中进行了多少次迭代 np.ceil(train_data / batch_size)
'''
inp = Input(shape=(15,))
x = Dense(10, activation='relu')(inp)
x = Dense(1, activation='sigmoid')(x)
model = Model(inp, x)
X = np.random.rand(2000000, 15)
Y = np.random.randint(0, 2, size=2000000)
clr_triangular = CyclicLR(mode='triangular')
model.compile(optimizer=SGD(0.1), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)
plt.figure()
plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])
plt.xlabel('Training Iterations')
plt.ylabel('Learning Rate')
plt.title("CLR - 'triangular' Policy")
plt.show()
# clr_triangular = CyclicLR(mode='triangular2')
# model.compile(optimizer=SGD(), loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(X, Y, batch_size=2000, nb_epoch=20, callbacks=[clr_triangular], verbose=0)
# clr_triangular._reset()
# model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)
# plt.xlabel('Training Iterations')
# plt.ylabel('Learning Rate')
# plt.title("'triangular2' Policy Reset at 20000 Iterations")
# plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])
这个类的参数包括:
base_lr
:初始学习率,这是周期中的下限。这会覆盖优化器lr。默认值为0.001。max_lr
:循环中的上边界。在功能上,它定义了循环幅度(max_lr- base_lr)。任何周期的lr是base_lr幅度的总和和一些比例; 因此,max_lr根据缩放功能,实际上可能无法达到。默认0.006。step_size
:每半个周期的训练迭代次数。作者建议设定step_size = (2-8) x (training iterations in epoch)。默认2000。mode
:其中一个{‘triangular’, ‘triangular2’, ‘exp_range’}。值对应于下面详述的策略。如果scale_fn不是None,则忽略该参数。默认’triangular’。gamma
:‘exp_range’缩放功能常数,gamma^(cycle iterations)。默认1。scale_fn
:自定义扩展策略由单个参数lambda函数定义,0 <= scale_fn(x) <= 1适用于所有x >= 0。mode使用此参数时,将忽略该参数。默认None。scale_mode
:{‘cycle’, ‘iterations’}。定义是否scale_fn根据循环次数或循环迭代进行评估(自循环开始后的训练迭代)。默认是’cycle’。
详情可参照:https://github.com/bckenstler/CLR
寻找最优学习速率范围
写一个 Keras 回调函数,就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from keras import backend as K
from keras.callbacks import Callback,ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np
# Keras 回调函数,就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。
class LRFinder(Callback):
def __init__(self, min_lr=1e-7, max_lr=1e-4, steps_per_epoch=None, epochs=None):
super().__init__()
self.min_lr = min_lr
self.max_lr = max_lr
self.total_iterations = steps_per_epoch * epochs
self.iteration = 0
self.history = {}
def clr(self):
'''Calculate the learning rate.'''
x = self.iteration / self.total_iterations
return self.min_lr + (self.max_lr - self.min_lr) * x
def on_train_begin(self, logs=None):
'''Initialize the learning rate to the minimum value at the start of training.'''
logs = logs or {}
K.set_value(self.model.optimizer.lr, self.min_lr)
def on_batch_end(self, epoch, logs=None):
# Record previous batch statistics and update the learning rate.
logs = logs or {}
self.iteration += 1
self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
self.history.setdefault('iterations', []).append(self.iteration)
for k, v in logs.items():
self.history.setdefault(k, []).append(v)
K.set_value(self.model.optimizer.lr, self.clr())
def plot_lr(self):
# Helper function to quickly inspect the learning rate schedule.
plt.plot(self.history['iterations'], self.history['lr'])
plt.yscale('log')
plt.xlabel('Iteration')
plt.ylabel('Learning rate')
plt.savefig("../images/images/plot_lr.png")
plt.show()
#
def plot_loss(self):
# '''Helper function to quickly observe the learning rate experiment results.'''
plt.plot(self.history['lr'], self.history['loss'])
plt.xscale('log')
plt.xlabel('Learning rate')
plt.ylabel('Loss')
plt.savefig("../images/images/plot_loss.png")
plt.show()
if __name__ == '__main__':
'''
min_lr: The lower bound of the learning rate range for the experiment.
max_lr: The upper bound of the learning rate range for the experiment.
steps_per_epoch: Number of mini-batches in the dataset.
epochs: Number of epochs to run experiment. Usually between 2 and 4 epochs is
'''
a=200
batch_size = 40
epochs = 3
lr_finder = LRFinder(min_lr=1e-7, max_lr=1e-4, steps_per_epoch=np.ceil(a // batch_size),
epochs=epochs)
设置一个学习速率表(步衰减)
学习速率退火的最流行方式是「步衰减」(Step Decay),其中学习率经过一定数量的训练 epochs 后下降了一定的百分比。
import numpy as np
from keras.callbacks import LearningRateScheduler
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):
'''
Wrapper function to create a LearningRateScheduler with step decay schedule.
'''
def schedule(epoch):
return initial_lr * (decay_factor ** np.floor(epoch / step_size))
return LearningRateScheduler(schedule)
if __name__ == '__main__':
lr_sched = step_decay_schedule(initial_lr=1e-4, decay_factor=0.75, step_size=2)
print(lr_sched)
带有重启的随机梯度下降
带有热重启的随机梯度下降(SGDR)与周期性方法很相似,其中一个积极的退火表与周期性「再启动」融合到原始的初始学习率之中。
class SGDRScheduler(Callback):
'''Schedule learning rates with restarts
A simple restart technique for stochastic gradient descent.
The learning rate decays after each batch and peridically resets to its
initial value. Optionally, the learning rate is additionally reduced by a
fixed factor at a predifined set of epochs.
# Arguments
epochsize: Number of samples per epoch during training.
batchsize: Number of samples per batch during training.
start_epoch: First epoch where decay is applied.
epochs_to_restart: Initial number of epochs before restarts.
mult_factor: Increase of epochs_to_restart after each restart.
lr_fac: Decrease of learning rate at epochs given in
lr_reduction_epochs.
lr_reduction_epochs: Fixed list of epochs at which to reduce
learning rate.
# References
- [SGDR: Stochastic Gradient Descent with Restarts](http://arxiv.org/abs/1608.03983)
'''
def __init__(self,
epochsize,
batchsize,
epochs_to_restart=2,
mult_factor=2,
lr_fac=0.1,
lr_reduction_epochs=(60, 120, 160)):
super(SGDRScheduler, self).__init__()
self.epoch = -1
self.batch_since_restart = 0
self.next_restart = epochs_to_restart
self.epochsize = epochsize
self.batchsize = batchsize
self.epochs_to_restart = epochs_to_restart
self.mult_factor = mult_factor
self.batches_per_epoch = self.epochsize / self.batchsize
self.lr_fac = lr_fac
self.lr_reduction_epochs = lr_reduction_epochs
self.lr_log = []
def on_train_begin(self, logs={}):
self.lr = K.get_value(self.model.optimizer.lr)
def on_epoch_begin(self, epoch, logs={}):
self.epoch += 1
def on_batch_end(self, batch, logs={}):
fraction_to_restart = self.batch_since_restart / \
(self.batches_per_epoch * self.epochs_to_restart)
lr = 0.5 * self.lr * (1 + np.cos(fraction_to_restart * np.pi))
K.set_value(self.model.optimizer.lr, lr)
self.batch_since_restart += 1
self.lr_log.append(lr)
def on_epoch_end(self, epoch, logs={}):
if self.epoch + 1 == self.next_restart:
self.batch_since_restart = 0
self.epochs_to_restart *= self.mult_factor
self.next_restart += self.epochs_to_restart
if (self.epoch + 1) in self.lr_reduction_epochs:
self.lr *= self.lr_fac