周期性学习率（CLR）

神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析
循环学习率是学习率调整的策略，其在周期性质中将学习率从基值增加。通常，周期的频率是恒定的，但是振幅通常在每个周期或每个小批量迭代中动态地缩放。

from keras.callbacks import *
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.optimizers import *
import matplotlib.pyplot as plt

'''循环学习率是学习率调整的策略，其在周期性质中将学习率从基值增加。
   通常，周期的频率是恒定的，但是振幅通常在每个周期或每个小批量迭代中动态地缩放。
    '''
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each
        cycle iteration.
    For more detail, please see paper.

    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```

    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1 / (2. ** (x - 1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma ** (x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.

    def clr(self):
        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(
                self.clr_iterations)

    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())

    def on_batch_end(self, epoch, logs=None):

        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        K.set_value(self.model.optimizer.lr, self.clr())

if __name__ == '__main__':

    '''
    一个epoch是至将整个训练集训练一轮。如果我们令batch_size等于100（每次使用100个样本进行训练）, 
    那么一个epoch总共需要计算500次iteration。
    iteration : 一代中进行了多少次迭代　np.ceil(train_data / batch_size)
    '''
    inp = Input(shape=(15,))
    x = Dense(10, activation='relu')(inp)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inp, x)

    X = np.random.rand(2000000, 15)
    Y = np.random.randint(0, 2, size=2000000)

    clr_triangular = CyclicLR(mode='triangular')
    model.compile(optimizer=SGD(0.1), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)
    plt.figure()
    plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])
    plt.xlabel('Training Iterations')
    plt.ylabel('Learning Rate')
    plt.title("CLR - 'triangular' Policy")
    plt.show()

    # clr_triangular = CyclicLR(mode='triangular2')
    # model.compile(optimizer=SGD(), loss='binary_crossentropy', metrics=['accuracy'])
    # model.fit(X, Y, batch_size=2000, nb_epoch=20, callbacks=[clr_triangular], verbose=0)
    # clr_triangular._reset()
    # model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)
    # plt.xlabel('Training Iterations')
    # plt.ylabel('Learning Rate')
    # plt.title("'triangular2' Policy Reset at 20000 Iterations")
    # plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])

这个类的参数包括：

base_lr：初始学习率，这是周期中的下限。这会覆盖优化器lr。默认值为0.001。
max_lr：循环中的上边界。在功能上，它定义了循环幅度（max_lr- base_lr）。任何周期的lr是base_lr幅度的总和和一些比例; 因此，max_lr根据缩放功能，实际上可能无法达到。默认0.006。
step_size：每半个周期的训练迭代次数。作者建议设定step_size = (2-8) x (training iterations in epoch)。默认2000。
mode：其中一个{‘triangular’, ‘triangular2’, ‘exp_range’}。值对应于下面详述的策略。如果scale_fn不是None，则忽略该参数。默认’triangular’。
gamma：‘exp_range’缩放功能常数，gamma^(cycle iterations)。默认1。
scale_fn：自定义扩展策略由单个参数lambda函数定义，0 <= scale_fn(x) <= 1适用于所有x >= 0。mode使用此参数时，将忽略该参数。默认None。
scale_mode：{‘cycle’, ‘iterations’}。定义是否scale_fn根据循环次数或循环迭代进行评估（自循环开始后的训练迭代）。默认是’cycle’。

详情可参照：https://github.com/bckenstler/CLR

寻找最优学习速率范围

神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析

写一个 Keras 回调函数，就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from keras import backend as K
from keras.callbacks import Callback,ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np

# Keras 回调函数，就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。
class LRFinder(Callback):
    def __init__(self, min_lr=1e-7, max_lr=1e-4, steps_per_epoch=None, epochs=None):
        super().__init__()

        self.min_lr = min_lr
        self.max_lr = max_lr
        self.total_iterations = steps_per_epoch * epochs
        self.iteration = 0
        self.history = {}

    def clr(self):
        '''Calculate the learning rate.'''
        x = self.iteration / self.total_iterations
        return self.min_lr + (self.max_lr - self.min_lr) * x

    def on_train_begin(self, logs=None):
        '''Initialize the learning rate to the minimum value at the start of training.'''
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.min_lr)

    def on_batch_end(self, epoch, logs=None):
        # Record previous batch statistics and update the learning rate.
        logs = logs or {}
        self.iteration += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.iteration)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        K.set_value(self.model.optimizer.lr, self.clr())

    def plot_lr(self):
        # Helper function to quickly inspect the learning rate schedule.
        plt.plot(self.history['iterations'], self.history['lr'])
        plt.yscale('log')
        plt.xlabel('Iteration')
        plt.ylabel('Learning rate')
        plt.savefig("../images/images/plot_lr.png")
        plt.show()
        #

    def plot_loss(self):
        # '''Helper function to quickly observe the learning rate experiment results.'''
        plt.plot(self.history['lr'], self.history['loss'])
        plt.xscale('log')
        plt.xlabel('Learning rate')
        plt.ylabel('Loss')
        plt.savefig("../images/images/plot_loss.png")
        plt.show()

if __name__ == '__main__':
    '''
    min_lr: The lower bound of the learning rate range for the experiment.
    max_lr: The upper bound of the learning rate range for the experiment.
    steps_per_epoch: Number of mini-batches in the dataset.
    epochs: Number of epochs to run experiment. Usually between 2 and 4 epochs is 
    '''

    a=200
    batch_size = 40
    epochs = 3
    lr_finder = LRFinder(min_lr=1e-7, max_lr=1e-4, steps_per_epoch=np.ceil(a // batch_size),
                         epochs=epochs)

设置一个学习速率表（步衰减）

神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析

学习速率退火的最流行方式是「步衰减」（Step Decay），其中学习率经过一定数量的训练 epochs 后下降了一定的百分比。

import numpy as np
from keras.callbacks import LearningRateScheduler
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):
    '''
    Wrapper function to create a LearningRateScheduler with step decay schedule.
    '''
    def schedule(epoch):
        return initial_lr * (decay_factor ** np.floor(epoch / step_size))

    return LearningRateScheduler(schedule)

if __name__ == '__main__':

    lr_sched = step_decay_schedule(initial_lr=1e-4, decay_factor=0.75, step_size=2)
    print(lr_sched)

带有重启的随机梯度下降

神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析

带有热重启的随机梯度下降（SGDR）与周期性方法很相似，其中一个积极的退火表与周期性「再启动」融合到原始的初始学习率之中。

class SGDRScheduler(Callback):
    '''Schedule learning rates with restarts
     A simple restart technique for stochastic gradient descent.
    The learning rate decays after each batch and peridically resets to its
    initial value. Optionally, the learning rate is additionally reduced by a
    fixed factor at a predifined set of epochs.
     # Arguments
        epochsize: Number of samples per epoch during training.
        batchsize: Number of samples per batch during training.
        start_epoch: First epoch where decay is applied.
        epochs_to_restart: Initial number of epochs before restarts.
        mult_factor: Increase of epochs_to_restart after each restart.
        lr_fac: Decrease of learning rate at epochs given in
                lr_reduction_epochs.
        lr_reduction_epochs: Fixed list of epochs at which to reduce
                             learning rate.
     # References
        - [SGDR: Stochastic Gradient Descent with Restarts](http://arxiv.org/abs/1608.03983)
    '''
    def __init__(self,
                 epochsize,
                 batchsize,
                 epochs_to_restart=2,
                 mult_factor=2,
                 lr_fac=0.1,
                 lr_reduction_epochs=(60, 120, 160)):
        super(SGDRScheduler, self).__init__()
        self.epoch = -1
        self.batch_since_restart = 0
        self.next_restart = epochs_to_restart
        self.epochsize = epochsize
        self.batchsize = batchsize
        self.epochs_to_restart = epochs_to_restart
        self.mult_factor = mult_factor
        self.batches_per_epoch = self.epochsize / self.batchsize
        self.lr_fac = lr_fac
        self.lr_reduction_epochs = lr_reduction_epochs
        self.lr_log = []

     def on_train_begin(self, logs={}):
        self.lr = K.get_value(self.model.optimizer.lr)

     def on_epoch_begin(self, epoch, logs={}):
        self.epoch += 1

     def on_batch_end(self, batch, logs={}):
        fraction_to_restart = self.batch_since_restart / \
            (self.batches_per_epoch * self.epochs_to_restart)
        lr = 0.5 * self.lr * (1 + np.cos(fraction_to_restart * np.pi))
        K.set_value(self.model.optimizer.lr, lr)

         self.batch_since_restart += 1
        self.lr_log.append(lr)

     def on_epoch_end(self, epoch, logs={}):
        if self.epoch + 1 == self.next_restart:
            self.batch_since_restart = 0
            self.epochs_to_restart *= self.mult_factor
            self.next_restart += self.epochs_to_restart

         if (self.epoch + 1) in self.lr_reduction_epochs:
            self.lr *= self.lr_fac

神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析

周期性学习率（CLR）

寻找最优学习速率范围

设置一个学习速率表（步衰减）

带有重启的随机梯度下降

相关推荐