1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//! Provides the trainers for the Layers.
//!
//! The optimal state of a neural network would be the one where
//! for any given input to the network, it would produce an output perfectly
//! matching the target function. In that state the loss function would have its
//! [global minimum][minimum].
//! This statement can also be reversed to *if we manage to minimize
//! the loss function of the network, we map the target function*.
//!
//! We can change the way a network works by adjusting its individual
//! [weights][weight]. So to optimize the network we want to adjust
//! the weights in a way that the loss function will be minimized.
//! If we want to know how to correctly adjust a single weight,
//! we have to get to know the effect of that weight
//! on the loss function (= the *gradient*).
//! This can be done via a method called [*backpropagation*][backprop].
//!
//! There are different methods of how a Solver solves for the minimum of the
//! loss function. They mostly differ in two ways:
//!
//! - How to execute the backpropagation to compute the gradient.
//! - How to comute the weight update from the gradient.
//!
//! [layer]: ../layer/index.html
//! [loss]: ../layers/loss/index.html
//! [weight]: https://en.wikipedia.org/wiki/Synaptic_weight
//! [minimum]: http://mathworld.wolfram.com/GlobalMinimum.html
//! [backprop]: https://en.wikipedia.org/wiki/Backpropagation

#[allow(unused_import_braces)]
pub use self::sgd::Momentum;
pub mod sgd;

use crate::co::{IBackend, SharedTensor};
use crate::layer::*;
use crate::solver::*;
use crate::util::*;

trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32>>: ISolver<SolverB, NetB> {
    fn compute_update_value(
        &mut self,
        config: &SolverConfig,
        weight_blob: &ArcLock<SharedTensor<f32>>,
        history_blob_id: usize,
        global_lr: &f32,
        blob_lr: &f32,
    );

    /// [Clip gradients][1] when they exceed [SolverConfig.clip_gradients][2].
    /// [1]: http://arxiv.org/abs/1211.5063
    /// [2]: ../solver/struct.SolverConfig.html
    ///
    /// [Gradient norm clipping][1] is a technique used when dealing with
    /// [Recurrent Neural Networks][3].
    /// When the [L2 norm][4] of the gradients exceeds a threshold it is "clipped"
    /// to that threshold. The naming can be misleading since the gradients are not
    /// actually clipped (as in cut off), but rescaled to the threshold.
    ///
    /// [3]: https://en.wikipedia.org/wiki/Recurrent_neural_network
    /// [4]: https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
    #[allow(unused_must_use)]
    fn clip_gradients<B: IBackend + LayerOps<f32> + 'static>(&self, config: &SolverConfig, net: &mut Layer<B>) {
        // skip clipping gradients if SolverConfig.clip_gradients is set to None
        if let Some(clip_threshold) = config.clip_gradients {
            let native = native_backend();

            let net_gradients = net.learnable_weights_gradients();
            let mut sumsq_diff = 0f32;
            let backend = self.backend();
            for net_gradient in net_gradients.clone() {
                let gradient = net_gradient.read().unwrap();
                // PERF: preallocate tensor once
                let mut result = SharedTensor::new(&[1]);
                // gradient.sumsq_diff(self.backend(), &mut result);
                self.backend().dot(&gradient, &gradient, &mut result);

                let sumsq_diff_slice = result.read(native.device()).unwrap().as_slice::<f32>();
                sumsq_diff += sumsq_diff_slice[0];
            }
            let l2norm_diff = sumsq_diff.sqrt();
            if l2norm_diff > clip_threshold {
                let scale_factor = clip_threshold / l2norm_diff;
                info!(
                    "Gradient clipping: scaling down gradients (L2 norm {} > {})
                        by scale factor {}",
                    l2norm_diff, clip_threshold, scale_factor
                );

                let mut scale_shared = native_scalar(scale_factor);

                for weight_gradient in net_gradients {
                    let mut gradient = weight_gradient.write().unwrap();
                    backend.scal(&mut scale_shared, &mut gradient);
                }
            }
        }
    }

    /// Scale the gradient to counteract the [SolverConfig.minibatch_size][1]
    /// [1]: ../solver/struct.SolverConfig.html
    ///
    /// To counteract that we are accumulating the gradients over multiple samples,
    /// we need to scale the gradients down to the equivalent of a single sample.</br>
    /// E.g. with a `minibatch_size` of 4 we need to scale the gradient by 0.25 (= 1/4).
    fn normalize(&self, config: &SolverConfig, weight_blob: &ArcLock<SharedTensor<f32>>) {
        if config.minibatch_size > 1 {
            let scale_factor = 1f32 / config.minibatch_size as f32;
            let mut gradient = weight_blob.write().unwrap();
            let native = native_backend();

            let mut scale_factor_shared = native_scalar(scale_factor);
            // self.backend().scal_plain(&scale_factor_shared, &mut gradient).unwrap();
            self.backend().scal(&mut scale_factor_shared, &mut gradient).unwrap();
        }
    }

    /// [Regularize][1] the gradient according to the configured [RegularizationMethod][2].
    /// [1]: https://cs231n.github.io/neural-networks-2/#reg
    /// [2]: ../solver/enum.RegularizationMethod.html
    fn regularize(
        &self,
        config: &SolverConfig,
        weight_gradient: &ArcLock<SharedTensor<f32>>,
        blob_weight_decay: Option<f32>,
    ) {
        if let Some(global_weight_decay) = config.weight_decay {
            if let Some(regularization_method) = config.regularization_method {
                match blob_weight_decay {
                    Some(weight_decay_mult) => {
                        let local_decay = global_weight_decay * weight_decay_mult;
                        match regularization_method {
                            RegularizationMethod::L2 => {
                                let native = native_backend();
                                let decay_shared = native_scalar(local_decay);
                                let gradient = &mut weight_gradient.write().unwrap();
                                // gradient.regularize_l2(self.backend(), &decay_shared);
                                // backend.axpy_plain(&decay_shared, &self.data, &mut self.diff).unwrap();
                                // TODO: solver
                                unimplemented!();
                            }
                        }
                    }
                    None => {
                        error!("Weight decay multiplier for gradient missing.");
                    }
                }
            }
        }
    }
}