Fix glorot initialization for convolutional kernels (#1420)

blester125 · neubig · commit d9d3e8004771 · 2018-09-18T10:51:16.000-04:00
* fix glorot initialization for convs

* Update Docs

* Updated benchmark results
diff --git a/dynet/param-init.cc b/dynet/param-init.cc
@@ -25,8 +25,17 @@ void ParameterInitIdentity::initialize_params(Tensor & values) const {
 
 void ParameterInitGlorot::initialize_params(Tensor & values) const {
   int dims = 0, dim_len = values.d.nd - (lookup ? 1 : 0);
-  for (int i = 0; i < dim_len; ++i) dims += values.d[i];
-  float my_scale = gain * sqrt(3 * dim_len) / sqrt(dims);
+  float my_scale = 0.0;
+  if (dim_len == 4) {
+    // When doing a Conv the parameters is (H, W, In, Out)
+    int receptive_field = values.d[0] * values.d[1];
+    // Other framework m + n are calculated by multiplying by the kernel size.
+    dims = values.d[2] * receptive_field + values.d[3] * receptive_field;
+    my_scale = gain * sqrt(6) / sqrt(dims);
+  } else {
+    for (int i = 0; i < dim_len; ++i) dims += values.d[i];
+    my_scale = gain * sqrt(3 * dim_len) / sqrt(dims);
+  }
   TensorTools::randomize_uniform(values, -my_scale, my_scale);
 }
 
diff --git a/dynet/param-init.h b/dynet/param-init.h
@@ -113,6 +113,7 @@ struct ParameterInitIdentity : public ParameterInit {
  * \ingroup params
  * \brief Initialize with the methods described in [Glorot, 2010](http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf?hc_location=ufi)
  * \details In order to preserve the variance of the forward and backward flow across layers, the parameters \f$\theta\f$ are initialized such that \f$\mathrm{Var}(\theta)=\frac 2 {n_1+n_2}\f$ where \f$n_1,n_2\f$ are the input and output dim.
+ * \details In the case of 4d tensors (common in convolutional networks) of shape \f$XH,XW,XC,N\f$ the weights are sampled from \f$\mathcal U([-g\sqrt{\frac 6 {d}},g\sqrt{ \frac 6 {d}}])\f$ where \f$d = XC * (XH * XW) + N * (XH * XW)\f$
  * Important note : The underlying distribution is uniform (not gaussian)
  * 
  * *Note:* This is also known as **Xavier initialization**
diff --git a/examples/mnist/basic-mnist-benchmarks/README.md b/examples/mnist/basic-mnist-benchmarks/README.md
@@ -67,5 +67,5 @@ Batch size: 64, learning rate: 0.01.
 | OS | Device | Framework | Speed | Accuracy (After 20 Epochs)|
 | --- | --- | --- | --- | --- |
 | Ubuntu 16.04 |  GeForce GTX 1080 Ti | PyTorch | ~ 4.49±0.11 s per epoch | 98.95% |
-| Ubuntu 16.04 |  GeForce GTX 1080 Ti | DyNet (autobatch) | ~ 8.58±0.09 s per epoch | 99.14% |
-| Ubuntu 16.04 |  GeForce GTX 1080 Ti | DyNet (minibatch) | ~ 4.13±0.13 s per epoch | 99.16% |
+| Ubuntu 16.04 |  GeForce GTX 1080 Ti | DyNet (autobatch) | ~ 8.58±0.09 s per epoch | 98.98% |
+| Ubuntu 16.04 |  GeForce GTX 1080 Ti | DyNet (minibatch) | ~ 4.13±0.13 s per epoch | 98.99% |
diff --git a/python/_dynet.pyx b/python/_dynet.pyx
@@ -519,6 +519,8 @@ cdef class GlorotInitializer(PyInitializer):
     
     If the dimensions of the parameter matrix are :math:`m,n`, the weights are sampled from :math:`\mathcal U([-g\sqrt{\\frac{6}{m+n}},g\sqrt{\\frac{6}{m+n}}])`
     
+    In the case of 4d tensors (common in convolutional networks) of shape :math:`XH,XW,XC,N` the weights are sampled from :math:`\mathcal U([-g\sqrt{\\frac{6}{d}},g\sqrt{\\frac{6}{d}}])` where :math:`d = XC * (XH * XW) + N * (XH * XW)`
+
     The gain :math:`g` depends on the activation function : 
 
     * :math:`\\text{tanh}` : 1.0

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ struct ParameterInitIdentity : public ParameterInit {`
`113`	`113`	`* \ingroup params`
`114`	`114`	`* \brief Initialize with the methods described in [Glorot, 2010](http://www.jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf?hc_location=ufi)`
`115`	`115`	`* \details In order to preserve the variance of the forward and backward flow across layers, the parameters \f$\theta\f$ are initialized such that \f$\mathrm{Var}(\theta)=\frac 2 {n_1+n_2}\f$ where \f$n_1,n_2\f$ are the input and output dim.`
	`116`	`+ * \details In the case of 4d tensors (common in convolutional networks) of shape \f$XH,XW,XC,N\f$ the weights are sampled from \f$\mathcal U([-g\sqrt{\frac 6 {d}},g\sqrt{ \frac 6 {d}}])\f$ where \f$d = XC * (XH * XW) + N * (XH * XW)\f$`
`116`	`117`	`* Important note : The underlying distribution is uniform (not gaussian)`
`117`	`118`	`*`
`118`	`119`	`* Note: This is also known as Xavier initialization`