00001 /*************************************************************************** 00002 * @file gnn_hessian.c 00003 * @brief Hessian Matrix Evaluation Routines Implementation. 00004 * 00005 * @date : 19-09-03 15:37 00006 * @author : Pedro Ortega C. <peortega@dcc.uchile.cl> 00007 * Copyright 2003 Pedro Ortega C. 00008 ****************************************************************************/ 00009 /* 00010 * This program is free software; you can redistribute it and/or modify 00011 * it under the terms of the GNU General Public License as published by 00012 * the Free Software Foundation; either version 2 of the License, or 00013 * (at your option) any later version. 00014 * 00015 * This program is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 * GNU Library General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU General Public License 00021 * along with this program; if not, write to the Free Software 00022 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00023 */ 00024 00025 00026 00027 /** 00028 * @brief Evaluation of Hessian Matrix. 00029 * @defgroup gnn_hessian_doc Hessian Matrix Evaluation Routines. 00030 * @ingroup libgnn_node 00031 * 00032 * The routines presented in this module deal with the computation of the 00033 * second derivatives of the error, given by 00034 * 00035 * \f[ \frac{\partial^2 E}{\partial w_i \partial w_j} \f] 00036 * 00037 * where both indexes run over all parameters \f$i,j=1, \ldots, l\f$. These 00038 * elements form the Hessian Matrix, which plays a very important role in 00039 * several aspects of neural computing, like, per example: 00040 * - Some parameter optimization algorithms use the information of the 00041 * error surface's second-order properties. 00042 * - Pruning algorithms use the Hessian in order to determine the relevance 00043 * of the parameters. 00044 * Although the Hessian Matrix does hold important information about a 00045 * modelling function, its computation can be very expensive, both in 00046 * time and memory requirements. 00047 * 00048 * It is important to note that some of these algorithms assume special 00049 * error functions. Nevertheless, sometimes this assumption does not 00050 * heavilly contradict the situation encountered in e.g. pruning procedures 00051 * using another cost function. 00052 */ 00053 00054 00055 00056 /******************************************/ 00057 /* Include Files */ 00058 /******************************************/ 00059 00060 #include <gsl/gsl_blas.h> 00061 #include "gnn_evaluation.h" 00062 #include "gnn_hessian.h" 00063 #include <math.h> 00064 00065 00066 00067 /******************************************/ 00068 /* Public Interface */ 00069 /******************************************/ 00070 00071 /** 00072 * @brief Fast multiplication by the Hessian matrix. 00073 * @ingroup gnn_hessian_doc 00074 * 00075 * Some applications do not require the Hessian explicitally. Instead, 00076 * what really is needed is a fast method for computing \f$Hv\f$, where 00077 * \f$H\f$ is the Hessian matrix (which is assumed to be symmetric, so that 00078 * \f$Hv = v^TH\f$) and 00079 * \f$v\f$ is a vector. This numerical method computes this multiplication 00080 * using central differences: 00081 * 00082 * \f[ v^T H 00083 * = \frac{1}{2\epsilon} 00084 * \left [ 00085 * \frac{\partial E}{\partial w}(w + \epsilon v) - 00086 * \frac{\partial E}{\partial w}(w - \epsilon v) 00087 * \right ] 00088 * \f] 00089 * 00090 * where \f$\epsilon\f$ is a small value that gives the required precision, 00091 * since the error resulting from this approximation is \f$O(\epsilon^2)\f$. 00092 * 00093 * @param grad A pointer to a \ref gnn_grad buffer. 00094 * @param v A pointer to the vector \f$v\f$. 00095 * @param vH A pointer to a vector of size \f$l \times 1\f$, 00096 * where \f$l\f$ is the number of parameters involved 00097 * in the \ref gnn_node. The resulting multiplication 00098 * will be stored in this vector. 00099 * @param eps The required precision \f$\epsilon\f$. 00100 * @return Returns 0 if succeeded. 00101 */ 00102 int 00103 gnn_hessian_vector_mult (gnn_line *line, 00104 const gsl_vector *v, gsl_vector *vH, double eps) 00105 { 00106 size_t j; 00107 gnn_grad *grad; 00108 00109 assert (line != NULL); 00110 assert (v != NULL); 00111 assert (vH != NULL); 00112 assert (v != vH); 00113 00114 /* get gnn_grad */ 00115 grad = gnn_line_get_grad (line); 00116 00117 /* check sizes */ 00118 if (grad->l != v->size) 00119 { 00120 GSL_ERROR ("the vector's size should match the number of parameters", 00121 GSL_EINVAL); 00122 } 00123 if (grad->l != vH->size) 00124 { 00125 GSL_ERROR ("the result's size should match the number of parameters", 00126 GSL_EINVAL); 00127 } 00128 if (grad->P == GNN_INFTY) 00129 { 00130 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00131 } 00132 00133 /* check precision */ 00134 if (eps <= 0.0) 00135 { 00136 GSL_ERROR ("eps should be stricly positive", GSL_EINVAL); 00137 } 00138 00139 /* set direction */ 00140 gnn_line_set_direction (line, v); 00141 00142 /* compute gradient step forward */ 00143 gnn_line_all (line, eps, gnnLineDE); 00144 gsl_vector_memcpy (vH, GNN_GRAD_DW (grad)); 00145 00146 /* compute gradient step backward */ 00147 gnn_line_all (line, -eps, gnnLineDE); 00148 gsl_vector_sub (vH, GNN_GRAD_DW (grad)); 00149 00150 /* finish approximation */ 00151 gsl_vector_scale (vH, 1 / (2*eps)); 00152 00153 /* restore original parameters */ 00154 gnn_line_all (line, 0.0, gnnLineE); 00155 00156 return 0; 00157 } 00158 00159 /** 00160 * @brief Finite differences diagonal Hessian matrix approximation. 00161 * @ingroup gnn_hessian_doc 00162 * 00163 * This numerical efficient method computes the diagonal values of the Hessian 00164 * matrix using the central differentes method, given by 00165 * 00166 * \f[ \frac{\partial^2 E}{\partial w_i^2} 00167 * = \frac{1}{2\epsilon} 00168 * \left [ 00169 * \frac{\partial E}{\partial w_i}(w_i + \epsilon) - 00170 * \frac{\partial E}{\partial w_i}(w_i - \epsilon) 00171 * \right ] 00172 * \f] 00173 * 00174 * where \f$\epsilon\f$ is a small value that gives the required precision, 00175 * since the error resulting from this approximation is \f$O(\epsilon^2)\f$. 00176 * 00177 * The same as above written in matrix notation is: 00178 * 00179 * \f[ H = \frac{1}{2\epsilon} 00180 * \left [ g(w + \epsilon I) - g(w - \epsilon I) \right ] 00181 * \f] 00182 * 00183 * where \f$H\f$ is the Hessian matrix and 00184 * \f$g(w_0) = \frac{\partial E}{\partial w}(w_0)\f$. Note that this 00185 * requires a single forward and backward step for evaluation, since 00186 * the assumption of the diagonal Hessian is equivalent to an uncorrelated 00187 * parameter space. 00188 * 00189 * @param grad A pointer to a \ref gnn_grad buffer. 00190 * @param H A pointer to a vector of size \f$l \times 1\f$, 00191 * where \f$l\f$ is the number of parameters involved 00192 * in the \ref gnn_node. The resulting diagonal terms of the 00193 * Hessian will be stored in this vector. 00194 * @param eps The required precision \f$\epsilon\f$. 00195 * @return Returns 0 if succeeded. 00196 */ 00197 int 00198 gnn_hessian_diagonal (gnn_line *line, gsl_vector *H, double eps) 00199 { 00200 gnn_grad *grad; 00201 gsl_vector *d; 00202 00203 assert (line != NULL); 00204 assert (H != NULL); 00205 00206 /* get gnn_grad */ 00207 grad = gnn_line_get_grad (line); 00208 00209 /* check sizes */ 00210 if (grad->l != H->size) 00211 { 00212 GSL_ERROR ("the Hessian's size should be l, with l the number " 00213 "of parameters", GSL_EINVAL); 00214 } 00215 if (grad->P == GNN_INFTY) 00216 { 00217 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00218 } 00219 00220 /* check precision */ 00221 if (eps <= 0.0) 00222 { 00223 GSL_ERROR ("eps should be stricly positive", GSL_EINVAL); 00224 } 00225 00226 /* compute direction */ 00227 d = (gsl_vector *) gnn_line_get_direction (line); 00228 gsl_vector_set_all (d, 1.0); 00229 00230 /* compute gradient step forward */ 00231 gnn_line_all (line, eps, gnnLineDE); 00232 gsl_vector_memcpy (H, GNN_GRAD_DW (grad)); 00233 00234 /* compute gradient step backward */ 00235 gnn_line_all (line, -eps, gnnLineDE); 00236 gsl_vector_sub (H, GNN_GRAD_DW (grad)); 00237 00238 /* finish approximation */ 00239 gsl_vector_scale (H, 1 / (2*eps)); 00240 00241 /* restore original parameters */ 00242 gnn_line_all (line, 0.0, gnnLineE); 00243 00244 return 0; 00245 } 00246 00247 /** 00248 * @brief Initializes the Levenberg-Marquadt Hessian approximation. 00249 * @ingroup gnn_hessian_doc 00250 * 00251 * This function initializes the Hessian for the Levenberg-Marquadt 00252 * approximation. 00253 * 00254 * For further details, see \ref gnn_hessian_levenberg_marquadt. 00255 * 00256 * @param grad A pointer to a \ref gnn_grad buffer. 00257 * @param H A pointer to a matrix of size \f$l \times l\f$, 00258 * where \f$l\f$ is the number of parameters involved 00259 * in the \ref gnn_node. The resulting Hessian will be 00260 * stored in this matrix. 00261 * @return Returns 0 if succeeded. 00262 */ 00263 int 00264 gnn_hessian_levenberg_marquadt_init (gnn_grad *grad, gsl_matrix *H) 00265 { 00266 assert (grad != NULL); 00267 assert (H != NULL); 00268 00269 /* check sizes */ 00270 if (grad->l != H->size1 || grad->l != H->size2) 00271 { 00272 GSL_ERROR ("the Hessian's size should be l x l, with l the number " 00273 "of parameters", GSL_EINVAL); 00274 } 00275 if (grad->P == GNN_INFTY) 00276 { 00277 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00278 } 00279 00280 /* clear Hessian */ 00281 gsl_matrix_set_zero (H); 00282 00283 return 0; 00284 } 00285 00286 /** 00287 * @brief Levenberg-Marquadt Hessian approximation. 00288 * @ingroup gnn_hessian_doc 00289 * 00290 * This function computes the k-th iteration of the Levenberg-Marquadt Hessian 00291 * approximation, using the k-th pattern. 00292 * 00293 * The Levenberg-Marquadt approximation (also known as the outer product 00294 * approximation) of the Hessian matrix is given by 00295 * 00296 * \f[ \frac{\partial^2 E}{\partial w_i \partial w_j} 00297 * = \sum_{k=1}^P \frac{\partial y^k}{\partial w_i} 00298 * \frac{\partial y^k}{\partial w_j} 00299 * \f] 00300 * 00301 * where the sum runs over all pattern outputs \f$y^k\f$, \f$k=1,\ldots,P\f$. 00302 * It is very important to note that this approximation is only valid for 00303 * an error function of the form 00304 * \f[ E = \frac{1}{2} \sum_{k=1}^P (y^k - t^k)^2, \f] 00305 * that is, for an sum-of-squares error function (and its equivalent forms). 00306 * 00307 * The previous formula can be written in matrix form, giving the sequential 00308 * procedure for building the Hessian: 00309 * 00310 * \f[ H_P = \sum_{k=1}^P \frac{\partial E}{\partial w}^k 00311 * \frac{\partial E}{\partial w}^k 00312 * \f] 00313 * 00314 * which is the way used by \ref libgnn. 00315 * 00316 * @warning Evaluation??? Shouldn't it be handled completely by a gnn_eval 00317 * or gnn_grad?? 00318 * 00319 * @param grad A pointer to a \ref gnn_grad buffer. 00320 * @param H A pointer to a matrix of size \f$l \times l\f$, 00321 * where \f$l\f$ is the number of parameters involved 00322 * in the \ref gnn_node. The resulting Hessian will be 00323 * stored in this matrix. 00324 * @param k The index of the pattern to be used. 00325 * @return Returns 0 if succeeded. 00326 */ 00327 int 00328 gnn_hessian_levenberg_marquadt (gnn_grad *grad, gsl_matrix *H, size_t k) 00329 { 00330 double p; 00331 gsl_vector *x; 00332 gsl_vector *t; 00333 gsl_matrix_view G; 00334 00335 assert (grad != NULL); 00336 assert (H != NULL); 00337 00338 /* check sizes */ 00339 if (grad->l != H->size1 || grad->l != H->size2) 00340 { 00341 GSL_ERROR ("the Hessian's size should be l x l, with l the number " 00342 "of parameters", GSL_EINVAL); 00343 } 00344 if (grad->P == GNN_INFTY) 00345 { 00346 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00347 } 00348 00349 /* check batch bounds */ 00350 if (k < 0 || k >= grad->P) 00351 { 00352 GSL_ERROR ("pattern index out of bounds", GSL_EINVAL); 00353 } 00354 00355 /* get pattern */ 00356 gnn_dataset_get (grad->data, k, &x, &t, &p); 00357 00358 /* compute gradient */ 00359 gnn_node_evaluate_init (grad->node); 00360 gnn_node_evaluate_f (grad->node, x, grad->y); 00361 grad->e = gnn_criterion_evaluate_e (grad->crit, grad->y, t); 00362 gnn_criterion_evaluate_dy (grad->crit, grad->dy); 00363 gnn_node_evaluate_dx (grad->node, grad->dy, grad->dx); 00364 gnn_node_evaluate_dw (grad->node, grad->dw); 00365 00366 /* scale dw gradient */ 00367 gsl_vector_scale (grad->dw, p); 00368 00369 /* get matrix view from gradient */ 00370 G = gsl_matrix_view_vector (grad->dw, grad->l, 1); 00371 00372 /* compute g*gT and sum to Hessian */ 00373 gsl_blas_dgemm (CblasNoTrans, CblasTrans, 00374 1.0, &(G.matrix), &(G.matrix), 1.0, H); 00375 00376 return 0; 00377 } 00378 00379 /** 00380 * @brief Levenberg-Marquadt Hessian Inverse approximation initialization. 00381 * @ingroup gnn_hessian_doc 00382 * 00383 * This function initializes the outer product inverse approximation routine. 00384 * For further details, please refer to \ref gnn_hessian_inverse. 00385 * 00386 * @param grad A pointer to a \ref gnn_grad buffer. 00387 * @param H A pointer to a matrix of size \f$l \times l\f$, 00388 * where \f$l\f$ is the number of parameters involved 00389 * in the \ref gnn_node. The resulting inverse Hessian will be 00390 * stored in this matrix. 00391 * @param alpha A small value for \f$\alpha\f$. 00392 * @return Returns 0 if succeeded. 00393 */ 00394 int 00395 gnn_hessian_inverse_init (gnn_grad *grad, gsl_matrix *H, double alpha) 00396 { 00397 assert (grad != NULL); 00398 assert (H != NULL); 00399 00400 /* check sizes */ 00401 if (grad->l != H->size1 || grad->l != H->size2) 00402 { 00403 GSL_ERROR ("the Hessian's size should be l x l, with l the number " 00404 "of parameters", GSL_EINVAL); 00405 } 00406 if (grad->P == GNN_INFTY) 00407 { 00408 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00409 } 00410 00411 /* clear Hessian */ 00412 gsl_matrix_set_identity (H); 00413 gsl_matrix_scale (H, alpha); 00414 00415 return 0; 00416 } 00417 00418 /** 00419 * @brief Levenberg-Marquadt Hessian Inverse approximation. 00420 * @ingroup gnn_hessian_doc 00421 * 00422 * This function computes the k-th iteration of the outer-product inverse 00423 * Hessian approximation, using the k-th pattern. 00424 * 00425 * The outer product approximation used in \ref gnn_hessian_levenberg_marquadt 00426 * can be used to develop an efficient procedure for computing the inverse 00427 * of the Hessian matrix. 00428 * 00429 * Recalling the formula for the outer product approximation 00430 * 00431 * \f[ H_P = \sum_{k=1}^P g^k (g^k)^T \f] 00432 * 00433 * where \f$g^k = \frac{\partial E}{\partial w}^k\f$, it is easy to see that 00434 * this leads to a sequential form for computing the Hessian: 00435 * 00436 * \f[ H_{N+1} = H_{N} + g^{N+1} (g^{N+1})^T \f] 00437 * 00438 * Using the following matrix identity 00439 * 00440 * \f[ (A + BC)^{-1} = A^{-1} - A^{-1}B(I+CA^{-1}B)^{-1}CA^{-1} \f] 00441 * 00442 * where \f$I\f$ is the identity matrix, and setting \f$H_{N} = A\f$, 00443 * \f$g^{N+1}=B\f$ and \f$(g^{N+1})^T=C\f$, then the previous formula 00444 * can be rewritten in order to obtain an iterative procedure for building 00445 * the inverse of the Hessian: 00446 * 00447 * \f[ 00448 * H^{-1}_{N+1} = H^{-1}_N - 00449 * \frac{ H^{-1}_N g^{N+1} (g^{N+1})^T H^{-1}_N } 00450 * { 1 + (g^{N+1})^T H^{-1}_N g^{N+1} } 00451 * \f] 00452 * 00453 * The procedure can be initialized by \f$ H_0 = \alpha I \f$, where 00454 * \f$\alpha\f$ is a small value (because the procedure actually seeks 00455 * to approximate \f$ H + \alpha I \f$). 00456 * 00457 * It is important to note that, as in the case for 00458 * \ref gnn_hessian_levenberg_marquadt, this approximation is only valid 00459 * for a sum-of-squares error function. 00460 * 00461 * @warning Evaluation??? Shouldn't it be handled completely by a gnn_eval 00462 * or gnn_grad?? 00463 * 00464 * @param grad A pointer to a \ref gnn_grad buffer. 00465 * @param H A pointer to a matrix of size \f$l \times l\f$, 00466 * where \f$l\f$ is the number of parameters involved 00467 * in the \ref gnn_node. The resulting inverse Hessian will be 00468 * stored in this matrix. 00469 * @param k The index of the pattern to be used. 00470 * @return Returns 0 if succeeded. 00471 */ 00472 int 00473 gnn_hessian_inverse (gnn_grad *grad, gsl_matrix *H, size_t k) 00474 { 00475 double p; 00476 gsl_vector *x; 00477 gsl_vector *t; 00478 gsl_matrix_view Hg; 00479 00480 assert (grad != NULL); 00481 assert (H != NULL); 00482 00483 /* check sizes */ 00484 if (grad->l != H->size1 || grad->l != H->size2) 00485 { 00486 GSL_ERROR ("the Hessian's size should be l x l, with l the number " 00487 "of parameters", GSL_EINVAL); 00488 } 00489 if (grad->P == GNN_INFTY) 00490 { 00491 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00492 } 00493 00494 /* check batch bounds */ 00495 if (k < 0 || k >= grad->P) 00496 { 00497 GSL_ERROR ("pattern index out of bounds", GSL_EINVAL); 00498 } 00499 00500 /* get pattern */ 00501 gnn_dataset_get (grad->data, k, &x, &t, &p); 00502 00503 /* compute gradient */ 00504 gnn_node_evaluate_init (grad->node); 00505 gnn_node_evaluate_f (grad->node, x, grad->y); 00506 grad->e = gnn_criterion_evaluate_e (grad->crit, grad->y, t); 00507 gnn_criterion_evaluate_dy (grad->crit, grad->dy); 00508 gnn_node_evaluate_dx (grad->node, grad->dy, grad->dx); 00509 gnn_node_evaluate_dw (grad->node, grad->dw); 00510 00511 /* scale dw gradient */ 00512 gsl_vector_scale (grad->dw, p); 00513 00514 /* COMPUTE FORMULA */ 00515 { 00516 double v; 00517 00518 /* h = H^-1 * g */ 00519 gsl_blas_dgemv (CblasNoTrans, 1.0, H, grad->dw, 0.0, grad->h); 00520 00521 /* get matrix view from vector */ 00522 Hg = gsl_matrix_view_vector (grad->h, grad->l, 1); 00523 00524 /* v = 1 + g^T * h */ 00525 gsl_blas_ddot (grad->dw, grad->dw, &v); 00526 v = 1.0 + v; 00527 00528 /* compute (h*h^T / v) and sum to Hessian */ 00529 gsl_blas_dgemm (CblasNoTrans, CblasTrans, 00530 1.0, &(Hg.matrix), &(Hg.matrix), -1.0 / v, H); 00531 } 00532 00533 return 0; 00534 } 00535 00536 /** 00537 * @brief Finite differences Hessian matrix approximation. 00538 * @ingroup gnn_hessian_doc 00539 * 00540 * This numerical efficient method computes the values of the Hessian matrix 00541 * using the central differentes method, given by 00542 * 00543 * \f[ \frac{\partial^2 E}{\partial w_i \partial w_j} 00544 * = \frac{1}{2\epsilon} 00545 * \left [ 00546 * \frac{\partial E}{\partial w_i}(w_j + \epsilon) - 00547 * \frac{\partial E}{\partial w_i}(w_j - \epsilon) 00548 * \right ] 00549 * \f] 00550 * 00551 * where \f$\epsilon\f$ is a small value that gives the required precision, 00552 * since the error resulting from this approximation is \f$O(\epsilon^2)\f$. 00553 * 00554 * @param line A pointer to a \ref gnn_line buffer. 00555 * @param H A pointer to a matrix of size \f$l \times l\f$, 00556 * where \f$l\f$ is the number of parameters involved 00557 * in the \ref gnn_node. The resulting Hessian will be 00558 * stored in this matrix. 00559 * @param eps The required precision \f$\epsilon\f$. 00560 * @return Returns 0 if succeeded. 00561 */ 00562 int 00563 gnn_hessian_finite_differences (gnn_line *line, gsl_matrix *H, double eps) 00564 { 00565 size_t j; 00566 gnn_grad *grad; 00567 00568 assert (line != NULL); 00569 assert (H != NULL); 00570 00571 /* get gnn_grad */ 00572 grad = gnn_line_get_grad (line); 00573 00574 /* check sizes */ 00575 if (grad->l != H->size1 || grad->l != H->size2) 00576 { 00577 GSL_ERROR ("the Hessian's size should be l x l, with l the number " 00578 "of parameters", GSL_EINVAL); 00579 } 00580 if (grad->P == GNN_INFTY) 00581 { 00582 GSL_ERROR ("the dataset should be finite", GSL_EINVAL); 00583 } 00584 00585 /* check precision */ 00586 if (eps <= 0.0) 00587 { 00588 GSL_ERROR ("eps should be stricly positive", GSL_EINVAL); 00589 } 00590 00591 /* compute Hessian */ 00592 for (j=0; j<grad->l; ++j) 00593 { 00594 gsl_vector *d; 00595 gsl_vector_view Hj; 00596 00597 /* compute direction */ 00598 d = (gsl_vector *) gnn_line_get_direction (line); 00599 gsl_vector_set_all (d, 0.0); 00600 gsl_vector_set (d, j, 1.0); 00601 00602 /* get Hessian row view */ 00603 Hj = gsl_matrix_row (H, j); 00604 00605 /* compute gradient step forward */ 00606 gnn_line_all (line, eps, gnnLineDE); 00607 gsl_vector_memcpy (&(Hj.vector), GNN_GRAD_DW (grad)); 00608 00609 /* compute gradient step backward */ 00610 gnn_line_all (line, -eps, gnnLineDE); 00611 gsl_vector_sub (&(Hj.vector), GNN_GRAD_DW (grad)); 00612 00613 /* finish approximation */ 00614 gsl_vector_scale (&(Hj.vector), 1 / (2*eps)); 00615 } 00616 00617 /* recover original parameters */ 00618 gnn_line_all (line, 0.0, gnnGradE); 00619 00620 return 0; 00621 } 00622 00623 00624 00625
1.2.18