00001 /*************************************************************************** 00002 * @file gnn_dataset.c 00003 * @brief Data Sets Implementation. 00004 * 00005 * @date : 23-08-03 21:12, 22-09-03 02:28 00006 * @author : Pedro Ortega C. <peortega@dcc.uchile.cl> 00007 * Copyright 2003 Pedro Ortega C. 00008 ****************************************************************************/ 00009 /* 00010 * This program is free software; you can redistribute it and/or modify 00011 * it under the terms of the GNU General Public License as published by 00012 * the Free Software Foundation; either version 2 of the License, or 00013 * (at your option) any later version. 00014 * 00015 * This program is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 * GNU Library General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU General Public License 00021 * along with this program; if not, write to the Free Software 00022 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 00023 */ 00024 00025 00026 00027 /** 00028 * @defgroup gnn_dataset_doc gnn_dataset : Datasets for Training. 00029 * @ingroup libgnn_dataset 00030 * 00031 * The \ref gnn_dataset type defines a common interface for handling pattern 00032 * sets. 00033 * 00034 * In \ref libgnn, a pattern is defined as a triple 00035 * \f[ (x, t, p) \f] 00036 * where \f$x \in \mathbb{R}^n\f$ is the input pattern or feature vector, 00037 * \f$t \in \mathbb{R}^m\f$ is the output pattern or target vector and 00038 * \f$p \in \mathbb{R}\f$ is the pattern weight or pattern relevance. 00039 * 00040 * Datasets are sets of \f$P\f$ training patterns, which could be used for 00041 * training (as mentioned) or for model validation and testing. To identify 00042 * a particular pattern, they are indexed by \f$k=0,1, \ldots, P-1\f$ and 00043 * written as \f$x^k\f$, \f$t^k\f$ and \f$p_k\f$ (note that the vectors 00044 * are superindexed and the scalar value is subindexed). Schematically, 00045 * a dataset can be ilustrated by the following figure: 00046 * 00047 * <img src="images/gnn_dataset1.png"> 00048 * 00049 * Patterns are commonly obtained from the observation of the real system 00050 * to be modelled, e.g. plants or other phenomenon, or sometimes they are 00051 * artificially constructed. In \ref libgnn, they can be sampled from 00052 * different sources, like its parts (input, output, weight). 00053 * 00054 * The \ref gnn_dataset and its functions provide a common interface, or 00055 * protocol, for different types of datasets. A dataset can be just a 00056 * sampler from three different sources (input, output and weight), 00057 * or a shuffler, or a preprocessor, etc. They just provide a logic 00058 * view of the underlying sample sources, and they have to manage them 00059 * without mixing them up. 00060 * 00061 * libgnn's design for handling datasets is very simple. A dataset, as an 00062 * "abstract object", has some properties and can do some things: 00063 * 00064 * - Its patterns have a fixed input and output size \f$n\f$ and \f$m\f$ 00065 * respectivelly. 00066 * - It contains a fixed, positive amount of patterns \f$(x^k, t^k, p_k)\f$. 00067 * - It can be reset. This should update its internal structure. What this 00068 * means in practice depends on the particular dataset. 00069 * - It can retrieve a full pattern located at a specific position 00070 * (given by its index). 00071 * 00072 * <b>What is the purpose of a dataset?</b> 00073 * 00074 * Datasets (as an abstraction) do exist because there are many ways to 00075 * get samples, and there are many possible sources. Per example, there 00076 * could be a dataset which samples its patterns form a disk, from RAM, 00077 * serial port, etc. Or even worse, the three sources itself could be 00078 * heterogeneous. Also, the <i>sampling method</i> could vary. 00079 * 00080 * <img src="images/gnn_dataset3.png"> 00081 * 00082 * Datasets <b>can</b> (but aren't forced to) be made of \ref gnn_input s. 00083 * As a particular example, \ref gnn_simple_set is built upon three samplers 00084 * (one for inputs, targets and weights). 00085 * 00086 * <b>What does a trainer do in order to sample from a data set?</b> 00087 * 00088 * The order in which a trainer calls the functions on a \ref gnn_dataset is 00089 * the one ilustrated in the following flow diagram: 00090 * 00091 * <img src="images/gnn_dataset2.png"> 00092 * 00093 * The important steps are marked with a bold border. 00094 * 00095 * <b>How to implement the gnn_dataset interface with a custom dataset?</b> 00096 * 00097 * It's simple. Create a new C datatype, wich should contain a \ref gnn_dataset 00098 * structure in it: 00099 * 00100 \code 00101 typedef struct _my_dataset my_dataset; 00102 00103 struct _gnn_simple_set 00104 { 00105 gnn_dataset set; // The underlying gnn_dataset 00106 00107 // Other things your dataset needs... 00108 ... 00109 }; 00110 00111 \endcode 00112 * 00113 * Then, implement the 3 needed functions: reset, get and detroy, conforming 00114 * to the calling parameter specification: 00115 * 00116 \code 00117 int 00118 my_dataset_reset (gnn_dataset *set); 00119 00120 int 00121 my_dataset_get (gnn_dataset *set, 00122 size_t k, 00123 gsl_vector **x, 00124 gsl_vector **t, 00125 double *p); 00126 00127 void 00128 my_dataset_destroy (gnn_dataset *set); 00129 \endcode 00130 * 00131 * And finally, create a constructor which <b>must call</b> the 00132 * \ref gnn_dataset_init function: 00133 * 00134 \code 00135 gnn_dataset * 00136 my_dataset_new () 00137 { 00138 my_datasete *myset; // a pointer to the dataset to be created 00139 gnn_dataset *set; // a pointer to the same dataset, but viewed as a 00140 // gnn_dataset 00141 00142 // allocate memory for the dataset 00143 myset = (my_dataset *) malloc (sizeof (my_dataset)); 00144 00145 // initialize the dataset 00146 gnn_dataset_init (set, N_OF_PATTERNS, INPUT_SIZE, TARGET_SIZE, 00147 my_dataset_reset, my_dataset_get, my_dataset_destroy); 00148 00149 // do other initialization your dataset might need... 00150 ... 00151 00152 return myset; 00153 } 00154 \endcode 00155 * 00156 * That's it! 00157 */ 00158 00159 00160 00161 /******************************************/ 00162 /* Include Files */ 00163 /******************************************/ 00164 00165 #include "gnn_dataset.h" 00166 #include "gnn_utilities.h" 00167 00168 00169 00170 /******************************************/ 00171 /* Static Declaration */ 00172 /******************************************/ 00173 00174 static int 00175 gnn_dataset_default_reset (gnn_dataset *set); 00176 00177 static int 00178 gnn_dataset_default_get (gnn_dataset *set, 00179 size_t i, 00180 gsl_vector **x, 00181 gsl_vector **t, 00182 double *p); 00183 00184 void 00185 gnn_dataset_default_destroy (gnn_dataset *set); 00186 00187 00188 00189 /******************************************/ 00190 /* Static Implementation */ 00191 /******************************************/ 00192 00193 /** 00194 * @brief Default "reset" function for a dataset. 00195 * @ingroup gnn_dataset_doc 00196 * 00197 * This function is the default "reset" function for a dataset. It does nothing. 00198 * 00199 * @param set A pointer to a \ref gnn_dataset. 00200 * @return 0 if succeeded. 00201 */ 00202 static int 00203 gnn_dataset_default_reset (gnn_dataset *set) 00204 { 00205 assert (set != NULL); 00206 00207 return 0; 00208 } 00209 00210 /** 00211 * @brief Default "destroy" function for a dataset. 00212 * @ingroup gnn_dataset_doc 00213 * 00214 * This function is the default "destroy" function for a dataset. It assumes 00215 * that there isn't any additional data for the specific dataset type, so 00216 * it actually just returns. 00217 * 00218 * @param set A pointer to a \ref gnn_dataset. 00219 */ 00220 void 00221 gnn_dataset_default_destroy (gnn_dataset *set) 00222 { 00223 return; 00224 } 00225 00226 00227 00228 /******************************************/ 00229 /* Public Interface */ 00230 /******************************************/ 00231 00232 /** 00233 * @brief Initializes a \ref gnn_dataset. 00234 * @ingroup gnn_dataset_doc 00235 * 00236 * This function initializes a given dataset, setting its properties and 00237 * installing its functions. 00238 * 00239 * If the reset or destroy functions aren't provided, then the default 00240 * functions \ref gnn_dataset_default_reset 00241 * and \ref gnn_dataset_default_destroy are installed respectively. The 00242 * "get" function is mandatory and can't be omitted. 00243 * 00244 * As an example, suppose that you have made your own extension to the 00245 * \ref gnn_dataset datatype, which you called "my_dataset_type". Also, 00246 * suppose you have already coded the appropiate "reset", "get" and "destroy" 00247 * functions for your special dataset. Then, 00248 * \code 00249 my_dataset_type *myset; // a pointer to the dataset to be created 00250 gnn_dataset *set; // a pointer to the same dataset, but viewed as a 00251 // gnn_dataset 00252 00253 // allocate memory for the dataset 00254 myset = (my_dataset_type *) malloc (sizeof (my_dataset_type)); 00255 00256 // initialize the dataset 00257 gnn_dataset_init (set, 100, 5, 2, 00258 my_dataset_reset, my_dataset_get, my_dataset_destroy); 00259 * \endcode 00260 * would initialize your dataset for 100 patterns, whose inputs and outputs 00261 * are of size 5 and 2 respectivelly. 00262 * 00263 * @param set A pointer to a \ref gnn_dataset. 00264 * @param size The number of patterns that it contains. 00265 * @param n The size of the inputs. 00266 * @param m The size of the outputs. 00267 * @param reset A pointer to the dataset's "reset" function. 00268 * @param get A pointer to the dataset's "get" function. 00269 * @param destroy A pointer to the dataset's "destroy" function. 00270 * @return 0 if succeeded. 00271 */ 00272 int 00273 gnn_dataset_init (gnn_dataset *set, 00274 size_t size, 00275 size_t n, 00276 size_t m, 00277 gnn_dataset_reset_type reset, 00278 gnn_dataset_get_type get, 00279 gnn_dataset_destroy_type destroy) 00280 { 00281 assert (set != NULL); 00282 assert (get != NULL); 00283 00284 /* check sizes */ 00285 if (n < 1 || m < 1 || size < 0) 00286 { 00287 GSL_ERROR ("datasets should have stricly positive number of" 00288 "patterns, input and output sizes", GSL_EINVAL); 00289 } 00290 00291 /* set fields */ 00292 set->size = size; 00293 set->n = n; 00294 set->m = m; 00295 set->reset = gnn_dataset_default_reset; 00296 set->get = get; 00297 set->destroy = gnn_dataset_default_destroy; 00298 00299 /* install functions */ 00300 if (reset != NULL) 00301 set->reset = reset; 00302 if (destroy != NULL) 00303 set->destroy = destroy; 00304 00305 return 0; 00306 } 00307 00308 00309 /** 00310 * @brief Destroy a dataset. 00311 * @ingroup gnn_dataset_doc 00312 * 00313 * This function destroys the dataset. 00314 * 00315 * @param set A pointer to a \ref gnn_dataset. 00316 */ 00317 void 00318 gnn_dataset_destroy (gnn_dataset *set) 00319 { 00320 assert (set != NULL); 00321 00322 set->destroy (set); 00323 free (set); 00324 } 00325 00326 /** 00327 * @brief Reset a dataset. 00328 * @ingroup gnn_dataset_doc 00329 * 00330 * This function resets the dataset. 00331 * 00332 * @param set A pointer to a \ref gnn_dataset. 00333 * @return 0 if succeeded. 00334 */ 00335 int 00336 gnn_dataset_reset (gnn_dataset *set) 00337 { 00338 assert (set != NULL); 00339 00340 set->reset (set); 00341 } 00342 00343 /** 00344 * @brief Gets the i-th pattern. 00345 * @ingroup gnn_dataset_doc 00346 * 00347 * This function returns pointers to the pattern located atstores the dataset's i-th pattern into the buffers "x" and "t" 00348 * (which should be both gsl_vector of the correct size) and its corresponding 00349 * weight into the location pointed by "weight". 00350 * 00351 * Note that what "the i-th pattern" means depends on the underlying 00352 * implementation. Also, i should be within 0 and the dataset's size. 00353 * 00354 * @param set A pointer to a \ref gnn_dataset. 00355 * @param i The index of the pattern to be retrieved. 00356 * @param x A pointer to the gsl_vector where the input pattern should 00357 * be placed in. 00358 * @param t A pointer to the gsl_vector where the output pattern should 00359 * be placed in. 00360 * @param weight A pointer to a double where the pattern's weight should be 00361 * placed in. 00362 * @return 0 if succeeded. 00363 */ 00364 int 00365 gnn_dataset_get (gnn_dataset *set, 00366 size_t k, 00367 gsl_vector **x, 00368 gsl_vector **t, 00369 double *weight) 00370 { 00371 assert (set != NULL); 00372 00373 set->get (set, k, x, t, weight); 00374 } 00375 00376 /** 00377 * @brief Gets the size of the dataset. 00378 * @ingroup gnn_dataset_doc 00379 * 00380 * This function returns the dataset's size. 00381 * 00382 * @param set A pointer to a \ref gnn_dataset. 00383 * @return Returns the size. 00384 */ 00385 size_t 00386 gnn_dataset_get_size (gnn_dataset *set) 00387 { 00388 assert (set != NULL); 00389 00390 return set->size; 00391 } 00392 00393 /** 00394 * @brief Gets the input size of the dataset. 00395 * @ingroup gnn_dataset_doc 00396 * 00397 * This function returns the size of the pattern's input vector size. 00398 * 00399 * @param set A pointer to a \ref gnn_dataset. 00400 * @return Returns the size. 00401 */ 00402 size_t 00403 gnn_dataset_input_get_size (gnn_dataset *set) 00404 { 00405 assert (set != NULL); 00406 00407 return set->n; 00408 } 00409 00410 /** 00411 * @brief Gets the output size of the dataset. 00412 * @ingroup gnn_dataset_doc 00413 * 00414 * This function returns the size of the pattern's output vector size. 00415 * 00416 * @param set A pointer to a \ref gnn_dataset. 00417 * @return Returns the size. 00418 */ 00419 size_t 00420 gnn_dataset_output_get_size (gnn_dataset *set) 00421 { 00422 assert (set != NULL); 00423 00424 return set->m; 00425 } 00426 00427
1.2.18