OpenNN  2.2
Open Neural Networks Library
data_set.h
1 /****************************************************************************************************************/
2 /* */
3 /* OpenNN: Open Neural Networks Library */
4 /* www.artelnics.com/opennn */
5 /* */
6 /* D A T A S E T C L A S S H E A D E R */
7 /* */
8 /* Roberto Lopez */
9 /* Artelnics - Making intelligent use of data */
11 /* */
12 /****************************************************************************************************************/
13 
14 #ifndef __DATASET_H__
15 #define __DATASET_H__
16 
17 // System includes
18 
19 #include <iostream>
20 #include <fstream>
21 #include <string>
22 #include <sstream>
23 #include <cmath>
24 #include <algorithm>
25 #include <cstdlib>
26 #include <stdexcept>
27 #include <ctime>
28 #include <exception>
29 
30 // OpenNN includes
31 
32 #include "vector.h"
33 #include "matrix.h"
34 
35 #include "missing_values.h"
36 #include "variables.h"
37 #include "instances.h"
38 
39 // TinyXml includes
40 
41 #include "../tinyxml2/tinyxml2.h"
42 
43 namespace OpenNN
44 {
45 
49 
50 class DataSet
51 {
52 
53 public:
54 
55  // DEFAULT CONSTRUCTOR
56 
57  explicit DataSet(void);
58 
59  // DATA CONSTRUCTOR
60 
61  explicit DataSet(const Matrix<double>&);
62 
63  // VARIABLES AND INSTANCES CONSTRUCTOR
64 
65  explicit DataSet(const size_t&, const size_t&);
66 
67  // INPUTS, TARGETS AND INSTANCES NUMBERS CONSTRUCTOR
68 
69  explicit DataSet(const size_t&, const size_t&, const size_t&);
70 
71  // XML CONSTRUCTOR
72 
73  explicit DataSet(const tinyxml2::XMLDocument&);
74 
75  // FILE CONSTRUCTOR
76 
77  explicit DataSet(const std::string&);
78 
79  // COPY CONSTRUCTOR
80 
81  DataSet(const DataSet&);
82 
83  // DESTRUCTOR
84 
85  virtual ~DataSet(void);
86 
87  // ASSIGNMENT OPERATOR
88 
89  DataSet& operator = (const DataSet&);
90 
91  // EQUAL TO OPERATOR
92 
93  bool operator == (const DataSet&) const;
94 
95  // ENUMERATIONS
96 
98 
99  enum Separator{Space, Tab, Comma, Semicolon};
100 
102 
103  enum ScalingUnscalingMethod{MinimumMaximum, MeanStandardDeviation};
104 
106 
107  enum AngularUnits{Radians, Degrees};
108 
109  // METHODS
110 
111  // Get methods
112 
113  const std::string& get_data_file_name(void) const;
114 
115  const bool& get_header_line(void) const;
116 
117  const Separator& get_separator(void) const;
118  std::string get_separator_string(void) const;
119  std::string write_separator(void) const;
120 
121  const std::string& get_missing_values_label(void) const;
122 
123  const size_t& get_lags_number(void) const;
124  const bool& get_autoassociation(void) const;
125 
126  const Vector<size_t>& get_angular_variables(void) const;
127  const AngularUnits& get_angular_units(void) const;
128 
129  static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string&);
130 
131  const MissingValues& get_missing_values(void) const;
133 
134  const Variables& get_variables(void) const;
136 
137  const Instances& get_instances(void) const;
139 
140  const bool& get_display(void) const;
141 
142  // Data methods
143 
144  bool empty(void) const;
145 
146  const Matrix<double>& get_data(void) const;
147 
151 
154 
161 
162  // Instance methods
163 
164  Vector<double> get_instance(const size_t&) const;
165  Vector<double> get_instance(const size_t&, const Vector<size_t>&) const;
166 
167  // Variable methods
168 
169  Vector<double> get_variable(const size_t&) const;
170  Vector<double> get_variable(const size_t&, const Vector<size_t>&) const;
171 
172  // Set methods
173 
174  void set(void);
175  void set(const Matrix<double>&);
176  void set(const size_t&, const size_t&);
177  void set(const size_t&, const size_t&, const size_t&);
178  void set(const DataSet&);
179  void set(const tinyxml2::XMLDocument&);
180  void set(const std::string&);
181 
182  // Data methods
183 
184  void set_data(const Matrix<double>&);
185 
186  void set_instances_number(const size_t&);
187  void set_variables_number(const size_t&);
188 
189  void set_data_file_name(const std::string&);
190 
191  void set_header_line(const bool&);
192 
193  void set_separator(const Separator&);
194  void set_separator(const std::string&);
195 
196  void set_missing_values_label(const std::string&);
197 
198  void set_lags_number(const size_t&);
199  void set_autoassociation(const bool&);
200 
203 
204  // Utilities
205 
206  void set_display(const bool&);
207 
208  void set_default(void);
209 
210  // Instance methods
211 
212  void set_instance(const size_t&, const Vector<double>&);
213 
214  // Data resizing methods
215 
216  void add_instance(const Vector<double>&);
217  void subtract_instance(const size_t&);
218 
219  void append_variable(const Vector<double>&);
220  void subtract_variable(const size_t&);
221 
224 
225  // Initialization methods
226 
227  void initialize_data(const double&);
228 
229  void randomize_data_uniform(const double& minimum = -1.0, const double& maximum = 1.0);
230  void randomize_data_normal(const double& mean = 0.0, const double& standard_deviation = 1.0);
231 
232  // Statistics methods
233 
235 
237 
241 
244 
248 
249  // Correlation methods
250 
252 
253  // Histrogram methods
254 
255  Vector< Histogram<double> > calculate_data_histograms(const size_t& = 10) const;
256 
257  // Filtering methods
258 
260 
261 
262  // Data scaling
263 
266 
269 
270  void scale_data(const std::string&, const Vector< Statistics<double> >&);
271 
272  Vector< Statistics<double> > scale_data(const std::string&);
273 
274  // Input variables scaling
275 
278 
281 
282  Vector< Statistics<double> > scale_inputs(const std::string&);
283  void scale_inputs(const std::string&, const Vector< Statistics<double> >&);
284 
285  // Target variables scaling
286 
289 
292 
293  Vector< Statistics<double> > scale_targets(const std::string&);
294  void scale_targets(const std::string&, const Vector< Statistics<double> >&);
295 
296  // Data unscaling
297 
300 
301  // Input variables unscaling
302 
305 
306  // Target variables unscaling
307 
310 
311  // Pattern recognition methods
312 
314 
316 
317  void balance_data(const double&);
318 
320 
321  // Serialization methods
322 
323  std::string to_string(void) const;
324 
325  void print(void) const;
326  void print_summary(void) const;
327 
328  tinyxml2::XMLDocument* to_XML(void) const;
329  void from_XML(const tinyxml2::XMLDocument&);
330 
331  void save(const std::string&) const;
332  void load(const std::string&);
333 
334  void print_data(void) const;
335  void print_data_preview(void) const;
336 
337  void save_data(void) const;
338 
339  bool has_data(void) const;
340 
341  // Data load methods
342 
343  void load_data(void);
344 
346 
348 
349  void convert_time_series(void);
350  void convert_autoassociation(void);
351 
352  void convert_angular_variable_degrees(const size_t&);
353  void convert_angular_variable_radians(const size_t&);
354 
357 
358  void convert_angular_variables(void);
359 
360  // Missing values
361 
362  void scrub_missing_values_unuse(void);
363  void scrub_missing_values_mean(void);
364  void scrub_missing_values(void);
365 
366  // String utilities
367 
368  size_t count_tokens(std::string&) const;
369 
370  Vector<std::string> get_tokens(const std::string&) const;
371 
372  bool is_numeric(const std::string&) const;
373 
374  void trim(std::string&) const;
375 
376  std::string get_trimmed(const std::string&) const;
377 
378  std::string prepend(const std::string&, const std::string&) const;
379 
380  // Vector string utilities
381 
382  bool is_numeric(const Vector<std::string>&) const;
383  bool is_not_numeric(const Vector<std::string>&) const;
384  bool is_mixed(const Vector<std::string>&) const;
385 
386 private:
387 
388  // MEMBERS
389 
391 
392  std::string data_file_name;
393 
395 
397 
399 
401 
403 
404  std::string missing_values_label;
405 
407 
408  size_t lags_number;
409 
411 
413 
415 
417 
419 
421 
425 
427 
429 
431 
433 
435 
437 
439 
441 
442  bool display;
443 
444  // METHODS
445 
446  size_t get_column_index(const Vector< Vector<std::string> >&, const size_t) const;
447 
448  void check_separator(const std::string&) const;
449 
450  size_t count_data_file_columns_number(void) const;
451  void check_header_line(void);
453 
454  void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&);
455 
458 
459 };
460 
461 }
462 
463 #endif
464 
465 // OpenNN: Open Neural Networks Library.
466 // Copyright (c) 2005-2015 Roberto Lopez.
467 //
468 // This library is free software; you can redistribute it and/or
469 // modify it under the terms of the GNU Lesser General Public
470 // License as published by the Free Software Foundation; either
471 // version 2.1 of the License, or any later version.
472 //
473 // This library is distributed in the hope that it will be useful,
474 // but WITHOUT ANY WARRANTY; without even the implied warranty of
475 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
476 // Lesser General Public License for more details.
477 
478 // You should have received a copy of the GNU Lesser General Public
479 // License along with this library; if not, write to the Free Software
480 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
void unscale_inputs_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2404
void from_XML(const tinyxml2::XMLDocument &)
Definition: data_set.cpp:2601
void randomize_data_normal(const double &mean=0.0, const double &standard_deviation=1.0)
Definition: data_set.cpp:2469
Separator
Enumeration of available separators for the data file.
Definition: data_set.h:99
void save(const std::string &) const
Definition: data_set.cpp:2845
Variables * get_variables_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:212
Vector< Vector< std::string > > set_from_data_file(void)
Definition: data_set.cpp:3344
void unscale_data_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2376
bool has_data(void) const
Definition: data_set.cpp:4051
Matrix< double > arrange_training_input_data(void) const
Definition: data_set.cpp:589
void set_data(const Matrix< double > &)
Definition: data_set.cpp:1025
Matrix< double > calculate_linear_correlations(void) const
Definition: data_set.cpp:1769
void scrub_missing_values_mean(void)
Substitutes all the missing values by the mean of the corresponding variable.
Definition: data_set.cpp:4380
void add_instance(const Vector< double > &)
Definition: data_set.cpp:1307
void set(void)
Sets zero instances and zero variables in the data set.
Definition: data_set.cpp:813
Vector< Statistics< double > > calculate_targets_statistics(void) const
Definition: data_set.cpp:1705
const Separator & get_separator(void) const
Returns the separator to be used in the data file.
Definition: data_set.cpp:315
void set_default(void)
Definition: data_set.cpp:999
void unscale_data_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2363
void convert_angular_variable_degrees(const size_t &)
Definition: data_set.cpp:4142
std::string write_separator(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:372
const Variables & get_variables(void) const
Returns a constant reference to the variables object composing this data set object.
Definition: data_set.cpp:202
Vector< size_t > angular_variables
Indices of angular variables.
Definition: data_set.h:416
void set_data_file_name(const std::string &)
Definition: data_set.cpp:1078
void read_instance(const std::string &, const Vector< Vector< std::string > > &, const size_t &)
Definition: data_set.cpp:3210
std::string data_file_name
Data file name.
Definition: data_set.h:392
size_t get_column_index(const Vector< Vector< std::string > > &, const size_t) const
Definition: data_set.cpp:2992
Matrix< double > arrange_target_data(void) const
Definition: data_set.cpp:572
const std::string & get_missing_values_label(void) const
Returns the string which will be used as label for the missing values in the data file...
Definition: data_set.cpp:419
Vector< size_t > unuse_repeated_instances(void)
Definition: data_set.cpp:1497
void unscale_inputs_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2389
bool operator==(const DataSet &) const
Definition: data_set.cpp:179
bool is_not_numeric(const Vector< std::string > &) const
Definition: data_set.cpp:4641
AngularUnits angular_units
Units of angular variables.
Definition: data_set.h:420
void balance_data(const double &)
Definition: data_set.cpp:3896
const Vector< size_t > & get_angular_variables(void) const
Definition: data_set.cpp:452
void append_variable(const Vector< double > &)
Definition: data_set.cpp:1377
size_t count_data_file_columns_number(void) const
Definition: data_set.cpp:3045
void initialize_data(const double &)
Definition: data_set.cpp:2447
void set_missing_values_label(const std::string &)
Definition: data_set.cpp:1146
Matrix< double > arrange_testing_target_data(void) const
Definition: data_set.cpp:669
Matrix< double > arrange_input_data(void) const
Definition: data_set.cpp:555
void print_data(void) const
Prints to the sceen the values of the data matrix.
Definition: data_set.cpp:2903
void print_summary(void) const
Prints to the screen in text format the main numbers from the data set object.
Definition: data_set.cpp:2824
std::string get_separator_string(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:325
ScalingUnscalingMethod
Enumeration of available methods for scaling and unscaling the data.
Definition: data_set.h:103
void set_angular_variables(const Vector< size_t > &)
Definition: data_set.cpp:1201
void check_separator(const std::string &) const
Definition: data_set.cpp:3018
bool is_numeric(const std::string &) const
Definition: data_set.cpp:4542
void print_data_preview(void) const
Definition: data_set.cpp:2917
const MissingValues & get_missing_values(void) const
Returns a reference to the missing values object in the data set.
Definition: data_set.cpp:275
Vector< double > calculate_training_target_data_mean(void) const
Returns the mean values of the target variables on the training instances.
Definition: data_set.cpp:1719
bool empty(void) const
Returns true if the data matrix is empty, and false otherwise.
Definition: data_set.cpp:253
void scrub_missing_values_unuse(void)
Sets all the instances with missing values to "Unused".
Definition: data_set.cpp:4365
Vector< Statistics< double > > scale_inputs_mean_standard_deviation(void)
Definition: data_set.cpp:2036
Vector< Statistics< double > > scale_targets(const std::string &)
Definition: data_set.cpp:2289
Vector< Histogram< double > > calculate_data_histograms(const size_t &=10) const
Definition: data_set.cpp:1558
MissingValues * get_missing_values_pointer(void)
Returns a pointer to the missing values object in the data set.
Definition: data_set.cpp:285
std::string prepend(const std::string &, const std::string &) const
Definition: data_set.cpp:4607
void convert_angular_variables(void)
Definition: data_set.cpp:4329
tinyxml2::XMLDocument * to_XML(void) const
Serializes the data set object into a XML document of the TinyXML library.
Definition: data_set.cpp:2479
const bool & get_header_line(void) const
Returns true if the first line of the data file has a header with the names of the variables...
Definition: data_set.cpp:305
void print(void) const
Prints to the screen in text format the members of the data set object.
Definition: data_set.cpp:2811
std::string to_string(void) const
Returns a string representation of the current data set object.
Definition: data_set.cpp:2788
void set_display(const bool &)
Definition: data_set.cpp:986
Separator separator
Separator character.
Definition: data_set.h:400
void scale_data(const std::string &, const Vector< Statistics< double > > &)
Definition: data_set.cpp:1944
const std::string & get_data_file_name(void) const
Returns the name of the data file.
Definition: data_set.cpp:295
Vector< std::string > arrange_autoassociation_names(const Vector< std::string > &) const
Definition: data_set.cpp:3594
Vector< Statistics< double > > scale_data_mean_standard_deviation(void)
Definition: data_set.cpp:1879
Vector< Statistics< double > > scale_inputs(const std::string &)
Definition: data_set.cpp:2118
bool autoassociation
Autoassociation flag.
Definition: data_set.h:412
Vector< double > get_instance(const size_t &) const
Definition: data_set.cpp:684
void read_from_data_file(const Vector< Vector< std::string > > &)
Performs a second data file read in which the data is set.
Definition: data_set.cpp:3497
void unscale_targets_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2419
DataSet & operator=(const DataSet &)
Definition: data_set.cpp:143
void set_header_line(const bool &)
Sets if the data file contains a header with the names of the variables.
Definition: data_set.cpp:1088
void set_autoassociation(const bool &)
Definition: data_set.cpp:1190
MissingValues missing_values
Missing values object.
Definition: data_set.h:438
void balance_target_class_distribution(void)
Definition: data_set.cpp:3998
size_t count_tokens(std::string &) const
Definition: data_set.cpp:4446
const AngularUnits & get_angular_units(void) const
Returns the units used for the angular variables (Radians or Degrees).
Definition: data_set.cpp:462
Instances * get_instances_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:232
Matrix< double > arrange_generalization_data(void) const
Definition: data_set.cpp:520
void unscale_targets_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2434
Vector< std::string > get_tokens(const std::string &) const
Definition: data_set.cpp:4497
size_t lags_number
Number of lags.
Definition: data_set.h:408
const bool & get_display(void) const
Definition: data_set.cpp:243
bool header_line
Header which contains variables name.
Definition: data_set.h:396
Vector< size_t > unuse_constant_variables(void)
Definition: data_set.cpp:1456
const Matrix< double > & get_data(void) const
Definition: data_set.cpp:265
void load(const std::string &)
Definition: data_set.cpp:2880
void convert_time_series(void)
Definition: data_set.cpp:3607
Vector< Statistics< double > > calculate_data_statistics(void) const
Definition: data_set.cpp:1577
void randomize_data_uniform(const double &minimum=-1.0, const double &maximum=1.0)
Definition: data_set.cpp:2458
Vector< double > calculate_distances(void) const
Definition: data_set.cpp:3864
Vector< double > calculate_testing_target_data_mean(void) const
Returns the mean values of the target variables on the testing instances.
Definition: data_set.cpp:1751
Vector< Statistics< double > > scale_data_minimum_maximum(void)
Definition: data_set.cpp:1863
Matrix< double > arrange_training_data(void) const
Definition: data_set.cpp:502
bool is_mixed(const Vector< std::string > &) const
Definition: data_set.cpp:4660
Matrix< double > data
Definition: data_set.h:426
void trim(std::string &) const
Definition: data_set.cpp:4569
void convert_autoassociation(void)
Definition: data_set.cpp:3629
Matrix< double > arrange_testing_data(void) const
Definition: data_set.cpp:538
void set_instances_number(const size_t &)
Definition: data_set.cpp:1225
Matrix< double > calculate_data_statistics_matrix(void) const
Definition: data_set.cpp:1591
void convert_angular_variables_radians(const Vector< size_t > &)
Definition: data_set.cpp:4282
Variables variables
Variables object (inputs and target variables).
Definition: data_set.h:430
std::string missing_values_label
Missing values label.
Definition: data_set.h:404
const size_t & get_lags_number(void) const
Returns the number of lags to be used in a time series prediction application.
Definition: data_set.cpp:429
Vector< Statistics< double > > calculate_testing_instances_statistics(void) const
Definition: data_set.cpp:1663
Vector< Statistics< double > > calculate_inputs_statistics(void) const
Definition: data_set.cpp:1684
void check_header_line(void)
Definition: data_set.cpp:3088
AngularUnits
Enumeration of the units used for angular variables.
Definition: data_set.h:107
std::string get_trimmed(const std::string &) const
Definition: data_set.cpp:4585
void set_lags_number(const size_t &)
Definition: data_set.cpp:1176
Vector< Statistics< double > > scale_targets_mean_standard_deviation(void)
Definition: data_set.cpp:2207
Vector< double > get_variable(const size_t &) const
Definition: data_set.cpp:749
Matrix< double > arrange_testing_input_data(void) const
Definition: data_set.cpp:653
void subtract_variable(const size_t &)
Definition: data_set.cpp:1418
Vector< Statistics< double > > scale_inputs_minimum_maximum(void)
Definition: data_set.cpp:2084
Matrix< double > arrange_training_target_data(void) const
Definition: data_set.cpp:605
Vector< Statistics< double > > calculate_generalization_instances_statistics(void) const
Definition: data_set.cpp:1642
bool display
Display messages to screen.
Definition: data_set.h:442
Vector< Statistics< double > > calculate_training_instances_statistics(void) const
Definition: data_set.cpp:1621
void convert_angular_variable_radians(const size_t &)
Definition: data_set.cpp:4188
Vector< std::string > read_header_line(void) const
Returns the name of the columns in the data set as a list of strings.
Definition: data_set.cpp:3164
Matrix< double > get_generalization_target_data(void) const
Definition: data_set.cpp:637
Vector< double > calculate_generalization_target_data_mean(void) const
Returns the mean values of the target variables on the generalization instances.
Definition: data_set.cpp:1735
void set_angular_units(AngularUnits &)
Sets the units of the angular variables (Radians or Degrees).
Definition: data_set.cpp:1211
Matrix< double > get_generalization_input_data(void) const
Definition: data_set.cpp:621
Vector< std::string > arrange_time_series_names(const Vector< std::string > &) const
Definition: data_set.cpp:3563
virtual ~DataSet(void)
Destructor.
Definition: data_set.cpp:132
void set_instance(const size_t &, const Vector< double > &)
Definition: data_set.cpp:1258
void set_separator(const Separator &)
Definition: data_set.cpp:1099
void load_data(void)
This method loads the data file.
Definition: data_set.cpp:3643
const bool & get_autoassociation(void) const
Definition: data_set.cpp:440
void scrub_missing_values(void)
Definition: data_set.cpp:4407
Vector< Statistics< double > > scale_targets_minimum_maximum(void)
Definition: data_set.cpp:2272
void subtract_instance(const size_t &)
Definition: data_set.cpp:1344
Vector< size_t > filter_data(const Vector< double > &, const Vector< double > &)
Definition: data_set.cpp:4072
void convert_angular_variables_degrees(const Vector< size_t > &)
Definition: data_set.cpp:4234
void set_variables_number(const size_t &)
Definition: data_set.cpp:1242
Vector< size_t > calculate_target_class_distribution(void) const
Definition: data_set.cpp:3789
Instances instances
Instances object (training, generalization and testing instances).
Definition: data_set.h:434
static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string &)
Definition: data_set.cpp:473
const Instances & get_instances(void) const
Returns a constant reference to the instances object composing this data set object.
Definition: data_set.cpp:222
void save_data(void) const
Saves to the data file the values of the data matrix.
Definition: data_set.cpp:2954