OpenNN  2.2
Open Neural Networks Library
data_set.cpp
1 /****************************************************************************************************************/
2 /* */
3 /* OpenNN: Open Neural Networks Library */
4 /* www.artelnics.com/opennn */
5 /* */
6 /* D A T A S E T C L A S S */
7 /* */
8 /* Roberto Lopez */
9 /* Artelnics - Making intelligent use of data */
11 /* */
12 /****************************************************************************************************************/
13 
14 // OpenNN includes
15 
16 #include "data_set.h"
17 
18 
19 namespace OpenNN
20 {
21 
22 // DEFAULT CONSTRUCTOR
23 
26 
28 {
29  set();
30 
31  set_default();
32 }
33 
34 
35 // DATA CONSTRUCTOR
36 
40 
42 {
43  set(data);
44 
45  set_default();
46 }
47 
48 
49 // GENERAL CONSTRUCTOR
50 
57 
58 DataSet::DataSet(const size_t& new_variables_number, const size_t& new_instances_number)
59 {
60  set(new_variables_number, new_instances_number);
61 
62  set_default();
63 }
64 
65 
66 // INSTANCES, INPUTS AND TARGETS NUMBERS CONSTRUCTORS
67 
74 
75 DataSet::DataSet(const size_t& new_inputs_number, const size_t& new_targets_number, const size_t& new_instances_number)
76 {
77  set(new_inputs_number, new_targets_number, new_instances_number);
78 
79 
80  set_default();
81 }
82 
83 
84 // XML CONSTRUCTOR
85 
88 
89 DataSet::DataSet(const tinyxml2::XMLDocument& data_set_document)
90 {
91  set_default();
92 
93  from_XML(data_set_document);
94 }
95 
96 
97 // FILE CONSTRUCTOR
98 
102 
103 DataSet::DataSet(const std::string& file_name)
104 {
105  set();
106 
107  set_default();
108 
109  load(file_name);
110 
111 }
112 
113 
114 // COPY CONSTRUCTOR
115 
119 
120 DataSet::DataSet(const DataSet& other_data_set)
121 {
122  set_default();
123 
124  set(other_data_set);
125 }
126 
127 
128 // DESTRUCTOR
129 
131 
133 {
134 }
135 
136 
137 // ASSIGNMENT OPERATOR
138 
142 
143 DataSet& DataSet::operator = (const DataSet& other_data_set)
144 {
145  if(this != &other_data_set)
146  {
147  data_file_name = other_data_set.data_file_name;
148 
149  // Data matrix
150 
151  data = other_data_set.data;
152 
153  // Variables
154 
155  variables = other_data_set.variables;
156 
157  // Instances
158 
159  instances = other_data_set.instances;
160 
161  // Utilities
162 
163  display = other_data_set.display;
164  }
165 
166  return(*this);
167 }
168 
169 
170 // EQUAL TO OPERATOR
171 
172 // bool operator == (const DataSet&) const method
173 
178 
179 bool DataSet::operator == (const DataSet& other_data_set) const
180 {
181  if(data_file_name == other_data_set.data_file_name
182  && data == other_data_set.data
183  && variables == other_data_set.variables
184  && instances == other_data_set.instances
185  && display == other_data_set.display)
186  {
187  return(true);
188  }
189  else
190  {
191  return(false);
192  }
193 }
194 
195 
196 // METHODS
197 
198 // const Variables& get_variables(void) const
199 
201 
203 {
204  return(variables);
205 }
206 
207 
208 // Variables* get_variables_pointer(void) const
209 
211 
213 {
214  return(&variables);
215 }
216 
217 
218 // const Instances& get_instances(void) const
219 
221 
223 {
224  return(instances);
225 }
226 
227 
228 // Instances* get_instances_pointer(void)
229 
231 
233 {
234  return(&instances);
235 }
236 
237 
238 // const bool& get_display(void) const method
239 
242 
243 const bool& DataSet::get_display(void) const
244 {
245  return(display);
246 }
247 
248 
249 // bool empty(void) const method
250 
252 
253 bool DataSet::empty(void) const
254 {
255  return(data.empty());
256 }
257 
258 
259 // const Matrix<double>& get_data(void) const method
260 
264 
266 {
267  return(data);
268 }
269 
270 
271 // const MissingValues& get_missing_values(void) const method
272 
274 
276 {
277  return(missing_values);
278 }
279 
280 
281 // MissingValues* get_missing_values_pointer(void) method
282 
284 
286 {
287  return(&missing_values);
288 }
289 
290 
291 // const std::string& get_data_file_name(void) const method
292 
294 
295 const std::string& DataSet::get_data_file_name(void) const
296 {
297  return(data_file_name);
298 }
299 
300 
301 // const bool& get_header(void) const
302 
304 
305 const bool& DataSet::get_header_line(void) const
306 {
307  return(header_line);
308 }
309 
310 
311 // const Separator& get_separator(void) const
312 
314 
316 {
317  return(separator);
318 }
319 
320 
321 // std::string get_separator_string(void) const
322 
324 
325 std::string DataSet::get_separator_string(void) const
326 {
327  switch(separator)
328  {
329  case Space:
330  {
331  return(" ");
332  }
333  break;
334 
335  case Tab:
336  {
337  return("\t");
338  }
339  break;
340 
341  case Comma:
342  {
343  return(",");
344  }
345  break;
346 
347  case Semicolon:
348  {
349  return(";");
350  }
351  break;
352 
353  default:
354  {
355  std::ostringstream buffer;
356 
357  buffer << "OpenNN Exception: DataSet class.\n"
358  << "std::string get_separator_string(void) const method.\n"
359  << "Unknown separator.\n";
360 
361  throw std::logic_error(buffer.str());
362  }
363  break;
364  }
365 }
366 
367 
368 // std::string write_separator(void) const
369 
371 
372 std::string DataSet::write_separator(void) const
373 {
374  switch(separator)
375  {
376  case Space:
377  {
378  return("Space");
379  }
380  break;
381 
382  case Tab:
383  {
384  return("Tab");
385  }
386  break;
387 
388  case Comma:
389  {
390  return("Comma");
391  }
392  break;
393 
394  case Semicolon:
395  {
396  return("Semicolon");
397  }
398  break;
399 
400  default:
401  {
402  std::ostringstream buffer;
403 
404  buffer << "OpenNN Exception: DataSet class.\n"
405  << "std::string write_separator(void) const method.\n"
406  << "Unknown separator.\n";
407 
408  throw std::logic_error(buffer.str());
409  }
410  break;
411  }
412 }
413 
414 
415 // const std::string& get_missing_values_label(void) const
416 
418 
419 const std::string& DataSet::get_missing_values_label(void) const
420 {
421  return(missing_values_label);
422 }
423 
424 
425 // const size_t& get_lags_number(void) const
426 
428 
429 const size_t& DataSet::get_lags_number(void) const
430 {
431  return(lags_number);
432 }
433 
434 
435 // const bool& get_autoassociation(void) const
436 
439 
440 const bool& DataSet::get_autoassociation(void) const
441 {
442  return(autoassociation);
443 }
444 
445 
446 // const Vector<size_t>& get_angular_variables(void) const method
447 
451 
453 {
454  return(angular_variables);
455 }
456 
457 
458 // const AngularUnits& get_angular_units(void) const method
459 
461 
463 {
464  return(angular_units);
465 }
466 
467 
468 // static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string&) method
469 
472 
473 DataSet::ScalingUnscalingMethod DataSet::get_scaling_unscaling_method(const std::string& scaling_unscaling_method)
474 {
475  if(scaling_unscaling_method == "MinimumMaximum")
476  {
477  return(MinimumMaximum);
478  }
479  else if(scaling_unscaling_method == "MeanStandardDeviation")
480  {
481  return(MeanStandardDeviation);
482  }
483  else
484  {
485  std::ostringstream buffer;
486 
487  buffer << "OpenNN Exception: DataSet class.\n"
488  << "static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string).\n"
489  << "Unknown scaling-unscaling method: " << scaling_unscaling_method << ".\n";
490 
491  throw std::logic_error(buffer.str());
492  }
493 }
494 
495 
496 // Matrix<double> arrange_training_data(void) const method
497 
501 
503 {
504  const size_t variables_number = variables.get_variables_number();
505 
506  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
507 
508  const Vector<size_t> training_indices = instances.arrange_training_indices();
509 
510  return(data.arrange_submatrix(training_indices, variables_indices));
511 }
512 
513 
514 // Matrix<double> arrange_generalization_data(void) const method
515 
519 
521 {
522  const size_t variables_number = variables.get_variables_number();
523 
524  const Vector<size_t> generalization_indices = instances.arrange_generalization_indices();
525 
526  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
527 
528  return(data.arrange_submatrix(generalization_indices, variables_indices));
529 }
530 
531 
532 // Matrix<double> arrange_testing_data(void) const method
533 
537 
539 {
540  const size_t variables_number = variables.get_variables_number();
541  Vector<size_t> variables_indices(0, 1, (int)variables_number-1);
542 
543  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
544 
545  return(data.arrange_submatrix(testing_indices, variables_indices));
546 }
547 
548 
549 // Matrix<double> arrange_input_data(void) const method
550 
554 
556 {
557  const size_t instances_number = instances.get_instances_number();
558  Vector<size_t> indices(0, 1, (int)instances_number-1);
559 
560  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
561 
562  return(data.arrange_submatrix(indices, inputs_indices));
563 }
564 
565 
566 // Matrix<double> arrange_target_data(void) const method
567 
571 
573 {
574  const size_t instances_number = instances.get_instances_number();
575  Vector<size_t> indices(0, 1, (size_t)instances_number-1);
576 
577  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
578 
579  return(data.arrange_submatrix(indices, targets_indices));
580 }
581 
582 
583 // Matrix<double> arrange_training_input_data(void) const method
584 
588 
590 {
591  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
592 
593  const Vector<size_t> training_indices = instances.arrange_training_indices();
594 
595  return(data.arrange_submatrix(training_indices, inputs_indices));
596 }
597 
598 
599 // Matrix<double> arrange_training_target_data(void) const method
600 
604 
606 {
607  const Vector<size_t> training_indices = instances.arrange_training_indices();
608 
609  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
610 
611  return(data.arrange_submatrix(training_indices, targets_indices));
612 }
613 
614 
615 // Matrix<double> get_generalization_input_data(void) const method
616 
620 
622 {
623  const Vector<size_t> generalization_indices = instances.arrange_generalization_indices();
624 
625  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
626 
627  return(data.arrange_submatrix(generalization_indices, inputs_indices));
628 }
629 
630 
631 // Matrix<double> get_generalization_target_data(void) const method
632 
636 
638 {
639  const Vector<size_t> generalization_indices = instances.arrange_generalization_indices();
640 
641  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
642 
643  return(data.arrange_submatrix(generalization_indices, targets_indices));
644 }
645 
646 
647 // Matrix<double> arrange_testing_input_data(void) const method
648 
652 
654 {
655  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
656 
657  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
658 
659  return(data.arrange_submatrix(testing_indices, inputs_indices));
660 }
661 
662 
663 // Matrix<double> arrange_testing_target_data(void) const method
664 
668 
670 {
671  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
672 
673  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
674 
675  return(data.arrange_submatrix(testing_indices, targets_indices));
676 }
677 
678 
679 // Vector<double> get_instance(const size_t&) const method
680 
683 
684 Vector<double> DataSet::get_instance(const size_t& i) const
685 {
686  // Control sentence (if debug)
687 
688  #ifndef NDEBUG
689 
690  const size_t instances_number = instances.get_instances_number();
691 
692  if(i >= instances_number)
693  {
694  std::ostringstream buffer;
695 
696  buffer << "OpenNN Exception: DataSet class.\n"
697  << "Vector<double> get_instance(const size_t&) const method.\n"
698  << "Index of instance must be less than number of instances.\n";
699 
700  throw std::logic_error(buffer.str());
701  }
702 
703  #endif
704 
705  // Get instance
706 
707  return(data.arrange_row(i));
708 }
709 
710 
711 // Vector<double> get_instance(const size_t&, const Vector<size_t>&) const method
712 
716 
717 Vector<double> DataSet::get_instance(const size_t& instance_index, const Vector<size_t>& variables_indices) const
718 {
719  // Control sentence (if debug)
720 
721  #ifndef NDEBUG
722 
723  const size_t instances_number = instances.get_instances_number();
724 
725  if(instance_index >= instances_number)
726  {
727  std::ostringstream buffer;
728 
729  buffer << "OpenNN Exception: DataSet class.\n"
730  << "Vector<double> get_instance(const size_t&, const Vector<size_t>&) const method.\n"
731  << "Index of instance must be less than number of instances.\n";
732 
733  throw std::logic_error(buffer.str());
734  }
735 
736  #endif
737 
738  // Get instance
739 
740  return(data.arrange_row(instance_index, variables_indices));
741 }
742 
743 
744 // Vector<double> get_variable(const size_t&) const method
745 
748 
749 Vector<double> DataSet::get_variable(const size_t& i) const
750 {
751  // Control sentence (if debug)
752 
753  #ifndef NDEBUG
754 
755  const size_t variables_number = variables.get_variables_number();
756 
757  if(i >= variables_number)
758  {
759  std::ostringstream buffer;
760 
761  buffer << "OpenNN Exception: DataSet class.\n"
762  << "Vector<double> get_variable(const size_t&) const method.\n"
763  << "Index of variable must be less than number of instances.\n";
764 
765  throw std::logic_error(buffer.str());
766  }
767 
768  #endif
769 
770  // Get variable
771 
772  return(data.arrange_column(i));
773 }
774 
775 
776 // Vector<double> get_variable(const size_t&, const Vector<size_t>&) const method
777 
781 
782 Vector<double> DataSet::get_variable(const size_t& variable_index, const Vector<size_t>& instances_indices) const
783 {
784  // Control sentence (if debug)
785 
786  #ifndef NDEBUG
787 
788  const size_t variables_number = variables.get_variables_number();
789 
790  if(variable_index >= variables_number)
791  {
792  std::ostringstream buffer;
793 
794  buffer << "OpenNN Exception: DataSet class.\n"
795  << "Vector<double> get_variable(const size_t&, const Vector<double>&) const method.\n"
796  << "Index of variable must be less than number of instances.\n";
797 
798  throw std::logic_error(buffer.str());
799  }
800 
801  #endif
802 
803  // Get variable
804 
805  return(data.arrange_column(variable_index, instances_indices));
806 }
807 
808 
809 // void set(void) method
810 
812 
813 void DataSet::set(void)
814 {
815  data_file_name = "";
816 
817  data.set();
818 
819  variables.set();
820  instances.set();
821 
823 
824  display = true;
825 }
826 
827 
828 // void set(const Matrix<double>&) method
829 
832 
833 void DataSet::set(const Matrix<double>& new_data)
834 {
835  data_file_name = "";
836 
837  const size_t variables_number = new_data.get_columns_number();
838  const size_t instances_number = new_data.get_rows_number();
839 
840  data = new_data;
841 
842  variables.set(variables_number);
843  instances.set(instances_number);
844 
845  missing_values.set(instances_number, variables_number);
846 
847  display = true;
848 }
849 
850 
851 // void set(const size_t&, const size_t&) method
852 
858 
859 void DataSet::set(const size_t& new_variables_number, const size_t& new_instances_number)
860 {
861  // Control sentence (if debug)
862 
863  #ifndef NDEBUG
864 
865  if(new_variables_number == 0)
866  {
867  std::ostringstream buffer;
868 
869  buffer << "OpenNN Exception: DataSet class.\n"
870  << "void set(const size_t&, const size_t&) method.\n"
871  << "Number of variables must be greater than zero.\n";
872 
873  throw std::logic_error(buffer.str());
874  }
875 
876  if(new_instances_number == 0)
877  {
878  std::ostringstream buffer;
879 
880  buffer << "OpenNN Exception: DataSet class.\n"
881  << "void set(const size_t&, const size_t&) method.\n"
882  << "Number of instances must be greater than zero.\n";
883 
884  throw std::logic_error(buffer.str());
885  }
886 
887  #endif
888 
889  data.set(new_instances_number, new_variables_number);
890 
891  instances.set(new_instances_number);
892 
893  variables.set(new_variables_number);
894 
895  missing_values.set(new_instances_number, new_variables_number);
896 
897  display = true;
898 }
899 
900 
901 // void set(const size_t&, const size_t&, const size_t&) method
902 
909 
910 void DataSet::set(const size_t& new_inputs_number, const size_t& new_targets_number, const size_t& new_instances_number)
911 {
912  data_file_name = "";
913 
914  const size_t new_variables_number = new_inputs_number + new_targets_number;
915 
916  data.set(new_instances_number, new_variables_number);
917 
918  variables.set(new_inputs_number, new_targets_number);
919 
920  instances.set(new_instances_number);
921 
922  missing_values.set(new_instances_number, new_variables_number);
923 
924  display = true;
925 }
926 
927 
928 // void set(const DataSet& other_data_set)
929 
932 
933 void DataSet::set(const DataSet& other_data_set)
934 {
935  data_file_name = other_data_set.data_file_name;
936 
937  header_line = other_data_set.header_line;
938 
939  separator = other_data_set.separator;
940 
942 
943  data = other_data_set.data;
944 
945  variables = other_data_set.variables;
946 
947  instances = other_data_set.instances;
948 
949  missing_values = other_data_set.missing_values;
950 
951  display = other_data_set.display;
952 }
953 
954 
955 // void set(const tinyxml2::XMLDocument&) method
956 
959 
960 void DataSet::set(const tinyxml2::XMLDocument& data_set_document)
961 {
962  set_default();
963 
964  from_XML(data_set_document);
965 }
966 
967 
968 // void set(const std::string&) method
969 
972 
973 void DataSet::set(const std::string& file_name)
974 {
975  load(file_name);
976 }
977 
978 
979 // void set_display(const bool&) method
980 
985 
986 void DataSet::set_display(const bool& new_display)
987 {
988  display = new_display;
989 }
990 
991 
992 // void set_default(void) method
993 
998 
1000 {
1001  header_line = false;
1002 
1003  separator = Space;
1004 
1005  missing_values_label = "?";
1006 
1007  lags_number = 0;
1008 
1009  autoassociation = false;
1010 
1011  angular_units = Degrees;
1012 
1013  display = true;
1014 }
1015 
1016 
1017 // void set_data(const Matrix<double>&) method
1018 
1024 
1025 void DataSet::set_data(const Matrix<double>& new_data)
1026 {
1027  // Control sentence (if debug)
1028 /*
1029  #ifndef NDEBUG
1030 
1031  const size_t rows_number = new_data.get_rows_number();
1032  const size_t instances_number = instances.get_instances_number();
1033 
1034  if(rows_number != instances_number)
1035  {
1036  std::ostringstream buffer;
1037 
1038  buffer << "OpenNN Exception: DataSet class.\n"
1039  << "void set_data(const Matrix<double>&) method.\n"
1040  << "Number of rows (" << rows_number << ") must be equal to number of instances (" << instances_number << ").\n";
1041 
1042  throw std::logic_error(buffer.str());
1043  }
1044 
1045  const size_t columns_number = new_data.get_columns_number();
1046  const size_t variables_number = variables.get_variables_number();
1047 
1048  if(columns_number != variables_number)
1049  {
1050  std::ostringstream buffer;
1051 
1052  buffer << "OpenNN Exception: DataSet class.\n"
1053  << "void set_data(const Matrix<double>&) method.\n"
1054  << "Number of columns (" << columns_number << ") must be equal to number of variables (" << variables_number << ").\n";
1055 
1056  throw std::logic_error(buffer.str());
1057  }
1058 
1059  #endif
1060 */
1061  // Set data
1062 
1063  data = new_data;
1064 
1067 
1068 }
1069 
1070 
1071 // void set_data_file_name(const std::string&) method
1072 
1077 
1078 void DataSet::set_data_file_name(const std::string& new_data_file_name)
1079 {
1080  data_file_name = new_data_file_name;
1081 }
1082 
1083 
1084 // void set_header_line(const bool&) method
1085 
1087 
1088 void DataSet::set_header_line(const bool& new_header_line)
1089 {
1090  header_line = new_header_line;
1091 }
1092 
1093 
1094 // void set_separator(const Separator&) method
1095 
1098 
1099 void DataSet::set_separator(const Separator& new_separator)
1100 {
1101  separator = new_separator;
1102 }
1103 
1104 
1105 // void set_separator(const std::string&) method
1106 
1109 
1110 void DataSet::set_separator(const std::string& new_separator)
1111 {
1112  if(new_separator == "Space")
1113  {
1114  separator = Space;
1115  }
1116  else if(new_separator == "Tab")
1117  {
1118  separator = Tab;
1119  }
1120  else if(new_separator == "Comma")
1121  {
1122  separator = Comma;
1123  }
1124  else if(new_separator == "Semicolon")
1125  {
1126  separator = Semicolon;
1127  }
1128  else
1129  {
1130  std::ostringstream buffer;
1131 
1132  buffer << "OpenNN Exception: DataSet class.\n"
1133  << "void set_separator(const std::string&) method.\n"
1134  << "Unknown separator: " << new_separator << ".\n";
1135 
1136  throw std::logic_error(buffer.str());
1137  }
1138 }
1139 
1140 
1141 // void set_missing_values_label(const std::string&) method
1142 
1145 
1146 void DataSet::set_missing_values_label(const std::string& new_missing_values_label)
1147 {
1148  // Control sentence (if debug)
1149 
1150  #ifndef NDEBUG
1151 
1152  if(get_trimmed(new_missing_values_label).empty())
1153  {
1154  std::ostringstream buffer;
1155 
1156  buffer << "OpenNN Exception: DataSet class.\n"
1157  << "void set_missing_values_label(const std::string&) method.\n"
1158  << "Missing values label cannot be empty.\n";
1159 
1160  throw std::logic_error(buffer.str());
1161  }
1162 
1163  #endif
1164 
1165 
1166  missing_values_label = new_missing_values_label;
1167 }
1168 
1169 
1170 // void set_lags_number(const size_t&)
1171 
1175 
1176 void DataSet::set_lags_number(const size_t& new_lags_number)
1177 {
1178  lags_number = new_lags_number;
1179 }
1180 
1181 
1182 // void set_autoassociation(const size_t&)
1183 
1189 
1190 void DataSet::set_autoassociation(const bool& new_autoassociation)
1191 {
1192  autoassociation = new_autoassociation;
1193 }
1194 
1195 
1196 // void set_angular_variables(const Vector<size_t>&)
1197 
1200 
1201 void DataSet::set_angular_variables(const Vector<size_t>& new_angular_variables)
1202 {
1203  angular_variables = new_angular_variables;
1204 }
1205 
1206 
1207 // void set_angular_units(AngularUnits&)
1208 
1210 
1212 {
1213  angular_units = new_angular_units;
1214 }
1215 
1216 
1217 
1218 // void set_instances_number(const size_t&) method
1219 
1224 
1225 void DataSet::set_instances_number(const size_t& new_instances_number)
1226 {
1227  const size_t variables_number = variables.get_variables_number();
1228 
1229  data.set(new_instances_number, variables_number);
1230 
1231  instances.set(new_instances_number);
1232 }
1233 
1234 
1235 // void set_variables_number(const size_t&) method
1236 
1241 
1242 void DataSet::set_variables_number(const size_t& new_variables_number)
1243 {
1244  const size_t instances_number = instances.get_instances_number();
1245 
1246  data.set(instances_number, new_variables_number);
1247 
1248  variables.set(new_variables_number);
1249 }
1250 
1251 
1252 // void set_instance(const size_t&, const Vector<double>&)
1253 
1257 
1258 void DataSet::set_instance(const size_t& instance_index, const Vector<double>& instance)
1259 {
1260  // Control sentence (if debug)
1261 
1262  #ifndef NDEBUG
1263 
1264  const size_t instances_number = instances.get_instances_number();
1265 
1266  if(instance_index >= instances_number)
1267  {
1268  std::ostringstream buffer;
1269 
1270  buffer << "OpenNN Exception: DataSet class.\n"
1271  << "void set_instance(const size_t&, const Vector<double>&) method.\n"
1272  << "Index of instance must be less than number of instances.\n";
1273 
1274  throw std::logic_error(buffer.str());
1275  }
1276 
1277  const size_t size = instance.size();
1278  const size_t variables_number = variables.get_variables_number();
1279 
1280  if(size != variables_number)
1281  {
1282  std::ostringstream buffer;
1283 
1284  buffer << "OpenNN Exception: DataSet class.\n"
1285  << "void set_instance(const size_t&, const Vector<double>&) method.\n"
1286  << "Size (" << size << ") must be equal to number of variables (" << variables_number << ").\n";
1287 
1288  throw std::logic_error(buffer.str());
1289  }
1290 
1291  #endif
1292 
1293  // Set instance
1294 
1295  data.set_row(instance_index, instance);
1296 }
1297 
1298 
1299 // void add_instance(const Vector<double>&) method
1300 
1306 
1308 {
1309  // Control sentence (if debug)
1310 
1311  #ifndef NDEBUG
1312 
1313  const size_t size = instance.size();
1314  const size_t variables_number = variables.get_variables_number();
1315 
1316  if(size != variables_number)
1317  {
1318  std::ostringstream buffer;
1319 
1320  buffer << "OpenNN Exception: DataSet class.\n"
1321  << "void add_instance(const Vector<double>&) method.\n"
1322  << "Size of instance must be equal to number of variables.\n";
1323 
1324  throw std::logic_error(buffer.str());
1325  }
1326 
1327  #endif
1328 
1329  const size_t instances_number = instances.get_instances_number();
1330 
1331  data.append_row(instance);
1332 
1333  instances.set(instances_number+1);
1334 }
1335 
1336 
1337 // void subtract_instance(size_t) method
1338 
1343 
1344 void DataSet::subtract_instance(const size_t& instance_index)
1345 {
1346  const size_t instances_number = instances.get_instances_number();
1347 
1348  // Control sentence (if debug)
1349 
1350  #ifndef NDEBUG
1351 
1352  if(instance_index >= instances_number)
1353  {
1354  std::ostringstream buffer;
1355 
1356  buffer << "OpenNN Exception: DataSet class.\n"
1357  << "void subtract_instance(size_t) method.\n"
1358  << "Index of instance must be less than number of instances.\n";
1359 
1360  throw std::logic_error(buffer.str());
1361  }
1362 
1363  #endif
1364 
1365  data.subtract_row(instance_index);
1366 
1367  instances.set_instances_number(instances_number-1);
1368 
1369 }
1370 
1371 
1372 // void append_variable(const Vector<double>&) method
1373 
1376 
1378 {
1379  // Control sentence (if debug)
1380 
1381  #ifndef NDEBUG
1382 
1383  const size_t size = variable.size();
1384  const size_t instances_number = instances.get_instances_number();
1385 
1386  if(size != instances_number)
1387  {
1388  std::ostringstream buffer;
1389 
1390  buffer << "OpenNN Exception: DataSet class.\n"
1391  << "void append_variable(const Vector<double>&) method.\n"
1392  << "Size of variable must be equal to number of instances.\n";
1393 
1394  throw std::logic_error(buffer.str());
1395  }
1396 
1397  #endif
1398 
1399  const size_t variables_number = variables.get_variables_number();
1400 
1401  data.append_column(variable);
1402 
1403  Matrix<double> new_data(data);
1404 
1405  const size_t new_variables_number = variables_number + 1;
1406 
1407  set_variables_number(new_variables_number);
1408 
1409  set_data(new_data);
1410 }
1411 
1412 
1413 // void subtract_variable(size_t) method
1414 
1417 
1418 void DataSet::subtract_variable(const size_t& variable_index)
1419 {
1420  const size_t variables_number = variables.get_variables_number();
1421 
1422  // Control sentence (if debug)
1423 
1424  #ifndef NDEBUG
1425 
1426  if(variable_index >= variables_number)
1427  {
1428  std::ostringstream buffer;
1429 
1430  buffer << "OpenNN Exception: DataSet class.\n"
1431  << "void subtract_variable(size_t) method.\n"
1432  << "Index of variable must be less than number of variables.\n";
1433 
1434  throw std::logic_error(buffer.str());
1435  }
1436 
1437  #endif
1438 
1439  data.subtract_column(variable_index);
1440 
1441  Matrix<double> new_data(data);
1442 
1443  const size_t new_variables_number = variables_number - 1;
1444 
1445  set_variables_number(new_variables_number);
1446 
1447  set_data(new_data);
1448 }
1449 
1450 
1451 // Vector<size_t> unuse_constant_variables(void) method
1452 
1455 
1457 {
1458  const size_t variables_number = variables.get_variables_number();
1459 
1460  // Control sentence (if debug)
1461 
1462  #ifndef NDEBUG
1463 
1464  if(variables_number == 0)
1465  {
1466  std::ostringstream buffer;
1467 
1468  buffer << "OpenNN Exception: DataSet class.\n"
1469  << "Vector<size_t> unuse_constant_variables(void) method.\n"
1470  << "Number of variables is zero.\n";
1471 
1472  throw std::logic_error(buffer.str());
1473  }
1474 
1475  #endif
1476 
1477  Vector<size_t> constant_variables;
1478 
1479  for(size_t i = 0; i < variables_number; i++)
1480  {
1481  if(variables.is_used(i) && data.arrange_column(i).is_constant(1.0e-6))
1482  {
1483  variables.set_use(i, Variables::Unused);
1484  constant_variables.push_back(i);
1485  }
1486  }
1487 
1488  return(constant_variables);
1489 }
1490 
1491 
1492 // Vector<size_t> unuse_repeated_instances(void) method
1493 
1496 
1498 {
1499  const size_t instances_number = instances.get_instances_number();
1500 
1501  // Control sentence (if debug)
1502 
1503  #ifndef NDEBUG
1504 
1505  if(instances_number == 0)
1506  {
1507  std::ostringstream buffer;
1508 
1509  buffer << "OpenNN Exception: DataSet class.\n"
1510  << "Vector<size_t> unuse_repeated_indices(void) method.\n"
1511  << "Number of instances is zero.\n";
1512 
1513  throw std::logic_error(buffer.str());
1514  }
1515 
1516  #endif
1517 
1518  Vector<size_t> repeated_instances;
1519 
1520  Vector<double> instance_i;
1521  Vector<double> instance_j;
1522 
1523  int i = 0;
1524 
1525  #pragma omp parallel for private(i, instance_i, instance_j)
1526 
1527  for(i = 0; i < (int)instances_number; i++)
1528  {
1529  instance_i = get_instance(i);
1530 
1531  for(size_t j = i+1; j < instances_number; j++)
1532  {
1533  instance_j = get_instance(j);
1534 
1535  if(instances.get_use(j) != Instances::Unused
1536  && instance_j == instance_i)
1537  {
1538  instances.set_use(j, Instances::Unused);
1539  repeated_instances.push_back(j);
1540  }
1541  }
1542  }
1543 
1544  return(repeated_instances);
1545 }
1546 
1547 
1548 // Vector<Histogram> calculate_data_histograms(const size_t&) const method
1549 
1557 
1559 {
1561 
1562  return(data.calculate_histograms_missing_values(missing_indices, bins_number));
1563 }
1564 
1565 
1566 // Vector< Vector<double> > calculate_data_statistics(void) const method
1567 
1576 
1578 {
1580 
1581  return(data.calculate_statistics_missing_values(missing_indices));
1582 }
1583 
1584 
1585 // Matrix<double> calculate_data_statistics_matrix(void) const method
1586 
1590 
1592 {
1594 
1595  const Vector< Statistics<double> > data_statistics = data.calculate_statistics_missing_values(missing_indices);
1596 
1597  const size_t variables_number = variables.get_variables_number();
1598 
1599  Matrix<double> data_statistics_matrix(variables_number, 4);
1600 
1601  for(size_t i = 0; i < variables_number; i++)
1602  {
1603  data_statistics_matrix.set_row(i, data_statistics[i].to_vector());
1604  }
1605 
1606  return(data_statistics_matrix);
1607 }
1608 
1609 
1610 // Vector< Statistics<double> > calculate_training_instances_statistics(void) const method
1611 
1620 
1622 {
1623  const Vector<size_t> training_indices = instances.arrange_training_indices();
1624 
1626 
1627  return(data.calculate_rows_statistics_missing_values(training_indices, missing_indices));
1628 }
1629 
1630 
1631 // Vector< Statistics<double> > calculate_generalization_instances_statistics(void) const method
1632 
1641 
1643 {
1644  const Vector<size_t> generalization_indices = instances.arrange_generalization_indices();
1645 
1647 
1648  return(data.calculate_rows_statistics_missing_values(generalization_indices, missing_indices));
1649 }
1650 
1651 
1652 // Vector< Statistics<double> > calculate_testing_instances_statistics(void) const method
1653 
1662 
1664 {
1665  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
1666 
1668 
1669  return(data.calculate_rows_statistics_missing_values(testing_indices, missing_indices));
1670 }
1671 
1672 
1673 // Vector< Statistics<double> > calculate_inputs_statistics(void) const method
1674 
1683 
1685 {
1686  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
1687 
1689 
1690  return(data.calculate_columns_statistics_missing_values(inputs_indices, missing_indices));
1691 }
1692 
1693 
1694 // Vector< Statistics<double> > calculate_targets_statistics(void) const method
1695 
1704 
1706 {
1707  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
1708 
1710 
1711  return(data.calculate_columns_statistics_missing_values(targets_indices, missing_indices));
1712 }
1713 
1714 
1715 // Vector<double> calculate_training_target_data_mean(void) const method
1716 
1718 
1720 {
1721  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
1722 
1723  const Vector<size_t> training_indices = instances.arrange_training_indices();
1724 
1726 
1727  return(data.calculate_mean_missing_values(training_indices, targets_indices, missing_indices));
1728 }
1729 
1730 
1731 // Vector<double> calculate_generalization_target_data_mean(void) const method
1732 
1734 
1736 {
1737  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
1738 
1739  const Vector<size_t> generalization_indices = instances.arrange_generalization_indices();
1740 
1742 
1743  return(data.calculate_mean_missing_values(generalization_indices, targets_indices, missing_indices));
1744 }
1745 
1746 
1747 // Vector<double> calculate_testing_target_data_mean(void) const method
1748 
1750 
1752 {
1753  const Vector<size_t> testing_indices = instances.arrange_testing_indices();
1754 
1755  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
1756 
1758 
1759  return(data.calculate_mean_missing_values(testing_indices, targets_indices, missing_indices));
1760 }
1761 
1762 
1763 // Matrix<double> calculate_linear_correlations(void) const method
1764 
1768 
1770 {
1771  const size_t inputs_number = variables.count_inputs_number();
1772  const size_t targets_number = variables.count_targets_number();
1773 
1774  const Vector<size_t> input_indices = variables.arrange_inputs_indices();
1775  const Vector<size_t> target_indices = variables.arrange_targets_indices();
1776 
1777  size_t input_index;
1778  size_t target_index;
1779 
1780  const size_t instances_number = instances.get_instances_number();
1781 
1783 
1784  Vector<double> input_variable(instances_number);
1785  Vector<double> target_variable(instances_number);
1786 
1787  Matrix<double> linear_correlations(inputs_number, targets_number);
1788 
1789  for(size_t i = 0; i < inputs_number; i++)
1790  {
1791  input_index = input_indices[i];
1792 
1793  input_variable = data.arrange_column(input_index);
1794 
1795  for(size_t j = 0; j < targets_number; j++)
1796  {
1797  target_index = target_indices[j];
1798 
1799  target_variable = data.arrange_column(target_index);
1800 
1801 // linear_correlations(i,j) = input_variable.calculate_linear_correlation_missing_values(target_variable, missing_indices[target_index]);
1802  linear_correlations(i,j) = input_variable.calculate_linear_correlation(target_variable);
1803  }
1804  }
1805 
1806  return(linear_correlations);
1807 }
1808 
1809 
1810 // void scale_data_mean_standard_deviation(const Vector< Statistics<double> >&) const method
1811 
1816 
1818 {
1819  // Control sentence (if debug)
1820 
1821  #ifndef NDEBUG
1822 
1823  std::ostringstream buffer;
1824 
1825  const size_t columns_number = data.get_columns_number();
1826 
1827  const size_t statistics_size = data_statistics.size();
1828 
1829  if(statistics_size != columns_number)
1830  {
1831  buffer << "OpenNN Exception: DataSet class.\n"
1832  << "void scale_data_mean_standard_deviation(const Vector< Statistics<double> >&) method.\n"
1833  << "Size of statistics must be equal to number of columns.\n";
1834 
1835  throw std::logic_error(buffer.str());
1836  }
1837 
1838  #endif
1839 
1840  const size_t variables_number = variables.get_variables_number();
1841 
1842  for(size_t i = 0; i < variables_number; i++)
1843  {
1844  if(display && data_statistics[i].standard_deviation < 1.0e-99)
1845  {
1846  std::cout << "OpenNN Warning: DataSet class.\n"
1847  << "void scale_data_mean_standard_deviation(const Vector< Statistics<Type> >&) method.\n"
1848  << "Standard deviation of variable " << i << " is zero.\n"
1849  << "That variable won't be scaled.\n";
1850  }
1851  }
1852 
1853  data.scale_mean_standard_deviation(data_statistics);
1854 }
1855 
1856 
1857 // Vector< Statistics<double> > scale_data_minimum_maximum(void) method
1858 
1862 
1864 {
1865  const Vector< Statistics<double> > data_statistics = calculate_data_statistics();
1866 
1867  scale_data_minimum_maximum(data_statistics);
1868 
1869  return(data_statistics);
1870 }
1871 
1872 
1873 // Vector< Statistics<double> > scale_data_mean_standard_deviation(void) method
1874 
1878 
1880 {
1881  const Vector< Statistics<double> > data_statistics = calculate_data_statistics();
1882 
1883  scale_data_mean_standard_deviation(data_statistics);
1884 
1885  return(data_statistics);
1886 }
1887 
1888 
1889 // void scale_data_minimum_maximum(const Vector< Statistics<double> >&) method
1890 
1895 
1897 {
1898  const size_t variables_number = variables.get_variables_number();
1899 
1900  // Control sentence (if debug)
1901 
1902  #ifndef NDEBUG
1903 
1904  std::ostringstream buffer;
1905 
1906  const size_t statistics_size = data_statistics.size();
1907 
1908  if(statistics_size != variables_number)
1909  {
1910  buffer << "OpenNN Exception: DataSet class.\n"
1911  << "void scale_data_minimum_maximum(const Vector< Statistics<double> >&) method.\n"
1912  << "Size of data statistics must be equal to number of variables.\n";
1913 
1914  throw std::logic_error(buffer.str());
1915  }
1916 
1917  #endif
1918 
1919  for(size_t i = 0; i < variables_number; i++)
1920  {
1921  if(display && data_statistics[i].maximum-data_statistics[i].minimum < 1.0e-99)
1922  {
1923  std::cout << "OpenNN Warning: DataSet class.\n"
1924  << "void scale_data_minimum_maximum(const Vector< Statistics<Type> >&) method.\n"
1925  << "Range of variable " << i << " is zero.\n"
1926  << "That variable won't be scaled.\n";
1927  }
1928  }
1929 
1930 
1931  data.scale_minimum_maximum(data_statistics);
1932 }
1933 
1934 
1935 // void scale_data(const std::string&, const Vector< Statistics<double> >&) method
1936 
1943 
1944 void DataSet::scale_data(const std::string& scaling_unscaling_method_string, const Vector< Statistics<double> >& data_statistics)
1945 {
1946  switch(get_scaling_unscaling_method(scaling_unscaling_method_string))
1947  {
1948  case MinimumMaximum:
1949  {
1950  scale_data_minimum_maximum(data_statistics);
1951  }
1952  break;
1953 
1954  case MeanStandardDeviation:
1955  {
1956  scale_data_mean_standard_deviation(data_statistics);
1957  }
1958  break;
1959 
1960  default:
1961  {
1962  std::ostringstream buffer;
1963 
1964  buffer << "OpenNN Exception: DataSet class\n"
1965  << "void scale_data(const std::string&, const Vector< Vector<double> >&) method.\n"
1966  << "Unknown data scaling and unscaling method.\n";
1967 
1968  throw std::logic_error(buffer.str());
1969  }
1970  break;
1971  }
1972 }
1973 
1974 
1975 // Vector< Statistics<double> > scale_data(void) method
1976 
1979 
1980 Vector< Statistics<double> > DataSet::scale_data(const std::string& scaling_unscaling_method)
1981 {
1982  const Vector< Statistics<double> > statistics = data.calculate_statistics();
1983 
1984  switch(get_scaling_unscaling_method(scaling_unscaling_method))
1985  {
1986  case MinimumMaximum:
1987  {
1988  scale_data_minimum_maximum(statistics);
1989  }
1990  break;
1991 
1992  case MeanStandardDeviation:
1993  {
1995  }
1996  break;
1997 
1998  default:
1999  {
2000  std::ostringstream buffer;
2001 
2002  buffer << "OpenNN Exception: DataSet class\n"
2003  << "Vector< Statistics<double> > scale_data(const std::string&) method.\n"
2004  << "Unknown scaling and unscaling method.\n";
2005 
2006  throw std::logic_error(buffer.str());
2007  }
2008  break;
2009  }
2010 
2011  return(statistics);
2012 }
2013 
2014 
2015 // void scale_inputs_mean_standard_deviation(const Vector< Statistics<double> >&) method
2016 
2021 
2023 {
2024  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2025 
2026  data.scale_columns_mean_standard_deviation(inputs_statistics, inputs_indices);
2027 }
2028 
2029 
2030 // Vector< Statistics<double> > scale_inputs_mean_standard_deviation(void) method
2031 
2035 
2037 {
2038  // Control sentence (if debug)
2039 
2040  #ifndef NDEBUG
2041 
2042  if(data.empty())
2043  {
2044  std::ostringstream buffer;
2045 
2046  buffer << "OpenNN Exception: DataSet class.\n"
2047  << "Vector< Statistics<double> > scale_inputs_mean_standard_deviation(void) method.\n"
2048  << "Data file is not loaded.\n";
2049 
2050  throw std::logic_error(buffer.str());
2051  }
2052 
2053  #endif
2054 
2055  const Vector< Statistics<double> > inputs_statistics = calculate_inputs_statistics();
2056 
2057  scale_inputs_mean_standard_deviation(inputs_statistics);
2058 
2059  return(inputs_statistics);
2060 }
2061 
2062 
2063 // void scale_inputs_minimum_maximum(const Vector< Statistics<double> >&) method
2064 
2069 
2071 {
2072  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2073 
2074  data.scale_columns_minimum_maximum(inputs_statistics, inputs_indices);
2075 }
2076 
2077 
2078 // Vector< Statistics<double> > scale_inputs_minimum_maximum(void) method
2079 
2083 
2085 {
2086  // Control sentence (if debug)
2087 
2088  #ifndef NDEBUG
2089 
2090  if(data.empty())
2091  {
2092  std::ostringstream buffer;
2093 
2094  buffer << "OpenNN Exception: DataSet class.\n"
2095  << "Vector< Statistics<double> > scale_inputs_minimum_maximum(void) method.\n"
2096  << "Data file is not loaded.\n";
2097 
2098  throw std::logic_error(buffer.str());
2099  }
2100 
2101  #endif
2102 
2103  const Vector< Statistics<double> > inputs_statistics = calculate_inputs_statistics();
2104 
2105  scale_inputs_minimum_maximum(inputs_statistics);
2106 
2107  return(inputs_statistics);
2108 }
2109 
2110 
2111 // Vector< Vector<double> > scale_inputs(const std::string&) method
2112 
2117 
2118 Vector< Statistics<double> > DataSet::scale_inputs(const std::string& scaling_unscaling_method)
2119 {
2120  switch(get_scaling_unscaling_method(scaling_unscaling_method))
2121  {
2122  case MinimumMaximum:
2123  {
2124  return(scale_inputs_minimum_maximum());
2125  }
2126  break;
2127 
2128  case MeanStandardDeviation:
2129  {
2131  }
2132  break;
2133 
2134  default:
2135  {
2136  std::ostringstream buffer;
2137 
2138  buffer << "OpenNN Exception: DataSet class\n"
2139  << "Vector< Statistics<double> > scale_inputs(void) method.\n"
2140  << "Unknown scaling and unscaling method.\n";
2141 
2142  throw std::logic_error(buffer.str());
2143  }
2144  break;
2145  }
2146 }
2147 
2148 
2149 // void scale_inputs(const std::string&, const Vector< Statistics<double> >&) method
2150 
2154 
2155 void DataSet::scale_inputs(const std::string& scaling_unscaling_method, const Vector< Statistics<double> >& inputs_statistics)
2156 {
2157  switch(get_scaling_unscaling_method(scaling_unscaling_method))
2158  {
2159  case MinimumMaximum:
2160  {
2161  scale_inputs_minimum_maximum(inputs_statistics);
2162  }
2163  break;
2164 
2165  case MeanStandardDeviation:
2166  {
2167  scale_inputs_mean_standard_deviation(inputs_statistics);
2168  }
2169  break;
2170 
2171  default:
2172  {
2173  std::ostringstream buffer;
2174 
2175  buffer << "OpenNN Exception: DataSet class\n"
2176  << "void scale_inputs(const std::string&, const Vector< Statistics<double> >&) method.\n"
2177  << "Unknown scaling and unscaling method.\n";
2178 
2179  throw std::logic_error(buffer.str());
2180  }
2181  break;
2182  }
2183 }
2184 
2185 
2186 // void scale_targets_mean_standard_deviation(const Vector< Statistics<double> >&)
2187 
2192 
2194 {
2195  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2196 
2197  data.scale_columns_mean_standard_deviation(targets_statistics, targets_indices);
2198 }
2199 
2200 
2201 // Vector< Statistics<double> > scale_targets_mean_standard_deviation(void) method
2202 
2206 
2208 {
2209  // Control sentence (if debug)
2210 
2211  #ifndef NDEBUG
2212 
2213  if(data.empty())
2214  {
2215  std::ostringstream buffer;
2216 
2217  buffer << "OpenNN Exception: DataSet class.\n"
2218  << "Vector< Statistics<double> > scale_targets_mean_standard_deviation(void) method.\n"
2219  << "Data file is not loaded.\n";
2220 
2221  throw std::logic_error(buffer.str());
2222  }
2223 
2224  #endif
2225 
2226  const Vector< Statistics<double> > targets_statistics = calculate_targets_statistics();
2227 
2228  scale_targets_mean_standard_deviation(targets_statistics);
2229 
2230  return(targets_statistics);
2231 }
2232 
2233 
2234 // void scale_targets_minimum_maximum(const Vector< Statistics<double> >&) method
2235 
2240 
2242 {
2243  // Control sentence (if debug)
2244 
2245  #ifndef NDEBUG
2246 
2247  if(data.empty())
2248  {
2249  std::ostringstream buffer;
2250 
2251  buffer << "OpenNN Exception: DataSet class.\n"
2252  << "Vector< Statistics<double> > scale_targets_minimum_maximum(void) method.\n"
2253  << "Data file is not loaded.\n";
2254 
2255  throw std::logic_error(buffer.str());
2256  }
2257 
2258  #endif
2259 
2260  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2261 
2262  data.scale_columns_minimum_maximum(targets_statistics, targets_indices);
2263 }
2264 
2265 
2266 // Vector< Statistics<double> > scale_targets_minimum_maximum(void) method
2267 
2271 
2273 {
2274  const Vector< Statistics<double> > targets_statistics = calculate_targets_statistics();
2275 
2276  scale_targets_minimum_maximum(targets_statistics);
2277 
2278  return(targets_statistics);
2279 }
2280 
2281 
2282 // Vector< Statistics<double> > scale_targets(const std::string&) method
2283 
2288 
2289 Vector< Statistics<double> > DataSet::scale_targets(const std::string& scaling_unscaling_method)
2290 {
2291  switch(get_scaling_unscaling_method(scaling_unscaling_method))
2292  {
2293  case MinimumMaximum:
2294  {
2296  }
2297  break;
2298 
2299  case MeanStandardDeviation:
2300  {
2302  }
2303  break;
2304 
2305  default:
2306  {
2307  std::ostringstream buffer;
2308 
2309  buffer << "OpenNN Exception: DataSet class\n"
2310  << "Vector< Statistics<double> > scale_targets(const std::string&) method.\n"
2311  << "Unknown scaling and unscaling method.\n";
2312 
2313  throw std::logic_error(buffer.str());
2314  }
2315  break;
2316  }
2317 }
2318 
2319 
2320 // void scale_targets(const std::string&, const Vector< Statistics<double> >&) method
2321 
2324 
2325 void DataSet::scale_targets(const std::string& scaling_unscaling_method, const Vector< Statistics<double> >& targets_statistics)
2326 {
2327  switch(get_scaling_unscaling_method(scaling_unscaling_method))
2328  {
2329  case MinimumMaximum:
2330  {
2331  scale_targets_minimum_maximum(targets_statistics);
2332  }
2333  break;
2334 
2335  case MeanStandardDeviation:
2336  {
2337  scale_targets_mean_standard_deviation(targets_statistics);
2338  }
2339  break;
2340 
2341  default:
2342  {
2343  std::ostringstream buffer;
2344 
2345  buffer << "OpenNN Exception: DataSet class\n"
2346  << "void scale_targets(const std::string&, const Vector< Statistics<double> >&) method.\n"
2347  << "Unknown scaling and unscaling method.\n";
2348 
2349  throw std::logic_error(buffer.str());
2350  }
2351  break;
2352  }
2353 }
2354 
2355 
2356 // void unscale_data_mean_standard_deviation(const Vector< Statistics<double> >&) method
2357 
2362 
2364 {
2365  data.unscale_mean_standard_deviation(data_statistics);
2366 }
2367 
2368 
2369 // void unscale_data_minimum_maximum(const Vector< Statistics<double> >&) method
2370 
2375 
2377 {
2378  data.unscale_minimum_maximum(data_statistics);
2379 }
2380 
2381 
2382 // void unscale_inputs_mean_standard_deviation(const Vector< Statistics<double> >&) method
2383 
2388 
2390 {
2391  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2392 
2393  data.unscale_columns_mean_standard_deviation(data_statistics, inputs_indices);
2394 }
2395 
2396 
2397 // void unscale_inputs_minimum_maximum(const Vector< Statistics<double> >&) method
2398 
2403 
2405 {
2406  const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
2407 
2408  data.unscale_columns_minimum_maximum(data_statistics, inputs_indices);
2409 }
2410 
2411 
2412 // void unscale_targets_mean_standard_deviation(const Vector< Statistics<double> >&) method
2413 
2418 
2420 {
2421  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2422 
2423  data.unscale_columns_mean_standard_deviation(data_statistics, targets_indices);
2424 }
2425 
2426 
2427 // void unscale_targets_minimum_maximum(const Vector< Statistics<double> >&) method
2428 
2433 
2435 {
2436  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
2437 
2438  data.unscale_columns_minimum_maximum(data_statistics, targets_indices);
2439 }
2440 
2441 
2442 // void initialize_data(const double& value) method
2443 
2446 
2447 void DataSet::initialize_data(const double& new_value)
2448 {
2449  data.initialize(new_value);
2450 }
2451 
2452 
2453 // void randomize_data_uniform(const double&, const double&) method
2454 
2457 
2458 void DataSet::randomize_data_uniform(const double& minimum, const double& maximum)
2459 {
2460  data.randomize_uniform(minimum, maximum);
2461 }
2462 
2463 
2464 // void randomize_data_normal(const double&, const double&) method
2465 
2468 
2469 void DataSet::randomize_data_normal(const double& mean, const double& standard_deviation)
2470 {
2471  data.randomize_normal(mean, standard_deviation);
2472 }
2473 
2474 
2475 // tinyxml2::XMLDocument* to_XML(void) const method
2476 
2478 
2479 tinyxml2::XMLDocument* DataSet::to_XML(void) const
2480 {
2481  tinyxml2::XMLDocument* document = new tinyxml2::XMLDocument;
2482 
2483  std::ostringstream buffer;
2484 
2485  // Data set
2486 
2487  tinyxml2::XMLElement* data_set_element = document->NewElement("DataSet");
2488  document->InsertFirstChild(data_set_element);
2489 
2490  tinyxml2::XMLElement* element = NULL;
2491  tinyxml2::XMLText* text = NULL;
2492 
2493  // Data file
2494 
2495  tinyxml2::XMLElement* data_file_element = document->NewElement("DataFile");
2496 
2497  data_set_element->InsertFirstChild(data_file_element);
2498 
2499  // Data file name
2500  {
2501  element = document->NewElement("DataFileName");
2502  data_file_element->LinkEndChild(element);
2503 
2504  text = document->NewText(data_file_name.c_str());
2505  element->LinkEndChild(text);
2506  }
2507 
2508  // Header line
2509  {
2510  element = document->NewElement("HeaderLine");
2511  data_file_element->LinkEndChild(element);
2512 
2513  buffer.str("");
2514  buffer << header_line;
2515 
2516  text = document->NewText(buffer.str().c_str());
2517  element->LinkEndChild(text);
2518  }
2519 
2520  // Separator
2521  {
2522  element = document->NewElement("Separator");
2523  data_file_element->LinkEndChild(element);
2524 
2525  text = document->NewText(write_separator().c_str());
2526  element->LinkEndChild(text);
2527  }
2528 
2529  // Missing values label
2530  {
2531  element = document->NewElement("MissingValuesLabel");
2532  data_file_element->LinkEndChild(element);
2533 
2534  text = document->NewText(missing_values_label.c_str());
2535  element->LinkEndChild(text);
2536  }
2537 
2538  // Variables
2539  {
2540  element = document->NewElement("Variables");
2541  data_set_element->LinkEndChild(element);
2542 
2543  const tinyxml2::XMLDocument* variables_document = variables.to_XML();
2544 
2545  const tinyxml2::XMLElement* variables_element = variables_document->FirstChildElement("Variables");
2546 
2547  DeepClone(element, variables_element, document, NULL);
2548 
2549  delete variables_document;
2550  }
2551 
2552  // Instances
2553  {
2554  element = document->NewElement("Instances");
2555  data_set_element->LinkEndChild(element);
2556 
2557  const tinyxml2::XMLDocument* instances_document = instances.to_XML();
2558 
2559  const tinyxml2::XMLElement* instances_element = instances_document->FirstChildElement("Instances");
2560 
2561  DeepClone(element, instances_element, document, NULL);
2562 
2563  delete instances_document;
2564  }
2565 
2566  // Missing values
2567  {
2568  element = document->NewElement("MissingValues");
2569  data_set_element->LinkEndChild(element);
2570 
2571  const tinyxml2::XMLDocument* missing_values_document = missing_values.to_XML();
2572 
2573  const tinyxml2::XMLElement* missing_values_element = missing_values_document->FirstChildElement("MissingValues");
2574 
2575  DeepClone(element, missing_values_element, document, NULL);
2576 
2577  delete missing_values_document;
2578  }
2579 
2580  // Display
2581  {
2582  element = document->NewElement("Display");
2583  data_set_element->LinkEndChild(element);
2584 
2585  buffer.str("");
2586  buffer << display;
2587 
2588  text = document->NewText(buffer.str().c_str());
2589  element->LinkEndChild(text);
2590  }
2591 
2592  return(document);
2593 }
2594 
2595 
2596 // void from_XML(const tinyxml2::XMLDocument&) method
2597 
2600 
2601 void DataSet::from_XML(const tinyxml2::XMLDocument& data_set_document)
2602 {
2603 
2604  std::ostringstream buffer;
2605 
2606  // Data set element
2607 
2608  const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement("DataSet");
2609 
2610  if(!data_set_element)
2611  {
2612  buffer << "OpenNN Exception: DataSet class.\n"
2613  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
2614  << "Data set element is NULL.\n";
2615 
2616  throw std::logic_error(buffer.str());
2617  }
2618 
2619  // Data file
2620 
2621  const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement("DataFile");
2622 
2623  if(!data_file_element)
2624  {
2625  buffer << "OpenNN Exception: DataSet class.\n"
2626  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
2627  << "Data file element is NULL.\n";
2628 
2629  throw std::logic_error(buffer.str());
2630  }
2631 
2632 
2633  // Data file name
2634  {
2635  const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement("DataFileName");
2636 
2637  if(!data_file_name_element)
2638  {
2639  buffer << "OpenNN Exception: DataSet class.\n"
2640  << "void from_XML(const tinyxml2::XMLDocument&) method.\n"
2641  << "Data file name element is NULL.\n";
2642 
2643  throw std::logic_error(buffer.str());
2644  }
2645 
2646  if(data_file_name_element->GetText())
2647  {
2648  const std::string new_data_file_name = data_file_name_element->GetText();
2649 
2650  set_data_file_name(new_data_file_name);
2651  }
2652  }
2653 
2654  // Header line
2655  {
2656  const tinyxml2::XMLElement* header_element = data_file_element->FirstChildElement("HeaderLine");
2657 
2658  if(header_element)
2659  {
2660  const std::string new_header_string = header_element->GetText();
2661 
2662  try
2663  {
2664  set_header_line(new_header_string != "0");
2665  }
2666  catch(const std::logic_error& e)
2667  {
2668  std::cout << e.what() << std::endl;
2669  }
2670  }
2671  }
2672 
2673  // Separator
2674  {
2675  const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement("Separator");
2676 
2677  if(separator_element)
2678  {
2679  if(separator_element->GetText())
2680  {
2681  const std::string new_separator = separator_element->GetText();
2682 
2683  set_separator(new_separator);
2684  }
2685  else
2686  {
2687  set_separator("Space");
2688  }
2689  }
2690  else
2691  {
2692  set_separator("Space");
2693  }
2694  }
2695 
2696  // Missing values label
2697  {
2698  const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement("MissingValuesLabel");
2699 
2700  if(missing_values_label_element)
2701  {
2702  if(missing_values_label_element->GetText())
2703  {
2704  const std::string new_missing_values_label = missing_values_label_element->GetText();
2705 
2706  set_missing_values_label(new_missing_values_label);
2707  }
2708  }
2709  }
2710 
2711  // Variables
2712  {
2713  const tinyxml2::XMLElement* variables_element = data_set_element->FirstChildElement("Variables");
2714 
2715  if(variables_element)
2716  {
2717  tinyxml2::XMLDocument variables_document;
2718 
2719  tinyxml2::XMLElement* variables_element_clone = variables_document.NewElement("Variables");
2720  variables_document.InsertFirstChild(variables_element_clone);
2721 
2722  DeepClone(variables_element_clone, variables_element, &variables_document, NULL);
2723 
2724  variables.from_XML(variables_document);
2725  }
2726  }
2727 
2728  // Instances
2729  {
2730  const tinyxml2::XMLElement* instances_element = data_set_element->FirstChildElement("Instances");
2731 
2732  if(instances_element)
2733  {
2734  tinyxml2::XMLDocument instances_document;
2735 
2736  tinyxml2::XMLElement* instances_element_clone = instances_document.NewElement("Instances");
2737  instances_document.InsertFirstChild(instances_element_clone);
2738 
2739  DeepClone(instances_element_clone, instances_element, &instances_document, NULL);
2740 
2741  instances.from_XML(instances_document);
2742  }
2743  }
2744 
2745  // Missing values
2746  {
2747  const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement("MissingValues");
2748 
2749  if(missing_values_element)
2750  {
2751  tinyxml2::XMLDocument missing_values_document;
2752 
2753  tinyxml2::XMLElement* missing_values_element_clone = missing_values_document.NewElement("MissingValues");
2754  missing_values_document.InsertFirstChild(missing_values_element_clone);
2755 
2756  DeepClone(missing_values_element_clone, missing_values_element, &missing_values_document, NULL);
2757 
2758  missing_values.from_XML(missing_values_document);
2759  }
2760  }
2761 
2762  // Display
2763  {
2764  const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement("Display");
2765 
2766  if(display_element)
2767  {
2768  const std::string new_display_string = display_element->GetText();
2769 
2770  try
2771  {
2772  set_display(new_display_string != "0");
2773  }
2774  catch(const std::logic_error& e)
2775  {
2776  std::cout << e.what() << std::endl;
2777  }
2778  }
2779  }
2780 
2781 }
2782 
2783 
2784 // std::string to_string(void) const method
2785 
2787 
2788 std::string DataSet::to_string(void) const
2789 {
2790  std::ostringstream buffer;
2791 
2792  buffer << "Data set object\n"
2793  << "Data file name: " << data_file_name << "\n"
2794  << "Header line: " << header_line << "\n"
2795  << "Separator: " << separator << "\n"
2796  << "Missing values label: " << missing_values_label << "\n"
2797  << "Data:\n" << data << "\n"
2798  << "Display: " << display << "\n"
2799  << variables.to_string()
2800  << instances.to_string()
2802 
2803  return(buffer.str());
2804 }
2805 
2806 
2807 // void print(void) const method
2808 
2810 
2811 void DataSet::print(void) const
2812 {
2813  if(display)
2814  {
2815  std::cout << to_string();
2816  }
2817 }
2818 
2819 
2820 // void print_summary(void) const method
2821 
2823 
2824 void DataSet::print_summary(void) const
2825 {
2826  if(display)
2827  {
2828  const size_t variables_number = variables.get_variables_number();
2829  const size_t instances_number = instances.get_instances_number();
2830  const size_t missing_values_number = missing_values.get_missing_values_number();
2831 
2832  std::cout << "Data set object summary:\n"
2833  << "Number of variables: " << variables_number << "\n"
2834  << "Number of instances: " << instances_number << "\n"
2835  << "Number of missing values: " << missing_values_number << std::endl;
2836  }
2837 }
2838 
2839 
2840 // void save(const std::string&) const method
2841 
2844 
2845 void DataSet::save(const std::string& file_name) const
2846 {
2847  tinyxml2::XMLDocument* document = to_XML();
2848 
2849  document->SaveFile(file_name.c_str());
2850 
2851  delete document;
2852 }
2853 
2854 
2855 // void load(const std::string&) method
2856 
2879 
2880 void DataSet::load(const std::string& file_name)
2881 {
2882  tinyxml2::XMLDocument document;
2883 
2884  if(document.LoadFile(file_name.c_str()))
2885  {
2886  std::ostringstream buffer;
2887 
2888  buffer << "OpenNN Exception: DataSet class.\n"
2889  << "void load(const std::string&) method.\n"
2890  << "Cannot load XML file " << file_name << ".\n";
2891 
2892  throw std::logic_error(buffer.str());
2893  }
2894 
2895  from_XML(document);
2896 }
2897 
2898 
2899 // void print_data(void) const method
2900 
2902 
2903 void DataSet::print_data(void) const
2904 {
2905  if(display)
2906  {
2907  std::cout << data << std::endl;
2908  }
2909 }
2910 
2911 
2912 // void print_data_preview(void) const method
2913 
2916 
2918 {
2919  if(display)
2920  {
2921  const size_t instances_number = instances.get_instances_number();
2922 
2923  if(instances_number > 0)
2924  {
2925  const Vector<double> first_instance = data.arrange_row(0);
2926 
2927  std::cout << "First instance:\n"
2928  << first_instance << std::endl;
2929  }
2930 
2931  if(instances_number > 1)
2932  {
2933  const Vector<double> second_instance = data.arrange_row(1);
2934 
2935  std::cout << "Second instance:\n"
2936  << second_instance << std::endl;
2937  }
2938 
2939  if(instances_number > 2)
2940  {
2941  const Vector<double> last_instance = data.arrange_row(instances_number-1);
2942 
2943  std::cout << "Instance " << instances_number << ":\n"
2944  << last_instance << std::endl;
2945  }
2946  }
2947 }
2948 
2949 
2950 // void save_data(void) const method
2951 
2953 
2954 void DataSet::save_data(void) const
2955 {
2956  std::ofstream file(data_file_name.c_str());
2957 
2958  if(!file.is_open())
2959  {
2960  std::ostringstream buffer;
2961 
2962  buffer << "OpenNN Exception: DataSet class.\n"
2963  << "void save_data(void) const method.\n"
2964  << "Cannot open data file.\n";
2965 
2966  throw std::logic_error(buffer.str());
2967  }
2968 
2969  if(header_line)
2970  {
2971  const Vector<std::string> variables_name = variables.arrange_names();
2972 
2973  file << variables_name << std::endl;
2974  }
2975 
2976  // Write data
2977 
2978  file << data;
2979 
2980  // Close file
2981 
2982  file.close();
2983 }
2984 
2985 
2986 // size_t get_column_index(const Vector< Vector<std::string> >&, const size_t) const method
2987 
2991 
2992 size_t DataSet::get_column_index(const Vector< Vector<std::string> >& nominal_labels, const size_t column_index) const
2993 {
2994  size_t variable_index = 0;
2995 
2996  for(size_t i = 0; i < column_index; i++)
2997  {
2998  if(nominal_labels[i].size() <= 2)
2999  {
3000  variable_index++;
3001  }
3002  else
3003  {
3004  variable_index += nominal_labels[i].size();
3005  }
3006  }
3007 
3008  return variable_index;
3009 }
3010 
3011 
3012 // void check_separator(const std::string&) method
3013 
3017 
3018 void DataSet::check_separator(const std::string& line) const
3019 {
3020  if(line.empty())
3021  {
3022  return;
3023  }
3024 
3025  const std::string separator_string = get_separator_string();
3026 
3027  if(line.find(separator_string) == std::string::npos)
3028  {
3029  std::ostringstream buffer;
3030 
3031  buffer << "OpenNN Exception: DataSet class.\n"
3032  << "void check_separator(const std::string&) method.\n"
3033  << "Separator '" << write_separator() << "' not found in data file " << data_file_name << ".\n";
3034 
3035  throw std::logic_error(buffer.str());
3036  }
3037 }
3038 
3039 
3040 // size_t count_data_file_columns_number(void) const method
3041 
3044 
3046 {
3047  std::ifstream file(data_file_name.c_str());
3048 
3049  std::string line;
3050 
3051  size_t columns_number = 0;
3052 
3053  while(file.good())
3054  {
3055  getline(file, line);
3056 
3057  if(separator != Tab)
3058  {
3059  std::replace(line.begin(), line.end(), '\t', ' ');
3060  }
3061 
3062  trim(line);
3063 
3064  if(line.empty())
3065  {
3066  continue;
3067  }
3068 
3069  check_separator(line);
3070 
3071  columns_number = count_tokens(line);
3072  }
3073 
3074  file.close();
3075 
3076  return columns_number;
3077 
3078 }
3079 
3080 
3081 // void check_header_line(void) method
3082 
3087 
3089 {
3090  std::ifstream file(data_file_name.c_str());
3091 
3092  std::string line;
3093  Vector<std::string> tokens;
3094 
3095  while(file.good())
3096  {
3097  getline(file, line);
3098 
3099  if(separator != Tab)
3100  {
3101  std::replace(line.begin(), line.end(), '\t', ' ');
3102  }
3103 
3104  trim(line);
3105 
3106  if(line.empty())
3107  {
3108  continue;
3109  }
3110 
3111  break;
3112  }
3113 
3114  file.close();
3115 
3116  check_separator(line);
3117 
3118  tokens = get_tokens(line);
3119 
3120  if(header_line && is_not_numeric(tokens))
3121  {
3122  return;
3123  }
3124  if(header_line && is_numeric(tokens))
3125  {
3126  if(display)
3127  {
3128  std::cout << "OpenNN Warning: DataSet class.\n"
3129  << "void check_header_line(void) method.\n"
3130  << "First line of data file interpreted as not header.\n";
3131  }
3132 
3133  header_line = false;
3134  }
3135  else if(header_line && is_mixed(tokens))
3136  {
3137  std::ostringstream buffer;
3138 
3139  buffer << "OpenNN Exception: DataSet class.\n"
3140  << "void check_header_line(void) method.\n"
3141  << "Header line contains numeric values: \n"
3142  << line << "\n";
3143 
3144  throw std::logic_error(buffer.str());
3145  }
3146  else if(!header_line && is_not_numeric(tokens))
3147  {
3148  if(display)
3149  {
3150  std::cout << "OpenNN Warning: DataSet class.\n"
3151  << "void check_header_line(void) method.\n"
3152  << "First line of data file interpreted as header.\n";
3153  }
3154 
3155  header_line = true;
3156  }
3157 }
3158 
3159 
3160 // Vector<std::string> read_header_line(void) const method
3161 
3163 
3165 {
3167 
3168  std::string line;
3169 
3170  std::ifstream file(data_file_name.c_str());
3171 
3172  // First line
3173 
3174  while(file.good())
3175  {
3176  getline(file, line);
3177 
3178  if(separator != Tab)
3179  {
3180  std::replace(line.begin(), line.end(), '\t', ' ');
3181  }
3182 
3183  trim(line);
3184 
3185  if(line.empty())
3186  {
3187  continue;
3188  }
3189 
3190  check_separator(line);
3191 
3192  header_line = get_tokens(line);
3193 
3194  break;
3195  }
3196 
3197  file.close();
3198 
3199  return header_line;
3200 }
3201 
3202 
3203 // void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method
3204 
3209 
3210 void DataSet::read_instance(const std::string& line, const Vector< Vector<std::string> >& nominal_labels, const size_t& instance_index)
3211 {
3212  // Control sentence (if debug)
3213 
3214  #ifndef NDEBUG
3215 
3216  const size_t instances_number = instances.get_instances_number();
3217 
3218  if(instance_index >= instances_number)
3219  {
3220  std::ostringstream buffer;
3221 
3222  buffer << "OpenNN Exception: DataSet class.\n"
3223  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3224  << "Index of instance (" << instance_index << ") must be less than number of instances (" << instances_number << ").\n";
3225 
3226  throw std::logic_error(buffer.str());
3227  }
3228 
3229  #endif
3230 
3231  const Vector<std::string> tokens = get_tokens(line);
3232 
3233  #ifndef NDEBUG
3234 
3235  if(tokens.size() != nominal_labels.size())
3236  {
3237  std::ostringstream buffer;
3238 
3239  buffer << "OpenNN Exception: DataSet class.\n"
3240  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3241  << "Size of tokens (" << tokens.size() << ") must be equal to size of names (" << nominal_labels.size() << ").\n";
3242 
3243  throw std::logic_error(buffer.str());
3244  }
3245 
3246  #endif
3247 
3248  size_t column_index;
3249 
3250  for(size_t j = 0; j < tokens.size(); j++)
3251  {
3252  column_index = get_column_index(nominal_labels, j);
3253 
3254  if(nominal_labels[j].size() == 0) // Numeric variable
3255  {
3256  if(tokens[j] != missing_values_label) // No missing values
3257  {
3258  data(instance_index, column_index) = atof(tokens[j].c_str());
3259  }
3260  else // Missing values
3261  {
3262  data(instance_index, column_index) = -99.9;
3263 
3264  missing_values.append(instance_index, column_index);
3265  }
3266  }
3267 
3268  else if(nominal_labels[j].size() == 2) // Binary variable
3269  {
3270  if(tokens[j] != missing_values_label) // No missing values
3271  {
3272  if(tokens[j] == "false" || tokens[j] == "False"|| tokens[j] == "FALSE"
3273  || tokens[j] == "negative"|| tokens[j] == "Negative"|| tokens[j] == "NEGATIVE")
3274  {
3275  data(instance_index, column_index) = 0.0;
3276  }
3277  else if(tokens[j] == "true" || tokens[j] == "True"|| tokens[j] == "TRUE"
3278  || tokens[j] == "positive"|| tokens[j] == "Positive"|| tokens[j] == "POSITIVE")
3279  {
3280  data(instance_index, column_index) = 1.0;
3281  }
3282  else if(tokens[j] == nominal_labels[j][0])
3283  {
3284  data(instance_index, column_index) = 0.0;
3285  }
3286  else if(tokens[j] == nominal_labels[j][1])
3287  {
3288  data(instance_index, column_index) = 1.0;
3289  }
3290  else
3291  {
3292  std::ostringstream buffer;
3293 
3294  buffer << "OpenNN Exception: DataSet class.\n"
3295  << "void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3296  << "Unknown token binary value.\n";
3297 
3298  throw std::logic_error(buffer.str());
3299  }
3300  }
3301  else // Missing values
3302  {
3303  data(instance_index, column_index) = -99.9;
3304 
3305  missing_values.append(instance_index, column_index);
3306  }
3307  }
3308 
3309  else // Nominal variable
3310  {
3311  if(tokens[j] != missing_values_label)
3312  {
3313  for(size_t k = 0; k < nominal_labels[j].size(); k++)
3314  {
3315  if(tokens[j] == nominal_labels[j][k])
3316  {
3317  data(instance_index, column_index+k) = 1.0;
3318  }
3319  else
3320  {
3321  data(instance_index, column_index+k) = 0.0;
3322  }
3323  }
3324  }
3325  else // Missing values
3326  {
3327  for(size_t k = 0; k < nominal_labels[j].size(); k++)
3328  {
3329  data(instance_index, column_index+k) = -99.9;
3330 
3331  missing_values.append(instance_index, column_index+k);
3332  }
3333  }
3334  }
3335  }
3336 }
3337 
3338 
3339 // Vector< Vector<std::string> > set_from_data_file(void) method
3340 
3343 
3345 {
3346  const size_t columns_number = count_data_file_columns_number();
3347 
3348  Vector< Vector<std::string> > nominal_labels(columns_number);
3349 
3350  std::string line;
3351  Vector<std::string> tokens;
3352 
3353  bool numeric;
3354 
3356 
3357  int instances_count;
3358 
3359  if(header_line)
3360  {
3361  instances_count = -1;
3362  }
3363  else
3364  {
3365  instances_count = 0;
3366  }
3367 
3368  std::ifstream file(data_file_name.c_str());
3369 
3370  // Rest of lines
3371 
3372  while(file.good())
3373  {
3374  getline(file, line);
3375 
3376  if(separator != Tab)
3377  {
3378  std::replace(line.begin(), line.end(), '\t', ' ');
3379  }
3380 
3381  trim(line);
3382 
3383  if(line.empty())
3384  {
3385  continue;
3386  }
3387 
3388  if(header_line && instances_count == -1)
3389  {
3390  instances_count = 0;
3391 
3392  continue;
3393  }
3394 
3395  check_separator(line);
3396 
3397  tokens = get_tokens(line);
3398 
3399  if(tokens.size() != columns_number)
3400  {
3401  std::ostringstream buffer;
3402 
3403  buffer << "OpenNN Exception: DataSet class.\n"
3404  << "Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
3405  << "Row " << instances_count << ": Size of tokens (" << tokens.size() << ") is not equal to "
3406  << "number of columns (" << columns_number << ").\n";
3407 
3408  throw std::logic_error(buffer.str());
3409  }
3410 
3411  instances_count++;
3412 
3413  for(size_t j = 0; j < columns_number; j++)
3414  {
3415  numeric = is_numeric(tokens[j]);
3416 
3417  if(!numeric
3418  && tokens[j] != missing_values_label
3419  && !nominal_labels[j].contains(tokens[j]))
3420  {
3421  nominal_labels[j].push_back(tokens[j]);
3422  }
3423  }
3424  }
3425 
3426  file.close();
3427 
3428  size_t variables_count = 0;
3429 
3430  for(size_t i = 0; i < columns_number; i++)
3431  {
3432  if(nominal_labels[i].size() == 0 || nominal_labels[i].size() == 2)
3433  {
3434  variables_count++;
3435  }
3436  else
3437  {
3438  variables_count += nominal_labels[i].size();
3439  }
3440  }
3441 
3442  // Fix label case
3443 
3444  for(size_t i = 0; i < columns_number; i++)
3445  {
3446  if(nominal_labels[i].size() == instances_count)
3447  {
3448  std::ostringstream buffer;
3449 
3450  buffer << "OpenNN Exception: DataSet class.\n"
3451  << "Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
3452  << "Column " << i << ": All elements are nominal and different. It contains meaningless data.\n";
3453 
3454  throw std::logic_error(buffer.str());
3455  }
3456  }
3457 
3458  // Set instances and variables number
3459 
3460  if(instances_count == 0 || variables_count == 0)
3461  {
3462  set();
3463 
3464  return(nominal_labels);
3465  }
3466 
3467  data.set(instances_count, variables_count);
3468 
3469  if(variables.get_variables_number() != variables_count)
3470  {
3471  variables.set(variables_count);
3472 
3473  if(nominal_labels[columns_number-1].size() > 2)
3474  {
3475  for(size_t i = variables_count-1; i >= variables_count - nominal_labels[columns_number-1].size(); i--)
3476  {
3477  variables.set_use(i, Variables::Target);
3478  }
3479  }
3480  }
3481 
3482  if(instances.get_instances_number() != instances_count)
3483  {
3484  instances.set(instances_count);
3485  }
3486 
3488 
3489  return(nominal_labels);
3490 }
3491 
3492 
3493 // void read_from_data_file(Vector< Vector<std::string> >&) method
3494 
3496 
3498 {
3499  std::ifstream file(data_file_name.c_str());
3500 
3501  file.clear();
3502  file.seekg(0, std::ios::beg);
3503 
3504  std::string line;
3505 
3506  if(header_line)
3507  {
3508  while(file.good())
3509  {
3510  getline(file, line);
3511 
3512  if(separator != Tab)
3513  {
3514  std::replace(line.begin(), line.end(), '\t', ' ');
3515  }
3516 
3517  trim(line);
3518 
3519  if(line.empty())
3520  {
3521  continue;
3522  }
3523 
3524  break;
3525  }
3526  }
3527 
3528  size_t i = 0;
3529 
3530 // #pragma omp parallel for private(i, line)
3531 
3532  while(file.good())
3533  {
3534  getline(file, line);
3535 
3536  if(separator != Tab)
3537  {
3538  std::replace(line.begin(), line.end(), '\t', ' ');
3539  }
3540 
3541  trim(line);
3542 
3543  if(line.empty())
3544  {
3545  continue;
3546  }
3547 
3548  // #pragma omp task
3549 
3550  read_instance(line, nominal_labels, i);
3551  i++;
3552  }
3553 
3554  file.close();
3555 }
3556 
3557 
3558 // Vector<std::string> arrange_time_series_prediction_names(const Vector<std::string>&) const method
3559 
3562 
3564 {
3565  Vector<std::string> time_series_prediction_names;
3566 /*
3567  Vector< Vector<std::string> > new_names((1+columns_number)*lags_number);
3568 
3569  for(size_t i = 0; i < 1+lags_number; i++)
3570  {
3571  for(size_t j = 0; j < names.size(); j++)
3572  {
3573  new_names[i+j] = names[j];
3574 
3575  if(i != lags_number)
3576  {
3577  for(size_t k = 0; k < names[j].size();k++)
3578  {
3579  new_names[i+j][k].append("_lag_").append(std::string::from_size_t(lags_number-i).c_str());
3580  }
3581  }
3582  }
3583  }
3584 */
3585  return(time_series_prediction_names);
3586 }
3587 
3588 
3589 // Vector<std::string> DataSet::arrange_autoassociation_names(const Vector<std::string>& names) const method
3590 
3593 
3595 {
3596  Vector<std::string> autoassociation_names;
3597 
3598  return(autoassociation_names);
3599 }
3600 
3601 
3602 // void convert_time_series(void) method
3603 
3606 
3608 {
3609  if(lags_number == 0)
3610  {
3611  return;
3612  }
3613 
3615 
3617 
3619 
3621 }
3622 
3623 
3624 // void convert_autoassociation(void) method
3625 
3628 
3630 {
3632 
3634 
3636 }
3637 
3638 
3639 // void load_data(void) method
3640 
3642 
3644 {
3645  if(data_file_name.empty())
3646  {
3647  std::ostringstream buffer;
3648 
3649  buffer << "OpenNN Exception: DataSet class.\n"
3650  << "void load_data(void) method.\n"
3651  << "Data file name has not been set.\n";
3652 
3653  throw std::logic_error(buffer.str());
3654  }
3655 
3656  std::ifstream file(data_file_name.c_str());
3657 
3658  if(!file.is_open())
3659  {
3660  std::ostringstream buffer;
3661 
3662  buffer << "OpenNN Exception: DataSet class.\n"
3663  << "void load_data(void) method.\n"
3664  << "Cannot open data file: " << data_file_name << "\n";
3665 
3666  throw std::logic_error(buffer.str());
3667  }
3668 
3669  file.close();
3670 
3671  const Vector< Vector<std::string> > nominal_labels = set_from_data_file();
3672 
3673  read_from_data_file(nominal_labels);
3674 
3675  // Variables name
3676 
3677  Vector<std::string> columns_name;
3678 
3679  if(header_line)
3680  {
3681  columns_name = read_header_line();
3682  }
3683  else
3684  {
3685  for(unsigned i = 0; i < nominal_labels.size(); i++)
3686  {
3687  std::ostringstream buffer;
3688 
3689  buffer << "variable_" << i;
3690 
3691  columns_name.push_back(buffer.str());
3692  }
3693  }
3694 
3695  variables.set_names(columns_name, nominal_labels);
3696 
3697  // Angular variables
3698 
3699  if(!angular_variables.empty())
3700  {
3702  }
3703 
3704  // Time series
3705 
3706  if(lags_number != 0)
3707  {
3709  }
3710 
3711  // Autoassociation
3712 
3713  if(autoassociation)
3714  {
3716  }
3717 }
3718 
3719 
3720 // void load_time_series_data(void) method
3721 
3723 /*
3724 void DataSet::load_time_series_data(void)
3725 {
3726  if(lags_number <= 0)
3727  {
3728  std::ostringstream buffer;
3729 
3730  buffer << "OpenNN Exception: DataSet class.\n"
3731  << "void load_time_series_data(void) const method.\n"
3732  << "Number of lags (" << lags_number << ") must be greater than zero.\n";
3733 
3734  throw std::logic_error(buffer.str());
3735  }
3736 
3737 
3738  if(header)
3739  {
3740 // Vector<std::string> columns_name;
3741 
3742 // variables.set_names(names);
3743  }
3744 
3745 
3746  const Matrix<double> time_series_data(data_file_name);
3747 
3748  const size_t rows_number = time_series_data.get_rows_number();
3749  const size_t columns_number = time_series_data.get_columns_number();
3750 
3751  const size_t instances_number = rows_number - lags_number;
3752  const size_t variables_number = columns_number*(1 + lags_number);
3753 
3754  set(variables_number, instances_number);
3755 
3756  Vector<double> row(rows_number);
3757 
3758  for(size_t i = 0; i < instances_number; i++)
3759  {
3760  row = time_series_data.arrange_row(i);
3761 
3762  for(size_t j = 1; j <= lags_number; j++)
3763  {
3764  row = row.assemble(time_series_data.arrange_row(i+j));
3765  }
3766 
3767  data.set_row(i, row);
3768  }
3769 
3770  // Variables
3771 
3772  Vector<Variables::Use> uses(variables_number);
3773 
3774  std::fill(uses.begin(), uses.begin()+lags_number*variables_number/(lags_number+1)-1, Variables::Use::Input);
3775  std::fill(uses.begin()+lags_number*variables_number/(lags_number+1), uses.end(), Variables::Use::Target);
3776 
3777  variables.set_uses(uses);
3778 
3779 }
3780 */
3781 
3782 // Vector<size_t> calculate_target_class_distribution(void) const method
3783 
3788 
3790 {
3791  // Control sentence (if debug)
3792 
3793  const size_t instances_number = instances.get_instances_number();
3794  const size_t targets_number = variables.count_targets_number();
3795  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
3796 
3797  Vector<size_t> class_distribution;
3798 
3799  if(targets_number == 1) // Two classes
3800  {
3801  class_distribution.set(2, 0);
3802 
3803  size_t target_index = targets_indices[0];
3804 
3805  for(size_t instance_index = 0; instance_index < instances_number; instance_index++)
3806  {
3807  if(instances.get_use(instance_index) != Instances::Unused)
3808  {
3809  if(data(instance_index,target_index) < 0.5)
3810  {
3811  class_distribution[0]++;
3812  }
3813  else
3814  {
3815  class_distribution[1]++;
3816  }
3817  }
3818  }
3819  }
3820  else // More than two classes
3821  {
3822  class_distribution.set(targets_number, 0);
3823 
3824  for(size_t i = 0; i < instances_number; i++)
3825  {
3826  if(instances.get_use(i) != Instances::Unused)
3827  {
3828  for(size_t j = 0; j < targets_number; j++)
3829  {
3830  if(data(i,targets_indices[j]) > 0.5)
3831  {
3832  class_distribution[j]++;
3833  }
3834  }
3835  }
3836  }
3837  }
3838 
3839  // Check data consistency
3840 
3841  const size_t used_instances_number = instances.count_used_instances_number();
3842 
3843  if(class_distribution.calculate_sum() != used_instances_number)
3844  {
3845  std::ostringstream buffer;
3846 
3847  buffer << "OpenNN Exception: DataSet class.\n"
3848  << "Vector<size_t> calculate_target_class_distribution(void) const method.\n"
3849  << "Sum of class distributions (" << class_distribution << ") is not equal to "
3850  << "number of used instances (" << used_instances_number << ")." << std::endl;
3851 
3852  throw std::logic_error(buffer.str());
3853  }
3854 
3855  return(class_distribution);
3856 }
3857 
3858 
3859 // Vector<double> calculate_distances(void) const method
3860 
3863 
3865 {
3866  const Matrix<double> data_statistics_matrix = calculate_data_statistics_matrix();
3867 
3868  const Vector<double> means = data_statistics_matrix.arrange_column(2);
3869  const Vector<double> standard_deviations = data_statistics_matrix.arrange_column(3);
3870 
3871  const size_t instances_number = instances.get_instances_number();
3872  Vector<double> distances(instances_number);
3873 
3874  const size_t variables_number = variables.get_variables_number();
3875  Vector<double> instance(variables_number);
3876 
3877  int i = 0;
3878 
3879  #pragma omp parallel for private(i, instance)
3880 
3881  for(i = 0; i < (int)instances_number; i++)
3882  {
3883  instance = data.arrange_row(i);
3884 
3885  distances[i] = (instance-means/standard_deviations).calculate_norm();
3886  }
3887 
3888  return(distances);
3889 }
3890 
3891 
3892 // void DataSet::balance_data(const double&) method
3893 
3895 
3896 void DataSet::balance_data(const double&)
3897 {
3898 /*
3899  const size_t instances_number = instances.count_used_instances_number();
3900 
3901  const Vector<double> distances = calculate_distances();
3902 
3903  // Sorted performance vector
3904 
3905  const Vector<size_t> rank = distances.calculate_greater_rank();
3906 
3907  // Perform linear ranking fitness assignment
3908  // Cannot do that without loop because of different types of fitness and rank vectors
3909 
3910  Vector<double> fitness(instances_number);
3911 
3912  const double selective_pressure = 1.5;
3913 
3914  for(size_t i = 0; i < instances_number; i++)
3915  {
3916  fitness[i] = selective_pressure*rank[i];
3917  }
3918 
3919  // Set selection vector to false
3920 
3921 // Vector<double> selection(instances_number, false);
3922 
3923  const size_t selection_target = instances_number/2;
3924 
3925  if(selection_target <= 0)
3926  {
3927  return;
3928  }
3929 
3930  // Cumulative fitness vector
3931 
3932  const Vector<double> cumulative_fitness = fitness.calculate_cumulative();
3933 
3934  const double fitness_sum = fitness.calculate_sum();
3935 
3936  // Select individuals until the desired number of selections is obtained
3937 
3938  size_t selection_count = 0;
3939 
3940  double pointer;
3941 
3942  while(selection_count != selection_target)
3943  {
3944  // Random number between 0 and total cumulative fitness
3945 
3946  pointer = calculate_random_uniform(0, fitness_sum);
3947 
3948  // Perform selection
3949 
3950  if(pointer < cumulative_fitness[0])
3951  {
3952  if(instances.is_used(0))
3953  {
3954  instances.set_use(0, "Unused");
3955  selection_count++;
3956  }
3957  }
3958  else
3959  {
3960  for(size_t i = 1; i < instances_number; i++)
3961  {
3962  if(pointer < cumulative_fitness[i] && pointer >= cumulative_fitness[i-1])
3963  {
3964  if(instances.is_used(i))
3965  {
3966  instances.set_use(0, "Unused");
3967  selection_count++;
3968  }
3969  }
3970  }
3971  }
3972  }
3973 
3974  // Control sentence (if debug)
3975 
3976  #ifndef NDEBUG
3977 
3978  if(instances.count_used_instances_number() != instances_number/2)
3979  {
3980  std::ostringstream buffer;
3981 
3982  buffer << "OpenNN Exception: DataSet class.\n"
3983  << "void perform_roulette_wheel_selection(void) method.\n"
3984  << "Selection count (" << instances.count_used_instances_number() << ") is not equal to half instances number (" << instances_number/2 << ").\n";
3985 
3986  throw std::logic_error(buffer.str());
3987  }
3988 
3989  #endif
3990 */
3991 }
3992 
3993 
3994 // void balance_target_class_distribution(void) method
3995 
3997 
3999 {
4000 /*
4001  const size_t instances_number = instances.count_used_instances_number();
4002 
4003  const size_t targets_number = variables.count_targets_number();
4004 
4005  const Vector<size_t> targets_indices = variables.arrange_targets_indices();
4006 
4007 // const size_t minimum = target_class_distribution.calculate_minimum();
4008 
4009 // const Matrix<double> target_data = arrange_target_data();
4010 
4011  Vector<size_t> target_class_distribution = calculate_target_class_distribution();
4012 
4013  size_t instance_index;
4014 
4015  Vector<double> instance;
4016 
4017  if(targets_number == 1) // Two classes
4018  {
4019  const size_t target_index = targets_indices[0];
4020 
4021  while(target_class_distribution[0] != target_class_distribution[1]);
4022  {
4023  instance_index = (size_t)calculate_random_uniform(0.0, instances_number);
4024 
4025  instance = get_instance(instance_index);
4026 
4027  if(instances.is_used(instance_index))
4028  {
4029  if(instance[target_index] == 0
4030  && target_class_distribution[0] > target_class_distribution[1])
4031  {
4032  instances.set_use(instance_index, Instances::Unused);
4033  }
4034  else if(instance[target_index] == 1
4035  && target_class_distribution[1] > target_class_distribution[0])
4036  {
4037  instances.set_use(instance_index, Instances::Unused);
4038  }
4039  }
4040  }
4041  }
4042 */
4043 }
4044 
4045 
4046 // bool has_data(void) const method
4047 
4050 
4051 bool DataSet::has_data(void) const
4052 {
4053  if(data.empty())
4054  {
4055  return(false);
4056  }
4057  else
4058  {
4059  return(true);
4060  }
4061 }
4062 
4063 
4064 // Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method
4065 
4071 
4073 {
4074  const size_t variables_number = variables.get_variables_number();
4075 
4076  // Control sentence (if debug)
4077 
4078  #ifndef NDEBUG
4079 
4080  if(minimums.size() != variables_number)
4081  {
4082  std::ostringstream buffer;
4083 
4084  buffer << "OpenNN Exception: DataSet class.\n"
4085  << "Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
4086  << "Size of minimums (" << minimums.size() << ") is not equal to number of variables (" << variables_number << ").\n";
4087 
4088  throw std::logic_error(buffer.str());
4089  }
4090 
4091  if(maximums.size() != variables_number)
4092  {
4093  std::ostringstream buffer;
4094 
4095  buffer << "OpenNN Exception: DataSet class.\n"
4096  << "Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
4097  << "Size of maximums (" << maximums.size() << ") is not equal to number of variables (" << variables_number << ").\n";
4098 
4099  throw std::logic_error(buffer.str());
4100  }
4101 
4102  #endif
4103 
4105 
4106  Vector<size_t> filtered_indices;
4107 
4108  const size_t instances_number = instances.get_instances_number();
4109 
4110  for(size_t i = 0; i < instances_number; i++)
4111  {
4112  for(size_t j = 0; j < variables_number; j++)
4113  {
4115  {
4116  continue;
4117  }
4118 
4119  if(data(i,j) < minimums[j] || data(i,j) > maximums[j])
4120  {
4121  if(instances.is_used(i))
4122  {
4123  filtered_indices.push_back(i);
4124 
4125  instances.set_use(i, Instances::Unused);
4126  }
4127  }
4128  }
4129  }
4130 
4131  return(filtered_indices);
4132 }
4133 
4134 
4135 // void convert_angular_variable_degrees(const size_t&) method
4136 
4141 
4142 void DataSet::convert_angular_variable_degrees(const size_t& variable_index)
4143 {
4144  // Control sentence (if debug)
4145 
4146  #ifndef NDEBUG
4147 
4148  const size_t variables_number = variables.get_variables_number();
4149 
4150  if(variable_index >= variables_number)
4151  {
4152  std::ostringstream buffer;
4153 
4154  buffer << "OpenNN Exception: DataSet class.\n"
4155  << "void convert_angular_variable_degrees(const size_t&) method.\n"
4156  << "Index of variable (" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
4157 
4158  throw std::logic_error(buffer.str());
4159  }
4160 
4161  #endif
4162 
4164 
4165  Variables::Item sin_item = items[variable_index];
4166  prepend("sin_", sin_item.name);
4167 
4168  Variables::Item cos_item = items[variable_index];
4169  prepend("cos_", cos_item.name);
4170 
4171  items[variable_index] = sin_item;
4172  items = items.insert_element(variable_index, cos_item);
4173 
4174  variables.set_items(items);
4175 
4176  data.convert_angular_variables_degrees(variable_index);
4177 
4178 }
4179 
4180 
4181 // void convert_angular_variable_radians(const size_t&) method
4182 
4187 
4188 void DataSet::convert_angular_variable_radians(const size_t& variable_index)
4189 {
4190  // Control sentence (if debug)
4191 
4192  #ifndef NDEBUG
4193 
4194  const size_t variables_number = variables.get_variables_number();
4195 
4196  if(variable_index >= variables_number)
4197  {
4198  std::ostringstream buffer;
4199 
4200  buffer << "OpenNN Exception: DataSet class.\n"
4201  << "void convert_angular_variable_radians(const size_t&) method.\n"
4202  << "Index of variable (" << variable_index << ") must be less than number of variables (" << variables_number << ").\n";
4203 
4204  throw std::logic_error(buffer.str());
4205  }
4206 
4207  #endif
4208 
4210 
4211  Variables::Item sin_item = items[variable_index];
4212  prepend("sin_", sin_item.name);
4213 
4214  Variables::Item cos_item = items[variable_index];
4215  prepend("cos_", cos_item.name);
4216 
4217  items[variable_index] = sin_item;
4218  items = items.insert_element(variable_index, cos_item);
4219 
4220  variables.set_items(items);
4221 
4222  data.convert_angular_variables_radians(variable_index);
4223 
4224 }
4225 
4226 
4227 // void convert_angular_variables_degrees(const Vector<size_t>&)
4228 
4233 
4235 {
4236  // Control sentence (if debug)
4237 
4238  #ifndef NDEBUG
4239 
4240  const size_t variables_number = variables.get_variables_number();
4241 
4242  for(size_t i = 0; i < indices.size(); i++)
4243  {
4244  if(indices[i] >= variables_number)
4245  {
4246  std::ostringstream buffer;
4247 
4248  buffer << "OpenNN Exception: DataSet class.\n"
4249  << "void convert_angular_variables_degrees(const Vector<size_t>&) method.\n"
4250  << "Index (" << i << ") must be less than number of variables (" << variables_number << ").\n";
4251 
4252  throw std::logic_error(buffer.str());
4253  }
4254  }
4255 
4256  #endif
4257 
4258  size_t size = indices.size();
4259 
4260  unsigned count = 0;
4261 
4262  size_t index;
4263 
4264  for(size_t i = 0; i < size; i++)
4265  {
4266  index = indices[i]+count;
4267 
4269 
4270  count++;
4271  }
4272 }
4273 
4274 
4275 // void convert_angular_variables_radians(const Vector<size_t>&)
4276 
4281 
4283 {
4284  // Control sentence (if debug)
4285 
4286  #ifndef NDEBUG
4287 
4288  const size_t variables_number = variables.get_variables_number();
4289 
4290  for(size_t i = 0; i < indices.size(); i++)
4291  {
4292  if(indices[i] >= variables_number)
4293  {
4294  std::ostringstream buffer;
4295 
4296  buffer << "OpenNN Exception: DataSet class.\n"
4297  << "void convert_angular_variables_radians(const Vector<size_t>&) method.\n"
4298  << "Index (" << i << ") must be less than number of variables (" << variables_number << ").\n";
4299 
4300  throw std::logic_error(buffer.str());
4301  }
4302  }
4303 
4304  #endif
4305 
4306  size_t size = indices.size();
4307 
4308  unsigned count = 0;
4309 
4310  size_t index;
4311 
4312  for(size_t i = 0; i < size; i++)
4313  {
4314  index = indices[i]+count;
4315 
4317 
4318  count++;
4319  }
4320 }
4321 
4322 
4323 // void convert_angular_variables(void) method
4324 
4328 
4330 {
4331  switch(angular_units)
4332  {
4333  case DataSet::Radians:
4334  {
4336  }
4337  break;
4338 
4339  case DataSet::Degrees:
4340  {
4342  }
4343  break;
4344 
4345  default:
4346  {
4347  std::ostringstream buffer;
4348 
4349  buffer << "OpenNN Exception: DataSet class.\n"
4350  << "void convert_angular_variables(void) method.\n"
4351  << "Unknown angular units.\n";
4352 
4353  throw std::logic_error(buffer.str());
4354  }
4355  break;
4356  }
4357 
4358 }
4359 
4360 
4361 // void scrub_missing_values_unuse(void) method
4362 
4364 
4366 {
4367  const Vector<size_t> missing_instances = missing_values.arrange_missing_instances();
4368 
4369  for(size_t i = 0; i < missing_instances.size(); i++)
4370  {
4371  instances.set_use(missing_instances[i], Instances::Unused);
4372  }
4373 }
4374 
4375 
4376 // void scrub_missing_values_mean(void) method
4377 
4379 
4381 {
4383 
4384  Vector<double> means = data.calculate_mean_missing_values(missing_indices);
4385 
4386  const size_t variables_number = variables.get_variables_number();
4387 
4388  size_t instance_index;
4389 
4390  for(size_t i = 0; i < variables_number; i++)
4391  {
4392  for(size_t j = 0; j < missing_indices[i].size(); j++)
4393  {
4394  instance_index = missing_indices[i][j];
4395  data(instance_index, i) = means[i];
4396  }
4397  }
4398 }
4399 
4400 
4401 // void scrub_missing_values(void) method
4402 
4406 
4408 {
4410 
4411  switch(scrubbing_method)
4412  {
4413  case MissingValues::Unuse:
4414  {
4416  }
4417  break;
4418 
4419  case MissingValues::Mean:
4420  {
4422  }
4423  break;
4424 
4425  default:
4426  {
4427  std::ostringstream buffer;
4428 
4429  buffer << "OpenNN Exception: DataSet class\n"
4430  << "void scrub_missing_values(void) method.\n"
4431  << "Unknown scrubbing method.\n";
4432 
4433  throw std::logic_error(buffer.str());
4434  }
4435  break;
4436  }
4437 }
4438 
4439 
4440 // size_t count_tokens(std::string& str) const method
4441 
4445 
4446 size_t DataSet::count_tokens(std::string& str) const
4447 {
4448 // if(!(this->find(separator) != std::string::npos))
4449 // {
4450 // std::ostringstream buffer;
4451 //
4452 // buffer << "OpenNN Exception:\n"
4453 // << "std::string class.\n"
4454 // << "inline size_t count_tokens(const std::string&) const method.\n"
4455 // << "Separator not found in string: \"" << separator << "\".\n";
4456 //
4457 // throw std::logic_error(buffer.str());
4458 // }
4459 
4460  trim(str);
4461 
4462  size_t tokens_count = 0;
4463 
4464  // Skip delimiters at beginning.
4465 
4466  const std::string separator_string = get_separator_string();
4467 
4468  std::string::size_type last_pos = str.find_first_not_of(separator_string, 0);
4469 
4470  // Find first "non-delimiter".
4471 
4472  std::string::size_type pos = str.find_first_of(separator_string, last_pos);
4473 
4474  while (std::string::npos != pos || std::string::npos != last_pos)
4475  {
4476  // Found a token, add it to the vector
4477 
4478  tokens_count++;
4479 
4480  // Skip delimiters. Note the "not_of"
4481 
4482  last_pos = str.find_first_not_of(separator_string, pos);
4483 
4484  // Find next "non-delimiter"
4485 
4486  pos = str.find_first_of(separator_string, last_pos);
4487  }
4488 
4489  return(tokens_count);
4490 }
4491 
4492 
4496 
4497 Vector<std::string> DataSet::get_tokens(const std::string& str) const
4498 {
4499  const std::string new_string = get_trimmed(str);
4500 
4501  Vector<std::string> tokens;
4502 
4503  const std::string separator_string = get_separator_string();
4504 
4505  // Skip delimiters at beginning.
4506 
4507  std::string::size_type lastPos = new_string.find_first_not_of(separator_string, 0);
4508 
4509  // Find first "non-delimiter"
4510 
4511  std::string::size_type pos = new_string.find_first_of(separator_string, lastPos);
4512 
4513  while(std::string::npos != pos || std::string::npos != lastPos)
4514  {
4515  // Found a token, add it to the vector
4516 
4517  tokens.push_back(new_string.substr(lastPos, pos - lastPos));
4518 
4519  // Skip delimiters. Note the "not_of"
4520 
4521  lastPos = new_string.find_first_not_of(separator_string, pos);
4522 
4523  // Find next "non-delimiter"
4524 
4525  pos = new_string.find_first_of(separator_string, lastPos);
4526  }
4527 
4528  for(size_t i = 0; i < tokens.size(); i++)
4529  {
4530  trim(tokens[i]);
4531  }
4532 
4533  return(tokens);
4534 }
4535 
4536 
4537 // bool is_numeric(const std::string&) const method
4538 
4541 
4542 bool DataSet::is_numeric(const std::string& str) const
4543 {
4544  std::istringstream iss(str.data());
4545 
4546  double dTestSink;
4547 
4548  iss >> dTestSink;
4549 
4550  // was any input successfully consumed/converted?
4551 
4552  if(!iss)
4553  {
4554  return false;
4555  }
4556 
4557  // was all the input successfully consumed/converted?
4558 
4559  return(iss.rdbuf()->in_avail() == 0);
4560 }
4561 
4562 
4563 // void DataSet::trim(std::string&) const method
4564 
4568 
4569 void DataSet::trim(std::string& str) const
4570 {
4571  //prefixing spaces
4572 
4573  str.erase(0, str.find_first_not_of(' '));
4574 
4575  //surfixing spaces
4576 
4577  str.erase(str.find_last_not_of(' ') + 1);
4578 }
4579 
4580 
4584 
4585 std::string DataSet::get_trimmed(const std::string& str) const
4586 {
4587  std::string output(str);
4588 
4589  //prefixing spaces
4590 
4591  output.erase(0, output.find_first_not_of(' '));
4592 
4593  //surfixing spaces
4594 
4595  output.erase(output.find_last_not_of(' ') + 1);
4596 
4597  return(output);
4598 }
4599 
4600 
4601 // std::string prepend(const std::string&, const std::string&) const method
4602 
4606 
4607 std::string DataSet::prepend(const std::string& pre, const std::string& str) const
4608 {
4609  std::ostringstream buffer;
4610 
4611  buffer << pre << str;
4612 
4613  return(buffer.str());
4614 }
4615 
4616 
4617 // bool is_numeric(const Vector<std::string>&) const
4618 
4621 
4623 {
4624  for(size_t i = 0; i < v.size(); i++)
4625  {
4626  if(!is_numeric(v[i]))
4627  {
4628  return false;
4629  }
4630  }
4631 
4632  return true;
4633 }
4634 
4635 
4636 // bool is_not_numeric(const Vector<std::string>&) const
4637 
4640 
4642 {
4643  for(size_t i = 0; i < v.size(); i++)
4644  {
4645  if(is_numeric(v[i]))
4646  {
4647  return false;
4648  }
4649  }
4650 
4651  return true;
4652 }
4653 
4654 
4655 // bool is_mixed(const Vector<std::string>&) const
4656 
4659 
4661 {
4662  unsigned count_numeric = 0;
4663  unsigned count_not_numeric = 0;
4664 
4665  for(size_t i = 0; i < v.size(); i++)
4666  {
4667  if(is_numeric(v[i]))
4668  {
4669  count_numeric++;
4670  }
4671  else
4672  {
4673  count_not_numeric++;
4674  }
4675  }
4676 
4677  if(count_numeric > 0 && count_not_numeric > 0)
4678  {
4679  return true;
4680  }
4681  else
4682  {
4683  return false;
4684  }
4685 }
4686 
4687 }
4688 
4689 // OpenNN: Open Neural Networks Library.
4690 // Copyright (c) 2005-2015 Roberto Lopez.
4691 //
4692 // This library is free software; you can redistribute it and/or
4693 // modify it under the terms of the GNU Lesser General Public
4694 // License as published by the Free Software Foundation; either
4695 // version 2.1 of the License, or any later version.
4696 //
4697 // This library is distributed in the hope that it will be useful,
4698 // but WITHOUT ANY WARRANTY; without even the implied warranty of
4699 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4700 // Lesser General Public License for more details.
4701 
4702 // You should have received a copy of the GNU Lesser General Public
4703 // License along with this library; if not, write to the Free Software
4704 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Vector< Statistics< T > > calculate_rows_statistics_missing_values(const Vector< size_t > &, const Vector< Vector< size_t > > &) const
Definition: matrix.h:3808
void unscale_inputs_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2404
void from_XML(const tinyxml2::XMLDocument &)
Definition: data_set.cpp:2601
const Use & get_use(const size_t &) const
Definition: instances.cpp:286
void randomize_data_normal(const double &mean=0.0, const double &standard_deviation=1.0)
Definition: data_set.cpp:2469
Vector< T > arrange_column(const size_t &) const
Definition: matrix.h:1580
void convert_time_series(const size_t &)
Separator
Enumeration of available separators for the data file.
Definition: data_set.h:99
void save(const std::string &) const
Definition: data_set.cpp:2845
Variables * get_variables_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:212
Vector< Vector< std::string > > set_from_data_file(void)
Definition: data_set.cpp:3344
void scale_columns_mean_standard_deviation(const Vector< Statistics< T > > &, const Vector< size_t > &)
Definition: matrix.h:4143
Vector< Histogram< T > > calculate_histograms_missing_values(const Vector< Vector< size_t > > &, const size_t &=10) const
Definition: matrix.h:3924
void unscale_data_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2376
bool has_data(void) const
Definition: data_set.cpp:4051
Vector< Statistics< T > > calculate_statistics_missing_values(const Vector< Vector< size_t > > &) const
Definition: matrix.h:3694
Matrix< double > arrange_training_input_data(void) const
Definition: data_set.cpp:589
void set_data(const Matrix< double > &)
Definition: data_set.cpp:1025
tinyxml2::XMLDocument * to_XML(void) const
Definition: instances.cpp:851
ScrubbingMethod get_scrubbing_method(void) const
Returns the method to be used for dealing with the missing values.
Matrix< double > calculate_linear_correlations(void) const
Definition: data_set.cpp:1769
void scrub_missing_values_mean(void)
Substitutes all the missing values by the mean of the corresponding variable.
Definition: data_set.cpp:4380
void add_instance(const Vector< double > &)
Definition: data_set.cpp:1307
void set(void)
Sets zero instances and zero variables in the data set.
Definition: data_set.cpp:813
ScrubbingMethod
Enumeration of available activation functions for the perceptron neuron model.
Vector< Statistics< double > > calculate_targets_statistics(void) const
Definition: data_set.cpp:1705
const Separator & get_separator(void) const
Returns the separator to be used in the data file.
Definition: data_set.cpp:315
void set_default(void)
Definition: data_set.cpp:999
void unscale_data_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2363
void convert_angular_variable_degrees(const size_t &)
Definition: data_set.cpp:4142
std::string write_separator(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:372
const Variables & get_variables(void) const
Returns a constant reference to the variables object composing this data set object.
Definition: data_set.cpp:202
Vector< size_t > angular_variables
Indices of angular variables.
Definition: data_set.h:416
Vector< size_t > arrange_missing_instances(void) const
Returns a vector with the indices of those instances with missing values.
void set_data_file_name(const std::string &)
Definition: data_set.cpp:1078
void read_instance(const std::string &, const Vector< Vector< std::string > > &, const size_t &)
Definition: data_set.cpp:3210
std::string data_file_name
Data file name.
Definition: data_set.h:392
size_t get_column_index(const Vector< Vector< std::string > > &, const size_t) const
Definition: data_set.cpp:2992
std::string to_string(void) const
Returns a string representation of the current instances object.
Definition: instances.cpp:1329
Matrix< double > arrange_target_data(void) const
Definition: data_set.cpp:572
bool is_constant(const double &=0.0) const
Definition: vector.h:1028
void set(void)
Sets a instances object with zero instances.
Definition: instances.cpp:581
const std::string & get_missing_values_label(void) const
Returns the string which will be used as label for the missing values in the data file...
Definition: data_set.cpp:419
void append_column(const Vector< T > &)
Definition: matrix.h:2123
Vector< size_t > unuse_repeated_instances(void)
Definition: data_set.cpp:1497
void set_instances_number(const size_t &)
Definition: instances.cpp:838
void convert_autoassociation(void)
void unscale_inputs_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2389
bool operator==(const DataSet &) const
Definition: data_set.cpp:179
void unscale_columns_mean_standard_deviation(const Vector< Statistics< T > > &, const Vector< size_t > &)
Definition: matrix.h:4446
bool is_not_numeric(const Vector< std::string > &) const
Definition: data_set.cpp:4641
AngularUnits angular_units
Units of angular variables.
Definition: data_set.h:420
Vector< double > calculate_mean_missing_values(const Vector< Vector< size_t > > &) const
Definition: matrix.h:3096
void balance_data(const double &)
Definition: data_set.cpp:3896
const Vector< size_t > & get_angular_variables(void) const
Definition: data_set.cpp:452
void set(void)
Sets the size of a vector to zero.
Definition: vector.h:656
void append_variable(const Vector< double > &)
Definition: data_set.cpp:1377
size_t count_data_file_columns_number(void) const
Definition: data_set.cpp:3045
void initialize_data(const double &)
Definition: data_set.cpp:2447
void set_missing_values_label(const std::string &)
Definition: data_set.cpp:1146
tinyxml2::XMLDocument * to_XML(void) const
Matrix< T > arrange_submatrix(const Vector< size_t > &, const Vector< size_t > &) const
Definition: matrix.h:1417
Matrix< double > arrange_testing_target_data(void) const
Definition: data_set.cpp:669
Matrix< double > arrange_input_data(void) const
Definition: data_set.cpp:555
void print_data(void) const
Prints to the sceen the values of the data matrix.
Definition: data_set.cpp:2903
void print_summary(void) const
Prints to the screen in text format the main numbers from the data set object.
Definition: data_set.cpp:2824
std::string get_separator_string(void) const
Returns the string which will be used as separator in the data file.
Definition: data_set.cpp:325
ScalingUnscalingMethod
Enumeration of available methods for scaling and unscaling the data.
Definition: data_set.h:103
void set_angular_variables(const Vector< size_t > &)
Definition: data_set.cpp:1201
void convert_angular_variables_radians(const size_t &)
Definition: matrix.h:6274
void check_separator(const std::string &) const
Definition: data_set.cpp:3018
bool is_numeric(const std::string &) const
Definition: data_set.cpp:4542
void convert_time_series(const size_t &)
Definition: variables.cpp:1547
void print_data_preview(void) const
Definition: data_set.cpp:2917
const MissingValues & get_missing_values(void) const
Returns a reference to the missing values object in the data set.
Definition: data_set.cpp:275
Vector< double > calculate_training_target_data_mean(void) const
Returns the mean values of the target variables on the training instances.
Definition: data_set.cpp:1719
bool empty(void) const
Returns true if the data matrix is empty, and false otherwise.
Definition: data_set.cpp:253
void scrub_missing_values_unuse(void)
Sets all the instances with missing values to "Unused".
Definition: data_set.cpp:4365
Vector< Statistics< double > > scale_inputs_mean_standard_deviation(void)
Definition: data_set.cpp:2036
void scale_columns_minimum_maximum(const Vector< Statistics< T > > &, const Vector< size_t > &)
Definition: matrix.h:4314
Vector< Statistics< double > > scale_targets(const std::string &)
Definition: data_set.cpp:2289
Vector< Histogram< double > > calculate_data_histograms(const size_t &=10) const
Definition: data_set.cpp:1558
MissingValues * get_missing_values_pointer(void)
Returns a pointer to the missing values object in the data set.
Definition: data_set.cpp:285
std::string prepend(const std::string &, const std::string &) const
Definition: data_set.cpp:4607
void convert_angular_variables(void)
Definition: data_set.cpp:4329
tinyxml2::XMLDocument * to_XML(void) const
Serializes the data set object into a XML document of the TinyXML library.
Definition: data_set.cpp:2479
const bool & get_header_line(void) const
Returns true if the first line of the data file has a header with the names of the variables...
Definition: data_set.cpp:305
Vector< size_t > arrange_testing_indices(void) const
Returns the indices of the instances which will be used for testing.
Definition: instances.cpp:543
void print(void) const
Prints to the screen in text format the members of the data set object.
Definition: data_set.cpp:2811
std::string to_string(void) const
Returns a string representation of the current data set object.
Definition: data_set.cpp:2788
void convert_angular_variables_degrees(const size_t &)
Definition: matrix.h:6220
void set_display(const bool &)
Definition: data_set.cpp:986
Separator separator
Separator character.
Definition: data_set.h:400
void scale_data(const std::string &, const Vector< Statistics< double > > &)
Definition: data_set.cpp:1944
const std::string & get_data_file_name(void) const
Returns the name of the data file.
Definition: data_set.cpp:295
Vector< std::string > arrange_autoassociation_names(const Vector< std::string > &) const
Definition: data_set.cpp:3594
const size_t & get_columns_number(void) const
Returns the number of columns in the matrix.
Definition: matrix.h:1090
Vector< Statistics< double > > scale_data_mean_standard_deviation(void)
Definition: data_set.cpp:1879
Vector< size_t > arrange_targets_indices(void) const
Returns the indices of the target variables.
Definition: variables.cpp:519
Vector< Statistics< double > > scale_inputs(const std::string &)
Definition: data_set.cpp:2118
Vector< size_t > arrange_training_indices(void) const
Returns the indices of the instances which will be used for training.
Definition: instances.cpp:489
bool autoassociation
Autoassociation flag.
Definition: data_set.h:412
void convert_autoassociation(void)
Definition: matrix.h:6205
Vector< double > get_instance(const size_t &) const
Definition: data_set.cpp:684
void read_from_data_file(const Vector< Vector< std::string > > &)
Performs a second data file read in which the data is set.
Definition: data_set.cpp:3497
void unscale_targets_mean_standard_deviation(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2419
DataSet & operator=(const DataSet &)
Definition: data_set.cpp:143
void from_XML(const tinyxml2::XMLDocument &)
Definition: instances.cpp:916
void append(const size_t &, const size_t &)
void set_header_line(const bool &)
Sets if the data file contains a header with the names of the variables.
Definition: data_set.cpp:1088
void set_autoassociation(const bool &)
Definition: data_set.cpp:1190
void convert_time_series(const size_t &)
Definition: instances.cpp:1304
MissingValues missing_values
Missing values object.
Definition: data_set.h:438
void subtract_column(const size_t &)
Definition: matrix.h:2339
Vector< Statistics< T > > calculate_statistics(void) const
Definition: matrix.h:3652
void set_variables_number(const size_t &)
Definition: variables.cpp:1616
void balance_target_class_distribution(void)
Definition: data_set.cpp:3998
size_t count_tokens(std::string &) const
Definition: data_set.cpp:4446
const AngularUnits & get_angular_units(void) const
Returns the units used for the angular variables (Radians or Degrees).
Definition: data_set.cpp:462
Instances * get_instances_pointer(void)
Returns a pointer to the variables object composing this data set object.
Definition: data_set.cpp:232
Matrix< double > arrange_generalization_data(void) const
Definition: data_set.cpp:520
void unscale_targets_minimum_maximum(const Vector< Statistics< double > > &)
Definition: data_set.cpp:2434
Vector< std::string > get_tokens(const std::string &) const
Definition: data_set.cpp:4497
size_t lags_number
Number of lags.
Definition: data_set.h:408
void unscale_mean_standard_deviation(const Vector< Statistics< T > > &)
Definition: matrix.h:4368
const bool & get_display(void) const
Definition: data_set.cpp:243
bool header_line
Header which contains variables name.
Definition: data_set.h:396
Vector< size_t > unuse_constant_variables(void)
Definition: data_set.cpp:1456
void set_items(const Vector< Item > &)
Definition: variables.cpp:1011
void set(void)
Sets a missing values object with zero instances, variables and missing values.
const Matrix< double > & get_data(void) const
Definition: data_set.cpp:265
void load(const std::string &)
Definition: data_set.cpp:2880
void convert_time_series(void)
Definition: data_set.cpp:3607
bool is_missing_value(const size_t &, const size_t &) const
Vector< Statistics< double > > calculate_data_statistics(void) const
Definition: data_set.cpp:1577
void randomize_data_uniform(const double &minimum=-1.0, const double &maximum=1.0)
Definition: data_set.cpp:2458
void initialize(const T &)
Definition: matrix.h:2510
Vector< double > calculate_distances(void) const
Definition: data_set.cpp:3864
Vector< double > calculate_testing_target_data_mean(void) const
Returns the mean values of the target variables on the testing instances.
Definition: data_set.cpp:1751
Vector< Statistics< double > > scale_data_minimum_maximum(void)
Definition: data_set.cpp:1863
Matrix< double > arrange_training_data(void) const
Definition: data_set.cpp:502
bool is_mixed(const Vector< std::string > &) const
Definition: data_set.cpp:4660
void convert_time_series(const size_t &)
Definition: matrix.h:6173
Matrix< double > data
Definition: data_set.h:426
tinyxml2::XMLDocument * to_XML(void) const
Definition: variables.cpp:1664
void trim(std::string &) const
Definition: data_set.cpp:4569
void convert_autoassociation(void)
Definition: data_set.cpp:3629
Matrix< double > arrange_testing_data(void) const
Definition: data_set.cpp:538
void set_instances_number(const size_t &)
Definition: data_set.cpp:1225
void set(void)
Sets a variables object with zero variables.
Definition: variables.cpp:930
std::string to_string(void) const
Returns a string representation of the current MissingValues object.
void scale_minimum_maximum(const Vector< Statistics< T > > &)
Definition: matrix.h:4197
Matrix< double > calculate_data_statistics_matrix(void) const
Definition: data_set.cpp:1591
void convert_angular_variables_radians(const Vector< size_t > &)
Definition: data_set.cpp:4282
Variables variables
Variables object (inputs and target variables).
Definition: data_set.h:430
void set(void)
This method set the numbers of rows and columns of the matrix to zero.
Definition: matrix.h:1101
const size_t & get_rows_number(void) const
Returns the number of rows in the matrix.
Definition: matrix.h:1079
std::string missing_values_label
Missing values label.
Definition: data_set.h:404
const size_t & get_lags_number(void) const
Returns the number of lags to be used in a time series prediction application.
Definition: data_set.cpp:429
Vector< Statistics< double > > calculate_testing_instances_statistics(void) const
Definition: data_set.cpp:1663
void from_XML(const tinyxml2::XMLDocument &)
Definition: variables.cpp:1754
Vector< Statistics< double > > calculate_inputs_statistics(void) const
Definition: data_set.cpp:1684
void check_header_line(void)
Definition: data_set.cpp:3088
AngularUnits
Enumeration of the units used for angular variables.
Definition: data_set.h:107
std::string get_trimmed(const std::string &) const
Definition: data_set.cpp:4585
void set_lags_number(const size_t &)
Definition: data_set.cpp:1176
Vector< Statistics< double > > scale_targets_mean_standard_deviation(void)
Definition: data_set.cpp:2207
Vector< Statistics< T > > calculate_columns_statistics_missing_values(const Vector< size_t > &, const Vector< Vector< size_t > >) const
Definition: matrix.h:3866
Vector< double > get_variable(const size_t &) const
Definition: data_set.cpp:749
T calculate_linear_correlation(const Vector< T > &) const
Definition: vector.h:2675
Matrix< double > arrange_testing_input_data(void) const
Definition: data_set.cpp:653
void subtract_variable(const size_t &)
Definition: data_set.cpp:1418
Vector< Statistics< double > > scale_inputs_minimum_maximum(void)
Definition: data_set.cpp:2084
void randomize_uniform(const double &=-1.0, const double &=1.0)
Definition: matrix.h:2524
size_t count_inputs_number(void) const
Returns the number of input variables of the data set.
Definition: variables.cpp:249
void append_row(const Vector< T > &)
Definition: matrix.h:2080
void from_XML(const tinyxml2::XMLDocument &)
Matrix< double > arrange_training_target_data(void) const
Definition: data_set.cpp:605
Vector< Statistics< double > > calculate_generalization_instances_statistics(void) const
Definition: data_set.cpp:1642
bool display
Display messages to screen.
Definition: data_set.h:442
void set_use(const size_t &, const Use &)
Definition: instances.cpp:731
Vector< Statistics< double > > calculate_training_instances_statistics(void) const
Definition: data_set.cpp:1621
void convert_angular_variable_radians(const size_t &)
Definition: data_set.cpp:4188
Vector< std::string > read_header_line(void) const
Returns the name of the columns in the data set as a list of strings.
Definition: data_set.cpp:3164
Matrix< double > get_generalization_target_data(void) const
Definition: data_set.cpp:637
void convert_autoassociation(void)
Definition: variables.cpp:1589
Vector< T > insert_element(const size_t &, const T &) const
Definition: vector.h:4967
Vector< double > calculate_generalization_target_data_mean(void) const
Returns the mean values of the target variables on the generalization instances.
Definition: data_set.cpp:1735
Vector< std::string > arrange_names(void) const
Returns the names of all the variables in the data set.
Definition: variables.cpp:545
void set_use(const size_t &, const Use &)
Definition: variables.cpp:1078
void set_angular_units(AngularUnits &)
Sets the units of the angular variables (Radians or Degrees).
Definition: data_set.cpp:1211
size_t count_targets_number(void) const
Returns the number of target variables of the data set.
Definition: variables.cpp:271
Matrix< double > get_generalization_input_data(void) const
Definition: data_set.cpp:621
Vector< std::string > arrange_time_series_names(const Vector< std::string > &) const
Definition: data_set.cpp:3563
virtual ~DataSet(void)
Destructor.
Definition: data_set.cpp:132
void set_instance(const size_t &, const Vector< double > &)
Definition: data_set.cpp:1258
size_t get_missing_values_number(void) const
Returns the number of missing values in the data set.
void set_separator(const Separator &)
Definition: data_set.cpp:1099
void load_data(void)
This method loads the data file.
Definition: data_set.cpp:3643
const bool & get_autoassociation(void) const
Definition: data_set.cpp:440
void scrub_missing_values(void)
Definition: data_set.cpp:4407
bool is_used(const size_t &) const
Definition: instances.cpp:333
std::string to_string(void) const
Returns a string representation of the current variables object.
Definition: variables.cpp:1627
void unscale_columns_minimum_maximum(const Vector< Statistics< T > > &, const Vector< size_t > &)
Definition: matrix.h:4577
Vector< Statistics< double > > scale_targets_minimum_maximum(void)
Definition: data_set.cpp:2272
bool empty(void) const
Returns true if number of rows and columns is zero.
Definition: matrix.h:5908
void set_names(const Vector< std::string > &)
Definition: variables.cpp:1221
const Vector< Item > & get_items(void) const
Returns the vector Item structures in the variables object.
Definition: variables.cpp:156
bool is_used(const size_t &) const
Definition: variables.cpp:431
void subtract_instance(const size_t &)
Definition: data_set.cpp:1344
Vector< size_t > filter_data(const Vector< double > &, const Vector< double > &)
Definition: data_set.cpp:4072
size_t count_used_instances_number(void) const
Definition: instances.cpp:374
T calculate_sum(void) const
Returns the sum of the elements in the vector.
Definition: vector.h:2005
void set_row(const size_t &, const Vector< T > &)
Definition: matrix.h:1691
void unscale_minimum_maximum(const Vector< Statistics< T > > &)
Definition: matrix.h:4493
void convert_angular_variables_degrees(const Vector< size_t > &)
Definition: data_set.cpp:4234
void scale_mean_standard_deviation(const Vector< Statistics< T > > &)
Definition: matrix.h:4028
void set_variables_number(const size_t &)
Definition: data_set.cpp:1242
size_t get_instances_number(void) const
Returns the number of instances in the data set.
Definition: instances.h:134
size_t get_variables_number(void) const
Returns the total number of variables in the data set.
Definition: variables.h:126
Vector< Vector< size_t > > arrange_missing_indices(void) const
Vector< size_t > arrange_generalization_indices(void) const
Returns the indices of the instances which will be used for generalization.
Definition: instances.cpp:516
Vector< size_t > calculate_target_class_distribution(void) const
Definition: data_set.cpp:3789
Vector< T > arrange_row(const size_t &) const
Definition: matrix.h:1505
Instances instances
Instances object (training, generalization and testing instances).
Definition: data_set.h:434
void subtract_row(const size_t &)
Definition: matrix.h:2283
void randomize_normal(const double &=0.0, const double &=1.0)
Definition: matrix.h:2593
static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string &)
Definition: data_set.cpp:473
std::string name
Name of a variable.
Definition: variables.h:103
Vector< size_t > arrange_inputs_indices(void) const
Returns the indices of the input variables.
Definition: variables.cpp:493
const Instances & get_instances(void) const
Returns a constant reference to the instances object composing this data set object.
Definition: data_set.cpp:222
void save_data(void) const
Saves to the data file the values of the data matrix.
Definition: data_set.cpp:2954