58 DataSet::DataSet(
const size_t& new_variables_number,
const size_t& new_instances_number)
60 set(new_variables_number, new_instances_number);
75 DataSet::DataSet(
const size_t& new_inputs_number,
const size_t& new_targets_number,
const size_t& new_instances_number)
77 set(new_inputs_number, new_targets_number, new_instances_number);
145 if(
this != &other_data_set)
355 std::ostringstream buffer;
357 buffer <<
"OpenNN Exception: DataSet class.\n"
358 <<
"std::string get_separator_string(void) const method.\n"
359 <<
"Unknown separator.\n";
361 throw std::logic_error(buffer.str());
402 std::ostringstream buffer;
404 buffer <<
"OpenNN Exception: DataSet class.\n"
405 <<
"std::string write_separator(void) const method.\n"
406 <<
"Unknown separator.\n";
408 throw std::logic_error(buffer.str());
475 if(scaling_unscaling_method ==
"MinimumMaximum")
477 return(MinimumMaximum);
479 else if(scaling_unscaling_method ==
"MeanStandardDeviation")
481 return(MeanStandardDeviation);
485 std::ostringstream buffer;
487 buffer <<
"OpenNN Exception: DataSet class.\n"
488 <<
"static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string).\n"
489 <<
"Unknown scaling-unscaling method: " << scaling_unscaling_method <<
".\n";
491 throw std::logic_error(buffer.str());
692 if(i >= instances_number)
694 std::ostringstream buffer;
696 buffer <<
"OpenNN Exception: DataSet class.\n"
697 <<
"Vector<double> get_instance(const size_t&) const method.\n"
698 <<
"Index of instance must be less than number of instances.\n";
700 throw std::logic_error(buffer.str());
725 if(instance_index >= instances_number)
727 std::ostringstream buffer;
729 buffer <<
"OpenNN Exception: DataSet class.\n"
730 <<
"Vector<double> get_instance(const size_t&, const Vector<size_t>&) const method.\n"
731 <<
"Index of instance must be less than number of instances.\n";
733 throw std::logic_error(buffer.str());
757 if(i >= variables_number)
759 std::ostringstream buffer;
761 buffer <<
"OpenNN Exception: DataSet class.\n"
762 <<
"Vector<double> get_variable(const size_t&) const method.\n"
763 <<
"Index of variable must be less than number of instances.\n";
765 throw std::logic_error(buffer.str());
790 if(variable_index >= variables_number)
792 std::ostringstream buffer;
794 buffer <<
"OpenNN Exception: DataSet class.\n"
795 <<
"Vector<double> get_variable(const size_t&, const Vector<double>&) const method.\n"
796 <<
"Index of variable must be less than number of instances.\n";
798 throw std::logic_error(buffer.str());
859 void DataSet::set(
const size_t& new_variables_number,
const size_t& new_instances_number)
865 if(new_variables_number == 0)
867 std::ostringstream buffer;
869 buffer <<
"OpenNN Exception: DataSet class.\n"
870 <<
"void set(const size_t&, const size_t&) method.\n"
871 <<
"Number of variables must be greater than zero.\n";
873 throw std::logic_error(buffer.str());
876 if(new_instances_number == 0)
878 std::ostringstream buffer;
880 buffer <<
"OpenNN Exception: DataSet class.\n"
881 <<
"void set(const size_t&, const size_t&) method.\n"
882 <<
"Number of instances must be greater than zero.\n";
884 throw std::logic_error(buffer.str());
889 data.
set(new_instances_number, new_variables_number);
910 void DataSet::set(
const size_t& new_inputs_number,
const size_t& new_targets_number,
const size_t& new_instances_number)
914 const size_t new_variables_number = new_inputs_number + new_targets_number;
916 data.
set(new_instances_number, new_variables_number);
1112 if(new_separator ==
"Space")
1116 else if(new_separator ==
"Tab")
1120 else if(new_separator ==
"Comma")
1124 else if(new_separator ==
"Semicolon")
1130 std::ostringstream buffer;
1132 buffer <<
"OpenNN Exception: DataSet class.\n"
1133 <<
"void set_separator(const std::string&) method.\n"
1134 <<
"Unknown separator: " << new_separator <<
".\n";
1136 throw std::logic_error(buffer.str());
1154 std::ostringstream buffer;
1156 buffer <<
"OpenNN Exception: DataSet class.\n"
1157 <<
"void set_missing_values_label(const std::string&) method.\n"
1158 <<
"Missing values label cannot be empty.\n";
1160 throw std::logic_error(buffer.str());
1229 data.
set(new_instances_number, variables_number);
1246 data.
set(instances_number, new_variables_number);
1266 if(instance_index >= instances_number)
1268 std::ostringstream buffer;
1270 buffer <<
"OpenNN Exception: DataSet class.\n"
1271 <<
"void set_instance(const size_t&, const Vector<double>&) method.\n"
1272 <<
"Index of instance must be less than number of instances.\n";
1274 throw std::logic_error(buffer.str());
1277 const size_t size = instance.size();
1280 if(size != variables_number)
1282 std::ostringstream buffer;
1284 buffer <<
"OpenNN Exception: DataSet class.\n"
1285 <<
"void set_instance(const size_t&, const Vector<double>&) method.\n"
1286 <<
"Size (" << size <<
") must be equal to number of variables (" << variables_number <<
").\n";
1288 throw std::logic_error(buffer.str());
1313 const size_t size = instance.size();
1316 if(size != variables_number)
1318 std::ostringstream buffer;
1320 buffer <<
"OpenNN Exception: DataSet class.\n"
1321 <<
"void add_instance(const Vector<double>&) method.\n"
1322 <<
"Size of instance must be equal to number of variables.\n";
1324 throw std::logic_error(buffer.str());
1352 if(instance_index >= instances_number)
1354 std::ostringstream buffer;
1356 buffer <<
"OpenNN Exception: DataSet class.\n"
1357 <<
"void subtract_instance(size_t) method.\n"
1358 <<
"Index of instance must be less than number of instances.\n";
1360 throw std::logic_error(buffer.str());
1383 const size_t size = variable.size();
1386 if(size != instances_number)
1388 std::ostringstream buffer;
1390 buffer <<
"OpenNN Exception: DataSet class.\n"
1391 <<
"void append_variable(const Vector<double>&) method.\n"
1392 <<
"Size of variable must be equal to number of instances.\n";
1394 throw std::logic_error(buffer.str());
1405 const size_t new_variables_number = variables_number + 1;
1426 if(variable_index >= variables_number)
1428 std::ostringstream buffer;
1430 buffer <<
"OpenNN Exception: DataSet class.\n"
1431 <<
"void subtract_variable(size_t) method.\n"
1432 <<
"Index of variable must be less than number of variables.\n";
1434 throw std::logic_error(buffer.str());
1443 const size_t new_variables_number = variables_number - 1;
1464 if(variables_number == 0)
1466 std::ostringstream buffer;
1468 buffer <<
"OpenNN Exception: DataSet class.\n"
1469 <<
"Vector<size_t> unuse_constant_variables(void) method.\n"
1470 <<
"Number of variables is zero.\n";
1472 throw std::logic_error(buffer.str());
1479 for(
size_t i = 0; i < variables_number; i++)
1484 constant_variables.push_back(i);
1488 return(constant_variables);
1505 if(instances_number == 0)
1507 std::ostringstream buffer;
1509 buffer <<
"OpenNN Exception: DataSet class.\n"
1510 <<
"Vector<size_t> unuse_repeated_indices(void) method.\n"
1511 <<
"Number of instances is zero.\n";
1513 throw std::logic_error(buffer.str());
1525 #pragma omp parallel for private(i, instance_i, instance_j)
1527 for(i = 0; i < (int)instances_number; i++)
1531 for(
size_t j = i+1; j < instances_number; j++)
1536 && instance_j == instance_i)
1539 repeated_instances.push_back(j);
1544 return(repeated_instances);
1601 for(
size_t i = 0; i < variables_number; i++)
1603 data_statistics_matrix.
set_row(i, data_statistics[i].to_vector());
1606 return(data_statistics_matrix);
1778 size_t target_index;
1787 Matrix<double> linear_correlations(inputs_number, targets_number);
1789 for(
size_t i = 0; i < inputs_number; i++)
1791 input_index = input_indices[i];
1795 for(
size_t j = 0; j < targets_number; j++)
1797 target_index = target_indices[j];
1806 return(linear_correlations);
1823 std::ostringstream buffer;
1827 const size_t statistics_size = data_statistics.size();
1829 if(statistics_size != columns_number)
1831 buffer <<
"OpenNN Exception: DataSet class.\n"
1832 <<
"void scale_data_mean_standard_deviation(const Vector< Statistics<double> >&) method.\n"
1833 <<
"Size of statistics must be equal to number of columns.\n";
1835 throw std::logic_error(buffer.str());
1842 for(
size_t i = 0; i < variables_number; i++)
1844 if(
display && data_statistics[i].standard_deviation < 1.0e-99)
1846 std::cout <<
"OpenNN Warning: DataSet class.\n"
1847 <<
"void scale_data_mean_standard_deviation(const Vector< Statistics<Type> >&) method.\n"
1848 <<
"Standard deviation of variable " << i <<
" is zero.\n"
1849 <<
"That variable won't be scaled.\n";
1869 return(data_statistics);
1885 return(data_statistics);
1904 std::ostringstream buffer;
1906 const size_t statistics_size = data_statistics.size();
1908 if(statistics_size != variables_number)
1910 buffer <<
"OpenNN Exception: DataSet class.\n"
1911 <<
"void scale_data_minimum_maximum(const Vector< Statistics<double> >&) method.\n"
1912 <<
"Size of data statistics must be equal to number of variables.\n";
1914 throw std::logic_error(buffer.str());
1919 for(
size_t i = 0; i < variables_number; i++)
1921 if(
display && data_statistics[i].maximum-data_statistics[i].minimum < 1.0e-99)
1923 std::cout <<
"OpenNN Warning: DataSet class.\n"
1924 <<
"void scale_data_minimum_maximum(const Vector< Statistics<Type> >&) method.\n"
1925 <<
"Range of variable " << i <<
" is zero.\n"
1926 <<
"That variable won't be scaled.\n";
1948 case MinimumMaximum:
1954 case MeanStandardDeviation:
1962 std::ostringstream buffer;
1964 buffer <<
"OpenNN Exception: DataSet class\n"
1965 <<
"void scale_data(const std::string&, const Vector< Vector<double> >&) method.\n"
1966 <<
"Unknown data scaling and unscaling method.\n";
1968 throw std::logic_error(buffer.str());
1986 case MinimumMaximum:
1992 case MeanStandardDeviation:
2000 std::ostringstream buffer;
2002 buffer <<
"OpenNN Exception: DataSet class\n"
2003 <<
"Vector< Statistics<double> > scale_data(const std::string&) method.\n"
2004 <<
"Unknown scaling and unscaling method.\n";
2006 throw std::logic_error(buffer.str());
2044 std::ostringstream buffer;
2046 buffer <<
"OpenNN Exception: DataSet class.\n"
2047 <<
"Vector< Statistics<double> > scale_inputs_mean_standard_deviation(void) method.\n"
2048 <<
"Data file is not loaded.\n";
2050 throw std::logic_error(buffer.str());
2059 return(inputs_statistics);
2092 std::ostringstream buffer;
2094 buffer <<
"OpenNN Exception: DataSet class.\n"
2095 <<
"Vector< Statistics<double> > scale_inputs_minimum_maximum(void) method.\n"
2096 <<
"Data file is not loaded.\n";
2098 throw std::logic_error(buffer.str());
2107 return(inputs_statistics);
2122 case MinimumMaximum:
2128 case MeanStandardDeviation:
2136 std::ostringstream buffer;
2138 buffer <<
"OpenNN Exception: DataSet class\n"
2139 <<
"Vector< Statistics<double> > scale_inputs(void) method.\n"
2140 <<
"Unknown scaling and unscaling method.\n";
2142 throw std::logic_error(buffer.str());
2159 case MinimumMaximum:
2165 case MeanStandardDeviation:
2173 std::ostringstream buffer;
2175 buffer <<
"OpenNN Exception: DataSet class\n"
2176 <<
"void scale_inputs(const std::string&, const Vector< Statistics<double> >&) method.\n"
2177 <<
"Unknown scaling and unscaling method.\n";
2179 throw std::logic_error(buffer.str());
2215 std::ostringstream buffer;
2217 buffer <<
"OpenNN Exception: DataSet class.\n"
2218 <<
"Vector< Statistics<double> > scale_targets_mean_standard_deviation(void) method.\n"
2219 <<
"Data file is not loaded.\n";
2221 throw std::logic_error(buffer.str());
2230 return(targets_statistics);
2249 std::ostringstream buffer;
2251 buffer <<
"OpenNN Exception: DataSet class.\n"
2252 <<
"Vector< Statistics<double> > scale_targets_minimum_maximum(void) method.\n"
2253 <<
"Data file is not loaded.\n";
2255 throw std::logic_error(buffer.str());
2278 return(targets_statistics);
2293 case MinimumMaximum:
2299 case MeanStandardDeviation:
2307 std::ostringstream buffer;
2309 buffer <<
"OpenNN Exception: DataSet class\n"
2310 <<
"Vector< Statistics<double> > scale_targets(const std::string&) method.\n"
2311 <<
"Unknown scaling and unscaling method.\n";
2313 throw std::logic_error(buffer.str());
2329 case MinimumMaximum:
2335 case MeanStandardDeviation:
2343 std::ostringstream buffer;
2345 buffer <<
"OpenNN Exception: DataSet class\n"
2346 <<
"void scale_targets(const std::string&, const Vector< Statistics<double> >&) method.\n"
2347 <<
"Unknown scaling and unscaling method.\n";
2349 throw std::logic_error(buffer.str());
2481 tinyxml2::XMLDocument* document =
new tinyxml2::XMLDocument;
2483 std::ostringstream buffer;
2487 tinyxml2::XMLElement* data_set_element = document->NewElement(
"DataSet");
2488 document->InsertFirstChild(data_set_element);
2490 tinyxml2::XMLElement* element = NULL;
2491 tinyxml2::XMLText* text = NULL;
2495 tinyxml2::XMLElement* data_file_element = document->NewElement(
"DataFile");
2497 data_set_element->InsertFirstChild(data_file_element);
2501 element = document->NewElement(
"DataFileName");
2502 data_file_element->LinkEndChild(element);
2505 element->LinkEndChild(text);
2510 element = document->NewElement(
"HeaderLine");
2511 data_file_element->LinkEndChild(element);
2516 text = document->NewText(buffer.str().c_str());
2517 element->LinkEndChild(text);
2522 element = document->NewElement(
"Separator");
2523 data_file_element->LinkEndChild(element);
2526 element->LinkEndChild(text);
2531 element = document->NewElement(
"MissingValuesLabel");
2532 data_file_element->LinkEndChild(element);
2535 element->LinkEndChild(text);
2540 element = document->NewElement(
"Variables");
2541 data_set_element->LinkEndChild(element);
2545 const tinyxml2::XMLElement* variables_element = variables_document->FirstChildElement(
"Variables");
2547 DeepClone(element, variables_element, document, NULL);
2549 delete variables_document;
2554 element = document->NewElement(
"Instances");
2555 data_set_element->LinkEndChild(element);
2559 const tinyxml2::XMLElement* instances_element = instances_document->FirstChildElement(
"Instances");
2561 DeepClone(element, instances_element, document, NULL);
2563 delete instances_document;
2568 element = document->NewElement(
"MissingValues");
2569 data_set_element->LinkEndChild(element);
2573 const tinyxml2::XMLElement* missing_values_element = missing_values_document->FirstChildElement(
"MissingValues");
2575 DeepClone(element, missing_values_element, document, NULL);
2577 delete missing_values_document;
2582 element = document->NewElement(
"Display");
2583 data_set_element->LinkEndChild(element);
2588 text = document->NewText(buffer.str().c_str());
2589 element->LinkEndChild(text);
2604 std::ostringstream buffer;
2608 const tinyxml2::XMLElement* data_set_element = data_set_document.FirstChildElement(
"DataSet");
2610 if(!data_set_element)
2612 buffer <<
"OpenNN Exception: DataSet class.\n"
2613 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
2614 <<
"Data set element is NULL.\n";
2616 throw std::logic_error(buffer.str());
2621 const tinyxml2::XMLElement* data_file_element = data_set_element->FirstChildElement(
"DataFile");
2623 if(!data_file_element)
2625 buffer <<
"OpenNN Exception: DataSet class.\n"
2626 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
2627 <<
"Data file element is NULL.\n";
2629 throw std::logic_error(buffer.str());
2635 const tinyxml2::XMLElement* data_file_name_element = data_file_element->FirstChildElement(
"DataFileName");
2637 if(!data_file_name_element)
2639 buffer <<
"OpenNN Exception: DataSet class.\n"
2640 <<
"void from_XML(const tinyxml2::XMLDocument&) method.\n"
2641 <<
"Data file name element is NULL.\n";
2643 throw std::logic_error(buffer.str());
2646 if(data_file_name_element->GetText())
2648 const std::string new_data_file_name = data_file_name_element->GetText();
2656 const tinyxml2::XMLElement* header_element = data_file_element->FirstChildElement(
"HeaderLine");
2660 const std::string new_header_string = header_element->GetText();
2666 catch(
const std::logic_error& e)
2668 std::cout << e.what() << std::endl;
2675 const tinyxml2::XMLElement* separator_element = data_file_element->FirstChildElement(
"Separator");
2677 if(separator_element)
2679 if(separator_element->GetText())
2681 const std::string new_separator = separator_element->GetText();
2698 const tinyxml2::XMLElement* missing_values_label_element = data_file_element->FirstChildElement(
"MissingValuesLabel");
2700 if(missing_values_label_element)
2702 if(missing_values_label_element->GetText())
2704 const std::string new_missing_values_label = missing_values_label_element->GetText();
2713 const tinyxml2::XMLElement* variables_element = data_set_element->FirstChildElement(
"Variables");
2715 if(variables_element)
2717 tinyxml2::XMLDocument variables_document;
2719 tinyxml2::XMLElement* variables_element_clone = variables_document.NewElement(
"Variables");
2720 variables_document.InsertFirstChild(variables_element_clone);
2722 DeepClone(variables_element_clone, variables_element, &variables_document, NULL);
2730 const tinyxml2::XMLElement* instances_element = data_set_element->FirstChildElement(
"Instances");
2732 if(instances_element)
2734 tinyxml2::XMLDocument instances_document;
2736 tinyxml2::XMLElement* instances_element_clone = instances_document.NewElement(
"Instances");
2737 instances_document.InsertFirstChild(instances_element_clone);
2739 DeepClone(instances_element_clone, instances_element, &instances_document, NULL);
2747 const tinyxml2::XMLElement* missing_values_element = data_set_element->FirstChildElement(
"MissingValues");
2749 if(missing_values_element)
2751 tinyxml2::XMLDocument missing_values_document;
2753 tinyxml2::XMLElement* missing_values_element_clone = missing_values_document.NewElement(
"MissingValues");
2754 missing_values_document.InsertFirstChild(missing_values_element_clone);
2756 DeepClone(missing_values_element_clone, missing_values_element, &missing_values_document, NULL);
2764 const tinyxml2::XMLElement* display_element = data_set_element->FirstChildElement(
"Display");
2768 const std::string new_display_string = display_element->GetText();
2774 catch(
const std::logic_error& e)
2776 std::cout << e.what() << std::endl;
2790 std::ostringstream buffer;
2792 buffer <<
"Data set object\n"
2797 <<
"Data:\n" <<
data <<
"\n"
2798 <<
"Display: " <<
display <<
"\n"
2803 return(buffer.str());
2832 std::cout <<
"Data set object summary:\n"
2833 <<
"Number of variables: " << variables_number <<
"\n"
2834 <<
"Number of instances: " << instances_number <<
"\n"
2835 <<
"Number of missing values: " << missing_values_number << std::endl;
2847 tinyxml2::XMLDocument* document =
to_XML();
2849 document->SaveFile(file_name.c_str());
2882 tinyxml2::XMLDocument document;
2884 if(document.LoadFile(file_name.c_str()))
2886 std::ostringstream buffer;
2888 buffer <<
"OpenNN Exception: DataSet class.\n"
2889 <<
"void load(const std::string&) method.\n"
2890 <<
"Cannot load XML file " << file_name <<
".\n";
2892 throw std::logic_error(buffer.str());
2907 std::cout <<
data << std::endl;
2923 if(instances_number > 0)
2927 std::cout <<
"First instance:\n"
2928 << first_instance << std::endl;
2931 if(instances_number > 1)
2935 std::cout <<
"Second instance:\n"
2936 << second_instance << std::endl;
2939 if(instances_number > 2)
2943 std::cout <<
"Instance " << instances_number <<
":\n"
2944 << last_instance << std::endl;
2960 std::ostringstream buffer;
2962 buffer <<
"OpenNN Exception: DataSet class.\n"
2963 <<
"void save_data(void) const method.\n"
2964 <<
"Cannot open data file.\n";
2966 throw std::logic_error(buffer.str());
2973 file << variables_name << std::endl;
2994 size_t variable_index = 0;
2996 for(
size_t i = 0; i < column_index; i++)
2998 if(nominal_labels[i].size() <= 2)
3004 variable_index += nominal_labels[i].size();
3008 return variable_index;
3027 if(line.find(separator_string) == std::string::npos)
3029 std::ostringstream buffer;
3031 buffer <<
"OpenNN Exception: DataSet class.\n"
3032 <<
"void check_separator(const std::string&) method.\n"
3035 throw std::logic_error(buffer.str());
3051 size_t columns_number = 0;
3055 getline(file, line);
3059 std::replace(line.begin(), line.end(),
'\t',
' ');
3076 return columns_number;
3097 getline(file, line);
3101 std::replace(line.begin(), line.end(),
'\t',
' ');
3128 std::cout <<
"OpenNN Warning: DataSet class.\n"
3129 <<
"void check_header_line(void) method.\n"
3130 <<
"First line of data file interpreted as not header.\n";
3137 std::ostringstream buffer;
3139 buffer <<
"OpenNN Exception: DataSet class.\n"
3140 <<
"void check_header_line(void) method.\n"
3141 <<
"Header line contains numeric values: \n"
3144 throw std::logic_error(buffer.str());
3150 std::cout <<
"OpenNN Warning: DataSet class.\n"
3151 <<
"void check_header_line(void) method.\n"
3152 <<
"First line of data file interpreted as header.\n";
3176 getline(file, line);
3180 std::replace(line.begin(), line.end(),
'\t',
' ');
3218 if(instance_index >= instances_number)
3220 std::ostringstream buffer;
3222 buffer <<
"OpenNN Exception: DataSet class.\n"
3223 <<
"void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3224 <<
"Index of instance (" << instance_index <<
") must be less than number of instances (" << instances_number <<
").\n";
3226 throw std::logic_error(buffer.str());
3235 if(tokens.size() != nominal_labels.size())
3237 std::ostringstream buffer;
3239 buffer <<
"OpenNN Exception: DataSet class.\n"
3240 <<
"void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3241 <<
"Size of tokens (" << tokens.size() <<
") must be equal to size of names (" << nominal_labels.size() <<
").\n";
3243 throw std::logic_error(buffer.str());
3248 size_t column_index;
3250 for(
size_t j = 0; j < tokens.size(); j++)
3254 if(nominal_labels[j].size() == 0)
3258 data(instance_index, column_index) = atof(tokens[j].c_str());
3262 data(instance_index, column_index) = -99.9;
3268 else if(nominal_labels[j].size() == 2)
3272 if(tokens[j] ==
"false" || tokens[j] ==
"False"|| tokens[j] ==
"FALSE"
3273 || tokens[j] ==
"negative"|| tokens[j] ==
"Negative"|| tokens[j] ==
"NEGATIVE")
3275 data(instance_index, column_index) = 0.0;
3277 else if(tokens[j] ==
"true" || tokens[j] ==
"True"|| tokens[j] ==
"TRUE"
3278 || tokens[j] ==
"positive"|| tokens[j] ==
"Positive"|| tokens[j] ==
"POSITIVE")
3280 data(instance_index, column_index) = 1.0;
3282 else if(tokens[j] == nominal_labels[j][0])
3284 data(instance_index, column_index) = 0.0;
3286 else if(tokens[j] == nominal_labels[j][1])
3288 data(instance_index, column_index) = 1.0;
3292 std::ostringstream buffer;
3294 buffer <<
"OpenNN Exception: DataSet class.\n"
3295 <<
"void read_instance(const std::string&, const Vector< Vector<std::string> >&, const size_t&) method.\n"
3296 <<
"Unknown token binary value.\n";
3298 throw std::logic_error(buffer.str());
3303 data(instance_index, column_index) = -99.9;
3313 for(
size_t k = 0; k < nominal_labels[j].size(); k++)
3315 if(tokens[j] == nominal_labels[j][k])
3317 data(instance_index, column_index+k) = 1.0;
3321 data(instance_index, column_index+k) = 0.0;
3327 for(
size_t k = 0; k < nominal_labels[j].size(); k++)
3329 data(instance_index, column_index+k) = -99.9;
3357 int instances_count;
3361 instances_count = -1;
3365 instances_count = 0;
3374 getline(file, line);
3378 std::replace(line.begin(), line.end(),
'\t',
' ');
3390 instances_count = 0;
3399 if(tokens.size() != columns_number)
3401 std::ostringstream buffer;
3403 buffer <<
"OpenNN Exception: DataSet class.\n"
3404 <<
"Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
3405 <<
"Row " << instances_count <<
": Size of tokens (" << tokens.size() <<
") is not equal to "
3406 <<
"number of columns (" << columns_number <<
").\n";
3408 throw std::logic_error(buffer.str());
3413 for(
size_t j = 0; j < columns_number; j++)
3419 && !nominal_labels[j].contains(tokens[j]))
3421 nominal_labels[j].push_back(tokens[j]);
3428 size_t variables_count = 0;
3430 for(
size_t i = 0; i < columns_number; i++)
3432 if(nominal_labels[i].size() == 0 || nominal_labels[i].size() == 2)
3438 variables_count += nominal_labels[i].size();
3444 for(
size_t i = 0; i < columns_number; i++)
3446 if(nominal_labels[i].size() == instances_count)
3448 std::ostringstream buffer;
3450 buffer <<
"OpenNN Exception: DataSet class.\n"
3451 <<
"Vector< Vector<std::string> > DataSet::set_from_data_file(void).\n"
3452 <<
"Column " << i <<
": All elements are nominal and different. It contains meaningless data.\n";
3454 throw std::logic_error(buffer.str());
3460 if(instances_count == 0 || variables_count == 0)
3464 return(nominal_labels);
3467 data.
set(instances_count, variables_count);
3473 if(nominal_labels[columns_number-1].size() > 2)
3475 for(
size_t i = variables_count-1; i >= variables_count - nominal_labels[columns_number-1].size(); i--)
3489 return(nominal_labels);
3502 file.seekg(0, std::ios::beg);
3510 getline(file, line);
3514 std::replace(line.begin(), line.end(),
'\t',
' ');
3534 getline(file, line);
3538 std::replace(line.begin(), line.end(),
'\t',
' ');
3585 return(time_series_prediction_names);
3598 return(autoassociation_names);
3647 std::ostringstream buffer;
3649 buffer <<
"OpenNN Exception: DataSet class.\n"
3650 <<
"void load_data(void) method.\n"
3651 <<
"Data file name has not been set.\n";
3653 throw std::logic_error(buffer.str());
3660 std::ostringstream buffer;
3662 buffer <<
"OpenNN Exception: DataSet class.\n"
3663 <<
"void load_data(void) method.\n"
3666 throw std::logic_error(buffer.str());
3685 for(
unsigned i = 0; i < nominal_labels.size(); i++)
3687 std::ostringstream buffer;
3689 buffer <<
"variable_" << i;
3691 columns_name.push_back(buffer.str());
3799 if(targets_number == 1)
3801 class_distribution.
set(2, 0);
3803 size_t target_index = targets_indices[0];
3805 for(
size_t instance_index = 0; instance_index < instances_number; instance_index++)
3809 if(
data(instance_index,target_index) < 0.5)
3811 class_distribution[0]++;
3815 class_distribution[1]++;
3822 class_distribution.
set(targets_number, 0);
3824 for(
size_t i = 0; i < instances_number; i++)
3828 for(
size_t j = 0; j < targets_number; j++)
3830 if(
data(i,targets_indices[j]) > 0.5)
3832 class_distribution[j]++;
3843 if(class_distribution.
calculate_sum() != used_instances_number)
3845 std::ostringstream buffer;
3847 buffer <<
"OpenNN Exception: DataSet class.\n"
3848 <<
"Vector<size_t> calculate_target_class_distribution(void) const method.\n"
3849 <<
"Sum of class distributions (" << class_distribution <<
") is not equal to "
3850 <<
"number of used instances (" << used_instances_number <<
")." << std::endl;
3852 throw std::logic_error(buffer.str());
3855 return(class_distribution);
3879 #pragma omp parallel for private(i, instance)
3881 for(i = 0; i < (int)instances_number; i++)
3885 distances[i] = (instance-means/standard_deviations).calculate_norm();
4080 if(minimums.size() != variables_number)
4082 std::ostringstream buffer;
4084 buffer <<
"OpenNN Exception: DataSet class.\n"
4085 <<
"Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
4086 <<
"Size of minimums (" << minimums.size() <<
") is not equal to number of variables (" << variables_number <<
").\n";
4088 throw std::logic_error(buffer.str());
4091 if(maximums.size() != variables_number)
4093 std::ostringstream buffer;
4095 buffer <<
"OpenNN Exception: DataSet class.\n"
4096 <<
"Vector<size_t> filter_data(const Vector<double>&, const Vector<double>&) method.\n"
4097 <<
"Size of maximums (" << maximums.size() <<
") is not equal to number of variables (" << variables_number <<
").\n";
4099 throw std::logic_error(buffer.str());
4110 for(
size_t i = 0; i < instances_number; i++)
4112 for(
size_t j = 0; j < variables_number; j++)
4119 if(
data(i,j) < minimums[j] ||
data(i,j) > maximums[j])
4123 filtered_indices.push_back(i);
4131 return(filtered_indices);
4150 if(variable_index >= variables_number)
4152 std::ostringstream buffer;
4154 buffer <<
"OpenNN Exception: DataSet class.\n"
4155 <<
"void convert_angular_variable_degrees(const size_t&) method.\n"
4156 <<
"Index of variable (" << variable_index <<
") must be less than number of variables (" << variables_number <<
").\n";
4158 throw std::logic_error(buffer.str());
4171 items[variable_index] = sin_item;
4196 if(variable_index >= variables_number)
4198 std::ostringstream buffer;
4200 buffer <<
"OpenNN Exception: DataSet class.\n"
4201 <<
"void convert_angular_variable_radians(const size_t&) method.\n"
4202 <<
"Index of variable (" << variable_index <<
") must be less than number of variables (" << variables_number <<
").\n";
4204 throw std::logic_error(buffer.str());
4217 items[variable_index] = sin_item;
4242 for(
size_t i = 0; i < indices.size(); i++)
4244 if(indices[i] >= variables_number)
4246 std::ostringstream buffer;
4248 buffer <<
"OpenNN Exception: DataSet class.\n"
4249 <<
"void convert_angular_variables_degrees(const Vector<size_t>&) method.\n"
4250 <<
"Index (" << i <<
") must be less than number of variables (" << variables_number <<
").\n";
4252 throw std::logic_error(buffer.str());
4258 size_t size = indices.size();
4264 for(
size_t i = 0; i < size; i++)
4266 index = indices[i]+count;
4290 for(
size_t i = 0; i < indices.size(); i++)
4292 if(indices[i] >= variables_number)
4294 std::ostringstream buffer;
4296 buffer <<
"OpenNN Exception: DataSet class.\n"
4297 <<
"void convert_angular_variables_radians(const Vector<size_t>&) method.\n"
4298 <<
"Index (" << i <<
") must be less than number of variables (" << variables_number <<
").\n";
4300 throw std::logic_error(buffer.str());
4306 size_t size = indices.size();
4312 for(
size_t i = 0; i < size; i++)
4314 index = indices[i]+count;
4333 case DataSet::Radians:
4339 case DataSet::Degrees:
4347 std::ostringstream buffer;
4349 buffer <<
"OpenNN Exception: DataSet class.\n"
4350 <<
"void convert_angular_variables(void) method.\n"
4351 <<
"Unknown angular units.\n";
4353 throw std::logic_error(buffer.str());
4369 for(
size_t i = 0; i < missing_instances.size(); i++)
4388 size_t instance_index;
4390 for(
size_t i = 0; i < variables_number; i++)
4392 for(
size_t j = 0; j < missing_indices[i].size(); j++)
4394 instance_index = missing_indices[i][j];
4395 data(instance_index, i) = means[i];
4411 switch(scrubbing_method)
4413 case MissingValues::Unuse:
4419 case MissingValues::Mean:
4427 std::ostringstream buffer;
4429 buffer <<
"OpenNN Exception: DataSet class\n"
4430 <<
"void scrub_missing_values(void) method.\n"
4431 <<
"Unknown scrubbing method.\n";
4433 throw std::logic_error(buffer.str());
4462 size_t tokens_count = 0;
4468 std::string::size_type last_pos = str.find_first_not_of(separator_string, 0);
4472 std::string::size_type pos = str.find_first_of(separator_string, last_pos);
4474 while (std::string::npos != pos || std::string::npos != last_pos)
4482 last_pos = str.find_first_not_of(separator_string, pos);
4486 pos = str.find_first_of(separator_string, last_pos);
4489 return(tokens_count);
4507 std::string::size_type lastPos = new_string.find_first_not_of(separator_string, 0);
4511 std::string::size_type pos = new_string.find_first_of(separator_string, lastPos);
4513 while(std::string::npos != pos || std::string::npos != lastPos)
4517 tokens.push_back(new_string.substr(lastPos, pos - lastPos));
4521 lastPos = new_string.find_first_not_of(separator_string, pos);
4525 pos = new_string.find_first_of(separator_string, lastPos);
4528 for(
size_t i = 0; i < tokens.size(); i++)
4544 std::istringstream iss(str.data());
4559 return(iss.rdbuf()->in_avail() == 0);
4573 str.erase(0, str.find_first_not_of(
' '));
4577 str.erase(str.find_last_not_of(
' ') + 1);
4587 std::string output(str);
4591 output.erase(0, output.find_first_not_of(
' '));
4595 output.erase(output.find_last_not_of(
' ') + 1);
4609 std::ostringstream buffer;
4611 buffer << pre << str;
4613 return(buffer.str());
4624 for(
size_t i = 0; i < v.size(); i++)
4643 for(
size_t i = 0; i < v.size(); i++)
4662 unsigned count_numeric = 0;
4663 unsigned count_not_numeric = 0;
4665 for(
size_t i = 0; i < v.size(); i++)
4673 count_not_numeric++;
4677 if(count_numeric > 0 && count_not_numeric > 0)
Vector< Statistics< T > > calculate_rows_statistics_missing_values(const Vector< size_t > &, const Vector< Vector< size_t > > &) const
void unscale_inputs_minimum_maximum(const Vector< Statistics< double > > &)
void from_XML(const tinyxml2::XMLDocument &)
const Use & get_use(const size_t &) const
void randomize_data_normal(const double &mean=0.0, const double &standard_deviation=1.0)
Vector< T > arrange_column(const size_t &) const
void convert_time_series(const size_t &)
Separator
Enumeration of available separators for the data file.
void save(const std::string &) const
Variables * get_variables_pointer(void)
Returns a pointer to the variables object composing this data set object.
Vector< Vector< std::string > > set_from_data_file(void)
void scale_columns_mean_standard_deviation(const Vector< Statistics< T > > &, const Vector< size_t > &)
Vector< Histogram< T > > calculate_histograms_missing_values(const Vector< Vector< size_t > > &, const size_t &=10) const
void unscale_data_minimum_maximum(const Vector< Statistics< double > > &)
bool has_data(void) const
Vector< Statistics< T > > calculate_statistics_missing_values(const Vector< Vector< size_t > > &) const
Matrix< double > arrange_training_input_data(void) const
void set_data(const Matrix< double > &)
tinyxml2::XMLDocument * to_XML(void) const
ScrubbingMethod get_scrubbing_method(void) const
Returns the method to be used for dealing with the missing values.
Matrix< double > calculate_linear_correlations(void) const
void scrub_missing_values_mean(void)
Substitutes all the missing values by the mean of the corresponding variable.
void add_instance(const Vector< double > &)
void set(void)
Sets zero instances and zero variables in the data set.
ScrubbingMethod
Enumeration of available activation functions for the perceptron neuron model.
Vector< Statistics< double > > calculate_targets_statistics(void) const
const Separator & get_separator(void) const
Returns the separator to be used in the data file.
void unscale_data_mean_standard_deviation(const Vector< Statistics< double > > &)
void convert_angular_variable_degrees(const size_t &)
std::string write_separator(void) const
Returns the string which will be used as separator in the data file.
const Variables & get_variables(void) const
Returns a constant reference to the variables object composing this data set object.
Vector< size_t > angular_variables
Indices of angular variables.
Vector< size_t > arrange_missing_instances(void) const
Returns a vector with the indices of those instances with missing values.
void set_data_file_name(const std::string &)
void read_instance(const std::string &, const Vector< Vector< std::string > > &, const size_t &)
std::string data_file_name
Data file name.
size_t get_column_index(const Vector< Vector< std::string > > &, const size_t) const
std::string to_string(void) const
Returns a string representation of the current instances object.
Matrix< double > arrange_target_data(void) const
bool is_constant(const double &=0.0) const
void set(void)
Sets a instances object with zero instances.
const std::string & get_missing_values_label(void) const
Returns the string which will be used as label for the missing values in the data file...
void append_column(const Vector< T > &)
Vector< size_t > unuse_repeated_instances(void)
void set_instances_number(const size_t &)
void convert_autoassociation(void)
void unscale_inputs_mean_standard_deviation(const Vector< Statistics< double > > &)
bool operator==(const DataSet &) const
void unscale_columns_mean_standard_deviation(const Vector< Statistics< T > > &, const Vector< size_t > &)
bool is_not_numeric(const Vector< std::string > &) const
AngularUnits angular_units
Units of angular variables.
Vector< double > calculate_mean_missing_values(const Vector< Vector< size_t > > &) const
void balance_data(const double &)
const Vector< size_t > & get_angular_variables(void) const
void set(void)
Sets the size of a vector to zero.
void append_variable(const Vector< double > &)
size_t count_data_file_columns_number(void) const
void initialize_data(const double &)
void set_missing_values_label(const std::string &)
tinyxml2::XMLDocument * to_XML(void) const
Matrix< T > arrange_submatrix(const Vector< size_t > &, const Vector< size_t > &) const
Matrix< double > arrange_testing_target_data(void) const
Matrix< double > arrange_input_data(void) const
void print_data(void) const
Prints to the sceen the values of the data matrix.
void print_summary(void) const
Prints to the screen in text format the main numbers from the data set object.
std::string get_separator_string(void) const
Returns the string which will be used as separator in the data file.
ScalingUnscalingMethod
Enumeration of available methods for scaling and unscaling the data.
void set_angular_variables(const Vector< size_t > &)
void convert_angular_variables_radians(const size_t &)
void check_separator(const std::string &) const
bool is_numeric(const std::string &) const
void convert_time_series(const size_t &)
void print_data_preview(void) const
const MissingValues & get_missing_values(void) const
Returns a reference to the missing values object in the data set.
Vector< double > calculate_training_target_data_mean(void) const
Returns the mean values of the target variables on the training instances.
bool empty(void) const
Returns true if the data matrix is empty, and false otherwise.
void scrub_missing_values_unuse(void)
Sets all the instances with missing values to "Unused".
Vector< Statistics< double > > scale_inputs_mean_standard_deviation(void)
void scale_columns_minimum_maximum(const Vector< Statistics< T > > &, const Vector< size_t > &)
Vector< Statistics< double > > scale_targets(const std::string &)
Vector< Histogram< double > > calculate_data_histograms(const size_t &=10) const
MissingValues * get_missing_values_pointer(void)
Returns a pointer to the missing values object in the data set.
std::string prepend(const std::string &, const std::string &) const
void convert_angular_variables(void)
tinyxml2::XMLDocument * to_XML(void) const
Serializes the data set object into a XML document of the TinyXML library.
const bool & get_header_line(void) const
Returns true if the first line of the data file has a header with the names of the variables...
Vector< size_t > arrange_testing_indices(void) const
Returns the indices of the instances which will be used for testing.
void print(void) const
Prints to the screen in text format the members of the data set object.
std::string to_string(void) const
Returns a string representation of the current data set object.
void convert_angular_variables_degrees(const size_t &)
void set_display(const bool &)
Separator separator
Separator character.
void scale_data(const std::string &, const Vector< Statistics< double > > &)
const std::string & get_data_file_name(void) const
Returns the name of the data file.
Vector< std::string > arrange_autoassociation_names(const Vector< std::string > &) const
const size_t & get_columns_number(void) const
Returns the number of columns in the matrix.
Vector< Statistics< double > > scale_data_mean_standard_deviation(void)
Vector< size_t > arrange_targets_indices(void) const
Returns the indices of the target variables.
Vector< Statistics< double > > scale_inputs(const std::string &)
Vector< size_t > arrange_training_indices(void) const
Returns the indices of the instances which will be used for training.
bool autoassociation
Autoassociation flag.
void convert_autoassociation(void)
Vector< double > get_instance(const size_t &) const
void read_from_data_file(const Vector< Vector< std::string > > &)
Performs a second data file read in which the data is set.
void unscale_targets_mean_standard_deviation(const Vector< Statistics< double > > &)
DataSet & operator=(const DataSet &)
void from_XML(const tinyxml2::XMLDocument &)
void append(const size_t &, const size_t &)
void set_header_line(const bool &)
Sets if the data file contains a header with the names of the variables.
void set_autoassociation(const bool &)
void convert_time_series(const size_t &)
MissingValues missing_values
Missing values object.
void subtract_column(const size_t &)
Vector< Statistics< T > > calculate_statistics(void) const
void set_variables_number(const size_t &)
void balance_target_class_distribution(void)
size_t count_tokens(std::string &) const
const AngularUnits & get_angular_units(void) const
Returns the units used for the angular variables (Radians or Degrees).
Instances * get_instances_pointer(void)
Returns a pointer to the variables object composing this data set object.
Matrix< double > arrange_generalization_data(void) const
void unscale_targets_minimum_maximum(const Vector< Statistics< double > > &)
Vector< std::string > get_tokens(const std::string &) const
size_t lags_number
Number of lags.
void unscale_mean_standard_deviation(const Vector< Statistics< T > > &)
const bool & get_display(void) const
bool header_line
Header which contains variables name.
Vector< size_t > unuse_constant_variables(void)
void set_items(const Vector< Item > &)
void set(void)
Sets a missing values object with zero instances, variables and missing values.
const Matrix< double > & get_data(void) const
void load(const std::string &)
void convert_time_series(void)
bool is_missing_value(const size_t &, const size_t &) const
Vector< Statistics< double > > calculate_data_statistics(void) const
void randomize_data_uniform(const double &minimum=-1.0, const double &maximum=1.0)
void initialize(const T &)
Vector< double > calculate_distances(void) const
Vector< double > calculate_testing_target_data_mean(void) const
Returns the mean values of the target variables on the testing instances.
Vector< Statistics< double > > scale_data_minimum_maximum(void)
Matrix< double > arrange_training_data(void) const
bool is_mixed(const Vector< std::string > &) const
void convert_time_series(const size_t &)
tinyxml2::XMLDocument * to_XML(void) const
void trim(std::string &) const
void convert_autoassociation(void)
Matrix< double > arrange_testing_data(void) const
void set_instances_number(const size_t &)
void set(void)
Sets a variables object with zero variables.
std::string to_string(void) const
Returns a string representation of the current MissingValues object.
void scale_minimum_maximum(const Vector< Statistics< T > > &)
Matrix< double > calculate_data_statistics_matrix(void) const
void convert_angular_variables_radians(const Vector< size_t > &)
Variables variables
Variables object (inputs and target variables).
void set(void)
This method set the numbers of rows and columns of the matrix to zero.
const size_t & get_rows_number(void) const
Returns the number of rows in the matrix.
std::string missing_values_label
Missing values label.
const size_t & get_lags_number(void) const
Returns the number of lags to be used in a time series prediction application.
Vector< Statistics< double > > calculate_testing_instances_statistics(void) const
void from_XML(const tinyxml2::XMLDocument &)
Vector< Statistics< double > > calculate_inputs_statistics(void) const
void check_header_line(void)
AngularUnits
Enumeration of the units used for angular variables.
std::string get_trimmed(const std::string &) const
void set_lags_number(const size_t &)
Vector< Statistics< double > > scale_targets_mean_standard_deviation(void)
Vector< Statistics< T > > calculate_columns_statistics_missing_values(const Vector< size_t > &, const Vector< Vector< size_t > >) const
Vector< double > get_variable(const size_t &) const
T calculate_linear_correlation(const Vector< T > &) const
Matrix< double > arrange_testing_input_data(void) const
void subtract_variable(const size_t &)
Vector< Statistics< double > > scale_inputs_minimum_maximum(void)
void randomize_uniform(const double &=-1.0, const double &=1.0)
size_t count_inputs_number(void) const
Returns the number of input variables of the data set.
void append_row(const Vector< T > &)
void from_XML(const tinyxml2::XMLDocument &)
Matrix< double > arrange_training_target_data(void) const
Vector< Statistics< double > > calculate_generalization_instances_statistics(void) const
bool display
Display messages to screen.
void set_use(const size_t &, const Use &)
Vector< Statistics< double > > calculate_training_instances_statistics(void) const
void convert_angular_variable_radians(const size_t &)
Vector< std::string > read_header_line(void) const
Returns the name of the columns in the data set as a list of strings.
Matrix< double > get_generalization_target_data(void) const
void convert_autoassociation(void)
Vector< T > insert_element(const size_t &, const T &) const
Vector< double > calculate_generalization_target_data_mean(void) const
Returns the mean values of the target variables on the generalization instances.
Vector< std::string > arrange_names(void) const
Returns the names of all the variables in the data set.
void set_use(const size_t &, const Use &)
void set_angular_units(AngularUnits &)
Sets the units of the angular variables (Radians or Degrees).
size_t count_targets_number(void) const
Returns the number of target variables of the data set.
Matrix< double > get_generalization_input_data(void) const
Vector< std::string > arrange_time_series_names(const Vector< std::string > &) const
virtual ~DataSet(void)
Destructor.
void set_instance(const size_t &, const Vector< double > &)
size_t get_missing_values_number(void) const
Returns the number of missing values in the data set.
void set_separator(const Separator &)
void load_data(void)
This method loads the data file.
const bool & get_autoassociation(void) const
void scrub_missing_values(void)
bool is_used(const size_t &) const
std::string to_string(void) const
Returns a string representation of the current variables object.
void unscale_columns_minimum_maximum(const Vector< Statistics< T > > &, const Vector< size_t > &)
Vector< Statistics< double > > scale_targets_minimum_maximum(void)
bool empty(void) const
Returns true if number of rows and columns is zero.
void set_names(const Vector< std::string > &)
const Vector< Item > & get_items(void) const
Returns the vector Item structures in the variables object.
bool is_used(const size_t &) const
void subtract_instance(const size_t &)
Vector< size_t > filter_data(const Vector< double > &, const Vector< double > &)
size_t count_used_instances_number(void) const
T calculate_sum(void) const
Returns the sum of the elements in the vector.
void set_row(const size_t &, const Vector< T > &)
void unscale_minimum_maximum(const Vector< Statistics< T > > &)
void convert_angular_variables_degrees(const Vector< size_t > &)
void scale_mean_standard_deviation(const Vector< Statistics< T > > &)
void set_variables_number(const size_t &)
size_t get_instances_number(void) const
Returns the number of instances in the data set.
size_t get_variables_number(void) const
Returns the total number of variables in the data set.
Vector< Vector< size_t > > arrange_missing_indices(void) const
Vector< size_t > arrange_generalization_indices(void) const
Returns the indices of the instances which will be used for generalization.
Vector< size_t > calculate_target_class_distribution(void) const
Vector< T > arrange_row(const size_t &) const
Instances instances
Instances object (training, generalization and testing instances).
void subtract_row(const size_t &)
void randomize_normal(const double &=0.0, const double &=1.0)
static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string &)
std::string name
Name of a variable.
Vector< size_t > arrange_inputs_indices(void) const
Returns the indices of the input variables.
const Instances & get_instances(void) const
Returns a constant reference to the instances object composing this data set object.
void save_data(void) const
Saves to the data file the values of the data matrix.