35 #include "missing_values.h"
36 #include "variables.h"
37 #include "instances.h"
41 #include "../tinyxml2/tinyxml2.h"
65 explicit DataSet(
const size_t&,
const size_t&);
69 explicit DataSet(
const size_t&,
const size_t&,
const size_t&);
73 explicit DataSet(
const tinyxml2::XMLDocument&);
77 explicit DataSet(
const std::string&);
144 bool empty(
void)
const;
176 void set(
const size_t&,
const size_t&);
177 void set(
const size_t&,
const size_t&,
const size_t&);
179 void set(
const tinyxml2::XMLDocument&);
180 void set(
const std::string&);
325 void print(
void)
const;
328 tinyxml2::XMLDocument*
to_XML(
void)
const;
329 void from_XML(
const tinyxml2::XMLDocument&);
331 void save(
const std::string&)
const;
332 void load(
const std::string&);
374 void trim(std::string&)
const;
378 std::string
prepend(
const std::string&,
const std::string&)
const;
void unscale_inputs_minimum_maximum(const Vector< Statistics< double > > &)
void from_XML(const tinyxml2::XMLDocument &)
void randomize_data_normal(const double &mean=0.0, const double &standard_deviation=1.0)
Separator
Enumeration of available separators for the data file.
void save(const std::string &) const
Variables * get_variables_pointer(void)
Returns a pointer to the variables object composing this data set object.
Vector< Vector< std::string > > set_from_data_file(void)
void unscale_data_minimum_maximum(const Vector< Statistics< double > > &)
bool has_data(void) const
Matrix< double > arrange_training_input_data(void) const
void set_data(const Matrix< double > &)
Matrix< double > calculate_linear_correlations(void) const
void scrub_missing_values_mean(void)
Substitutes all the missing values by the mean of the corresponding variable.
void add_instance(const Vector< double > &)
void set(void)
Sets zero instances and zero variables in the data set.
Vector< Statistics< double > > calculate_targets_statistics(void) const
const Separator & get_separator(void) const
Returns the separator to be used in the data file.
void unscale_data_mean_standard_deviation(const Vector< Statistics< double > > &)
void convert_angular_variable_degrees(const size_t &)
std::string write_separator(void) const
Returns the string which will be used as separator in the data file.
const Variables & get_variables(void) const
Returns a constant reference to the variables object composing this data set object.
Vector< size_t > angular_variables
Indices of angular variables.
void set_data_file_name(const std::string &)
void read_instance(const std::string &, const Vector< Vector< std::string > > &, const size_t &)
std::string data_file_name
Data file name.
size_t get_column_index(const Vector< Vector< std::string > > &, const size_t) const
Matrix< double > arrange_target_data(void) const
const std::string & get_missing_values_label(void) const
Returns the string which will be used as label for the missing values in the data file...
Vector< size_t > unuse_repeated_instances(void)
void unscale_inputs_mean_standard_deviation(const Vector< Statistics< double > > &)
bool operator==(const DataSet &) const
bool is_not_numeric(const Vector< std::string > &) const
AngularUnits angular_units
Units of angular variables.
void balance_data(const double &)
const Vector< size_t > & get_angular_variables(void) const
void append_variable(const Vector< double > &)
size_t count_data_file_columns_number(void) const
void initialize_data(const double &)
void set_missing_values_label(const std::string &)
Matrix< double > arrange_testing_target_data(void) const
Matrix< double > arrange_input_data(void) const
void print_data(void) const
Prints to the sceen the values of the data matrix.
void print_summary(void) const
Prints to the screen in text format the main numbers from the data set object.
std::string get_separator_string(void) const
Returns the string which will be used as separator in the data file.
ScalingUnscalingMethod
Enumeration of available methods for scaling and unscaling the data.
void set_angular_variables(const Vector< size_t > &)
void check_separator(const std::string &) const
bool is_numeric(const std::string &) const
void print_data_preview(void) const
const MissingValues & get_missing_values(void) const
Returns a reference to the missing values object in the data set.
Vector< double > calculate_training_target_data_mean(void) const
Returns the mean values of the target variables on the training instances.
bool empty(void) const
Returns true if the data matrix is empty, and false otherwise.
void scrub_missing_values_unuse(void)
Sets all the instances with missing values to "Unused".
Vector< Statistics< double > > scale_inputs_mean_standard_deviation(void)
Vector< Statistics< double > > scale_targets(const std::string &)
Vector< Histogram< double > > calculate_data_histograms(const size_t &=10) const
MissingValues * get_missing_values_pointer(void)
Returns a pointer to the missing values object in the data set.
std::string prepend(const std::string &, const std::string &) const
void convert_angular_variables(void)
tinyxml2::XMLDocument * to_XML(void) const
Serializes the data set object into a XML document of the TinyXML library.
const bool & get_header_line(void) const
Returns true if the first line of the data file has a header with the names of the variables...
void print(void) const
Prints to the screen in text format the members of the data set object.
std::string to_string(void) const
Returns a string representation of the current data set object.
void set_display(const bool &)
Separator separator
Separator character.
void scale_data(const std::string &, const Vector< Statistics< double > > &)
const std::string & get_data_file_name(void) const
Returns the name of the data file.
Vector< std::string > arrange_autoassociation_names(const Vector< std::string > &) const
Vector< Statistics< double > > scale_data_mean_standard_deviation(void)
Vector< Statistics< double > > scale_inputs(const std::string &)
bool autoassociation
Autoassociation flag.
Vector< double > get_instance(const size_t &) const
void read_from_data_file(const Vector< Vector< std::string > > &)
Performs a second data file read in which the data is set.
void unscale_targets_mean_standard_deviation(const Vector< Statistics< double > > &)
DataSet & operator=(const DataSet &)
void set_header_line(const bool &)
Sets if the data file contains a header with the names of the variables.
void set_autoassociation(const bool &)
MissingValues missing_values
Missing values object.
void balance_target_class_distribution(void)
size_t count_tokens(std::string &) const
const AngularUnits & get_angular_units(void) const
Returns the units used for the angular variables (Radians or Degrees).
Instances * get_instances_pointer(void)
Returns a pointer to the variables object composing this data set object.
Matrix< double > arrange_generalization_data(void) const
void unscale_targets_minimum_maximum(const Vector< Statistics< double > > &)
Vector< std::string > get_tokens(const std::string &) const
size_t lags_number
Number of lags.
const bool & get_display(void) const
bool header_line
Header which contains variables name.
Vector< size_t > unuse_constant_variables(void)
const Matrix< double > & get_data(void) const
void load(const std::string &)
void convert_time_series(void)
Vector< Statistics< double > > calculate_data_statistics(void) const
void randomize_data_uniform(const double &minimum=-1.0, const double &maximum=1.0)
Vector< double > calculate_distances(void) const
Vector< double > calculate_testing_target_data_mean(void) const
Returns the mean values of the target variables on the testing instances.
Vector< Statistics< double > > scale_data_minimum_maximum(void)
Matrix< double > arrange_training_data(void) const
bool is_mixed(const Vector< std::string > &) const
void trim(std::string &) const
void convert_autoassociation(void)
Matrix< double > arrange_testing_data(void) const
void set_instances_number(const size_t &)
Matrix< double > calculate_data_statistics_matrix(void) const
void convert_angular_variables_radians(const Vector< size_t > &)
Variables variables
Variables object (inputs and target variables).
std::string missing_values_label
Missing values label.
const size_t & get_lags_number(void) const
Returns the number of lags to be used in a time series prediction application.
Vector< Statistics< double > > calculate_testing_instances_statistics(void) const
Vector< Statistics< double > > calculate_inputs_statistics(void) const
void check_header_line(void)
AngularUnits
Enumeration of the units used for angular variables.
std::string get_trimmed(const std::string &) const
void set_lags_number(const size_t &)
Vector< Statistics< double > > scale_targets_mean_standard_deviation(void)
Vector< double > get_variable(const size_t &) const
Matrix< double > arrange_testing_input_data(void) const
void subtract_variable(const size_t &)
Vector< Statistics< double > > scale_inputs_minimum_maximum(void)
Matrix< double > arrange_training_target_data(void) const
Vector< Statistics< double > > calculate_generalization_instances_statistics(void) const
bool display
Display messages to screen.
Vector< Statistics< double > > calculate_training_instances_statistics(void) const
void convert_angular_variable_radians(const size_t &)
Vector< std::string > read_header_line(void) const
Returns the name of the columns in the data set as a list of strings.
Matrix< double > get_generalization_target_data(void) const
Vector< double > calculate_generalization_target_data_mean(void) const
Returns the mean values of the target variables on the generalization instances.
void set_angular_units(AngularUnits &)
Sets the units of the angular variables (Radians or Degrees).
Matrix< double > get_generalization_input_data(void) const
Vector< std::string > arrange_time_series_names(const Vector< std::string > &) const
virtual ~DataSet(void)
Destructor.
void set_instance(const size_t &, const Vector< double > &)
void set_separator(const Separator &)
void load_data(void)
This method loads the data file.
const bool & get_autoassociation(void) const
void scrub_missing_values(void)
Vector< Statistics< double > > scale_targets_minimum_maximum(void)
void subtract_instance(const size_t &)
Vector< size_t > filter_data(const Vector< double > &, const Vector< double > &)
void convert_angular_variables_degrees(const Vector< size_t > &)
void set_variables_number(const size_t &)
Vector< size_t > calculate_target_class_distribution(void) const
Instances instances
Instances object (training, generalization and testing instances).
static ScalingUnscalingMethod get_scaling_unscaling_method(const std::string &)
const Instances & get_instances(void) const
Returns a constant reference to the instances object composing this data set object.
void save_data(void) const
Saves to the data file the values of the data matrix.