diff --git a/files_utils.c b/files_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..9a8738f8f3439c74f711d7fa0f6cf59e4187c0e3 --- /dev/null +++ b/files_utils.c @@ -0,0 +1,46 @@ +// Project : K-means +// Author : Dario GENGA + +#include "files_utils.h" + + +FILE* open_file(char *file_path) { + // Open the file + FILE *fp = fopen(file_path, "r"); + if (fp == NULL) + { + perror("Error while opening the file.\n"); + exit(EXIT_FAILURE); + } + return fp; +} + +int count_file_lines(char *file_path) { + FILE *fp = open_file(file_path); + int current_line = 0; + char *line = NULL; + size_t len = 0; + + while(getline(&line, &len, fp) != -1) { + // Count the lines that are not empty + if (strcmp(line, "\n") != 0) { + current_line++; + } + } + + fclose(fp); + free(line); + return current_line; +} + +/// Remove the newline character in the string. The char must be at the end of the line. +/// \param str The string to modify. +/// \see https://siongui.github.io/2013/01/09/c-remove-string-trailing-newline-carriage-return/ +void remove_string_trailing_newline(char *str) { + if (str == NULL) + return; + + int length = strlen(str); + if (str[length-1] == '\n') + str[length-1] = '\0'; +} diff --git a/files_utils.h b/files_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..3d78c5704291e3216de1bcaec9d6f0b1cfc130e4 --- /dev/null +++ b/files_utils.h @@ -0,0 +1,22 @@ +// Project : K-means +// Author : Dario GENGA + + +#ifndef _FILES_UTILS_H +#define _FILES_UTILS_H + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/// Open a file. +/// \param file_path The path to the file. +/// \return The pointer of the FILE object. +FILE *open_file(char *file_path); + +/// Count the number of lines in the file. +/// \param file_path The path to the file. +/// \return The number of line in the file. +int count_file_lines(char *file_path); + +#endif diff --git a/kmeans.c b/kmeans.c index 80f22599a333aa5b63ba7511e552a70be2cd6acc..fcf9aa76b66dffdb92347c7b7f9957b7e94a0371 100644 --- a/kmeans.c +++ b/kmeans.c @@ -3,16 +3,80 @@ #include "kmeans.h" +kmeans* kmeans_create_empty() { + kmeans* universe = malloc(sizeof(kmeans)); + universe->points_array = NULL; + universe->nb_points = 0; + universe->k = 0; + universe->clusters_array = NULL; + + return universe; +} + +/// Create empty clusters for the universe +/// \param universe The universe who contains the cluster to create. +void initialize_clusters_array(kmeans* universe) { + universe->clusters_array = malloc(sizeof(cluster) * universe->k); + + for (int i = 0; i < universe->k; i++) { + universe->clusters_array[i] = cluster_create(NULL); + } +} + +/// Initialize the array of points in the universe. +/// \param universe The universe who contains the points. +void initialize_points_array(kmeans* universe) { + universe->points_array = malloc(sizeof(point) * universe->nb_points); + for (int i = 0; i < universe->nb_points; i++) { + universe->points_array[i] = NULL; + } +} + +/// Create a point from the values in a string. +/// \param line The string who contains the data of the point. +/// \param dimensions The number of dimension in the point. +/// \return The point created from the string. +point* create_point_from_string(char *line, int dimensions) { + // Skip the parsing if the line doesn't contain data + if (strcmp(line, "\n") == 0) { + return NULL; + } + + int i = 0; + point* p; + char *token; + const char separator[2] = ","; + double* data = malloc(sizeof(double) * dimensions); + + // Parse the line + token = strtok(line, separator); + while(token != NULL) { + // Convert the string value to double, then save it + double value = atof(token); + data[i] = value; + + // Get the next value + token = strtok(NULL, separator); + i++; + } + // Verify the dimensions of the point + if (i != dimensions) { + printf("Bad dimensions for the point.\n"); + exit(EXIT_FAILURE); + } + + // Create the point and return it + p = point_create(data); + return p; +} + kmeans* kmeans_create(int k, point** data, int nb_points) { kmeans* universe = malloc(sizeof(kmeans)); universe->points_array = data; universe->nb_points = nb_points; universe->k = k; - universe->clusters_array = malloc(sizeof(cluster) * k); - - for (int i = 0; i < k; i++) { - universe->clusters_array[i] = cluster_create(NULL); - } + universe->clusters_array = NULL; + initialize_clusters_array(universe); return universe; } @@ -35,8 +99,49 @@ void init_from_cmd_arguments(kmeans *universe) { } -void read_data_source(kmeans *universe, char* source_file) { +void read_data_source(kmeans* universe, char* source_file) { + char *line = NULL; + size_t len = 0; + ssize_t line_length; + int current_line = 0; + int dimensions = 0; + int point_index = 0; + + // Count the number of points + int nb_data = count_file_lines(source_file) - 2; // The two first line are not for the data + universe->nb_points = nb_data; + initialize_points_array(universe); + + // Open the file and read it, line by line + FILE *fp = open_file(source_file); + while((line_length = getline(&line, &len, fp)) != -1) { + // Remove newline char at end of line + remove_string_trailing_newline(line); + + // Get the number of dimensions for each data + if (current_line == LINE_INDEX_DIMENSIONS) { + dimensions = (int)strtol(line, NULL, 10); + } + // Get the number of cluster + if (current_line == LINE_INDEX_CLUSTER) { + universe->k = (int)strtol(line, NULL, 10); + initialize_clusters_array(universe); + } + // Retrieve the data + if (current_line >= LINE_INDEX_CONTENT) { + point* p = create_point_from_string(line, dimensions); + + if (p != NULL) { + universe->points_array[point_index] = p; + point_index++; + } + } + current_line++; + } + + fclose(fp); + free(line); } void write_data_output(kmeans *universe, char* output_file) { @@ -68,6 +173,10 @@ void destroy_point(point* p) { if (p->label != NULL) free(p->label); + if (p->value != NULL) { + free(p->value); + } + free(p); } } @@ -81,11 +190,15 @@ void destroy_cluster(cluster* clstr) { void destroy_universe(kmeans* kmeans) { if (kmeans != NULL) { - for (int i = 0; i < kmeans->nb_points; i++) { - destroy_point(kmeans->points_array[i]); + if (kmeans->points_array != NULL) { + for (int i = 0; i < kmeans->nb_points; i++) { + destroy_point(kmeans->points_array[i]); + } } - for (int i = 0; i < kmeans->k; i++) { - destroy_cluster(kmeans->clusters_array[i]); + if (kmeans->clusters_array != NULL) { + for (int i = 0; i < kmeans->k; i++) { + destroy_cluster(kmeans->clusters_array[i]); + } } free(kmeans->points_array); diff --git a/kmeans.h b/kmeans.h index 498de3dbef5e402a6646a6f7284100b1da6d9c3e..f41549582190bc3946125913c78162b3f93faffc 100644 --- a/kmeans.h +++ b/kmeans.h @@ -7,7 +7,13 @@ #include <stdio.h> #include <stdlib.h> #include <stdbool.h> +#include <string.h> #include <math.h> +#include "files_utils.h" + +#define LINE_INDEX_DIMENSIONS 0 +#define LINE_INDEX_CLUSTER 1 +#define LINE_INDEX_CONTENT 2 /// A group who contains points. typedef struct _cluster { @@ -41,6 +47,10 @@ typedef struct _kmeans { int nb_points; } kmeans; +/// Create an empty kmeans universe. +/// \return The kmeans object of the universe. +kmeans* kmeans_create_empty(); + /// Create the kmeans universe. /// \param k The number of clusters in the universe. /// \param data The array of points diff --git a/main.c b/main.c index c586a14e27a829bcb2550d6d20e522d036ed3ecc..e66bf4624231c5f4965d2f46e8729ce3b4d667f7 100644 --- a/main.c +++ b/main.c @@ -6,18 +6,10 @@ #include "kmeans.h" int main() { - // The code below verify the create and destroy methods. - double d = 5.5; - double b = 3.3; - point* gravity = point_create(&b); - point* p = point_create(&d); - cluster* c = cluster_create(gravity); + char* path = "./source_data.txt"; + kmeans *universe = kmeans_create_empty(); + read_data_source(universe, path); - point** points = malloc(sizeof(points) * 1); - points[0] = p; - kmeans* universe = kmeans_create(3, points, 1); - - destroy_cluster(c); destroy_universe(universe); return EXIT_SUCCESS; } diff --git a/makefile b/makefile index afe312ca80c25a4bdf2c387656b92d572ff7ec28..47c8687d6d885c75e17305bbd61a8977d23f226d 100644 --- a/makefile +++ b/makefile @@ -1,9 +1,11 @@ LIB=-lm CC=gcc -Wall -Wextra -g -main: kmeans.o main.o +main: files_utils.o kmeans.o main.o $(CC) $^ -fsanitize=address -fsanitize=leak -o $@ $(LIB) +files_utils.o: files_utils.c files_utils.h + $(CC) -c $< $(LIB) kmeans.o: kmeans.c kmeans.h $(CC) -c $< $(LIB) main.o: main.c diff --git a/source_data.txt b/source_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..782668ae3766b02163601bbabe5890ab35bde1d4 --- /dev/null +++ b/source_data.txt @@ -0,0 +1,7 @@ +2 +3 +1,24 +2.3,33.65 +3,4 +5,34 +-1,5