From c4ba647889fdb4ea218115fc5d6a3b5903b51018 Mon Sep 17 00:00:00 2001
From: "dario.genga" <dario.genga@etu.hesge.ch>
Date: Tue, 31 May 2022 16:31:42 +0200
Subject: [PATCH] Add import of data from file

The universe can now be created from data in a text file.

Added the files_utils who contains methods to open a file, count the
number of lines and remove trailing newline.

Added methods to create an empty kmeans universe and to initialize
empty clusters array and points array.

Added a method who create a point from a line in the text file.

Fixed a memory leak caused by non freed point value.
---
 files_utils.c   |  46 +++++++++++++++++
 files_utils.h   |  22 ++++++++
 kmeans.c        | 133 ++++++++++++++++++++++++++++++++++++++++++++----
 kmeans.h        |  10 ++++
 main.c          |  14 ++---
 makefile        |   4 +-
 source_data.txt |   7 +++
 7 files changed, 214 insertions(+), 22 deletions(-)
 create mode 100644 files_utils.c
 create mode 100644 files_utils.h
 create mode 100644 source_data.txt

diff --git a/files_utils.c b/files_utils.c
new file mode 100644
index 0000000..9a8738f
--- /dev/null
+++ b/files_utils.c
@@ -0,0 +1,46 @@
+// Project : K-means
+// Author : Dario GENGA
+
+#include "files_utils.h"
+
+
+FILE* open_file(char *file_path) {
+    // Open the file
+    FILE *fp = fopen(file_path, "r");
+    if (fp == NULL)
+    {
+        perror("Error while opening the file.\n");
+        exit(EXIT_FAILURE);
+    }
+    return fp;
+}
+
+int count_file_lines(char *file_path) {
+    FILE *fp = open_file(file_path);
+    int current_line = 0;
+    char *line = NULL;
+    size_t len = 0;
+
+    while(getline(&line, &len, fp) != -1) {
+        // Count the lines that are not empty
+        if (strcmp(line, "\n") != 0) {
+            current_line++;
+        }
+    }
+
+    fclose(fp);
+    free(line);
+    return current_line;
+}
+
+/// Remove the newline character in the string. The char must be at the end of the line.
+/// \param str The string to modify.
+/// \see https://siongui.github.io/2013/01/09/c-remove-string-trailing-newline-carriage-return/
+void remove_string_trailing_newline(char *str) {
+    if (str == NULL)
+        return;
+
+    int length = strlen(str);
+    if (str[length-1] == '\n')
+        str[length-1]  = '\0';
+}
diff --git a/files_utils.h b/files_utils.h
new file mode 100644
index 0000000..3d78c57
--- /dev/null
+++ b/files_utils.h
@@ -0,0 +1,22 @@
+// Project : K-means
+// Author : Dario GENGA
+
+
+#ifndef _FILES_UTILS_H
+#define _FILES_UTILS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/// Open a file.
+/// \param file_path The path to the file.
+/// \return The pointer of the FILE object.
+FILE *open_file(char *file_path);
+
+/// Count the number of lines in the file.
+/// \param file_path The path to the file.
+/// \return The number of line in the file.
+int count_file_lines(char *file_path);
+
+#endif
diff --git a/kmeans.c b/kmeans.c
index 80f2259..fcf9aa7 100644
--- a/kmeans.c
+++ b/kmeans.c
@@ -3,16 +3,80 @@
 
 #include "kmeans.h"
 
+kmeans* kmeans_create_empty() {
+    kmeans* universe = malloc(sizeof(kmeans));
+    universe->points_array = NULL;
+    universe->nb_points = 0;
+    universe->k = 0;
+    universe->clusters_array = NULL;
+
+    return universe;
+}
+
+/// Create empty clusters for the universe
+/// \param universe The universe who contains the cluster to create.
+void initialize_clusters_array(kmeans* universe) {
+    universe->clusters_array = malloc(sizeof(cluster) * universe->k);
+
+    for (int i = 0; i < universe->k; i++) {
+        universe->clusters_array[i] = cluster_create(NULL);
+    }
+}
+
+/// Initialize the array of points in the universe.
+/// \param universe The universe who contains the points.
+void initialize_points_array(kmeans* universe) {
+    universe->points_array = malloc(sizeof(point) * universe->nb_points);
+    for (int i = 0; i < universe->nb_points; i++) {
+        universe->points_array[i] = NULL;
+    }
+}
+
+/// Create a point from the values in a string.
+/// \param line The string who contains the data of the point.
+/// \param dimensions The number of dimension in the point.
+/// \return The point created from the string.
+point* create_point_from_string(char *line, int dimensions) {
+    // Skip the parsing if the line doesn't contain data
+    if (strcmp(line, "\n") == 0) {
+        return NULL;
+    }
+
+    int i = 0;
+    point* p;
+    char *token;
+    const char separator[2] = ",";
+    double* data = malloc(sizeof(double) * dimensions);
+
+    // Parse the line
+    token = strtok(line, separator);
+    while(token != NULL) {
+        // Convert the string value to double, then save it
+        double value = atof(token);
+        data[i] = value;
+
+        // Get the next value
+        token = strtok(NULL, separator);
+        i++;
+    }
+    // Verify the dimensions of the point
+    if (i != dimensions) {
+        printf("Bad dimensions for the point.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Create the point and return it
+    p = point_create(data);
+    return p;
+}
+
 kmeans* kmeans_create(int k, point** data, int nb_points) {
     kmeans* universe = malloc(sizeof(kmeans));
     universe->points_array = data;
     universe->nb_points = nb_points;
     universe->k = k;
-    universe->clusters_array = malloc(sizeof(cluster) * k);
-
-    for (int i = 0; i < k; i++) {
-        universe->clusters_array[i] = cluster_create(NULL);
-    }
+    universe->clusters_array = NULL;
+    initialize_clusters_array(universe);
 
     return universe;
 }
@@ -35,8 +99,49 @@ void init_from_cmd_arguments(kmeans *universe) {
 
 }
 
-void read_data_source(kmeans *universe, char* source_file) {
+void read_data_source(kmeans* universe, char* source_file) {
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t line_length;
+    int current_line = 0;
+    int dimensions = 0;
+    int point_index = 0;
+
+    // Count the number of points
+    int nb_data = count_file_lines(source_file) - 2; // The two first line are not for the data
+    universe->nb_points = nb_data;
+    initialize_points_array(universe);
+
+    // Open the file and read it, line by line
+    FILE *fp = open_file(source_file);
+    while((line_length = getline(&line, &len, fp)) != -1) {
+        // Remove newline char at end of line
+        remove_string_trailing_newline(line);
+
+        // Get the number of dimensions for each data
+        if (current_line == LINE_INDEX_DIMENSIONS) {
+            dimensions = (int)strtol(line, NULL, 10);
+        }
+        // Get the number of cluster
+        if (current_line == LINE_INDEX_CLUSTER) {
+            universe->k = (int)strtol(line, NULL, 10);
+            initialize_clusters_array(universe);
+        }
+        // Retrieve the data
+        if (current_line >= LINE_INDEX_CONTENT) {
+            point* p = create_point_from_string(line, dimensions);
+
+            if (p != NULL) {
+                universe->points_array[point_index] = p;
+                point_index++;
+            }
+        }
 
+        current_line++;
+    }
+
+    fclose(fp);
+    free(line);
 }
 
 void write_data_output(kmeans *universe, char* output_file) {
@@ -68,6 +173,10 @@ void destroy_point(point* p) {
         if (p->label != NULL)
             free(p->label);
 
+        if (p->value != NULL) {
+            free(p->value);
+        }
+
         free(p);
     }
 }
@@ -81,11 +190,15 @@ void destroy_cluster(cluster* clstr) {
 
 void destroy_universe(kmeans* kmeans) {
     if (kmeans != NULL) {
-        for (int i = 0; i < kmeans->nb_points; i++) {
-            destroy_point(kmeans->points_array[i]);
+        if (kmeans->points_array != NULL) {
+            for (int i = 0; i < kmeans->nb_points; i++) {
+                destroy_point(kmeans->points_array[i]);
+            }
         }
-        for (int i = 0; i < kmeans->k; i++) {
-            destroy_cluster(kmeans->clusters_array[i]);
+        if (kmeans->clusters_array != NULL) {
+            for (int i = 0; i < kmeans->k; i++) {
+                destroy_cluster(kmeans->clusters_array[i]);
+            }
         }
 
         free(kmeans->points_array);
diff --git a/kmeans.h b/kmeans.h
index 498de3d..f415495 100644
--- a/kmeans.h
+++ b/kmeans.h
@@ -7,7 +7,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
+#include <string.h>
 #include <math.h>
+#include "files_utils.h"
+
+#define LINE_INDEX_DIMENSIONS 0
+#define LINE_INDEX_CLUSTER 1
+#define LINE_INDEX_CONTENT 2
 
 /// A group who contains points.
 typedef struct _cluster {
@@ -41,6 +47,10 @@ typedef struct _kmeans {
     int nb_points;
 } kmeans;
 
+/// Create an empty kmeans universe.
+/// \return The kmeans object of the universe.
+kmeans* kmeans_create_empty();
+
 /// Create the kmeans universe.
 /// \param k The number of clusters in the universe.
 /// \param data The array of points
diff --git a/main.c b/main.c
index c586a14..e66bf46 100644
--- a/main.c
+++ b/main.c
@@ -6,18 +6,10 @@
 #include "kmeans.h"
 
 int main() {
-    // The code below verify the create and destroy methods.
-    double d = 5.5;
-    double b = 3.3;
-    point* gravity = point_create(&b);
-    point* p = point_create(&d);
-    cluster* c = cluster_create(gravity);
+    char* path = "./source_data.txt";
+    kmeans *universe = kmeans_create_empty();
+    read_data_source(universe, path);
 
-    point** points = malloc(sizeof(points) * 1);
-    points[0] = p;
-    kmeans* universe = kmeans_create(3, points, 1);
-
-    destroy_cluster(c);
     destroy_universe(universe);
     return EXIT_SUCCESS;
 }
diff --git a/makefile b/makefile
index afe312c..47c8687 100644
--- a/makefile
+++ b/makefile
@@ -1,9 +1,11 @@
 LIB=-lm
 CC=gcc -Wall -Wextra -g
 
-main: kmeans.o main.o
+main: files_utils.o kmeans.o main.o
 	$(CC) $^ -fsanitize=address -fsanitize=leak -o $@ $(LIB)
 
+files_utils.o: files_utils.c files_utils.h
+	$(CC) -c $< $(LIB)
 kmeans.o: kmeans.c kmeans.h
 	$(CC) -c $< $(LIB)
 main.o: main.c
diff --git a/source_data.txt b/source_data.txt
new file mode 100644
index 0000000..782668a
--- /dev/null
+++ b/source_data.txt
@@ -0,0 +1,7 @@
+2
+3
+1,24
+2.3,33.65
+3,4
+5,34
+-1,5
-- 
GitLab