From 0078dbfb57d3ce25a322ce588f76aa0d61536952 Mon Sep 17 00:00:00 2001
From: "dario.genga" <dario.genga@etu.hesge.ch>
Date: Tue, 14 Jun 2022 15:59:14 +0200
Subject: [PATCH] Add clustering

---
 kmeans.c        | 168 ++++++++++++++++++++++++++++++++++++++++++++----
 kmeans.h        |  13 ++--
 main.c          |  18 +-----
 output_data.txt |  21 ++++--
 source_data.txt |  21 ++++--
 5 files changed, 200 insertions(+), 41 deletions(-)

diff --git a/kmeans.c b/kmeans.c
index 3d2547f..0702fb9 100644
--- a/kmeans.c
+++ b/kmeans.c
@@ -3,6 +3,17 @@
 
 #include "kmeans.h"
 
+int random_with_min_man_value(int min, int max) {
+    return (rand() % (max - min + 1)) + min;
+}
+
+void swap(int *x, int *y)
+{
+    int tmp = *x;
+    *x = *y;
+    *y = tmp;
+}
+
 kmeans* kmeans_create_empty() {
     kmeans* universe = malloc(sizeof(kmeans));
     universe->points_array = NULL;
@@ -46,14 +57,14 @@ point* create_point_from_string(char *line, int dimensions) {
     int i = 0;
     point* p;
     char *token;
-    const char separator[2] = ",";
-    double* data = malloc(sizeof(double) * dimensions);
+    const char separator[2] = SEPARATOR;
+    float* data = malloc(sizeof(float) * dimensions);
 
     // Parse the line
     token = strtok(line, separator);
     while(token != NULL) {
-        // Convert the string value to double, then save it
-        double value = atof(token);
+        // Convert the string value to float, then save it
+        float value = atof(token);
         data[i] = value;
 
         // Get the next value
@@ -67,7 +78,7 @@ point* create_point_from_string(char *line, int dimensions) {
     }
 
     // Create the point and return it
-    p = point_create(data);
+    p = point_create(data, dimensions);
     return p;
 }
 
@@ -82,11 +93,12 @@ kmeans* kmeans_create(int k, point** data, int nb_points) {
     return universe;
 }
 
-point* point_create(double* value) {
+point* point_create(float* value, int dimensions) {
     point* p = malloc(sizeof(point));
     p->value = value;
     p->cluster = NULL;
     p->label = NULL;
+    p->dimensions = dimensions;
     return p;
 }
 
@@ -97,7 +109,7 @@ cluster* cluster_create(point* centroid) {
 }
 
 void init_from_cmd_arguments(kmeans *universe) {
-
+    universe = universe;
 }
 
 void read_data_source(kmeans* universe, char* source_file) {
@@ -181,24 +193,158 @@ void write_data_output(kmeans *universe, char* output_file) {
     fclose(file);
 }
 
+// Choose a random point by using the Fisher-Yates algorithm
+point* choose_random_point_as_centroid(kmeans *universe, int* points_index_possible, int max_index) {
+    int random_index = random_with_min_man_value(0, max_index);
+    float* value = malloc(sizeof(float) * universe->dimensions);
+
+    for (int d = 0; d < universe->dimensions; d++) {
+        value[d] = universe->points_array[random_index]->value[d];
+    }
+
+    // Swap the selected point index with the last one,
+    // so when max_index is decremented we can't choose again the same point
+    swap(&points_index_possible[random_index], &points_index_possible[max_index]);
+
+    point *centroid = point_create(value, universe->dimensions);
+    return centroid;
+}
+
 void init_clusters(kmeans *universe) {
+    // Create the index of each possible points that can be a centroid
+    int* random_index_possible = malloc(sizeof(int) * universe->nb_points);
+    for (int i = 0; i < universe->nb_points; i++) {
+        random_index_possible[i] = i;
+    }
+    int max_index = random_index_possible[universe->nb_points - 1];
 
+    // Choose a random centroid for each cluster
+    for (int i = 0; i < universe->k; i++) {
+        universe->clusters_array[i]->centroid = choose_random_point_as_centroid(universe, random_index_possible, max_index);
+        max_index--;
+    }
+
+    free(random_index_possible);
 }
 
-double compute_distance(point* p1, point p2) {
-    return 0;
+float compute_euclidean_distance(point* p1, point* p2) {
+    float sum = 0;
+    float result = 0;
+
+    for (int i = 0; i < p1->dimensions; i++) {
+        sum += pow(p1->value[i] - p2->value[i], 2);
+    }
+    result = sqrt(sum);
+    return result;
 }
 
-void compute_center_of_gravity(cluster* clstr, kmeans* universe) {
+float compute_manhattan_distance(point* p1, point* p2) {
+    float result = 0;
 
+    for (int i = 0; i < p1->dimensions; i++) {
+        result += fabs(p1->value[i] - p2->value[i]);
+    }
+    return result;
 }
 
-void assign_points_to_cluster(point* p, kmeans* universe) {
+float compute_chebyshev_distance(point* p1, point* p2) {
+    float result = 0;
+
+    for (int i = 0; i < p1->dimensions; i++) {
+        int abs_diff = fabs(p1->value[i] - p2->value[i]);
+        if (abs_diff > result) {
+            result = abs_diff;
+        }
+    }
+    return result;
+}
+
+float compute_distance(point* p1, point* p2) {
+    if (p1->dimensions != p2->dimensions) {
+        printf("The points don't have the same dimensions!\n");
+        exit(EXIT_FAILURE);
+    }
+    float euclidean = compute_euclidean_distance(p1, p2);
+    //float manhattan = compute_manhattan_distance(p1, p2);
+    //float chebyshev = compute_chebyshev_distance(p1, p2);
+
+    return euclidean;
+}
+
+bool compute_center_of_gravity(cluster* clstr, kmeans* universe) {
+    bool new_position = false;
+    int nb_points_in_cluster = 0;
+
+    // Create an array to determine the center of gravity
+    float* dimensions_average = malloc(sizeof(float) * universe->dimensions);
+    for (int i = 0; i < universe->dimensions; i++) {
+        dimensions_average[i] = 0;
+    }
+
+    // Parse each point in the cluster
+    for (int i = 0; i < universe->nb_points; i++) {
+        if (universe->points_array[i]->cluster == clstr) {
+            // Compute their position
+            for (int d = 0; d < universe->dimensions; d++) {
+                dimensions_average[d] += universe->points_array[i]->value[d];
+                nb_points_in_cluster += 1;
+            }
+        }
+    }
+
+    // Compute the center of gravity with the average position of each points in the cluster
+    if (nb_points_in_cluster > 0) {
+        for (int i = 0; i < universe->dimensions; i++) {
+            dimensions_average[i] = dimensions_average[i] / nb_points_in_cluster;
+            if (clstr->centroid->value[i] != dimensions_average[i]) {
+                clstr->centroid->value[i] = dimensions_average[i];
+                new_position = true;
+            }
+        }
+    }
+
+    free(dimensions_average);
+    return new_position;
+}
 
+void assign_points_to_cluster(point* p, kmeans* universe) {
+    float smallest_distance = 999;
+    cluster* clst;
+    for (int i = 0; i < universe->k; i++) {
+         float distance = compute_distance(p, universe->clusters_array[i]->centroid);
+         if (distance < smallest_distance) {
+             smallest_distance = distance;
+             clst = universe->clusters_array[i];
+         }
+    }
+    p->cluster = clst;
 }
 
 void start_clustering(kmeans* universe) {
+    bool clustering_in_progress = false;
+    init_clusters(universe);
 
+    do {
+        clustering_in_progress = false;
+
+        // Assign each points to their corresponding cluster
+        for (int i = 0; i < universe->nb_points; i++) {
+            assign_points_to_cluster(universe->points_array[i], universe);
+        }
+
+        // Compute the new center of gravity for each cluster
+        printf("Clusters positions...\n");
+        for (int i = 0; i < universe->k; i++) {
+            if (compute_center_of_gravity(universe->clusters_array[i], universe)) {
+                clustering_in_progress = true;
+            }
+            printf("Cluster %d position : ", i);
+            for (int j = 0; j < universe->dimensions; j++) {
+                printf("%0.2f, ", universe->clusters_array[i]->centroid->value[j]);
+            }
+            printf("\n");
+        }
+    } while (clustering_in_progress);
 }
 
 void destroy_point(point* p) {
diff --git a/kmeans.h b/kmeans.h
index 69b86f9..9d6022c 100644
--- a/kmeans.h
+++ b/kmeans.h
@@ -15,7 +15,7 @@
 #define LINE_INDEX_CLUSTER 1
 #define LINE_INDEX_CONTENT 2
 #define CLUSTER_SYMBOL "*"
-#define SEPARATOR ";"
+#define SEPARATOR ","
 
 /// A group who contains points.
 typedef struct _cluster {
@@ -28,7 +28,9 @@ typedef struct _cluster {
 /// A point in the universe, which represents a data.
 typedef struct _point {
     /// The coordinates of the point.
-    double* value;
+    float* value;
+    /// The number of dimensions of the point.
+    int dimensions;
     /// The cluster who contains the point.
     struct _cluster* cluster;
     /// The color to use to draw the point.
@@ -64,8 +66,9 @@ kmeans* kmeans_create(int k, point** data, int nb_points);
 
 /// Create a point.
 /// \param value The coordinates of the point.
+/// \param dimensions The number of dimensions of the point.
 /// \return The point objet initialized with its coordinates
-point* point_create(double* value);
+point* point_create(float* value, int dimensions);
 
 /// Create a cluster.
 /// \param centroid The point representing the center of gravity of the cluster.
@@ -95,12 +98,12 @@ void init_clusters(kmeans *universe);
 /// \param p1 The first point.
 /// \param p2 The second point.
 /// \return The distance between the two points.
-double compute_distance(point* p1, point p2);
+float compute_distance(point* p1, point* p2);
 
 /// Calculate the position of the center of gravity of the cluster.
 /// \param clstr The cluster.
 /// \param universe The universe who contains the points of the cluster.
-void compute_center_of_gravity(cluster* clstr, kmeans* universe);
+bool compute_center_of_gravity(cluster* clstr, kmeans* universe);
 
 /// Assign the point to the most coherent cluster.
 /// \param p The point to be assigned.
diff --git a/main.c b/main.c
index eacdf12..30334a0 100644
--- a/main.c
+++ b/main.c
@@ -3,31 +3,19 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <time.h>
 #include "kmeans.h"
 
 int main() {
+    srand(time(NULL));
     char* path = "./source_data.txt";
     char* output = "./output_data.txt";
     kmeans *universe = kmeans_create_empty();
     read_data_source(universe, path);
 
-    // Custom clustering for testing
-    for (int i = 0; i < universe->nb_points; i++) {
-        int c = 0;
-        if (i % 2 == 0) {
-            c = 1;
-        }
-        else if (i % 3 == 0) {
-            c = 2;
-        }
-        else {
-            c = 0;
-        }
-        universe->points_array[i]->cluster = universe->clusters_array[c];
-    }
+    start_clustering(universe);
 
     write_data_output(universe, output);
-
     destroy_universe(universe);
     return EXIT_SUCCESS;
 }
diff --git a/output_data.txt b/output_data.txt
index 1c8a7f5..174ff2c 100644
--- a/output_data.txt
+++ b/output_data.txt
@@ -1,10 +1,21 @@
 2
 3
 *
-2.30;33.65
+-1.00,-5.00
+-2.00,-4.00
+-3.00,-3.00
+-4.00,-2.00
+-5.00,-1.00
+-1.75,-2.25
 *
-1.00;24.00
-3.00;4.00
--1.00;5.00
+0.00,0.00
+3.10,-4.90
 *
-5.00;34.00
+1.00,1.00
+2.00,2.00
+3.00,3.00
+4.00,4.00
+5.00,5.00
+-2.25,4.75
+2.20,4.40
+4.00,2.00
diff --git a/source_data.txt b/source_data.txt
index 782668a..24bcf0a 100644
--- a/source_data.txt
+++ b/source_data.txt
@@ -1,7 +1,18 @@
 2
 3
-1,24
-2.3,33.65
-3,4
-5,34
--1,5
+0,0
+1,1
+2,2
+3,3
+4,4
+5,5
+-1,-5
+-2,-4
+-3,-3
+-4,-2
+-5,-1
+-2.25,4.75
+3.1,-4.9
+2.2,4.4
+-1.75,-2.25
+4,2
-- 
GitLab