From 0078dbfb57d3ce25a322ce588f76aa0d61536952 Mon Sep 17 00:00:00 2001 From: "dario.genga" <dario.genga@etu.hesge.ch> Date: Tue, 14 Jun 2022 15:59:14 +0200 Subject: [PATCH] Add clustering --- kmeans.c | 168 ++++++++++++++++++++++++++++++++++++++++++++---- kmeans.h | 13 ++-- main.c | 18 +----- output_data.txt | 21 ++++-- source_data.txt | 21 ++++-- 5 files changed, 200 insertions(+), 41 deletions(-) diff --git a/kmeans.c b/kmeans.c index 3d2547f..0702fb9 100644 --- a/kmeans.c +++ b/kmeans.c @@ -3,6 +3,17 @@ #include "kmeans.h" +int random_with_min_man_value(int min, int max) { + return (rand() % (max - min + 1)) + min; +} + +void swap(int *x, int *y) +{ + int tmp = *x; + *x = *y; + *y = tmp; +} + kmeans* kmeans_create_empty() { kmeans* universe = malloc(sizeof(kmeans)); universe->points_array = NULL; @@ -46,14 +57,14 @@ point* create_point_from_string(char *line, int dimensions) { int i = 0; point* p; char *token; - const char separator[2] = ","; - double* data = malloc(sizeof(double) * dimensions); + const char separator[2] = SEPARATOR; + float* data = malloc(sizeof(float) * dimensions); // Parse the line token = strtok(line, separator); while(token != NULL) { - // Convert the string value to double, then save it - double value = atof(token); + // Convert the string value to float, then save it + float value = atof(token); data[i] = value; // Get the next value @@ -67,7 +78,7 @@ point* create_point_from_string(char *line, int dimensions) { } // Create the point and return it - p = point_create(data); + p = point_create(data, dimensions); return p; } @@ -82,11 +93,12 @@ kmeans* kmeans_create(int k, point** data, int nb_points) { return universe; } -point* point_create(double* value) { +point* point_create(float* value, int dimensions) { point* p = malloc(sizeof(point)); p->value = value; p->cluster = NULL; p->label = NULL; + p->dimensions = dimensions; return p; } @@ -97,7 +109,7 @@ cluster* cluster_create(point* centroid) { } void init_from_cmd_arguments(kmeans *universe) { - + universe = universe; } void read_data_source(kmeans* universe, char* source_file) { @@ -181,24 +193,158 @@ void write_data_output(kmeans *universe, char* output_file) { fclose(file); } +// Choose a random point by using the Fisher-Yates algorithm +point* choose_random_point_as_centroid(kmeans *universe, int* points_index_possible, int max_index) { + int random_index = random_with_min_man_value(0, max_index); + float* value = malloc(sizeof(float) * universe->dimensions); + + for (int d = 0; d < universe->dimensions; d++) { + value[d] = universe->points_array[random_index]->value[d]; + } + + // Swap the selected point index with the last one, + // so when max_index is decremented we can't choose again the same point + swap(&points_index_possible[random_index], &points_index_possible[max_index]); + + point *centroid = point_create(value, universe->dimensions); + return centroid; +} + void init_clusters(kmeans *universe) { + // Create the index of each possible points that can be a centroid + int* random_index_possible = malloc(sizeof(int) * universe->nb_points); + for (int i = 0; i < universe->nb_points; i++) { + random_index_possible[i] = i; + } + int max_index = random_index_possible[universe->nb_points - 1]; + // Choose a random centroid for each cluster + for (int i = 0; i < universe->k; i++) { + universe->clusters_array[i]->centroid = choose_random_point_as_centroid(universe, random_index_possible, max_index); + max_index--; + } + + free(random_index_possible); } -double compute_distance(point* p1, point p2) { - return 0; +float compute_euclidean_distance(point* p1, point* p2) { + float sum = 0; + float result = 0; + + for (int i = 0; i < p1->dimensions; i++) { + sum += pow(p1->value[i] - p2->value[i], 2); + } + result = sqrt(sum); + return result; } -void compute_center_of_gravity(cluster* clstr, kmeans* universe) { +float compute_manhattan_distance(point* p1, point* p2) { + float result = 0; + for (int i = 0; i < p1->dimensions; i++) { + result += fabs(p1->value[i] - p2->value[i]); + } + return result; } -void assign_points_to_cluster(point* p, kmeans* universe) { +float compute_chebyshev_distance(point* p1, point* p2) { + float result = 0; + + for (int i = 0; i < p1->dimensions; i++) { + int abs_diff = fabs(p1->value[i] - p2->value[i]); + if (abs_diff > result) { + result = abs_diff; + } + } + return result; +} + +float compute_distance(point* p1, point* p2) { + if (p1->dimensions != p2->dimensions) { + printf("The points don't have the same dimensions!\n"); + exit(EXIT_FAILURE); + } + float euclidean = compute_euclidean_distance(p1, p2); + //float manhattan = compute_manhattan_distance(p1, p2); + //float chebyshev = compute_chebyshev_distance(p1, p2); + + return euclidean; +} + +bool compute_center_of_gravity(cluster* clstr, kmeans* universe) { + bool new_position = false; + int nb_points_in_cluster = 0; + + // Create an array to determine the center of gravity + float* dimensions_average = malloc(sizeof(float) * universe->dimensions); + for (int i = 0; i < universe->dimensions; i++) { + dimensions_average[i] = 0; + } + + // Parse each point in the cluster + for (int i = 0; i < universe->nb_points; i++) { + if (universe->points_array[i]->cluster == clstr) { + // Compute their position + for (int d = 0; d < universe->dimensions; d++) { + dimensions_average[d] += universe->points_array[i]->value[d]; + nb_points_in_cluster += 1; + } + } + } + + // Compute the center of gravity with the average position of each points in the cluster + if (nb_points_in_cluster > 0) { + for (int i = 0; i < universe->dimensions; i++) { + dimensions_average[i] = dimensions_average[i] / nb_points_in_cluster; + if (clstr->centroid->value[i] != dimensions_average[i]) { + clstr->centroid->value[i] = dimensions_average[i]; + new_position = true; + } + } + } + + free(dimensions_average); + return new_position; +} +void assign_points_to_cluster(point* p, kmeans* universe) { + float smallest_distance = 999; + cluster* clst; + for (int i = 0; i < universe->k; i++) { + float distance = compute_distance(p, universe->clusters_array[i]->centroid); + if (distance < smallest_distance) { + smallest_distance = distance; + clst = universe->clusters_array[i]; + } + } + p->cluster = clst; } void start_clustering(kmeans* universe) { + bool clustering_in_progress = false; + init_clusters(universe); + do { + clustering_in_progress = false; + + // Assign each points to their corresponding cluster + for (int i = 0; i < universe->nb_points; i++) { + assign_points_to_cluster(universe->points_array[i], universe); + } + + // Compute the new center of gravity for each cluster + printf("Clusters positions...\n"); + for (int i = 0; i < universe->k; i++) { + if (compute_center_of_gravity(universe->clusters_array[i], universe)) { + clustering_in_progress = true; + } + printf("Cluster %d position : ", i); + for (int j = 0; j < universe->dimensions; j++) { + printf("%0.2f, ", universe->clusters_array[i]->centroid->value[j]); + } + printf("\n"); + } + } while (clustering_in_progress); } void destroy_point(point* p) { diff --git a/kmeans.h b/kmeans.h index 69b86f9..9d6022c 100644 --- a/kmeans.h +++ b/kmeans.h @@ -15,7 +15,7 @@ #define LINE_INDEX_CLUSTER 1 #define LINE_INDEX_CONTENT 2 #define CLUSTER_SYMBOL "*" -#define SEPARATOR ";" +#define SEPARATOR "," /// A group who contains points. typedef struct _cluster { @@ -28,7 +28,9 @@ typedef struct _cluster { /// A point in the universe, which represents a data. typedef struct _point { /// The coordinates of the point. - double* value; + float* value; + /// The number of dimensions of the point. + int dimensions; /// The cluster who contains the point. struct _cluster* cluster; /// The color to use to draw the point. @@ -64,8 +66,9 @@ kmeans* kmeans_create(int k, point** data, int nb_points); /// Create a point. /// \param value The coordinates of the point. +/// \param dimensions The number of dimensions of the point. /// \return The point objet initialized with its coordinates -point* point_create(double* value); +point* point_create(float* value, int dimensions); /// Create a cluster. /// \param centroid The point representing the center of gravity of the cluster. @@ -95,12 +98,12 @@ void init_clusters(kmeans *universe); /// \param p1 The first point. /// \param p2 The second point. /// \return The distance between the two points. -double compute_distance(point* p1, point p2); +float compute_distance(point* p1, point* p2); /// Calculate the position of the center of gravity of the cluster. /// \param clstr The cluster. /// \param universe The universe who contains the points of the cluster. -void compute_center_of_gravity(cluster* clstr, kmeans* universe); +bool compute_center_of_gravity(cluster* clstr, kmeans* universe); /// Assign the point to the most coherent cluster. /// \param p The point to be assigned. diff --git a/main.c b/main.c index eacdf12..30334a0 100644 --- a/main.c +++ b/main.c @@ -3,31 +3,19 @@ #include <stdio.h> #include <stdlib.h> +#include <time.h> #include "kmeans.h" int main() { + srand(time(NULL)); char* path = "./source_data.txt"; char* output = "./output_data.txt"; kmeans *universe = kmeans_create_empty(); read_data_source(universe, path); - // Custom clustering for testing - for (int i = 0; i < universe->nb_points; i++) { - int c = 0; - if (i % 2 == 0) { - c = 1; - } - else if (i % 3 == 0) { - c = 2; - } - else { - c = 0; - } - universe->points_array[i]->cluster = universe->clusters_array[c]; - } + start_clustering(universe); write_data_output(universe, output); - destroy_universe(universe); return EXIT_SUCCESS; } diff --git a/output_data.txt b/output_data.txt index 1c8a7f5..174ff2c 100644 --- a/output_data.txt +++ b/output_data.txt @@ -1,10 +1,21 @@ 2 3 * -2.30;33.65 +-1.00,-5.00 +-2.00,-4.00 +-3.00,-3.00 +-4.00,-2.00 +-5.00,-1.00 +-1.75,-2.25 * -1.00;24.00 -3.00;4.00 --1.00;5.00 +0.00,0.00 +3.10,-4.90 * -5.00;34.00 +1.00,1.00 +2.00,2.00 +3.00,3.00 +4.00,4.00 +5.00,5.00 +-2.25,4.75 +2.20,4.40 +4.00,2.00 diff --git a/source_data.txt b/source_data.txt index 782668a..24bcf0a 100644 --- a/source_data.txt +++ b/source_data.txt @@ -1,7 +1,18 @@ 2 3 -1,24 -2.3,33.65 -3,4 -5,34 --1,5 +0,0 +1,1 +2,2 +3,3 +4,4 +5,5 +-1,-5 +-2,-4 +-3,-3 +-4,-2 +-5,-1 +-2.25,4.75 +3.1,-4.9 +2.2,4.4 +-1.75,-2.25 +4,2 -- GitLab