From 13fc6a107a63aea4740139189256cfc2022b6027 Mon Sep 17 00:00:00 2001
From: Boris Stefanovic <owldev@bluewin.ch>
Date: Sat, 18 Jun 2022 23:22:06 +0200
Subject: [PATCH] cleanup

---
 Makefile      |   3 ++
 src/cluster.c |  33 +++++++++++++++-
 src/io.c      |  17 +++++---
 src/kmeans.c  |  96 +++++++++++++++++++++++++++++++++++++++++---
 src/kmeans.h  |   5 +++
 src/main.c    | 107 +++++++++++++++++++++++++++++++++-----------------
 6 files changed, 211 insertions(+), 50 deletions(-)

diff --git a/Makefile b/Makefile
index 62d5a46..37e0985 100644
--- a/Makefile
+++ b/Makefile
@@ -55,5 +55,8 @@ clean:
 debug: ${DEBUG_TARGET}
 	./$< -i test/data.txt
 
+test: ${TARGET}
+	./$< -i test/data.txt -o ~/test_kmeans
+
 exec: ${TARGET}
 	./$<
diff --git a/src/cluster.c b/src/cluster.c
index e9bb226..5a173f6 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -52,8 +52,37 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) {
 
 
 bool cluster_update_center_int(cluster_int_t* cluster) {
-	//TODO
-	return true;
+	// save old center
+	vector_int_t* old_center = cluster->center;
+	assert(old_center != NULL);
+	assert(cluster != NULL);
+	// create new center
+	list_points_node_int_t* node = cluster->points->head;
+	// if cluster is empty
+	if (NULL == node) {
+		return false;  // center has not been changed
+	} else {
+		cluster->center = vector_create_int(node->point->dim);
+		// sum all values in center
+		while (node != NULL) {
+			vector_add_inplace_int(cluster->center, *(node->point));
+			node = node->next;
+		}
+		// divide by number of points
+		vector_div_inplace_int(cluster->center, (int_t) cluster->points->size);
+		// check whether center has changed
+		bool changed = false;
+		for (size_t p = 0; p < cluster->center->dim; ++p) {
+			if (cluster->center->data[p] == old_center->data[p]) {
+				changed = true;
+				break;
+			}
+		}
+		// destroy old center
+		vector_destroy_int(old_center);
+		// return true if center has changed
+		return changed;
+	}
 }
 
 bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
diff --git a/src/io.c b/src/io.c
index cc7824d..2b873e4 100644
--- a/src/io.c
+++ b/src/io.c
@@ -1,6 +1,7 @@
 #define _GNU_SOURCE
 
 #include "io.h"
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -9,17 +10,21 @@
 
 
 int_t io_read_int(FILE* file) {
-	char* line;
-	size_t len;
+	char* line = NULL;
+	size_t len = 0;
 	getline(&line, &len, file);
-	return strtol(line, NULL, 10);
+	long res = strtol(line, NULL, 10);
+	free(line);
+	return res;
 }
 
 fpt_t io_read_fpt(FILE* file) {
-	char* line;
-	size_t len;
+	char* line = NULL;
+	size_t len = 0;
 	getline(&line, &len, file);
-	return strtod(line, NULL);
+	double res = strtod(line, NULL);
+	free(line);
+	return res;
 }
 
 
diff --git a/src/kmeans.c b/src/kmeans.c
index 9d596fb..348c08f 100644
--- a/src/kmeans.c
+++ b/src/kmeans.c
@@ -3,11 +3,49 @@
 #include "vector.h"
 
 
-cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
-	//TODO
-	return NULL;
+bool is_vector_in_centers_int(const vector_int_t* center, const cluster_int_t** clusters, const size_t i) {
+	for (size_t k = 0; k < i; ++k) {
+		if (vector_equals_int(clusters[k]->center, center)) {
+			return true;
+		}
+	}
+	return false;
 }
 
+cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
+	// check args and init
+	if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
+	cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*));
+	if (NULL == clusters) return NULL;
+	for (size_t k = 0; k < nclusters; ++k) {
+		clusters[k] = cluster_create_int(NULL);
+	}
+	// determine range in which we are working
+	vector_int_t* min = vector_copy_int(points[0]);
+	vector_int_t* max = vector_copy_int(points[0]);
+	for (size_t i = 1; i < point_count; ++i) {
+		for (size_t p = 0; p < max->dim; ++p) {
+			const int_t value = points[i]->data[p];
+			if (value < min->data[p]) min->data[p] = value;
+			if (value > max->data[p]) max->data[p] = value;
+		}
+	}
+	// until we have enough centers
+	for (size_t i = 0; i < nclusters; ++i) {
+		vector_int_t* center = vector_create_int(max->dim);
+		bool valid = false;
+		while (!valid) {
+			// initialise center values randomly, within the "multidimensional rectangle" of our set of points
+			for (size_t p = 0; p < center->dim; ++p) {
+				center->data[p] = rand_int_range(min->data[p], max->data[p]);  // TODO: create a rand_long_range(...) function
+			}
+			// check center is not already in clusters, although probability is extremely low...
+			valid = !is_vector_in_centers_int(center, (const cluster_int_t**) clusters, i);
+		}
+		clusters[i] = cluster_create_int(center);
+	}
+	return clusters;
+}
 
 bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) {
 	for (size_t k = 0; k < i; ++k) {
@@ -18,7 +56,6 @@ bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t**
 	return false;
 }
 
-
 cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) {
 	// check args and init
 	if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
@@ -55,9 +92,57 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size
 }
 
 
+void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters) {
+	if (NULL == clusters) return;
+	for (size_t i = 0; i < nb_clusters; ++i) {
+		cluster_destroy_int(clusters[i]);
+	}
+}
+
+void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters) {
+	if (NULL == clusters) return;
+	for (size_t i = 0; i < nb_clusters; ++i) {
+		cluster_destroy_fpt(clusters[i]);
+	}
+}
+
+
 void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters,
 				fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
-	//TODO
+	bool changed = true;
+	while (changed) {
+		// reset condition
+		changed = false;
+		// empty all clusters, keeping only their centers (virtual)
+		for (size_t k = 0; k < nb_clusters; ++k) {
+			cluster_reset_int(clusters[k]);
+		}
+		// for each point
+		for (size_t i = 0; i < point_count; ++i) {
+			vector_int_t* point = points[i];
+			// find closest cluster
+			cluster_int_t* cmin = clusters[0];
+			fpt_t dmin = distance_function(point, cmin->center);
+			for (size_t k = 0; k < nb_clusters; ++k) {
+				cluster_int_t* current_cluster = clusters[k];
+				fpt_t dist = distance_function(point, current_cluster->center);
+				if (dist < dmin) {
+					cmin = current_cluster;
+					dmin = dist;
+				}
+			}
+			// add point to closest cluster
+			cluster_add_point_int(cmin, point);
+		}
+		// update all cluster centers
+		for (size_t k = 0; k < nb_clusters; ++k) {
+			assert(clusters[k] != NULL);
+			assert(clusters[k]->points != NULL);
+			if (cluster_update_center_int(clusters[k])) {
+				changed = true;
+			}
+		}
+	}
 }
 
 
@@ -94,7 +179,6 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t**
 			assert(clusters[k]->points != NULL);
 			if (cluster_update_center_fpt(clusters[k])) {
 				changed = true;
-				printf("%lud  \n<%lf  %lf  %lf>\n\n", nb_clusters, clusters[k]->center->data[0], clusters[k]->center->data[1], clusters[k]->center->data[2]);
 			}
 		}
 	}
diff --git a/src/kmeans.h b/src/kmeans.h
index 3d29bc6..cbfbd20 100644
--- a/src/kmeans.h
+++ b/src/kmeans.h
@@ -10,6 +10,11 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size
 cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters);
 
 
+void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters);
+
+void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters);
+
+
 void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*));
 
 void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*));
diff --git a/src/main.c b/src/main.c
index 86e9f2a..30fa4bc 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1,3 +1,4 @@
+#include <assert.h>
 #include <getopt.h>
 #include <stdbool.h>
 #include <stdio.h>
@@ -70,45 +71,59 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance
 }
 
 
-int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) {
-	//TODO
-	return EXIT_FAILURE;
+int main_int(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
+	// INIT
+	vector_int_t** points = NULL;
+	cluster_int_t** clusters = NULL;
+	// READ
+	list_points_int_t* list = io_get_vector_list_int(ifile, dim);
+	const size_t point_count = list->size;
+	points = list_points_to_array_int(list);
+	list_points_destroy_int(list, false);
+	list = NULL;
+	// ALGORITHM
+	printf("INIT:   ");
+	clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters);
+	printf("DONE\n");
+	printf("KMEANS: begin\n");
+	kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]);
+	printf("KMEANS: DONE !\n");
+	// WRITE
+	fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
+	io_write_clusters_to_file_int(ofile, clusters, nb_clusters);
+	// CLEANUP
+	for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_int(clusters[i]);
+	for (size_t i = 0; i < point_count; ++i) vector_destroy_int(points[i]);
+	// EXIT
+	return EXIT_SUCCESS;
 }
 
-int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) {
+int main_fpt(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) {
+	// INIT
+	vector_fpt_t** points = NULL;
+	cluster_fpt_t** clusters = NULL;
 	// READ
-	FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
-	const size_t dim = io_read_int(ifile);
-	const size_t nb_clusters = io_read_int(ifile);
-	if (0 == dim) {
-		printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
-		fclose(ifile);
-		return EXIT_FAILURE;
-	}
-	if (0 == nb_clusters) {
-		printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
-		fclose(ifile);
-		return EXIT_FAILURE;
-	}
 	list_points_fpt_t* list = io_get_vector_list_fpt(ifile, dim);
-	fclose(ifile);
-	ifile = NULL;
 	const size_t point_count = list->size;
-	vector_fpt_t** points = list_points_to_array_fpt(list);
+	points = list_points_to_array_fpt(list);
 	list_points_destroy_fpt(list, false);
 	list = NULL;
 	// ALGORITHM
-	printf("INIT: ...   ");
-	cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters);
+	printf("INIT:   ");
+	clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters);
 	printf("DONE\n");
-	printf("STARTING KMEANS ALGORITHM: ...\n");
+	printf("KMEANS: begin\n");
 	kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]);
-	printf("KMEANS DONE !\n");
+	printf("KMEANS: DONE !\n");
 	// WRITE
-	FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
-	fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters);
+	fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters);
 	io_write_clusters_to_file_fpt(ofile, clusters, nb_clusters);
-	fclose(ofile);
+	// CLEANUP
+	for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_fpt(clusters[i]);
+	free(clusters);
+	for (size_t i = 0; i < point_count; ++i) vector_destroy_fpt(points[i]);
+	free(points);
+	// EXIT
 	return EXIT_SUCCESS;
 }
 
@@ -120,14 +135,34 @@ int main(int argc, char** argv) {
 	enum DataType datatype = FLOAT;
 	// parse args
 	parse_args(argc, argv, &ipath, &opath, &disttype, &datatype);
-	switch (datatype) {
-		case FLOAT:
-			printf("FLOAT\n");
-			return main_fpt(ipath, opath, disttype);
-		case INT:
-			printf("INT\n");
-			return main_int(ipath, opath, disttype);
-		default:
-			abort();
+	// open files
+	FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
+	FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
+	// read dimension and desired number of clusters from file
+	const size_t dim = io_read_int(ifile);
+	const size_t nb_clusters = io_read_int(ifile);
+	if (0 == dim) {
+		printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
+		fclose(ifile);
+		fclose(ofile);
+		return EXIT_FAILURE;
 	}
+	if (0 == nb_clusters) {
+		printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
+		fclose(ifile);
+		fclose(ofile);
+		return EXIT_FAILURE;
+	}
+	// type specific code
+	int return_value = EXIT_FAILURE;
+	int (* main_routine)(FILE*, FILE*, const size_t, const size_t, const enum DistanceFunctionType);
+	main_routine = INT == datatype ? main_int : main_fpt;
+	printf(INT == datatype ? "TYPE:   INT\n" : "TYPE:   FLOAT\n");
+	assert(ifile != NULL);
+	assert(ofile != NULL);
+	return_value = main_routine(ifile, ofile, dim, nb_clusters, disttype);
+	// cleanup
+	fclose(ifile);
+	fclose(ofile);
+	return return_value;
 }
-- 
GitLab