From 317e27d2287fbabb336d4b7d4d127d557d012811 Mon Sep 17 00:00:00 2001
From: Boris Stefanovic <owldev@bluewin.ch>
Date: Wed, 8 Jun 2022 00:52:07 +0200
Subject: [PATCH] ADD: check random centers are not already in set

---
 src/cluster.c |  4 +--
 src/kmeans.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++------
 src/kmeans.h  | 10 +++---
 3 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index f1b2c27..bfb7149 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -45,8 +45,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) {
 
 void cluster_update_center_int(cluster_int_t* cluster) {
 	vector_destroy_int(cluster->center);
-	cluster->center = vector_create_int(cluster->points->head->point->dim);
 	list_points_node_int_t* node = cluster->points->head;
+	cluster->center = vector_create_int(node->point->dim);
 	while (node != NULL) {
 		vector_add_inplace_int(cluster->center, *(node->point));
 		node = node->next;
@@ -56,8 +56,8 @@ void cluster_update_center_int(cluster_int_t* cluster) {
 
 void cluster_update_center_fpt(cluster_fpt_t* cluster) {
 	vector_destroy_fpt(cluster->center);
-	cluster->center = vector_create_fpt(cluster->points->head->point->dim);
 	list_points_node_fpt_t* node = cluster->points->head;
+	cluster->center = vector_create_fpt(node->point->dim);
 	while (node != NULL) {
 		vector_add_inplace_fpt(cluster->center, *(node->point));
 		node = node->next;
diff --git a/src/kmeans.c b/src/kmeans.c
index 7ee9705..a8b6df1 100644
--- a/src/kmeans.c
+++ b/src/kmeans.c
@@ -2,13 +2,14 @@
 
 
 cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
+	// check args and init
 	if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
 	cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*));
 	if (NULL == clusters) return NULL;
 	// determine range in which we are working
 	vector_int_t* min = vector_copy_int(points[0]);
 	vector_int_t* max = vector_copy_int(points[0]);
-	for (size_t i = 0; i < point_count; ++i) {
+	for (size_t i = 1; i < point_count; ++i) {
 		for (size_t p = 0; p < max->dim; ++p) {
 			const int_t value = points[i]->data[p];
 			if (value < min->data[p]) min->data[p] = value;
@@ -18,21 +19,89 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size
 	// until we have enough centers
 	for (size_t i = 0; i < nclusters; ++i) {
 		vector_int_t* center = vector_create_int(max->dim);
-		for (size_t p = 0; p < center->dim; ++p) {
-			center->data[p] = rand_int_range(min->data[p], max->data[p]);
+		bool valid = false;
+		while (!valid) {
+			// initialise center values randomly, within the "polygon" of our set of points
+			for (size_t p = 0; p < center->dim; ++p) {
+				center->data[p] = rand_int_range(min->data[p], max->data[p] + 1);
+			}
+			// check center is not already in clusters, although probability is extremely low...
+			for (size_t k = 0; k < i; ++k) {
+				vector_int_t* kth_center = clusters[k]->center;
+				bool neq = false;
+				for (size_t p = 0; p < center->dim; ++p) {
+					if (center->data[p] != kth_center->data[p]) {
+						neq = true;
+						break;
+					}
+				}
+				if (neq) {
+					valid = true;
+				}
+			}
+		}
+		clusters[i]->center = center;
+	}
+	return clusters;
+}
+
+cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) {
+	// check args and init
+	if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
+	cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*));
+	if (NULL == clusters) return NULL;
+	// determine range in which we are working
+	vector_fpt_t* min = vector_copy_fpt(points[0]);
+	vector_fpt_t* max = vector_copy_fpt(points[0]);
+	for (size_t i = 1; i < point_count; ++i) {
+		for (size_t p = 0; p < max->dim; ++p) {
+			const fpt_t value = points[i]->data[p];
+			if (value < min->data[p]) min->data[p] = value;
+			if (value > max->data[p]) max->data[p] = value;
+		}
+	}
+	// until we have enough centers
+	for (size_t i = 0; i < nclusters; ++i) {
+		vector_fpt_t* center = vector_create_fpt(max->dim);
+		bool valid = false;
+		while (!valid) {
+			// initialise center values randomly, within the "polygon" of our set of points
+			for (size_t p = 0; p < center->dim; ++p) {
+				center->data[p] = rand_double_range(min->data[p], max->data[p]);
+			}
+			// check center is not already in clusters, although probability is extremely low...
+			for (size_t k = 0; k < i; ++k) {
+				vector_fpt_t* kth_center = clusters[k]->center;
+				bool neq = false;
+				for (size_t p = 0; p < center->dim; ++p) {
+					if (center->data[p] != kth_center->data[p]) {
+						neq = true;
+						break;
+					}
+				}
+				if (neq) {
+					valid = true;
+				}
+			}
 		}
-		// TODO: maybe check center is not already in clusters, although probability is extremely low...
 		clusters[i]->center = center;
 	}
 	return clusters;
 }
 
 
-void kmeans_int(
-		vector_int_t** points, const size_t point_count,
-		cluster_int_t** clusters, const size_t nb_clusters,
-		fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
+void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
 	//TODO
 	bool changed = true;
-	while (changed) {}
+	while (changed) {
+		changed = false;
+	}
+}
+
+void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) {
+	//TODO
+	bool changed = true;
+	while (changed) {
+		changed = false;
+	}
 }
diff --git a/src/kmeans.h b/src/kmeans.h
index 34a83c8..3d29bc6 100644
--- a/src/kmeans.h
+++ b/src/kmeans.h
@@ -7,10 +7,12 @@
 
 cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters);
 
-void kmeans_int(
-		vector_int_t** points, const size_t point_count,
-		cluster_int_t** clusters, const size_t nb_clusters,
-		fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*));
+cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters);
+
+
+void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*));
+
+void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*));
 
 
 #endif //PROG_KMEANS_KMEANS_H
-- 
GitLab