From 317e27d2287fbabb336d4b7d4d127d557d012811 Mon Sep 17 00:00:00 2001 From: Boris Stefanovic <owldev@bluewin.ch> Date: Wed, 8 Jun 2022 00:52:07 +0200 Subject: [PATCH] ADD: check random centers are not already in set --- src/cluster.c | 4 +-- src/kmeans.c | 87 +++++++++++++++++++++++++++++++++++++++++++++------ src/kmeans.h | 10 +++--- 3 files changed, 86 insertions(+), 15 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index f1b2c27..bfb7149 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -45,8 +45,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { void cluster_update_center_int(cluster_int_t* cluster) { vector_destroy_int(cluster->center); - cluster->center = vector_create_int(cluster->points->head->point->dim); list_points_node_int_t* node = cluster->points->head; + cluster->center = vector_create_int(node->point->dim); while (node != NULL) { vector_add_inplace_int(cluster->center, *(node->point)); node = node->next; @@ -56,8 +56,8 @@ void cluster_update_center_int(cluster_int_t* cluster) { void cluster_update_center_fpt(cluster_fpt_t* cluster) { vector_destroy_fpt(cluster->center); - cluster->center = vector_create_fpt(cluster->points->head->point->dim); list_points_node_fpt_t* node = cluster->points->head; + cluster->center = vector_create_fpt(node->point->dim); while (node != NULL) { vector_add_inplace_fpt(cluster->center, *(node->point)); node = node->next; diff --git a/src/kmeans.c b/src/kmeans.c index 7ee9705..a8b6df1 100644 --- a/src/kmeans.c +++ b/src/kmeans.c @@ -2,13 +2,14 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { + // check args and init if (NULL == points || point_count < 2 || nclusters < 2) return NULL; cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*)); if (NULL == clusters) return NULL; // determine range in which we are working vector_int_t* min = vector_copy_int(points[0]); vector_int_t* max = vector_copy_int(points[0]); - for (size_t i = 0; i < point_count; ++i) { + for (size_t i = 1; i < point_count; ++i) { for (size_t p = 0; p < max->dim; ++p) { const int_t value = points[i]->data[p]; if (value < min->data[p]) min->data[p] = value; @@ -18,21 +19,89 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size // until we have enough centers for (size_t i = 0; i < nclusters; ++i) { vector_int_t* center = vector_create_int(max->dim); - for (size_t p = 0; p < center->dim; ++p) { - center->data[p] = rand_int_range(min->data[p], max->data[p]); + bool valid = false; + while (!valid) { + // initialise center values randomly, within the "polygon" of our set of points + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_int_range(min->data[p], max->data[p] + 1); + } + // check center is not already in clusters, although probability is extremely low... + for (size_t k = 0; k < i; ++k) { + vector_int_t* kth_center = clusters[k]->center; + bool neq = false; + for (size_t p = 0; p < center->dim; ++p) { + if (center->data[p] != kth_center->data[p]) { + neq = true; + break; + } + } + if (neq) { + valid = true; + } + } + } + clusters[i]->center = center; + } + return clusters; +} + +cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { + // check args and init + if (NULL == points || point_count < 2 || nclusters < 2) return NULL; + cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*)); + if (NULL == clusters) return NULL; + // determine range in which we are working + vector_fpt_t* min = vector_copy_fpt(points[0]); + vector_fpt_t* max = vector_copy_fpt(points[0]); + for (size_t i = 1; i < point_count; ++i) { + for (size_t p = 0; p < max->dim; ++p) { + const fpt_t value = points[i]->data[p]; + if (value < min->data[p]) min->data[p] = value; + if (value > max->data[p]) max->data[p] = value; + } + } + // until we have enough centers + for (size_t i = 0; i < nclusters; ++i) { + vector_fpt_t* center = vector_create_fpt(max->dim); + bool valid = false; + while (!valid) { + // initialise center values randomly, within the "polygon" of our set of points + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_double_range(min->data[p], max->data[p]); + } + // check center is not already in clusters, although probability is extremely low... + for (size_t k = 0; k < i; ++k) { + vector_fpt_t* kth_center = clusters[k]->center; + bool neq = false; + for (size_t p = 0; p < center->dim; ++p) { + if (center->data[p] != kth_center->data[p]) { + neq = true; + break; + } + } + if (neq) { + valid = true; + } + } } - // TODO: maybe check center is not already in clusters, although probability is extremely low... clusters[i]->center = center; } return clusters; } -void kmeans_int( - vector_int_t** points, const size_t point_count, - cluster_int_t** clusters, const size_t nb_clusters, - fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { +void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { //TODO bool changed = true; - while (changed) {} + while (changed) { + changed = false; + } +} + +void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) { + //TODO + bool changed = true; + while (changed) { + changed = false; + } } diff --git a/src/kmeans.h b/src/kmeans.h index 34a83c8..3d29bc6 100644 --- a/src/kmeans.h +++ b/src/kmeans.h @@ -7,10 +7,12 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters); -void kmeans_int( - vector_int_t** points, const size_t point_count, - cluster_int_t** clusters, const size_t nb_clusters, - fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); +cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters); + + +void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); + +void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)); #endif //PROG_KMEANS_KMEANS_H -- GitLab