diff --git a/src/cluster.c b/src/cluster.c index f1b2c276129e5084a3c6026b9aa98b8036c27c53..bfb714912253863941f169aafc1ee9158897d5a6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -45,8 +45,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { void cluster_update_center_int(cluster_int_t* cluster) { vector_destroy_int(cluster->center); - cluster->center = vector_create_int(cluster->points->head->point->dim); list_points_node_int_t* node = cluster->points->head; + cluster->center = vector_create_int(node->point->dim); while (node != NULL) { vector_add_inplace_int(cluster->center, *(node->point)); node = node->next; @@ -56,8 +56,8 @@ void cluster_update_center_int(cluster_int_t* cluster) { void cluster_update_center_fpt(cluster_fpt_t* cluster) { vector_destroy_fpt(cluster->center); - cluster->center = vector_create_fpt(cluster->points->head->point->dim); list_points_node_fpt_t* node = cluster->points->head; + cluster->center = vector_create_fpt(node->point->dim); while (node != NULL) { vector_add_inplace_fpt(cluster->center, *(node->point)); node = node->next; diff --git a/src/kmeans.c b/src/kmeans.c index 7ee9705c032a4b98853f2402c75204b2d4347f4f..a8b6df1d0b1af24f185191ba89e35b804a9c9fe9 100644 --- a/src/kmeans.c +++ b/src/kmeans.c @@ -2,13 +2,14 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { + // check args and init if (NULL == points || point_count < 2 || nclusters < 2) return NULL; cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*)); if (NULL == clusters) return NULL; // determine range in which we are working vector_int_t* min = vector_copy_int(points[0]); vector_int_t* max = vector_copy_int(points[0]); - for (size_t i = 0; i < point_count; ++i) { + for (size_t i = 1; i < point_count; ++i) { for (size_t p = 0; p < max->dim; ++p) { const int_t value = points[i]->data[p]; if (value < min->data[p]) min->data[p] = value; @@ -18,21 +19,89 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size // until we have enough centers for (size_t i = 0; i < nclusters; ++i) { vector_int_t* center = vector_create_int(max->dim); - for (size_t p = 0; p < center->dim; ++p) { - center->data[p] = rand_int_range(min->data[p], max->data[p]); + bool valid = false; + while (!valid) { + // initialise center values randomly, within the "polygon" of our set of points + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_int_range(min->data[p], max->data[p] + 1); + } + // check center is not already in clusters, although probability is extremely low... + for (size_t k = 0; k < i; ++k) { + vector_int_t* kth_center = clusters[k]->center; + bool neq = false; + for (size_t p = 0; p < center->dim; ++p) { + if (center->data[p] != kth_center->data[p]) { + neq = true; + break; + } + } + if (neq) { + valid = true; + } + } + } + clusters[i]->center = center; + } + return clusters; +} + +cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { + // check args and init + if (NULL == points || point_count < 2 || nclusters < 2) return NULL; + cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*)); + if (NULL == clusters) return NULL; + // determine range in which we are working + vector_fpt_t* min = vector_copy_fpt(points[0]); + vector_fpt_t* max = vector_copy_fpt(points[0]); + for (size_t i = 1; i < point_count; ++i) { + for (size_t p = 0; p < max->dim; ++p) { + const fpt_t value = points[i]->data[p]; + if (value < min->data[p]) min->data[p] = value; + if (value > max->data[p]) max->data[p] = value; + } + } + // until we have enough centers + for (size_t i = 0; i < nclusters; ++i) { + vector_fpt_t* center = vector_create_fpt(max->dim); + bool valid = false; + while (!valid) { + // initialise center values randomly, within the "polygon" of our set of points + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_double_range(min->data[p], max->data[p]); + } + // check center is not already in clusters, although probability is extremely low... + for (size_t k = 0; k < i; ++k) { + vector_fpt_t* kth_center = clusters[k]->center; + bool neq = false; + for (size_t p = 0; p < center->dim; ++p) { + if (center->data[p] != kth_center->data[p]) { + neq = true; + break; + } + } + if (neq) { + valid = true; + } + } } - // TODO: maybe check center is not already in clusters, although probability is extremely low... clusters[i]->center = center; } return clusters; } -void kmeans_int( - vector_int_t** points, const size_t point_count, - cluster_int_t** clusters, const size_t nb_clusters, - fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { +void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { //TODO bool changed = true; - while (changed) {} + while (changed) { + changed = false; + } +} + +void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) { + //TODO + bool changed = true; + while (changed) { + changed = false; + } } diff --git a/src/kmeans.h b/src/kmeans.h index 34a83c8d4df56d52aad598c9e7a201439a90b913..3d29bc6184187d6eddd3f94e4a95ae888b110b8f 100644 --- a/src/kmeans.h +++ b/src/kmeans.h @@ -7,10 +7,12 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters); -void kmeans_int( - vector_int_t** points, const size_t point_count, - cluster_int_t** clusters, const size_t nb_clusters, - fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); +cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters); + + +void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); + +void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)); #endif //PROG_KMEANS_KMEANS_H