From 13fc6a107a63aea4740139189256cfc2022b6027 Mon Sep 17 00:00:00 2001 From: Boris Stefanovic <owldev@bluewin.ch> Date: Sat, 18 Jun 2022 23:22:06 +0200 Subject: [PATCH] cleanup --- Makefile | 3 ++ src/cluster.c | 33 +++++++++++++++- src/io.c | 17 +++++--- src/kmeans.c | 96 +++++++++++++++++++++++++++++++++++++++++--- src/kmeans.h | 5 +++ src/main.c | 107 +++++++++++++++++++++++++++++++++----------------- 6 files changed, 211 insertions(+), 50 deletions(-) diff --git a/Makefile b/Makefile index 62d5a46..37e0985 100644 --- a/Makefile +++ b/Makefile @@ -55,5 +55,8 @@ clean: debug: ${DEBUG_TARGET} ./$< -i test/data.txt +test: ${TARGET} + ./$< -i test/data.txt -o ~/test_kmeans + exec: ${TARGET} ./$< diff --git a/src/cluster.c b/src/cluster.c index e9bb226..5a173f6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -52,8 +52,37 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { bool cluster_update_center_int(cluster_int_t* cluster) { - //TODO - return true; + // save old center + vector_int_t* old_center = cluster->center; + assert(old_center != NULL); + assert(cluster != NULL); + // create new center + list_points_node_int_t* node = cluster->points->head; + // if cluster is empty + if (NULL == node) { + return false; // center has not been changed + } else { + cluster->center = vector_create_int(node->point->dim); + // sum all values in center + while (node != NULL) { + vector_add_inplace_int(cluster->center, *(node->point)); + node = node->next; + } + // divide by number of points + vector_div_inplace_int(cluster->center, (int_t) cluster->points->size); + // check whether center has changed + bool changed = false; + for (size_t p = 0; p < cluster->center->dim; ++p) { + if (cluster->center->data[p] == old_center->data[p]) { + changed = true; + break; + } + } + // destroy old center + vector_destroy_int(old_center); + // return true if center has changed + return changed; + } } bool cluster_update_center_fpt(cluster_fpt_t* cluster) { diff --git a/src/io.c b/src/io.c index cc7824d..2b873e4 100644 --- a/src/io.c +++ b/src/io.c @@ -1,6 +1,7 @@ #define _GNU_SOURCE #include "io.h" +#include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -9,17 +10,21 @@ int_t io_read_int(FILE* file) { - char* line; - size_t len; + char* line = NULL; + size_t len = 0; getline(&line, &len, file); - return strtol(line, NULL, 10); + long res = strtol(line, NULL, 10); + free(line); + return res; } fpt_t io_read_fpt(FILE* file) { - char* line; - size_t len; + char* line = NULL; + size_t len = 0; getline(&line, &len, file); - return strtod(line, NULL); + double res = strtod(line, NULL); + free(line); + return res; } diff --git a/src/kmeans.c b/src/kmeans.c index 9d596fb..348c08f 100644 --- a/src/kmeans.c +++ b/src/kmeans.c @@ -3,11 +3,49 @@ #include "vector.h" -cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { - //TODO - return NULL; +bool is_vector_in_centers_int(const vector_int_t* center, const cluster_int_t** clusters, const size_t i) { + for (size_t k = 0; k < i; ++k) { + if (vector_equals_int(clusters[k]->center, center)) { + return true; + } + } + return false; } +cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { + // check args and init + if (NULL == points || point_count < 2 || nclusters < 2) return NULL; + cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*)); + if (NULL == clusters) return NULL; + for (size_t k = 0; k < nclusters; ++k) { + clusters[k] = cluster_create_int(NULL); + } + // determine range in which we are working + vector_int_t* min = vector_copy_int(points[0]); + vector_int_t* max = vector_copy_int(points[0]); + for (size_t i = 1; i < point_count; ++i) { + for (size_t p = 0; p < max->dim; ++p) { + const int_t value = points[i]->data[p]; + if (value < min->data[p]) min->data[p] = value; + if (value > max->data[p]) max->data[p] = value; + } + } + // until we have enough centers + for (size_t i = 0; i < nclusters; ++i) { + vector_int_t* center = vector_create_int(max->dim); + bool valid = false; + while (!valid) { + // initialise center values randomly, within the "multidimensional rectangle" of our set of points + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_int_range(min->data[p], max->data[p]); // TODO: create a rand_long_range(...) function + } + // check center is not already in clusters, although probability is extremely low... + valid = !is_vector_in_centers_int(center, (const cluster_int_t**) clusters, i); + } + clusters[i] = cluster_create_int(center); + } + return clusters; +} bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) { for (size_t k = 0; k < i; ++k) { @@ -18,7 +56,6 @@ bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** return false; } - cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { // check args and init if (NULL == points || point_count < 2 || nclusters < 2) return NULL; @@ -55,9 +92,57 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size } +void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters) { + if (NULL == clusters) return; + for (size_t i = 0; i < nb_clusters; ++i) { + cluster_destroy_int(clusters[i]); + } +} + +void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters) { + if (NULL == clusters) return; + for (size_t i = 0; i < nb_clusters; ++i) { + cluster_destroy_fpt(clusters[i]); + } +} + + void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { - //TODO + bool changed = true; + while (changed) { + // reset condition + changed = false; + // empty all clusters, keeping only their centers (virtual) + for (size_t k = 0; k < nb_clusters; ++k) { + cluster_reset_int(clusters[k]); + } + // for each point + for (size_t i = 0; i < point_count; ++i) { + vector_int_t* point = points[i]; + // find closest cluster + cluster_int_t* cmin = clusters[0]; + fpt_t dmin = distance_function(point, cmin->center); + for (size_t k = 0; k < nb_clusters; ++k) { + cluster_int_t* current_cluster = clusters[k]; + fpt_t dist = distance_function(point, current_cluster->center); + if (dist < dmin) { + cmin = current_cluster; + dmin = dist; + } + } + // add point to closest cluster + cluster_add_point_int(cmin, point); + } + // update all cluster centers + for (size_t k = 0; k < nb_clusters; ++k) { + assert(clusters[k] != NULL); + assert(clusters[k]->points != NULL); + if (cluster_update_center_int(clusters[k])) { + changed = true; + } + } + } } @@ -94,7 +179,6 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** assert(clusters[k]->points != NULL); if (cluster_update_center_fpt(clusters[k])) { changed = true; - printf("%lud \n<%lf %lf %lf>\n\n", nb_clusters, clusters[k]->center->data[0], clusters[k]->center->data[1], clusters[k]->center->data[2]); } } } diff --git a/src/kmeans.h b/src/kmeans.h index 3d29bc6..cbfbd20 100644 --- a/src/kmeans.h +++ b/src/kmeans.h @@ -10,6 +10,11 @@ cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters); +void kmeans_destroy_clusters_int(cluster_int_t** clusters, const size_t nb_clusters); + +void kmeans_destroy_clusters_fpt(cluster_fpt_t** clusters, const size_t nb_clusters); + + void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)); void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)); diff --git a/src/main.c b/src/main.c index 86e9f2a..30fa4bc 100644 --- a/src/main.c +++ b/src/main.c @@ -1,3 +1,4 @@ +#include <assert.h> #include <getopt.h> #include <stdbool.h> #include <stdio.h> @@ -70,45 +71,59 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance } -int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { - //TODO - return EXIT_FAILURE; +int main_int(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) { + // INIT + vector_int_t** points = NULL; + cluster_int_t** clusters = NULL; + // READ + list_points_int_t* list = io_get_vector_list_int(ifile, dim); + const size_t point_count = list->size; + points = list_points_to_array_int(list); + list_points_destroy_int(list, false); + list = NULL; + // ALGORITHM + printf("INIT: "); + clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters); + printf("DONE\n"); + printf("KMEANS: begin\n"); + kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]); + printf("KMEANS: DONE !\n"); + // WRITE + fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters); + io_write_clusters_to_file_int(ofile, clusters, nb_clusters); + // CLEANUP + for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_int(clusters[i]); + for (size_t i = 0; i < point_count; ++i) vector_destroy_int(points[i]); + // EXIT + return EXIT_SUCCESS; } -int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { +int main_fpt(FILE* ifile, FILE* ofile, const size_t dim, const size_t nb_clusters, const enum DistanceFunctionType dist_func_type) { + // INIT + vector_fpt_t** points = NULL; + cluster_fpt_t** clusters = NULL; // READ - FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin; - const size_t dim = io_read_int(ifile); - const size_t nb_clusters = io_read_int(ifile); - if (0 == dim) { - printf("DIMENSION MUST BE STRICTLY POSITIVE !\n"); - fclose(ifile); - return EXIT_FAILURE; - } - if (0 == nb_clusters) { - printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n"); - fclose(ifile); - return EXIT_FAILURE; - } list_points_fpt_t* list = io_get_vector_list_fpt(ifile, dim); - fclose(ifile); - ifile = NULL; const size_t point_count = list->size; - vector_fpt_t** points = list_points_to_array_fpt(list); + points = list_points_to_array_fpt(list); list_points_destroy_fpt(list, false); list = NULL; // ALGORITHM - printf("INIT: ... "); - cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters); + printf("INIT: "); + clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters); printf("DONE\n"); - printf("STARTING KMEANS ALGORITHM: ...\n"); + printf("KMEANS: begin\n"); kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]); - printf("KMEANS DONE !\n"); + printf("KMEANS: DONE !\n"); // WRITE - FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; - fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters); + fprintf(ofile, "%lu\n%lu\n", dim, nb_clusters); io_write_clusters_to_file_fpt(ofile, clusters, nb_clusters); - fclose(ofile); + // CLEANUP + for (size_t i = 0; i < nb_clusters; ++i) cluster_destroy_fpt(clusters[i]); + free(clusters); + for (size_t i = 0; i < point_count; ++i) vector_destroy_fpt(points[i]); + free(points); + // EXIT return EXIT_SUCCESS; } @@ -120,14 +135,34 @@ int main(int argc, char** argv) { enum DataType datatype = FLOAT; // parse args parse_args(argc, argv, &ipath, &opath, &disttype, &datatype); - switch (datatype) { - case FLOAT: - printf("FLOAT\n"); - return main_fpt(ipath, opath, disttype); - case INT: - printf("INT\n"); - return main_int(ipath, opath, disttype); - default: - abort(); + // open files + FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin; + FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; + // read dimension and desired number of clusters from file + const size_t dim = io_read_int(ifile); + const size_t nb_clusters = io_read_int(ifile); + if (0 == dim) { + printf("DIMENSION MUST BE STRICTLY POSITIVE !\n"); + fclose(ifile); + fclose(ofile); + return EXIT_FAILURE; } + if (0 == nb_clusters) { + printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n"); + fclose(ifile); + fclose(ofile); + return EXIT_FAILURE; + } + // type specific code + int return_value = EXIT_FAILURE; + int (* main_routine)(FILE*, FILE*, const size_t, const size_t, const enum DistanceFunctionType); + main_routine = INT == datatype ? main_int : main_fpt; + printf(INT == datatype ? "TYPE: INT\n" : "TYPE: FLOAT\n"); + assert(ifile != NULL); + assert(ofile != NULL); + return_value = main_routine(ifile, ofile, dim, nb_clusters, disttype); + // cleanup + fclose(ifile); + fclose(ofile); + return return_value; } -- GitLab