From 0d07376d81a85d2a23c4dce276e307051280f55a Mon Sep 17 00:00:00 2001 From: Boris Stefanovic <owldev@bluewin.ch> Date: Fri, 10 Jun 2022 09:58:18 +0200 Subject: [PATCH] DEBUG: removed cause of infinite loop in kmeans_init_clusters --- Makefile | 6 +-- src/cluster.c | 71 ++++++++++++----------------- src/io.c | 6 ++- src/kmeans.c | 124 ++++++++++++++++++-------------------------------- src/main.c | 47 +++++++------------ src/vector.c | 23 ++++++++++ src/vector.h | 5 ++ test/data.txt | 7 +++ 8 files changed, 131 insertions(+), 158 deletions(-) create mode 100644 test/data.txt diff --git a/Makefile b/Makefile index 5163eac..62d5a46 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ OBJ := $(patsubst ${SRC_DIR}/%.c,${BUILD_DIR}/%.o,${SRC}) DEBUG_BUILD_DIR := ${BUILD_ROOT}/debug DEBUG_TARGET := ${DEBUG_BUILD_DIR}/debug DEBUG_CFLAGS := ${CFLAGS} -fsanitize=address -fsanitize=leak -g -DDEBUG -DEBUG_LDEXTRA := +DEBUG_LDEXTRA := ${LDEXTRA} DEBUG_LDFLAGS := ${DEBUG_CFLAGS} ${DEBUG_LDEXTRA} DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC}) @@ -22,7 +22,7 @@ DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC}) # TARGETS -all: ${TARGET} ${TARGET_DEBUG} +all: ${TARGET} ${DEBUG_TARGET} ${TARGET}: ${OBJ} ${CC} ${LDFLAGS} -o $@ $^ @@ -53,7 +53,7 @@ clean: rm -rf ${BUILD_ROOT} debug: ${DEBUG_TARGET} - ./$< + ./$< -i test/data.txt exec: ${TARGET} ./$< diff --git a/src/cluster.c b/src/cluster.c index 504c930..446062b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2,6 +2,10 @@ #include <stdbool.h> #include "vector.h" +#ifdef DEBUG +#include <assert.h> +#endif + cluster_int_t* cluster_create_int(vector_int_t* center) { cluster_int_t* cluster = malloc(sizeof(cluster_int_t)); @@ -47,30 +51,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { bool cluster_update_center_int(cluster_int_t* cluster) { - // save old center - vector_int_t* old_center = cluster->center; - // create new center - list_points_node_int_t* node = cluster->points->head; - cluster->center = vector_create_int(node->point->dim); - // sum all values in center - while (node != NULL) { - vector_add_inplace_int(cluster->center, *(node->point)); - node = node->next; - } - // divide by number of points - vector_div_inplace_int(cluster->center, (int_t) cluster->points->size); - // check whether center has changed - bool changed = false; - for (size_t p = 0; p < cluster->center->dim; ++p) { - if (cluster->center->data[p] != old_center->data[p]) { - changed = true; - break; - } - } - // destroy old center - vector_destroy_int(old_center); - // return true if center has changed - return changed; + //TODO + return true; } bool cluster_update_center_fpt(cluster_fpt_t* cluster) { @@ -78,26 +60,31 @@ bool cluster_update_center_fpt(cluster_fpt_t* cluster) { vector_fpt_t* old_center = cluster->center; // create new center list_points_node_fpt_t* node = cluster->points->head; - cluster->center = vector_create_fpt(node->point->dim); - // sum all values in center - while (node != NULL) { - vector_add_inplace_fpt(cluster->center, *(node->point)); - node = node->next; - } - // divide by number of points - vector_div_inplace_fpt(cluster->center, (fpt_t) cluster->points->size); - // check whether center has changed - bool changed = false; - for (size_t p = 0; p < cluster->center->dim; ++p) { - if (cluster->center->data[p] != old_center->data[p]) { - changed = true; - break; + // if cluster is empty + if (NULL == node) { + return false; // center has not been changed + } else { + cluster->center = vector_create_fpt(node->point->dim); + // sum all values in center + while (node != NULL) { + vector_add_inplace_fpt(cluster->center, *(node->point)); + node = node->next; + } + // divide by number of points + vector_div_inplace_fpt(cluster->center, (fpt_t) cluster->points->size); + // check whether center has changed + bool changed = false; + for (size_t p = 0; p < cluster->center->dim; ++p) { + if (cluster->center->data[p] != old_center->data[p]) { + changed = true; + break; + } } + // destroy old center + vector_destroy_fpt(old_center); + // return true if center has changed + return changed; } - // destroy old center - vector_destroy_fpt(old_center); - // return true if center has changed - return changed; } diff --git a/src/io.c b/src/io.c index a3674bb..cc7824d 100644 --- a/src/io.c +++ b/src/io.c @@ -56,9 +56,10 @@ list_points_int_t* io_get_vector_list_int(FILE* ifile, const size_t dim) { if (len != 0) { vector_int_t* vector = io_line_to_vector_int(line, dim); list_points_append_int(list, vector); - free(line); + //free(line); } } + free(line); return list; } @@ -70,9 +71,10 @@ list_points_fpt_t* io_get_vector_list_fpt(FILE* ifile, const size_t dim) { if (len != 0) { vector_fpt_t* vector = io_line_to_vector_fpt(line, dim); list_points_append_fpt(list, vector); - free(line); + //free(line); } } + free(line); return list; } diff --git a/src/kmeans.c b/src/kmeans.c index 1179541..cb572cd 100644 --- a/src/kmeans.c +++ b/src/kmeans.c @@ -1,55 +1,43 @@ #include "kmeans.h" +#include "vector.h" + +#ifdef DEBUG +#include <assert.h> +#include "io.h" +#endif + +#define EPSILON 0.001 + + +fpt_t abs_fpt(const fpt_t x) { + return x >= 0 ? x : -x; +} cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { - // check args and init - if (NULL == points || point_count < 2 || nclusters < 2) return NULL; - cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*)); - if (NULL == clusters) return NULL; - // determine range in which we are working - vector_int_t* min = vector_copy_int(points[0]); - vector_int_t* max = vector_copy_int(points[0]); - for (size_t i = 1; i < point_count; ++i) { - for (size_t p = 0; p < max->dim; ++p) { - const int_t value = points[i]->data[p]; - if (value < min->data[p]) min->data[p] = value; - if (value > max->data[p]) max->data[p] = value; - } - } - // until we have enough centers - for (size_t i = 0; i < nclusters; ++i) { - vector_int_t* center = vector_create_int(max->dim); - bool valid = false; - while (!valid) { - // initialise center values randomly, within the "polygon" of our set of points - for (size_t p = 0; p < center->dim; ++p) { - center->data[p] = rand_int_range(min->data[p], max->data[p] + 1); - } - // check center is not already in clusters, although probability is extremely low... - for (size_t k = 0; k < i; ++k) { - vector_int_t* kth_center = clusters[k]->center; - bool neq = false; - for (size_t p = 0; p < center->dim; ++p) { - if (center->data[p] != kth_center->data[p]) { - neq = true; - break; - } - } - if (neq) { - valid = true; - } - } + //TODO + return NULL; +} + + +bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) { + for (size_t k = 0; k < i; ++k) { + if (vector_equals_fpt(clusters[k]->center, center)) { + return true; } - clusters[i]->center = center; } - return clusters; + return false; } + cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { // check args and init if (NULL == points || point_count < 2 || nclusters < 2) return NULL; cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*)); if (NULL == clusters) return NULL; + for (size_t k = 0; k < nclusters; ++k) { + clusters[k] = cluster_create_fpt(NULL); + } // determine range in which we are working vector_fpt_t* min = vector_copy_fpt(points[0]); vector_fpt_t* max = vector_copy_fpt(points[0]); @@ -65,26 +53,14 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size vector_fpt_t* center = vector_create_fpt(max->dim); bool valid = false; while (!valid) { - // initialise center values randomly, within the "polygon" of our set of points + // initialise center values randomly, within the "multidimensional rectangle" of our set of points for (size_t p = 0; p < center->dim; ++p) { center->data[p] = rand_double_range(min->data[p], max->data[p]); } // check center is not already in clusters, although probability is extremely low... - for (size_t k = 0; k < i; ++k) { - vector_fpt_t* kth_center = clusters[k]->center; - bool neq = false; - for (size_t p = 0; p < center->dim; ++p) { - if (center->data[p] != kth_center->data[p]) { - neq = true; - break; - } - } - if (neq) { - valid = true; - } - } + valid = !is_vector_in_centers_fpt(center, (const cluster_fpt_t**) clusters, i); } - clusters[i]->center = center; + clusters[i] = cluster_create_fpt(center); } return clusters; } @@ -92,41 +68,24 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { - bool changed = true; - while (changed) { - changed = false; - for (size_t i = 0; i < point_count; ++i) { - vector_int_t* point = points[i]; - // find closest cluster and add point to it - cluster_int_t* cmin = clusters[0]; - int_t dmin = distance_function(point, cmin->center); - for (size_t k = 1; k < nb_clusters; ++k) { - cluster_int_t* current_cluster = clusters[k]; - fpt_t dist = distance_function(point, current_cluster->center); - if (dist < dmin) { - cmin = current_cluster; - dmin = dist; - } - } - cluster_add_point_int(cmin, point); - // update all cluster centers - for (size_t k = 0; k < nb_clusters; ++k) { - if (cluster_update_center_int(clusters[k])) { - changed = true; - } - } - } - } + //TODO } + void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) { bool changed = true; while (changed) { + // reset condition changed = false; + // empty all clusters, keeping only their centers (virtual) + for (size_t k = 0; k < nb_clusters; ++k) { + cluster_reset_fpt(clusters[k]); + } + // for each point for (size_t i = 0; i < point_count; ++i) { vector_fpt_t* point = points[i]; - // find closest cluster and add point to it + // find closest cluster cluster_fpt_t* cmin = clusters[0]; fpt_t dmin = distance_function(point, cmin->center); for (size_t k = 1; k < nb_clusters; ++k) { @@ -137,9 +96,14 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** dmin = dist; } } + // add point to closest cluster cluster_add_point_fpt(cmin, point); // update all cluster centers for (size_t k = 0; k < nb_clusters; ++k) { +#ifdef DEBUG + assert(clusters[k] != NULL); + assert(clusters[k]->points != NULL); +#endif if (cluster_update_center_fpt(clusters[k])) { changed = true; } diff --git a/src/main.c b/src/main.c index c12fe2c..5df68ee 100644 --- a/src/main.c +++ b/src/main.c @@ -9,6 +9,10 @@ #include "linkedlist.h" #include "vector.h" +#ifdef DEBUG +#include <assert.h> +#endif + enum DistanceFunctionType { EUCLID = 0, MANHATTAN = 1, CHEBYSHEV = 2 @@ -60,7 +64,7 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance else if (strcmp(optarg, "int") == 0) *type = INT; break; case '?': - //TODO: perhaps add an "unknown option" message on stderr + fprintf(stderr, "UNKNOWN OPTION : %c", opt); break; default: // https://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html @@ -71,36 +75,8 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { - // READ - FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin; - const size_t dim = io_read_int(ifile); - const size_t nb_clusters = io_read_int(ifile); - if (0 == dim) { - printf("DIMENSION MUST BE STRICTLY POSITIVE !\n"); - fclose(ifile); - return EXIT_FAILURE; - } - if (0 == nb_clusters) { - printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n"); - fclose(ifile); - return EXIT_FAILURE; - } - list_points_int_t* list = io_get_vector_list_int(ifile, dim); - fclose(ifile); - ifile = NULL; - const size_t point_count = list->size; - vector_int_t** points = list_points_to_array_int(list); - list_points_destroy_int(list, false); - list = NULL; - // ALGORITHM - cluster_int_t** clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters); - kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]); - // WRITE - FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; - fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters); - io_write_clusters_to_file_int(ofile, clusters, point_count); - fclose(ofile); - return EXIT_SUCCESS; + //TODO + return EXIT_FAILURE; } int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { @@ -126,8 +102,15 @@ int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionTy list_points_destroy_fpt(list, false); list = NULL; // ALGORITHM + printf("INIT: ... "); cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters); + printf("DONE\n"); +#ifdef DEBUG + for(size_t i = 0; i < nb_clusters; ++i) assert(clusters[i] !=NULL); +#endif + printf("STARTING KMEANS ALGORITHM: ...\n"); kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]); + printf("KMEANS DONE !\n"); // WRITE FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters); @@ -146,8 +129,10 @@ int main(int argc, char** argv) { parse_args(argc, argv, &ipath, &opath, &disttype, &datatype); switch (datatype) { case FLOAT: + printf("FLOAT\n"); return main_fpt(ipath, opath, disttype); case INT: + printf("INT\n"); return main_int(ipath, opath, disttype); default: abort(); diff --git a/src/vector.c b/src/vector.c index b70c9f1..6d31b50 100644 --- a/src/vector.c +++ b/src/vector.c @@ -111,3 +111,26 @@ void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a) { if (NULL == v) return; for (size_t i = 0; i < v->dim; ++i) v->data[i] /= a; } + + +void vector_print_int(const vector_int_t* v) { + if (NULL == v) printf("NULL"); + else { + printf("%ld", v->data[0]); + for (size_t p = 1; p < v->dim; ++p) { + printf(" , %ld", v->data[p]); + } + printf("\n"); + } +} + +void vector_print_fpt(const vector_fpt_t* v) { + if (NULL == v) printf("NULL"); + else { + printf("%lf", v->data[0]); + for (size_t p = 1; p < v->dim; ++p) { + printf(" , %lf", v->data[p]); + } + printf("\n"); + } +} diff --git a/src/vector.h b/src/vector.h index 155fb78..7eed5ce 100644 --- a/src/vector.h +++ b/src/vector.h @@ -54,4 +54,9 @@ void vector_div_inplace_int(vector_int_t* v, const int_t a); void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a); +void vector_print_int(const vector_int_t* v); + +void vector_print_fpt(const vector_fpt_t* v); + + #endif //PROG_KMEANS_VECTOR_H diff --git a/test/data.txt b/test/data.txt new file mode 100644 index 0000000..ffea229 --- /dev/null +++ b/test/data.txt @@ -0,0 +1,7 @@ +3 +3 +13, 6, 7 +100.5, 78.32, 1012.34 +-1, -1, -1 +14.2, 5.7, 7.56 +99, 79, 1011 -- GitLab