Skip to content
Snippets Groups Projects
Commit 0d07376d authored by Boris Stefanovic's avatar Boris Stefanovic
Browse files

DEBUG: removed cause of infinite loop in kmeans_init_clusters

parent 4a39e944
No related branches found
No related tags found
No related merge requests found
...@@ -14,7 +14,7 @@ OBJ := $(patsubst ${SRC_DIR}/%.c,${BUILD_DIR}/%.o,${SRC}) ...@@ -14,7 +14,7 @@ OBJ := $(patsubst ${SRC_DIR}/%.c,${BUILD_DIR}/%.o,${SRC})
DEBUG_BUILD_DIR := ${BUILD_ROOT}/debug DEBUG_BUILD_DIR := ${BUILD_ROOT}/debug
DEBUG_TARGET := ${DEBUG_BUILD_DIR}/debug DEBUG_TARGET := ${DEBUG_BUILD_DIR}/debug
DEBUG_CFLAGS := ${CFLAGS} -fsanitize=address -fsanitize=leak -g -DDEBUG DEBUG_CFLAGS := ${CFLAGS} -fsanitize=address -fsanitize=leak -g -DDEBUG
DEBUG_LDEXTRA := DEBUG_LDEXTRA := ${LDEXTRA}
DEBUG_LDFLAGS := ${DEBUG_CFLAGS} ${DEBUG_LDEXTRA} DEBUG_LDFLAGS := ${DEBUG_CFLAGS} ${DEBUG_LDEXTRA}
DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC}) DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC})
...@@ -22,7 +22,7 @@ DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC}) ...@@ -22,7 +22,7 @@ DEBUG_OBJ := $(patsubst ${SRC_DIR}/%.c,${DEBUG_BUILD_DIR}/%.o,${SRC})
# TARGETS # TARGETS
all: ${TARGET} ${TARGET_DEBUG} all: ${TARGET} ${DEBUG_TARGET}
${TARGET}: ${OBJ} ${TARGET}: ${OBJ}
${CC} ${LDFLAGS} -o $@ $^ ${CC} ${LDFLAGS} -o $@ $^
...@@ -53,7 +53,7 @@ clean: ...@@ -53,7 +53,7 @@ clean:
rm -rf ${BUILD_ROOT} rm -rf ${BUILD_ROOT}
debug: ${DEBUG_TARGET} debug: ${DEBUG_TARGET}
./$< ./$< -i test/data.txt
exec: ${TARGET} exec: ${TARGET}
./$< ./$<
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
#include <stdbool.h> #include <stdbool.h>
#include "vector.h" #include "vector.h"
#ifdef DEBUG
#include <assert.h>
#endif
cluster_int_t* cluster_create_int(vector_int_t* center) { cluster_int_t* cluster_create_int(vector_int_t* center) {
cluster_int_t* cluster = malloc(sizeof(cluster_int_t)); cluster_int_t* cluster = malloc(sizeof(cluster_int_t));
...@@ -47,30 +51,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) { ...@@ -47,30 +51,8 @@ void cluster_add_point_fpt(cluster_fpt_t* cluster, vector_fpt_t* point) {
bool cluster_update_center_int(cluster_int_t* cluster) { bool cluster_update_center_int(cluster_int_t* cluster) {
// save old center //TODO
vector_int_t* old_center = cluster->center; return true;
// create new center
list_points_node_int_t* node = cluster->points->head;
cluster->center = vector_create_int(node->point->dim);
// sum all values in center
while (node != NULL) {
vector_add_inplace_int(cluster->center, *(node->point));
node = node->next;
}
// divide by number of points
vector_div_inplace_int(cluster->center, (int_t) cluster->points->size);
// check whether center has changed
bool changed = false;
for (size_t p = 0; p < cluster->center->dim; ++p) {
if (cluster->center->data[p] != old_center->data[p]) {
changed = true;
break;
}
}
// destroy old center
vector_destroy_int(old_center);
// return true if center has changed
return changed;
} }
bool cluster_update_center_fpt(cluster_fpt_t* cluster) { bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
...@@ -78,6 +60,10 @@ bool cluster_update_center_fpt(cluster_fpt_t* cluster) { ...@@ -78,6 +60,10 @@ bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
vector_fpt_t* old_center = cluster->center; vector_fpt_t* old_center = cluster->center;
// create new center // create new center
list_points_node_fpt_t* node = cluster->points->head; list_points_node_fpt_t* node = cluster->points->head;
// if cluster is empty
if (NULL == node) {
return false; // center has not been changed
} else {
cluster->center = vector_create_fpt(node->point->dim); cluster->center = vector_create_fpt(node->point->dim);
// sum all values in center // sum all values in center
while (node != NULL) { while (node != NULL) {
...@@ -99,6 +85,7 @@ bool cluster_update_center_fpt(cluster_fpt_t* cluster) { ...@@ -99,6 +85,7 @@ bool cluster_update_center_fpt(cluster_fpt_t* cluster) {
// return true if center has changed // return true if center has changed
return changed; return changed;
} }
}
void cluster_reset_int(cluster_int_t* cluster) { void cluster_reset_int(cluster_int_t* cluster) {
......
...@@ -56,9 +56,10 @@ list_points_int_t* io_get_vector_list_int(FILE* ifile, const size_t dim) { ...@@ -56,9 +56,10 @@ list_points_int_t* io_get_vector_list_int(FILE* ifile, const size_t dim) {
if (len != 0) { if (len != 0) {
vector_int_t* vector = io_line_to_vector_int(line, dim); vector_int_t* vector = io_line_to_vector_int(line, dim);
list_points_append_int(list, vector); list_points_append_int(list, vector);
free(line); //free(line);
} }
} }
free(line);
return list; return list;
} }
...@@ -70,9 +71,10 @@ list_points_fpt_t* io_get_vector_list_fpt(FILE* ifile, const size_t dim) { ...@@ -70,9 +71,10 @@ list_points_fpt_t* io_get_vector_list_fpt(FILE* ifile, const size_t dim) {
if (len != 0) { if (len != 0) {
vector_fpt_t* vector = io_line_to_vector_fpt(line, dim); vector_fpt_t* vector = io_line_to_vector_fpt(line, dim);
list_points_append_fpt(list, vector); list_points_append_fpt(list, vector);
free(line); //free(line);
} }
} }
free(line);
return list; return list;
} }
......
#include "kmeans.h" #include "kmeans.h"
#include "vector.h"
#ifdef DEBUG
#include <assert.h>
#include "io.h"
#endif
cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) { #define EPSILON 0.001
// check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
cluster_int_t** clusters = calloc(nclusters, sizeof(vector_int_t*)); fpt_t abs_fpt(const fpt_t x) {
if (NULL == clusters) return NULL; return x >= 0 ? x : -x;
// determine range in which we are working
vector_int_t* min = vector_copy_int(points[0]);
vector_int_t* max = vector_copy_int(points[0]);
for (size_t i = 1; i < point_count; ++i) {
for (size_t p = 0; p < max->dim; ++p) {
const int_t value = points[i]->data[p];
if (value < min->data[p]) min->data[p] = value;
if (value > max->data[p]) max->data[p] = value;
}
} }
// until we have enough centers
for (size_t i = 0; i < nclusters; ++i) {
vector_int_t* center = vector_create_int(max->dim); cluster_int_t** kmeans_init_clusters_int(const vector_int_t** points, const size_t point_count, const size_t nclusters) {
bool valid = false; //TODO
while (!valid) { return NULL;
// initialise center values randomly, within the "polygon" of our set of points
for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_int_range(min->data[p], max->data[p] + 1);
} }
// check center is not already in clusters, although probability is extremely low...
bool is_vector_in_centers_fpt(const vector_fpt_t* center, const cluster_fpt_t** clusters, const size_t i) {
for (size_t k = 0; k < i; ++k) { for (size_t k = 0; k < i; ++k) {
vector_int_t* kth_center = clusters[k]->center; if (vector_equals_fpt(clusters[k]->center, center)) {
bool neq = false; return true;
for (size_t p = 0; p < center->dim; ++p) {
if (center->data[p] != kth_center->data[p]) {
neq = true;
break;
}
}
if (neq) {
valid = true;
} }
} }
} return false;
clusters[i]->center = center;
}
return clusters;
} }
cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) { cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size_t point_count, const size_t nclusters) {
// check args and init // check args and init
if (NULL == points || point_count < 2 || nclusters < 2) return NULL; if (NULL == points || point_count < 2 || nclusters < 2) return NULL;
cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*)); cluster_fpt_t** clusters = calloc(nclusters, sizeof(vector_fpt_t*));
if (NULL == clusters) return NULL; if (NULL == clusters) return NULL;
for (size_t k = 0; k < nclusters; ++k) {
clusters[k] = cluster_create_fpt(NULL);
}
// determine range in which we are working // determine range in which we are working
vector_fpt_t* min = vector_copy_fpt(points[0]); vector_fpt_t* min = vector_copy_fpt(points[0]);
vector_fpt_t* max = vector_copy_fpt(points[0]); vector_fpt_t* max = vector_copy_fpt(points[0]);
...@@ -65,26 +53,14 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size ...@@ -65,26 +53,14 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size
vector_fpt_t* center = vector_create_fpt(max->dim); vector_fpt_t* center = vector_create_fpt(max->dim);
bool valid = false; bool valid = false;
while (!valid) { while (!valid) {
// initialise center values randomly, within the "polygon" of our set of points // initialise center values randomly, within the "multidimensional rectangle" of our set of points
for (size_t p = 0; p < center->dim; ++p) { for (size_t p = 0; p < center->dim; ++p) {
center->data[p] = rand_double_range(min->data[p], max->data[p]); center->data[p] = rand_double_range(min->data[p], max->data[p]);
} }
// check center is not already in clusters, although probability is extremely low... // check center is not already in clusters, although probability is extremely low...
for (size_t k = 0; k < i; ++k) { valid = !is_vector_in_centers_fpt(center, (const cluster_fpt_t**) clusters, i);
vector_fpt_t* kth_center = clusters[k]->center;
bool neq = false;
for (size_t p = 0; p < center->dim; ++p) {
if (center->data[p] != kth_center->data[p]) {
neq = true;
break;
}
}
if (neq) {
valid = true;
}
}
} }
clusters[i]->center = center; clusters[i] = cluster_create_fpt(center);
} }
return clusters; return clusters;
} }
...@@ -92,41 +68,24 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size ...@@ -92,41 +68,24 @@ cluster_fpt_t** kmeans_init_clusters_fpt(const vector_fpt_t** points, const size
void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters, void kmeans_int(vector_int_t** points, const size_t point_count, cluster_int_t** clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) { fpt_t (* distance_function)(const vector_int_t*, const vector_int_t*)) {
bool changed = true; //TODO
while (changed) {
changed = false;
for (size_t i = 0; i < point_count; ++i) {
vector_int_t* point = points[i];
// find closest cluster and add point to it
cluster_int_t* cmin = clusters[0];
int_t dmin = distance_function(point, cmin->center);
for (size_t k = 1; k < nb_clusters; ++k) {
cluster_int_t* current_cluster = clusters[k];
fpt_t dist = distance_function(point, current_cluster->center);
if (dist < dmin) {
cmin = current_cluster;
dmin = dist;
}
}
cluster_add_point_int(cmin, point);
// update all cluster centers
for (size_t k = 0; k < nb_clusters; ++k) {
if (cluster_update_center_int(clusters[k])) {
changed = true;
}
}
}
}
} }
void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters, void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** clusters, const size_t nb_clusters,
fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) { fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) {
bool changed = true; bool changed = true;
while (changed) { while (changed) {
// reset condition
changed = false; changed = false;
// empty all clusters, keeping only their centers (virtual)
for (size_t k = 0; k < nb_clusters; ++k) {
cluster_reset_fpt(clusters[k]);
}
// for each point
for (size_t i = 0; i < point_count; ++i) { for (size_t i = 0; i < point_count; ++i) {
vector_fpt_t* point = points[i]; vector_fpt_t* point = points[i];
// find closest cluster and add point to it // find closest cluster
cluster_fpt_t* cmin = clusters[0]; cluster_fpt_t* cmin = clusters[0];
fpt_t dmin = distance_function(point, cmin->center); fpt_t dmin = distance_function(point, cmin->center);
for (size_t k = 1; k < nb_clusters; ++k) { for (size_t k = 1; k < nb_clusters; ++k) {
...@@ -137,9 +96,14 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t** ...@@ -137,9 +96,14 @@ void kmeans_fpt(vector_fpt_t** points, const size_t point_count, cluster_fpt_t**
dmin = dist; dmin = dist;
} }
} }
// add point to closest cluster
cluster_add_point_fpt(cmin, point); cluster_add_point_fpt(cmin, point);
// update all cluster centers // update all cluster centers
for (size_t k = 0; k < nb_clusters; ++k) { for (size_t k = 0; k < nb_clusters; ++k) {
#ifdef DEBUG
assert(clusters[k] != NULL);
assert(clusters[k]->points != NULL);
#endif
if (cluster_update_center_fpt(clusters[k])) { if (cluster_update_center_fpt(clusters[k])) {
changed = true; changed = true;
} }
......
...@@ -9,6 +9,10 @@ ...@@ -9,6 +9,10 @@
#include "linkedlist.h" #include "linkedlist.h"
#include "vector.h" #include "vector.h"
#ifdef DEBUG
#include <assert.h>
#endif
enum DistanceFunctionType { enum DistanceFunctionType {
EUCLID = 0, MANHATTAN = 1, CHEBYSHEV = 2 EUCLID = 0, MANHATTAN = 1, CHEBYSHEV = 2
...@@ -60,7 +64,7 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance ...@@ -60,7 +64,7 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance
else if (strcmp(optarg, "int") == 0) *type = INT; else if (strcmp(optarg, "int") == 0) *type = INT;
break; break;
case '?': case '?':
//TODO: perhaps add an "unknown option" message on stderr fprintf(stderr, "UNKNOWN OPTION : %c", opt);
break; break;
default: default:
// https://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html // https://www.gnu.org/software/libc/manual/html_node/Example-of-Getopt.html
...@@ -71,37 +75,9 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance ...@@ -71,37 +75,9 @@ void parse_args(int argc, char** argv, char** ipath, char** opath, enum Distance
int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { int main_int(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) {
// READ //TODO
FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin;
const size_t dim = io_read_int(ifile);
const size_t nb_clusters = io_read_int(ifile);
if (0 == dim) {
printf("DIMENSION MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
return EXIT_FAILURE;
}
if (0 == nb_clusters) {
printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n");
fclose(ifile);
return EXIT_FAILURE; return EXIT_FAILURE;
} }
list_points_int_t* list = io_get_vector_list_int(ifile, dim);
fclose(ifile);
ifile = NULL;
const size_t point_count = list->size;
vector_int_t** points = list_points_to_array_int(list);
list_points_destroy_int(list, false);
list = NULL;
// ALGORITHM
cluster_int_t** clusters = kmeans_init_clusters_int((const vector_int_t**) points, point_count, nb_clusters);
kmeans_int(points, point_count, clusters, nb_clusters, DIST_FUNC_INT[dist_func_type]);
// WRITE
FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters);
io_write_clusters_to_file_int(ofile, clusters, point_count);
fclose(ofile);
return EXIT_SUCCESS;
}
int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) { int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionType dist_func_type) {
// READ // READ
...@@ -126,8 +102,15 @@ int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionTy ...@@ -126,8 +102,15 @@ int main_fpt(const char* ipath, const char* opath, const enum DistanceFunctionTy
list_points_destroy_fpt(list, false); list_points_destroy_fpt(list, false);
list = NULL; list = NULL;
// ALGORITHM // ALGORITHM
printf("INIT: ... ");
cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters); cluster_fpt_t** clusters = kmeans_init_clusters_fpt((const vector_fpt_t**) points, point_count, nb_clusters);
printf("DONE\n");
#ifdef DEBUG
for(size_t i = 0; i < nb_clusters; ++i) assert(clusters[i] !=NULL);
#endif
printf("STARTING KMEANS ALGORITHM: ...\n");
kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]); kmeans_fpt(points, point_count, clusters, nb_clusters, DIST_FUNC_FPT[dist_func_type]);
printf("KMEANS DONE !\n");
// WRITE // WRITE
FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout;
fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters); fprintf(ofile, "%lud\n%lud\n", dim, nb_clusters);
...@@ -146,8 +129,10 @@ int main(int argc, char** argv) { ...@@ -146,8 +129,10 @@ int main(int argc, char** argv) {
parse_args(argc, argv, &ipath, &opath, &disttype, &datatype); parse_args(argc, argv, &ipath, &opath, &disttype, &datatype);
switch (datatype) { switch (datatype) {
case FLOAT: case FLOAT:
printf("FLOAT\n");
return main_fpt(ipath, opath, disttype); return main_fpt(ipath, opath, disttype);
case INT: case INT:
printf("INT\n");
return main_int(ipath, opath, disttype); return main_int(ipath, opath, disttype);
default: default:
abort(); abort();
......
...@@ -111,3 +111,26 @@ void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a) { ...@@ -111,3 +111,26 @@ void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a) {
if (NULL == v) return; if (NULL == v) return;
for (size_t i = 0; i < v->dim; ++i) v->data[i] /= a; for (size_t i = 0; i < v->dim; ++i) v->data[i] /= a;
} }
void vector_print_int(const vector_int_t* v) {
if (NULL == v) printf("NULL");
else {
printf("%ld", v->data[0]);
for (size_t p = 1; p < v->dim; ++p) {
printf(" , %ld", v->data[p]);
}
printf("\n");
}
}
void vector_print_fpt(const vector_fpt_t* v) {
if (NULL == v) printf("NULL");
else {
printf("%lf", v->data[0]);
for (size_t p = 1; p < v->dim; ++p) {
printf(" , %lf", v->data[p]);
}
printf("\n");
}
}
...@@ -54,4 +54,9 @@ void vector_div_inplace_int(vector_int_t* v, const int_t a); ...@@ -54,4 +54,9 @@ void vector_div_inplace_int(vector_int_t* v, const int_t a);
void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a); void vector_div_inplace_fpt(vector_fpt_t* v, const fpt_t a);
void vector_print_int(const vector_int_t* v);
void vector_print_fpt(const vector_fpt_t* v);
#endif //PROG_KMEANS_VECTOR_H #endif //PROG_KMEANS_VECTOR_H
3
3
13, 6, 7
100.5, 78.32, 1012.34
-1, -1, -1
14.2, 5.7, 7.56
99, 79, 1011
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment