diff --git a/doc/kmeans.md b/doc/kmeans.md index dd06909e5805cd64fcf021f41779b180b98b7edd..7efdcf92730fa4e97103f9159e6bb6c8caf8ddb5 100644 --- a/doc/kmeans.md +++ b/doc/kmeans.md @@ -6,18 +6,82 @@ theme: "Frankfurt" geometry: "margin=40mm" mainfont: DejaVu Sans header-includes: + - \usepackage{float} - \let\origfigure\figure - \let\endorigfigure\endfigure - \renewenvironment{figure}[1][2] {\expandafter\origfigure\expandafter[H]} {\endorigfigure} + --- -\newpage +# Structures de Données + +## Point + +- chaque point est un vecteur +- types entiers et virgule flottante séparés +- "common.h" contient les définitions de `int_t` et `fpt_t` + +```c +typedef struct vector_int_t_ { + size_t dim; + int_t* data; +} vector_int_t; + +typedef struct vector_fpt_t_ { + size_t dim; + fpt_t* data; +} vector_fpt_t; +``` + +## Faciliter l'Association à un Cluster + +- structure vecteur vue précédemment générale +- on veut associer chaque point à un cluster +- le cluster auquel chaque point appartient change au cours de l'algorithme + - stocker les points "dans" des structures "cluster" est peu judicieux + - on stocke un identifiant de cluster (un pointeur) dans une structure "point de cluster" +## Cluster +- un cluster peut être représenté par + - un identifiant: un pointeur, forcémment unique + - son centre: un point virtuel, la valeur derrière le pointeur -# Décisions +```c +typedef vector_int_t* cluster_int_t; +``` + +## Point de Cluster + +```c +typedef struct cluster_point_int { + const vector_int_t* vector; + cluster_int_t* cluster; +} cluster_point_int_t; +``` + +## Ensemble de Points + +- parcours répétés de l'ensemble de tous les points +- pas d'ordre particulier (sauf à l'initialisation des centroïdes) +- une liste simplement chaînée fera l'affaire + +```c +typedef struct ll_point_int_node { + const cluster_point_int_t* point; + struct ll_point_int_node* next; +} ll_point_int_node_t; + +typedef struct ll_point_int { + ll_point_int_node_t* head; + ll_point_int_node_t* tail; + size_t size; +} ll_point_int_t; +``` # Algorithmique # Implémentation + +# Démonstration diff --git a/src/cluster.c b/src/cluster.c index fe560c37ba9b752a0b2cb47ab562fc28659c37b1..42e6abbd187e8586273058a72654d964850f7ba6 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -3,6 +3,7 @@ // #include "cluster.h" +#include <stdlib.h> #include "vector.h" @@ -12,3 +13,10 @@ cluster_point_int_t* cluster_point_int_create(vector_int_t* vector) { elem->vector = vector; elem->cluster = NULL; } + + +void cluster_point_int_destroy(cluster_point_int_t* cp) { + if (NULL == cp) return; + vector_int_destroy(cp->vector); + free(cp); +} diff --git a/src/cluster.h b/src/cluster.h index be463ef4b070dbc1f448576bb830491fe47138cb..20085a7d10687c8fbfc10802e50de22e2cc98796 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -8,14 +8,16 @@ #include "vector.h" -typedef vector_int_t cluster_int_t; // a cluster may be represented by its center +typedef vector_int_t* cluster_int_t; // a cluster may be represented by its center typedef struct cluster_point_int { vector_int_t* vector; - cluster_int_t* cluster; // justified by "many-to-one" relationship and several passes over all points + cluster_int_t cluster; // justified by "many-to-one" relationship and several passes over all points } cluster_point_int_t; cluster_point_int_t* cluster_point_int_create(vector_int_t* vector); +void cluster_point_int_destroy(cluster_point_int_t* cp); + #endif //PROG_KMEANS_CLUSTER_H diff --git a/src/common.c b/src/common.c new file mode 100644 index 0000000000000000000000000000000000000000..25792882d0be61175f613ddf3700ff20ab3d9692 --- /dev/null +++ b/src/common.c @@ -0,0 +1,39 @@ +#include "common.h" +#include <stdbool.h> +#include <stdlib.h> +#include <time.h> + +bool randinit = false; + +inline void init_rand() { + srand(time(NULL)); + randinit = true; +} + +int rand_int(const int max) { + if (!randinit) init_rand(); + return rand() % max; +} + +int rand_int_range(int min, int max) { + if (min > max) { + int swap = min; + min = max; + max = swap; + } + return min + rand_int(max - min); +} + +double rand_double_range_one() { + if (!randinit) init_rand(); + return ((double) rand()) / ((double) RAND_MAX); +} + +double rand_double_range(double min, double max) { + if (min > max) { + double swap = min; + min = max; + max = swap; + } + return min + rand_double_range_one() * (max - min); +} diff --git a/src/common.h b/src/common.h index 5d079229665fe858fc92ea0eebe8d2809a043f7e..15787e99e3aa4c6d7eb8adf9de93b140f8ebbbaa 100644 --- a/src/common.h +++ b/src/common.h @@ -12,4 +12,13 @@ typedef int64_t int_t; typedef double fpt_t; +int rand_int(const int max); + +int rand_int_range(int min, int max); + +double rand_double_range_one(); + +double rand_double_range(double min, double max); + + #endif //PROG_KMEANS_COMMON_H diff --git a/src/distance.c b/src/distance.c index a854e6cb876ebd132d4302f36441f229b1707fb2..6459803128ca9ce94636cf1f4cca9bf40848e982 100644 --- a/src/distance.c +++ b/src/distance.c @@ -27,7 +27,7 @@ fpt_t distance_euclid_int(const vector_int_t* p1, const vector_int_t* p2) { int_t item = diff * diff; acc += item; } - return sqrt((double) acc); + return sqrt((fpt_t) acc); } fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2) { @@ -38,7 +38,7 @@ fpt_t distance_manhattan_int(const vector_int_t* p1, const vector_int_t* p2) { int_t item = diff >= 0 ? diff : -diff; acc += item; } - return (double) acc; + return (fpt_t) acc; } fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2) { @@ -49,38 +49,38 @@ fpt_t distance_chebyshev_int(const vector_int_t* p1, const vector_int_t* p2) { item = abs_diff_int(p1->data[i], p2->data[i]); if (item > max) max = item; } - return (double) max; + return (fpt_t) max; } -fpt_t distance_euclid_fpt(const vector_int_t* p1, const vector_int_t* p2) { +fpt_t distance_euclid_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) { if (p1->dim != p2->dim)return ERROR; - int_t acc = 0; + fpt_t acc = 0; for (size_t i = 0; i < p1->dim; ++i) { - int_t diff = p2->data[i] - p1->data[i]; - int_t item = diff * diff; + fpt_t diff = p2->data[i] - p1->data[i]; + fpt_t item = diff * diff; acc += item; } - return sqrt((double) acc); + return sqrt((fpt_t) acc); } -fpt_t distance_manhattan_fpt(const vector_int_t* p1, const vector_int_t* p2) { +fpt_t distance_manhattan_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) { if (p1->dim != p2->dim)return ERROR; - int_t acc = 0; + fpt_t acc = 0; for (size_t i = 0; i < p1->dim; ++i) { - int_t diff = p2->data[i] - p1->data[i]; - int_t item = diff >= 0 ? diff : -diff; + fpt_t diff = p2->data[i] - p1->data[i]; + fpt_t item = diff >= 0 ? diff : -diff; acc += item; } - return (double) acc; + return (fpt_t) acc; } -fpt_t distance_chebyshev_fpt(const vector_int_t* p1, const vector_int_t* p2) { +fpt_t distance_chebyshev_fpt(const vector_fpt_t* p1, const vector_fpt_t* p2) { if (p1->dim != p2->dim)return ERROR; - int_t max = ERROR; - int_t item; + fpt_t max = ERROR; + fpt_t item; for (size_t i = 0; i < p1->dim; ++i) { - item = abs_diff_int(p1->data[i], p2->data[i]); + item = abs_diff_fpt(p1->data[i], p2->data[i]); if (item > max) max = item; } - return (double) max; + return (fpt_t) max; } diff --git a/src/kmeans.c b/src/kmeans.c index 21a9232b0252b49b559872850bae5867ed9118a8..26f7178839bf335851790f9238ade5617358cd7b 100644 --- a/src/kmeans.c +++ b/src/kmeans.c @@ -4,3 +4,39 @@ #include "kmeans.h" #include "cluster.h" + + +cluster_int_t* kmeans_init_clusters_int(const cluster_point_int_t** points, const size_t point_count, const size_t nclusters) { + if (nclusters < 2) return NULL; + if (NULL == points) return NULL; + cluster_int_t* clusters = calloc(nclusters, sizeof(cluster_int_t)); + if (NULL == clusters) return NULL; + // determine range in which we are working + vector_int_t* min = vector_int_copy(points[0]->vector); + vector_int_t* max = vector_int_copy(points[0]->vector); + for (size_t i = 0; i < point_count; ++i) { + for (size_t p = 0; p < max->dim; ++p) { + const int_t value = points[i]->vector->data[p]; + if (value < min->data[p]) min->data[p] = value; + if (value > max->data[p]) max->data[p] = value; + } + } + // until we have enough centers + for (size_t i = 0; i < nclusters; ++i) { + cluster_int_t center = vector_int_create(max->dim); + for (size_t p = 0; p < center->dim; ++p) { + center->data[p] = rand_int_range(min->data[p], max->data[p]); + } + // TODO: maybe check center is not already in clusters, although probability is extremely low... + clusters[i] = center; + } + return clusters; +} + + +void kmeans_int( + cluster_point_int_t** points, const size_t point_count, + cluster_int_t* clusters, const size_t nb_clusters, + fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)) { + //TODO +} diff --git a/src/kmeans.h b/src/kmeans.h index 100ef805a35e1d85154c99eee9a413e1e0f4bd35..a0d4ca35a2f2ec1e86112632c4ca45770192f77e 100644 --- a/src/kmeans.h +++ b/src/kmeans.h @@ -5,8 +5,16 @@ #ifndef PROG_KMEANS_KMEANS_H #define PROG_KMEANS_KMEANS_H +#include "cluster.h" +#include "linkedlist.h" -// + +cluster_int_t* kmeans_init_clusters_int(const cluster_point_int_t** points, const size_t point_count, const size_t nclusters); + +void kmeans_int( + cluster_point_int_t** points, const size_t point_count, + cluster_int_t* clusters, const size_t nb_clusters, + fpt_t (* distance_function)(const vector_fpt_t*, const vector_fpt_t*)); #endif //PROG_KMEANS_KMEANS_H diff --git a/src/linkedlist.c b/src/linkedlist.c index b85c0d23ab2967baad1465c73755e0de05ded3d6..1cfd9b1d4842906080cb1526dae9ffc5d4b836b6 100644 --- a/src/linkedlist.c +++ b/src/linkedlist.c @@ -3,34 +3,76 @@ // #include "linkedlist.h" +#include <assert.h> +#include <stdbool.h> +#include "cluster.h" -ll_vint_node_t* ll_vint_create_node(const vector_int_t* vec) { - ll_vint_node_t* node = malloc(sizeof(ll_vint_node_t)); +ll_point_int_node_t* ll_point_int_create_node(vector_int_t* vec) { + ll_point_int_node_t* node = malloc(sizeof(ll_point_int_node_t)); if (NULL == node) return NULL; - node->data = vec; + cluster_point_int_t* point = cluster_point_int_create(vec); + if (NULL == point) return NULL; + node->point = point; node->next = NULL; + return node; } -ll_vint_t* ll_vint_create() { - ll_vint_t* ll = NULL; - ll = malloc(sizeof(ll_vint_t)); +void ll_point_int_destroy_node(ll_point_int_node_t* node, const bool full) { + if (NULL == node) return; + if (full) cluster_point_int_destroy(node->point); + free(node); +} + + +ll_point_int_t* ll_point_int_create() { + ll_point_int_t* ll = NULL; + ll = malloc(sizeof(ll_point_int_t)); if (NULL == ll) return NULL; ll->head = NULL; ll->tail = NULL; + ll->size = 0; return ll; } -void ll_vint_append(ll_vint_t* list, const vector_int_t* vector) { +void ll_point_int_destroy(ll_point_int_t* list, const bool full) { + if (NULL == list) return; + ll_point_int_node_t* node; + while ((node = list->head) != NULL) { + list->head = node->next; + ll_point_int_destroy_node(node, full); + } + free(list); +} + + +void ll_point_int_append(ll_point_int_t* list, vector_int_t* vector) { if (NULL == vector) return; - ll_vint_node_t* node = ll_vint_create_node(vector); - if (NULL == list->head) { + ll_point_int_node_t* node = ll_point_int_create_node(vector); + if (NULL == list->head) { // if list is empty list->head = node; list->tail = list->head; - list->head->next = NULL; } else { - // TODO + list->tail->next = node; + list->tail = node; + } + list->size++; +} + + +cluster_point_int_t** ll_point_int_to_array(const ll_point_int_t* list, size_t* size_ptr) { + cluster_point_int_t** a = calloc(list->size, sizeof(cluster_point_int_t*)); + if (NULL == a) return NULL; + ll_point_int_node_t* cur = list->head; + size_t idx = 0; + while (cur != NULL) { + a[idx] = cur->point; + cur = cur->next; + ++idx; } + assert(idx == list->size); + if (size_ptr != NULL) *size_ptr = list->size; + return a; } diff --git a/src/linkedlist.h b/src/linkedlist.h index 4141617368d6932cd8702da323e13532734a9842..8f4250d38e65792ab175fbe107303fa0799ffdb8 100644 --- a/src/linkedlist.h +++ b/src/linkedlist.h @@ -5,25 +5,33 @@ #ifndef PROG_KMEANS_LINKEDLIST_H #define PROG_KMEANS_LINKEDLIST_H +#include <stdbool.h> +#include "cluster.h" #include "vector.h" -typedef struct ll_vector_int_node { - const vector_int_t* data; - struct ll_vector_int_node* next; -} ll_vint_node_t; +typedef struct ll_point_int_node { + cluster_point_int_t* point; + struct ll_point_int_node* next; +} ll_point_int_node_t; -typedef struct ll_vector_int { - ll_vint_node_t* head; - ll_vint_node_t* tail; +typedef struct ll_point_int { + ll_point_int_node_t* head; + ll_point_int_node_t* tail; size_t size; -} ll_vint_t; +} ll_point_int_t; -ll_vint_node_t* ll_vint_create_node(const vector_int_t* vec); +ll_point_int_node_t* ll_point_int_create_node(vector_int_t* vec); -ll_vint_t* ll_vint_create(); +void ll_point_int_destroy_node(ll_point_int_node_t* node, const bool full); -void ll_vint_append(ll_vint_t* list, const vector_int_t* vector); +ll_point_int_t* ll_point_int_create(); + +void ll_point_int_destroy(ll_point_int_t* list, const bool full); + +void ll_point_int_append(ll_point_int_t* list, vector_int_t* vector); + +cluster_point_int_t** ll_point_int_to_array(const ll_point_int_t* list, size_t* size_ptr); #endif //PROG_KMEANS_LINKEDLIST_H diff --git a/src/main.c b/src/main.c index 60dc781341d6417d22b06c9a0c7298a779d2950c..754cdc71911bbcdff0f9c19cfd1609c0398736bc 100644 --- a/src/main.c +++ b/src/main.c @@ -1,18 +1,20 @@ #define _GNU_SOURCE -#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include "common.h" +#include "kmeans.h" #include "linkedlist.h" #include "vector.h" + void help(const char* callname) { fprintf(stderr, "\nUSAGE: %s <INPUT_FILE> <OUTPUT_FILE>\n", callname); } + int_t read_int(FILE* file) { char* line; size_t len; @@ -20,45 +22,28 @@ int_t read_int(FILE* file) { return strtol(line, NULL, 10); } -bool read_vector_int(FILE* file, vector_int_t* vector) { - // procure line - char* line = NULL; - size_t len = 0; - getline(&line, &len, file); - if (len == 0) return false; - // tokenise - char* toktgt = line; - char* token = NULL; - for (size_t i = 0; i < vector->dim; ++i, toktgt = NULL) { - token = strtok(toktgt, ","); - // strtol returns 0 if number not read; desired behaviour: - vector->data[i] = token != NULL ? strtol(token, NULL, 10) : 0; - } - free(line); - return true; -} vector_int_t* line_to_vector_int(char* line, const size_t dim) { - vector_int_t* vector = vector_int_create_zero(dim); + vector_int_t* vector = vector_int_create(dim); char* tgt = line; char* token = NULL; for (size_t i = 0; i < vector->dim; ++i, tgt = NULL) { token = strtok(tgt, ","); - // strtol returns 0 if number not read; desired behaviour: + // strtol returns 0 if number not read, which is the desired behaviour: vector->data[i] = token != NULL ? strtol(token, NULL, 10) : 0; } return vector; } -ll_vint_t* get_vector_list(FILE* ifile, const size_t dim) { - ll_vint_t* list = ll_vint_create(); +ll_point_int_t* get_vector_list_int(FILE* ifile, const size_t dim) { + ll_point_int_t* list = ll_point_int_create(); char* line = NULL; size_t len = 0; while (getline(&line, &len, ifile) != -1) { if (len != 0) { vector_int_t* vector = line_to_vector_int(line, dim); - ll_vint_append(list, vector); + ll_point_int_append(list, vector); free(line); } } @@ -81,18 +66,24 @@ int main(int argc, char** argv) { // READ FILE* ifile = ipath != NULL ? fopen(ipath, "r") : stdin; const size_t dim = read_int(ifile); - const int_t nclusters = read_int(ifile); + const size_t cluster_count = read_int(ifile); // k if (0 <= dim) { printf("DIMENSION MUST BE STRICTLY POSITIVE !\n"); return EXIT_FAILURE; } - if (0 == nclusters) { + if (0 <= cluster_count) { printf("NUMBER OF CLUSTERS MUST BE STRICTLY POSITIVE !\n"); return EXIT_FAILURE; } - ll_vint_t* list = get_vector_list(ifile, dim); + ll_point_int_t* list = get_vector_list_int(ifile, dim); + size_t count; + const cluster_point_int_t** points = ll_point_int_to_array(list, &count); + ll_point_int_destroy(list, false); + list = NULL; // ALGORITHM // TODO + // init clusters + cluster_int_t* clusters = kmeans_init_clusters_int(points, count, cluster_count); // WRITE FILE* ofile = opath != NULL ? fopen(opath, "w") : stdout; // TODO diff --git a/src/vector.c b/src/vector.c index 494530f301baa6be8c0a23f642281f6d304c0ac7..46e57464f4ace8494c63055f830cad90262559c0 100644 --- a/src/vector.c +++ b/src/vector.c @@ -3,37 +3,70 @@ // #include "vector.h" +#include <stdbool.h> #include <stdlib.h> -vector_int_t* vector_int_create(const size_t dim, const int_t* data) { +vector_int_t* vector_int_create(const size_t dim) { vector_int_t* v; - if ((v = calloc(dim, sizeof(int_t))) == NULL) return NULL; + if ((v = malloc(sizeof(vector_int_t))) == NULL) return NULL; v->dim = dim; - for (size_t i = 0; i < dim; ++i) v->data[i] = data[i]; + v->data = calloc(dim, sizeof(int_t)); return v; } -vector_fpt_t* vector_fpt_create(const size_t dim, const fpt_t* data) { + +vector_fpt_t* vector_fpt_create(const size_t dim) { vector_fpt_t* v; - if ((v = calloc(dim, sizeof(fpt_t))) == NULL) return NULL; + if ((v = malloc(sizeof(vector_fpt_t))) == NULL) return NULL; v->dim = dim; - for (size_t i = 0; i < dim; ++i) v->data[i] = data[i]; + v->data = calloc(dim, sizeof(fpt_t)); return v; } -vector_int_t* vector_int_create_zero(const size_t dim) { - vector_int_t* v; - if ((v = calloc(dim, sizeof(int_t))) == NULL) return NULL; - v->dim = dim; - for (size_t i = 0; i < dim; ++i) v->data[i] = 0; - return v; + +void vector_int_destroy(vector_int_t* vp) { + if (NULL == vp) return; + free(vp->data); + free(vp); } -vector_fpt_t* vector_fpt_create_zero(const size_t dim) { - vector_fpt_t* v; - if ((v = calloc(dim, sizeof(fpt_t))) == NULL) return NULL; - v->dim = dim; - for (size_t i = 0; i < dim; ++i) v->data[i] = 0.0; - return v; + +void vector_fpt_destroy(vector_int_t* vp) { + if (NULL == vp) return; + free(vp->data); + free(vp); +} + + +vector_int_t* vector_int_copy(const vector_int_t* v) { + if (NULL == v) return NULL; + vector_int_t* c = vector_int_create(v->dim); + if (NULL == c) return NULL; + for (size_t i = 0; i < v->dim; ++i) { + c->data[i] = v->data[i]; + } + return c; +} + + +bool vector_int_equals(const vector_int_t* v1, const vector_int_t* v2) { + if (v1->dim != v2->dim) return false; + for (size_t i = 0; i < v1->dim; ++i) { + if (v1->data[i] != v2->data[i]) { + return false; + } + } + return true; +} + + +bool vector_fpt_equals(const vector_fpt_t* v1, const vector_fpt_t* v2) { + if (v1->dim != v2->dim) return false; + for (size_t i = 0; i < v1->dim; ++i) { + if (v1->data[i] != v2->data[i]) { + return false; + } + } + return true; } diff --git a/src/vector.h b/src/vector.h index 5ca1d546972900a088e3c651442f7b3cc4bf306d..6b8f665698ea28aa488877e6ba460d6dd99eef1c 100644 --- a/src/vector.h +++ b/src/vector.h @@ -10,6 +10,7 @@ * e.g. scientific measurements (floating point) and image data (integer). */ +#include <stdbool.h> #include <stdlib.h> #include "common.h" @@ -24,13 +25,19 @@ typedef struct vector_fpt_t_ { fpt_t* data; } vector_fpt_t; -vector_int_t* vector_int_create(size_t dim, const int_t* data); +vector_int_t* vector_int_create(const size_t dim); -vector_fpt_t* vector_fpt_create(size_t dim, const fpt_t* data); +vector_fpt_t* vector_fpt_create(const size_t dim); -vector_int_t* vector_int_create_zero(size_t dim); +void vector_int_destroy(vector_int_t* vp); -vector_fpt_t* vector_fpt_create_zero(size_t dim); +void vector_fpt_destroy(vector_int_t* vp); + +vector_int_t* vector_int_copy(const vector_int_t* v); + +bool vector_int_equals(const vector_int_t* v1, const vector_int_t* v2); + +bool vector_fpt_equals(const vector_fpt_t* v1, const vector_fpt_t* v2); #endif //PROG_KMEANS_VECTOR_H