Skip to content
Snippets Groups Projects
Commit 0078dbfb authored by dario.genga's avatar dario.genga
Browse files

Add clustering

parent b0da2dcd
No related branches found
No related tags found
No related merge requests found
......@@ -3,6 +3,17 @@
#include "kmeans.h"
int random_with_min_man_value(int min, int max) {
return (rand() % (max - min + 1)) + min;
}
void swap(int *x, int *y)
{
int tmp = *x;
*x = *y;
*y = tmp;
}
kmeans* kmeans_create_empty() {
kmeans* universe = malloc(sizeof(kmeans));
universe->points_array = NULL;
......@@ -46,14 +57,14 @@ point* create_point_from_string(char *line, int dimensions) {
int i = 0;
point* p;
char *token;
const char separator[2] = ",";
double* data = malloc(sizeof(double) * dimensions);
const char separator[2] = SEPARATOR;
float* data = malloc(sizeof(float) * dimensions);
// Parse the line
token = strtok(line, separator);
while(token != NULL) {
// Convert the string value to double, then save it
double value = atof(token);
// Convert the string value to float, then save it
float value = atof(token);
data[i] = value;
// Get the next value
......@@ -67,7 +78,7 @@ point* create_point_from_string(char *line, int dimensions) {
}
// Create the point and return it
p = point_create(data);
p = point_create(data, dimensions);
return p;
}
......@@ -82,11 +93,12 @@ kmeans* kmeans_create(int k, point** data, int nb_points) {
return universe;
}
point* point_create(double* value) {
point* point_create(float* value, int dimensions) {
point* p = malloc(sizeof(point));
p->value = value;
p->cluster = NULL;
p->label = NULL;
p->dimensions = dimensions;
return p;
}
......@@ -97,7 +109,7 @@ cluster* cluster_create(point* centroid) {
}
void init_from_cmd_arguments(kmeans *universe) {
universe = universe;
}
void read_data_source(kmeans* universe, char* source_file) {
......@@ -181,24 +193,158 @@ void write_data_output(kmeans *universe, char* output_file) {
fclose(file);
}
// Choose a random point by using the Fisher-Yates algorithm
point* choose_random_point_as_centroid(kmeans *universe, int* points_index_possible, int max_index) {
int random_index = random_with_min_man_value(0, max_index);
float* value = malloc(sizeof(float) * universe->dimensions);
for (int d = 0; d < universe->dimensions; d++) {
value[d] = universe->points_array[random_index]->value[d];
}
// Swap the selected point index with the last one,
// so when max_index is decremented we can't choose again the same point
swap(&points_index_possible[random_index], &points_index_possible[max_index]);
point *centroid = point_create(value, universe->dimensions);
return centroid;
}
void init_clusters(kmeans *universe) {
// Create the index of each possible points that can be a centroid
int* random_index_possible = malloc(sizeof(int) * universe->nb_points);
for (int i = 0; i < universe->nb_points; i++) {
random_index_possible[i] = i;
}
int max_index = random_index_possible[universe->nb_points - 1];
// Choose a random centroid for each cluster
for (int i = 0; i < universe->k; i++) {
universe->clusters_array[i]->centroid = choose_random_point_as_centroid(universe, random_index_possible, max_index);
max_index--;
}
double compute_distance(point* p1, point p2) {
return 0;
free(random_index_possible);
}
void compute_center_of_gravity(cluster* clstr, kmeans* universe) {
float compute_euclidean_distance(point* p1, point* p2) {
float sum = 0;
float result = 0;
for (int i = 0; i < p1->dimensions; i++) {
sum += pow(p1->value[i] - p2->value[i], 2);
}
result = sqrt(sum);
return result;
}
void assign_points_to_cluster(point* p, kmeans* universe) {
float compute_manhattan_distance(point* p1, point* p2) {
float result = 0;
for (int i = 0; i < p1->dimensions; i++) {
result += fabs(p1->value[i] - p2->value[i]);
}
return result;
}
float compute_chebyshev_distance(point* p1, point* p2) {
float result = 0;
for (int i = 0; i < p1->dimensions; i++) {
int abs_diff = fabs(p1->value[i] - p2->value[i]);
if (abs_diff > result) {
result = abs_diff;
}
}
return result;
}
float compute_distance(point* p1, point* p2) {
if (p1->dimensions != p2->dimensions) {
printf("The points don't have the same dimensions!\n");
exit(EXIT_FAILURE);
}
float euclidean = compute_euclidean_distance(p1, p2);
//float manhattan = compute_manhattan_distance(p1, p2);
//float chebyshev = compute_chebyshev_distance(p1, p2);
return euclidean;
}
bool compute_center_of_gravity(cluster* clstr, kmeans* universe) {
bool new_position = false;
int nb_points_in_cluster = 0;
// Create an array to determine the center of gravity
float* dimensions_average = malloc(sizeof(float) * universe->dimensions);
for (int i = 0; i < universe->dimensions; i++) {
dimensions_average[i] = 0;
}
// Parse each point in the cluster
for (int i = 0; i < universe->nb_points; i++) {
if (universe->points_array[i]->cluster == clstr) {
// Compute their position
for (int d = 0; d < universe->dimensions; d++) {
dimensions_average[d] += universe->points_array[i]->value[d];
nb_points_in_cluster += 1;
}
}
}
// Compute the center of gravity with the average position of each points in the cluster
if (nb_points_in_cluster > 0) {
for (int i = 0; i < universe->dimensions; i++) {
dimensions_average[i] = dimensions_average[i] / nb_points_in_cluster;
if (clstr->centroid->value[i] != dimensions_average[i]) {
clstr->centroid->value[i] = dimensions_average[i];
new_position = true;
}
}
}
free(dimensions_average);
return new_position;
}
void assign_points_to_cluster(point* p, kmeans* universe) {
float smallest_distance = 999;
cluster* clst;
for (int i = 0; i < universe->k; i++) {
float distance = compute_distance(p, universe->clusters_array[i]->centroid);
if (distance < smallest_distance) {
smallest_distance = distance;
clst = universe->clusters_array[i];
}
}
p->cluster = clst;
}
void start_clustering(kmeans* universe) {
bool clustering_in_progress = false;
init_clusters(universe);
do {
clustering_in_progress = false;
// Assign each points to their corresponding cluster
for (int i = 0; i < universe->nb_points; i++) {
assign_points_to_cluster(universe->points_array[i], universe);
}
// Compute the new center of gravity for each cluster
printf("Clusters positions...\n");
for (int i = 0; i < universe->k; i++) {
if (compute_center_of_gravity(universe->clusters_array[i], universe)) {
clustering_in_progress = true;
}
printf("Cluster %d position : ", i);
for (int j = 0; j < universe->dimensions; j++) {
printf("%0.2f, ", universe->clusters_array[i]->centroid->value[j]);
}
printf("\n");
}
} while (clustering_in_progress);
}
void destroy_point(point* p) {
......
......@@ -15,7 +15,7 @@
#define LINE_INDEX_CLUSTER 1
#define LINE_INDEX_CONTENT 2
#define CLUSTER_SYMBOL "*"
#define SEPARATOR ";"
#define SEPARATOR ","
/// A group who contains points.
typedef struct _cluster {
......@@ -28,7 +28,9 @@ typedef struct _cluster {
/// A point in the universe, which represents a data.
typedef struct _point {
/// The coordinates of the point.
double* value;
float* value;
/// The number of dimensions of the point.
int dimensions;
/// The cluster who contains the point.
struct _cluster* cluster;
/// The color to use to draw the point.
......@@ -64,8 +66,9 @@ kmeans* kmeans_create(int k, point** data, int nb_points);
/// Create a point.
/// \param value The coordinates of the point.
/// \param dimensions The number of dimensions of the point.
/// \return The point objet initialized with its coordinates
point* point_create(double* value);
point* point_create(float* value, int dimensions);
/// Create a cluster.
/// \param centroid The point representing the center of gravity of the cluster.
......@@ -95,12 +98,12 @@ void init_clusters(kmeans *universe);
/// \param p1 The first point.
/// \param p2 The second point.
/// \return The distance between the two points.
double compute_distance(point* p1, point p2);
float compute_distance(point* p1, point* p2);
/// Calculate the position of the center of gravity of the cluster.
/// \param clstr The cluster.
/// \param universe The universe who contains the points of the cluster.
void compute_center_of_gravity(cluster* clstr, kmeans* universe);
bool compute_center_of_gravity(cluster* clstr, kmeans* universe);
/// Assign the point to the most coherent cluster.
/// \param p The point to be assigned.
......
......@@ -3,31 +3,19 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "kmeans.h"
int main() {
srand(time(NULL));
char* path = "./source_data.txt";
char* output = "./output_data.txt";
kmeans *universe = kmeans_create_empty();
read_data_source(universe, path);
// Custom clustering for testing
for (int i = 0; i < universe->nb_points; i++) {
int c = 0;
if (i % 2 == 0) {
c = 1;
}
else if (i % 3 == 0) {
c = 2;
}
else {
c = 0;
}
universe->points_array[i]->cluster = universe->clusters_array[c];
}
start_clustering(universe);
write_data_output(universe, output);
destroy_universe(universe);
return EXIT_SUCCESS;
}
2
3
*
2.30;33.65
-1.00,-5.00
-2.00,-4.00
-3.00,-3.00
-4.00,-2.00
-5.00,-1.00
-1.75,-2.25
*
1.00;24.00
3.00;4.00
-1.00;5.00
0.00,0.00
3.10,-4.90
*
5.00;34.00
1.00,1.00
2.00,2.00
3.00,3.00
4.00,4.00
5.00,5.00
-2.25,4.75
2.20,4.40
4.00,2.00
2
3
1,24
2.3,33.65
3,4
5,34
-1,5
0,0
1,1
2,2
3,3
4,4
5,5
-1,-5
-2,-4
-3,-3
-4,-2
-5,-1
-2.25,4.75
3.1,-4.9
2.2,4.4
-1.75,-2.25
4,2
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment