Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
K
kmeans
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
This is an archived project. Repository and other project resources are read-only.
Show more breadcrumbs
dario.genga
kmeans
Commits
0078dbfb
Commit
0078dbfb
authored
3 years ago
by
dario.genga
Browse files
Options
Downloads
Patches
Plain Diff
Add clustering
parent
b0da2dcd
No related branches found
No related tags found
No related merge requests found
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
kmeans.c
+157
-11
157 additions, 11 deletions
kmeans.c
kmeans.h
+8
-5
8 additions, 5 deletions
kmeans.h
main.c
+3
-15
3 additions, 15 deletions
main.c
output_data.txt
+16
-5
16 additions, 5 deletions
output_data.txt
source_data.txt
+16
-5
16 additions, 5 deletions
source_data.txt
with
200 additions
and
41 deletions
kmeans.c
+
157
−
11
View file @
0078dbfb
...
...
@@ -3,6 +3,17 @@
#include
"kmeans.h"
int
random_with_min_man_value
(
int
min
,
int
max
)
{
return
(
rand
()
%
(
max
-
min
+
1
))
+
min
;
}
void
swap
(
int
*
x
,
int
*
y
)
{
int
tmp
=
*
x
;
*
x
=
*
y
;
*
y
=
tmp
;
}
kmeans
*
kmeans_create_empty
()
{
kmeans
*
universe
=
malloc
(
sizeof
(
kmeans
));
universe
->
points_array
=
NULL
;
...
...
@@ -46,14 +57,14 @@ point* create_point_from_string(char *line, int dimensions) {
int
i
=
0
;
point
*
p
;
char
*
token
;
const
char
separator
[
2
]
=
","
;
double
*
data
=
malloc
(
sizeof
(
double
)
*
dimensions
);
const
char
separator
[
2
]
=
SEPARATOR
;
float
*
data
=
malloc
(
sizeof
(
float
)
*
dimensions
);
// Parse the line
token
=
strtok
(
line
,
separator
);
while
(
token
!=
NULL
)
{
// Convert the string value to
double
, then save it
double
value
=
atof
(
token
);
// Convert the string value to
float
, then save it
float
value
=
atof
(
token
);
data
[
i
]
=
value
;
// Get the next value
...
...
@@ -67,7 +78,7 @@ point* create_point_from_string(char *line, int dimensions) {
}
// Create the point and return it
p
=
point_create
(
data
);
p
=
point_create
(
data
,
dimensions
);
return
p
;
}
...
...
@@ -82,11 +93,12 @@ kmeans* kmeans_create(int k, point** data, int nb_points) {
return
universe
;
}
point
*
point_create
(
double
*
value
)
{
point
*
point_create
(
float
*
value
,
int
dimensions
)
{
point
*
p
=
malloc
(
sizeof
(
point
));
p
->
value
=
value
;
p
->
cluster
=
NULL
;
p
->
label
=
NULL
;
p
->
dimensions
=
dimensions
;
return
p
;
}
...
...
@@ -97,7 +109,7 @@ cluster* cluster_create(point* centroid) {
}
void
init_from_cmd_arguments
(
kmeans
*
universe
)
{
universe
=
universe
;
}
void
read_data_source
(
kmeans
*
universe
,
char
*
source_file
)
{
...
...
@@ -181,24 +193,158 @@ void write_data_output(kmeans *universe, char* output_file) {
fclose
(
file
);
}
// Choose a random point by using the Fisher-Yates algorithm
point
*
choose_random_point_as_centroid
(
kmeans
*
universe
,
int
*
points_index_possible
,
int
max_index
)
{
int
random_index
=
random_with_min_man_value
(
0
,
max_index
);
float
*
value
=
malloc
(
sizeof
(
float
)
*
universe
->
dimensions
);
for
(
int
d
=
0
;
d
<
universe
->
dimensions
;
d
++
)
{
value
[
d
]
=
universe
->
points_array
[
random_index
]
->
value
[
d
];
}
// Swap the selected point index with the last one,
// so when max_index is decremented we can't choose again the same point
swap
(
&
points_index_possible
[
random_index
],
&
points_index_possible
[
max_index
]);
point
*
centroid
=
point_create
(
value
,
universe
->
dimensions
);
return
centroid
;
}
void
init_clusters
(
kmeans
*
universe
)
{
// Create the index of each possible points that can be a centroid
int
*
random_index_possible
=
malloc
(
sizeof
(
int
)
*
universe
->
nb_points
);
for
(
int
i
=
0
;
i
<
universe
->
nb_points
;
i
++
)
{
random_index_possible
[
i
]
=
i
;
}
int
max_index
=
random_index_possible
[
universe
->
nb_points
-
1
];
// Choose a random centroid for each cluster
for
(
int
i
=
0
;
i
<
universe
->
k
;
i
++
)
{
universe
->
clusters_array
[
i
]
->
centroid
=
choose_random_point_as_centroid
(
universe
,
random_index_possible
,
max_index
);
max_index
--
;
}
double
compute_distance
(
point
*
p1
,
point
p2
)
{
return
0
;
free
(
random_index_possible
);
}
void
compute_center_of_gravity
(
cluster
*
clstr
,
kmeans
*
universe
)
{
float
compute_euclidean_distance
(
point
*
p1
,
point
*
p2
)
{
float
sum
=
0
;
float
result
=
0
;
for
(
int
i
=
0
;
i
<
p1
->
dimensions
;
i
++
)
{
sum
+=
pow
(
p1
->
value
[
i
]
-
p2
->
value
[
i
],
2
);
}
result
=
sqrt
(
sum
);
return
result
;
}
void
assign_points_to_cluster
(
point
*
p
,
kmeans
*
universe
)
{
float
compute_manhattan_distance
(
point
*
p1
,
point
*
p2
)
{
float
result
=
0
;
for
(
int
i
=
0
;
i
<
p1
->
dimensions
;
i
++
)
{
result
+=
fabs
(
p1
->
value
[
i
]
-
p2
->
value
[
i
]);
}
return
result
;
}
float
compute_chebyshev_distance
(
point
*
p1
,
point
*
p2
)
{
float
result
=
0
;
for
(
int
i
=
0
;
i
<
p1
->
dimensions
;
i
++
)
{
int
abs_diff
=
fabs
(
p1
->
value
[
i
]
-
p2
->
value
[
i
]);
if
(
abs_diff
>
result
)
{
result
=
abs_diff
;
}
}
return
result
;
}
float
compute_distance
(
point
*
p1
,
point
*
p2
)
{
if
(
p1
->
dimensions
!=
p2
->
dimensions
)
{
printf
(
"The points don't have the same dimensions!
\n
"
);
exit
(
EXIT_FAILURE
);
}
float
euclidean
=
compute_euclidean_distance
(
p1
,
p2
);
//float manhattan = compute_manhattan_distance(p1, p2);
//float chebyshev = compute_chebyshev_distance(p1, p2);
return
euclidean
;
}
bool
compute_center_of_gravity
(
cluster
*
clstr
,
kmeans
*
universe
)
{
bool
new_position
=
false
;
int
nb_points_in_cluster
=
0
;
// Create an array to determine the center of gravity
float
*
dimensions_average
=
malloc
(
sizeof
(
float
)
*
universe
->
dimensions
);
for
(
int
i
=
0
;
i
<
universe
->
dimensions
;
i
++
)
{
dimensions_average
[
i
]
=
0
;
}
// Parse each point in the cluster
for
(
int
i
=
0
;
i
<
universe
->
nb_points
;
i
++
)
{
if
(
universe
->
points_array
[
i
]
->
cluster
==
clstr
)
{
// Compute their position
for
(
int
d
=
0
;
d
<
universe
->
dimensions
;
d
++
)
{
dimensions_average
[
d
]
+=
universe
->
points_array
[
i
]
->
value
[
d
];
nb_points_in_cluster
+=
1
;
}
}
}
// Compute the center of gravity with the average position of each points in the cluster
if
(
nb_points_in_cluster
>
0
)
{
for
(
int
i
=
0
;
i
<
universe
->
dimensions
;
i
++
)
{
dimensions_average
[
i
]
=
dimensions_average
[
i
]
/
nb_points_in_cluster
;
if
(
clstr
->
centroid
->
value
[
i
]
!=
dimensions_average
[
i
])
{
clstr
->
centroid
->
value
[
i
]
=
dimensions_average
[
i
];
new_position
=
true
;
}
}
}
free
(
dimensions_average
);
return
new_position
;
}
void
assign_points_to_cluster
(
point
*
p
,
kmeans
*
universe
)
{
float
smallest_distance
=
999
;
cluster
*
clst
;
for
(
int
i
=
0
;
i
<
universe
->
k
;
i
++
)
{
float
distance
=
compute_distance
(
p
,
universe
->
clusters_array
[
i
]
->
centroid
);
if
(
distance
<
smallest_distance
)
{
smallest_distance
=
distance
;
clst
=
universe
->
clusters_array
[
i
];
}
}
p
->
cluster
=
clst
;
}
void
start_clustering
(
kmeans
*
universe
)
{
bool
clustering_in_progress
=
false
;
init_clusters
(
universe
);
do
{
clustering_in_progress
=
false
;
// Assign each points to their corresponding cluster
for
(
int
i
=
0
;
i
<
universe
->
nb_points
;
i
++
)
{
assign_points_to_cluster
(
universe
->
points_array
[
i
],
universe
);
}
// Compute the new center of gravity for each cluster
printf
(
"Clusters positions...
\n
"
);
for
(
int
i
=
0
;
i
<
universe
->
k
;
i
++
)
{
if
(
compute_center_of_gravity
(
universe
->
clusters_array
[
i
],
universe
))
{
clustering_in_progress
=
true
;
}
printf
(
"Cluster %d position : "
,
i
);
for
(
int
j
=
0
;
j
<
universe
->
dimensions
;
j
++
)
{
printf
(
"%0.2f, "
,
universe
->
clusters_array
[
i
]
->
centroid
->
value
[
j
]);
}
printf
(
"
\n
"
);
}
}
while
(
clustering_in_progress
);
}
void
destroy_point
(
point
*
p
)
{
...
...
This diff is collapsed.
Click to expand it.
kmeans.h
+
8
−
5
View file @
0078dbfb
...
...
@@ -15,7 +15,7 @@
#define LINE_INDEX_CLUSTER 1
#define LINE_INDEX_CONTENT 2
#define CLUSTER_SYMBOL "*"
#define SEPARATOR "
;
"
#define SEPARATOR "
,
"
/// A group who contains points.
typedef
struct
_cluster
{
...
...
@@ -28,7 +28,9 @@ typedef struct _cluster {
/// A point in the universe, which represents a data.
typedef
struct
_point
{
/// The coordinates of the point.
double
*
value
;
float
*
value
;
/// The number of dimensions of the point.
int
dimensions
;
/// The cluster who contains the point.
struct
_cluster
*
cluster
;
/// The color to use to draw the point.
...
...
@@ -64,8 +66,9 @@ kmeans* kmeans_create(int k, point** data, int nb_points);
/// Create a point.
/// \param value The coordinates of the point.
/// \param dimensions The number of dimensions of the point.
/// \return The point objet initialized with its coordinates
point
*
point_create
(
double
*
value
);
point
*
point_create
(
float
*
value
,
int
dimensions
);
/// Create a cluster.
/// \param centroid The point representing the center of gravity of the cluster.
...
...
@@ -95,12 +98,12 @@ void init_clusters(kmeans *universe);
/// \param p1 The first point.
/// \param p2 The second point.
/// \return The distance between the two points.
double
compute_distance
(
point
*
p1
,
point
p2
);
float
compute_distance
(
point
*
p1
,
point
*
p2
);
/// Calculate the position of the center of gravity of the cluster.
/// \param clstr The cluster.
/// \param universe The universe who contains the points of the cluster.
void
compute_center_of_gravity
(
cluster
*
clstr
,
kmeans
*
universe
);
bool
compute_center_of_gravity
(
cluster
*
clstr
,
kmeans
*
universe
);
/// Assign the point to the most coherent cluster.
/// \param p The point to be assigned.
...
...
This diff is collapsed.
Click to expand it.
main.c
+
3
−
15
View file @
0078dbfb
...
...
@@ -3,31 +3,19 @@
#include
<stdio.h>
#include
<stdlib.h>
#include
<time.h>
#include
"kmeans.h"
int
main
()
{
srand
(
time
(
NULL
));
char
*
path
=
"./source_data.txt"
;
char
*
output
=
"./output_data.txt"
;
kmeans
*
universe
=
kmeans_create_empty
();
read_data_source
(
universe
,
path
);
// Custom clustering for testing
for
(
int
i
=
0
;
i
<
universe
->
nb_points
;
i
++
)
{
int
c
=
0
;
if
(
i
%
2
==
0
)
{
c
=
1
;
}
else
if
(
i
%
3
==
0
)
{
c
=
2
;
}
else
{
c
=
0
;
}
universe
->
points_array
[
i
]
->
cluster
=
universe
->
clusters_array
[
c
];
}
start_clustering
(
universe
);
write_data_output
(
universe
,
output
);
destroy_universe
(
universe
);
return
EXIT_SUCCESS
;
}
This diff is collapsed.
Click to expand it.
output_data.txt
+
16
−
5
View file @
0078dbfb
2
3
*
2.30;33.65
-1.00,-5.00
-2.00,-4.00
-3.00,-3.00
-4.00,-2.00
-5.00,-1.00
-1.75,-2.25
*
1.00;24.00
3.00;4.00
-1.00;5.00
0.00,0.00
3.10,-4.90
*
5.00;34.00
1.00,1.00
2.00,2.00
3.00,3.00
4.00,4.00
5.00,5.00
-2.25,4.75
2.20,4.40
4.00,2.00
This diff is collapsed.
Click to expand it.
source_data.txt
+
16
−
5
View file @
0078dbfb
2
3
1,24
2.3,33.65
3,4
5,34
-1,5
0,0
1,1
2,2
3,3
4,4
5,5
-1,-5
-2,-4
-3,-3
-4,-2
-5,-1
-2.25,4.75
3.1,-4.9
2.2,4.4
-1.75,-2.25
4,2
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment