Skip to content
Snippets Groups Projects
Commit de33161a authored by thibault.capt's avatar thibault.capt
Browse files

create algorithm

parent d143a959
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CsvFileAttributes">
<option name="attributeMap">
<map>
<entry key="/Data/iris.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/Data/student-data-test.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/Data/student-data-train.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
</map>
</option>
</component>
</project>
\ No newline at end of file
File moved
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def import_csv(filename: str, h: int = None) -> pd.DataFrame:
"""
Imports a CSV file and returns a pandas DataFrame.
Args:
filename (str): The path to the CSV file.
h (int or list of int, default 0): The row(s) to use as the column names.
Returns:
pandas.DataFrame: The imported data as a DataFrame.
"""
return pd.read_csv(filename, header=h)
def euclidian_distance(x1, x2):
return np.sqrt(np.sum((x1 - x2) ** 2))
if __name__ == "__main__":
df = import_csv("Data/iris.csv")
# Sélectionner les caractéristiques (colonnes) que vous voulez utiliser pour le clustering
X = df.iloc[:, :-1].values # Sélectionner toutes les colonnes sauf la dernière (caratéristique)
k = 3 # Nombre de clusters
# Initialisation des k-centroïdes de manière aleatoire
np.random.seed(0)
centroids = X[np.random.choice(X.shape[0], k, replace=False)]
# Initialisation des variables pour stocker les anciens et nouveaux centroïdes
old_centroids = np.zeros(centroids.shape)
new_centroids = centroids.copy()
# Initialisation d'une liste pour stocker la somme des distances au sein de chaque cluster à chaque itération
distances = []
iteration = 0
while not np.array_equal(old_centroids, new_centroids):
iteration += 1
distances_to_centroids = np.array([[euclidian_distance(x, centroid) for centroid in new_centroids] for x in X])
# Attribution des points aux centroïdes les plus proches
labels = np.argmin(distances_to_centroids, axis=1)
# Calcul des nouveaux centroïdes comme la moyenne des points de chaque cluster
for i in range(k):
new_centroids[i] = np.mean(X[labels == i], axis=0)
# Calcul des distances au sein de chaque cluster
cluster_distances = [np.sum([euclidian_distance(X[j], new_centroids[i])
for j in range(len(X)) if labels[j] == i]) for i in range(k)]
total_distance = np.sum(cluster_distances)
distances.append(total_distance)
# Affichage des clusters à cette iteration
plt.figure(figsize=(8, 6))
for i in range(k):
cluster_points = X[labels == i]
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {i + 1}")
plt.scatter(new_centroids[:, 0], new_centroids[:, 1], marker="X", color="black", label="Centroids")
plt.xlabel("Caractéristique 1")
plt.ylabel("Caractéristique 2")
plt.legend()
plt.title(f"Iteration {iteration}")
plt.show()
# Mettre à jour les anciens et les nouveaux centroïdes
old_centroids = new_centroids.copy()
plt.figure(figsize=(8, 6))
plt.plot(range(1, iteration + 1), distances, marker='o')
plt.xlabel("Iteration")
plt.ylabel("Somme des distances au carré")
plt.title("Evolution de la somme des distances au carré")
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment