Skip to content
Snippets Groups Projects
Commit de33161a authored by thibault.capt's avatar thibault.capt
Browse files

create algorithm

parent d143a959
Branches
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CsvFileAttributes">
<option name="attributeMap">
<map>
<entry key="/Data/iris.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/Data/student-data-test.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
<entry key="/Data/student-data-train.csv">
<value>
<Attribute>
<option name="separator" value="," />
</Attribute>
</value>
</entry>
</map>
</option>
</component>
</project>
\ No newline at end of file
File moved
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def import_csv(filename: str, h: int = None) -> pd.DataFrame:
"""
Imports a CSV file and returns a pandas DataFrame.
Args:
filename (str): The path to the CSV file.
h (int or list of int, default 0): The row(s) to use as the column names.
Returns:
pandas.DataFrame: The imported data as a DataFrame.
"""
return pd.read_csv(filename, header=h)
def euclidian_distance(x1, x2):
return np.sqrt(np.sum((x1 - x2) ** 2))
if __name__ == "__main__":
df = import_csv("Data/iris.csv")
# Sélectionner les caractéristiques (colonnes) que vous voulez utiliser pour le clustering
X = df.iloc[:, :-1].values # Sélectionner toutes les colonnes sauf la dernière (caratéristique)
k = 3 # Nombre de clusters
# Initialisation des k-centroïdes de manière aleatoire
np.random.seed(0)
centroids = X[np.random.choice(X.shape[0], k, replace=False)]
# Initialisation des variables pour stocker les anciens et nouveaux centroïdes
old_centroids = np.zeros(centroids.shape)
new_centroids = centroids.copy()
# Initialisation d'une liste pour stocker la somme des distances au sein de chaque cluster à chaque itération
distances = []
iteration = 0
while not np.array_equal(old_centroids, new_centroids):
iteration += 1
distances_to_centroids = np.array([[euclidian_distance(x, centroid) for centroid in new_centroids] for x in X])
# Attribution des points aux centroïdes les plus proches
labels = np.argmin(distances_to_centroids, axis=1)
# Calcul des nouveaux centroïdes comme la moyenne des points de chaque cluster
for i in range(k):
new_centroids[i] = np.mean(X[labels == i], axis=0)
# Calcul des distances au sein de chaque cluster
cluster_distances = [np.sum([euclidian_distance(X[j], new_centroids[i])
for j in range(len(X)) if labels[j] == i]) for i in range(k)]
total_distance = np.sum(cluster_distances)
distances.append(total_distance)
# Affichage des clusters à cette iteration
plt.figure(figsize=(8, 6))
for i in range(k):
cluster_points = X[labels == i]
plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f"Cluster {i + 1}")
plt.scatter(new_centroids[:, 0], new_centroids[:, 1], marker="X", color="black", label="Centroids")
plt.xlabel("Caractéristique 1")
plt.ylabel("Caractéristique 2")
plt.legend()
plt.title(f"Iteration {iteration}")
plt.show()
# Mettre à jour les anciens et les nouveaux centroïdes
old_centroids = new_centroids.copy()
plt.figure(figsize=(8, 6))
plt.plot(range(1, iteration + 1), distances, marker='o')
plt.xlabel("Iteration")
plt.ylabel("Somme des distances au carré")
plt.title("Evolution de la somme des distances au carré")
plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment