Source code for foucluster.cluster

from sklearn import cluster
from sklearn.preprocessing import minmax_scale
from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np
from itertools import groupby

eps = 10**(-10)

n_cluster_methods = {'AgglomerativeClustering': cluster.AgglomerativeClustering,
                     'SpectralClustering': cluster.SpectralClustering,
                     'KMeans': cluster.KMeans}

non_n_cluster_methods = {'AffinityPropagation': cluster.AffinityPropagation,
                         'MeanShift': cluster.MeanShift}


cluster_methods = n_cluster_methods.copy()
cluster_methods.update(non_n_cluster_methods)


[docs]def determinist_cluster(dist_df, method, n_clusters): """ Clustering of the songs from the dataframe, indicating the number of clusters to use. :param pandas.DataFrame dist_df: :param str method: name of the sklearn.cluster. - cluster.AgglomerativeClustering. - cluster.SpectralClustering. - cluster.KMeans. :param int n_clusters: :return: pandas.DataFrame with a column with clusters. """ if not isinstance(dist_df, pd.DataFrame): dist_df = dist_df.to_df().T df_matrix = minmax_scale(dist_df) y = n_cluster_methods[method](n_clusters=n_clusters).fit_predict(df_matrix) cluster_df = pd.DataFrame(df_matrix, index=dist_df.index, columns=dist_df.columns) cluster_df['Cluster'] = pd.Series(y, index=cluster_df.index) return cluster_df
[docs]def automatic_cluster(dist_df, method): """ :param pd.DataFrame dist_df: :param str method: name of the sklearn.cluster. - cluster.AffinityPropagation. - cluster.MeanShift. - cluster.AgglomerativeClustering. - cluster.SpectralClustering. - cluster.KMeans. :return: pandas.DataFrame with a column with clusters. """ if not isinstance(dist_df, pd.DataFrame): dist_df = dist_df.to_df().T df_matrix = minmax_scale(dist_df) if method in n_cluster_methods.keys(): n_clusters = jump_method(dist_df=df_matrix) clf = n_cluster_methods[method](n_clusters=n_clusters) else: clf = non_n_cluster_methods[method]() y = clf.fit_predict(df_matrix) cluster_df = pd.DataFrame(df_matrix, index=dist_df.index, columns=dist_df.columns) cluster_df['Cluster'] = pd.Series(y, index=cluster_df.index) return cluster_df
[docs]def jump_method(dist_df, n_max=50): """ Method based on information theory to determine best number of clusters. :param pandas.DataFrame dist_df: :param int n_max: maximum number of clusters to test. :return: optimal number of clusters """ dim = dist_df.shape[0] if n_max > dim: n_max = dim Y = dim / 2 distortions = np.empty(n_max + 1) jump_vector = np.empty(n_max) distortions[0] = 0.0 for k in range(1, n_max + 1): kmean_model = cluster.KMeans(n_clusters=k).fit(dist_df) distortion = np.min(cdist(dist_df, kmean_model.cluster_centers_, 'euclidean').ravel()) / dim + eps distortions[k] = distortion**(- Y) jump_vector[k - 1] = distortions[k] - distortions[k - 1] n_cluster = np.argmax(jump_vector) + 1 # Avoiding let an instance alone instance_alone = True while instance_alone is True: y = cluster.KMeans(n_clusters=n_cluster).fit_predict(dist_df) group_member = [len(list(group)) for key, group in groupby(np.sort(y))] if np.min(group_member) > 1 or n_cluster == 2: instance_alone = False else: n_cluster -= 1 return n_cluster
def score_cluster(cluster_df): """ When `automatic_cluster` is used, then the clusters must be grouped into the categories we want into predict, in order to score our method. :param pandas.DataFrame cluster_df: :return: accuracy score. cluster_df have now `Cluster_corrected` column. """ accurate_class = [int(n[0][0]) for n in cluster_df.index.tolist()] accurate_class -= np.unique(accurate_class)[0] # Move to 0, 1, ... notation accurate_class = np.array(accurate_class, dtype=int) cluster_class = np.array(cluster_df['Cluster'].tolist(), dtype=int) # Find correspondences between given classes and cluster classes correspondence_dict = {} for p in np.unique(cluster_class): max_c = 0.0 pos_p = cluster_class == p for e in np.unique(accurate_class): pos_e = accurate_class == e c = (pos_p == pos_e).sum() if c > max_c: correspondence_dict.update({p: e}) max_c = c # Finding the accuracy cluster_class_corrected = [correspondence_dict[p] for p in cluster_class] cluster_df['Cluster_corrected'] = pd.Series(cluster_class_corrected, index=cluster_df.index) score_vector = [e == p_c for e, p_c in zip(accurate_class, cluster_class_corrected)] return np.average(score_vector) def party_list(song_df, song=None): """ A list of song of all the songs from the cluster dataframe sorted, from similarity between them. :param pandas.DataFrame song_df: :param str song: :return: """ song_df_rev = song_df.T if song is None or song not in song_df_rev.index: song = song_df_rev.index[0] # TODO: to implement final_index = list(song_df_rev.sort_values(song, axis='columns')[song].index) return final_index