import numpy as np
import pandas as pd
from scipy import spatial
from sklearn import preprocessing
from sklearn.cluster import KMeans
from . import amath
[docs]def pca_contours(contours):
"""
Return the principal component of the contours.
Parameters
----------
contours : ndarray
Object contours, with shape (num_contour, 2*num_points).
Returns
-------
principal_directions : ndarray
Loadings, weights, principal directions, principal axes,
eigenvector of covariance matrix of mean-subtracted contours,
with shape (2*num_points, 2*num_points).
principal_components : ndarray
PC score, principal components, coordinates of mean-subtracted contours
in their principal directions, with shape (num_contours, 2*num_points).
See Also
--------
vampire.amath.pca : Implementation of principal component analysis.
"""
principal_directions, principal_components, variance = amath.pca(contours, 'eig')
return principal_directions, principal_components
[docs]def cluster_contours(pc, contours, num_clusters=5, num_pc=20, random_state=None): # random: None
"""
K-means clustering of contour principal components.
Parameters
----------
pc : ndarray
Principal components of contours.
contours : ndarray
Object contours, with shape (num_contour, 2*num_points).
num_clusters : int, optional
Number of clusters.
num_pc : int, optional
Number of principal components used for approximation.
random_state : None or int, optional
Random state for K-means clustering.
Returns
-------
contours_df : DataFrameDataFrame of objects' contour coordinates with cluster id.
centroids : ndarray
Coordinates of cluster centers of K-means clusters.
See Also
--------
sklearn.cluster.KMeans : Implementation of K-means clustering.
"""
pc_truncated = pc[:, :num_pc]
pc_truncated_normalized = preprocessing.normalize(pc_truncated)
# k-means clustering of normalized principal coordinates
k_means = KMeans(n_clusters=num_clusters,
random_state=random_state,
init='k-means++',
n_init=3,
max_iter=300).fit(pc_truncated_normalized)
centroids = k_means.cluster_centers_
# distance = spatial.distance.cdist(pc_truncated_normalized, centroid) # D, why not this line?
distance = spatial.distance.cdist(pc_truncated, centroids)
cluster_id = np.argmin(distance, axis=1)
# tag each object with cluster id
contours_df = pd.DataFrame(contours)
contours_df['cluster_id'] = cluster_id
return contours_df, centroids
[docs]def assign_clusters_id(pc, contours, centroids, num_pc=20):
"""
Assign the contours with id of the closest centroid.
Parameters
----------
pc : ndarray
Principal components of contours.
contours : ndarray
Object contours, with shape (num_contour, 2*num_points).
centroids : ndarray
Coordinates of cluster centers of K-means clusters.
num_pc : int, optional
Number of principal components used for approximation.
Returns
-------
contours_df : DataFrame
DataFrame of objects' contour coordinates with cluster id.
min_distance : ndarray
Distance of truncated principal components to the closest centroid.
"""
# find closest centroid and get cluster id
pc_truncated = pc[:, :num_pc]
distance = spatial.distance.cdist(pc_truncated, centroids)
cluster_id = np.argmin(distance, axis=1)
min_distance = np.min(distance, axis=1)
# tag each object with cluster id
contours_df = pd.DataFrame(contours)
contours_df['cluster_id'] = cluster_id
return contours_df, min_distance