import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import cluster, spatial
from sklearn import preprocessing
from sklearn.cluster import KMeans
[docs]def get_explained_variance_ratio(explained_variance):
explained_variance_ratio = explained_variance / np.sum(explained_variance)
return explained_variance_ratio
[docs]def get_cum_explained_variance_ratio(explained_variance_ratio):
cum_explained_variance_ratio = np.cumsum(explained_variance_ratio)
return cum_explained_variance_ratio
[docs]def get_optimal_n_pcs(cum_explained_variance_ratio, ratio=0.95):
n_pcs = np.sum(cum_explained_variance_ratio < ratio) + 1 # add one to exceed the ratio
if n_pcs > len(cum_explained_variance_ratio): # prevent index out of range
n_pcs = len(cum_explained_variance_ratio)
return n_pcs
[docs]def cluster_contours(pc, n_clusters=5, n_pcs=20, random_state=None):
"""
K-means clustering of contour principal components.
Parameters
----------
pc : ndarray
Principal components of contours.
n_clusters : int, optional
Number of clusters.
n_pcs : int, optional
Number of principal components used for approximation.
random_state : None or int, optional
Random state for K-means clustering.
Returns
-------
cluster_id_df : DataFrame
DataFrame of objects' cluster id and min distance to centroid.
centroids : ndarray
Coordinates of cluster centers of K-means clusters.
inertia : float
Sum of squared distances of samples to their closest cluster center.
See Also
--------
sklearn.cluster.KMeans : Implementation of K-means clustering.
"""
pc_truncated = pc[:, :n_pcs]
pc_truncated_normalized = preprocessing.normalize(pc_truncated)
# k-means clustering of normalized principal coordinates
k_means = KMeans(
n_clusters=n_clusters,
random_state=random_state,
init='k-means++',
n_init=3,
max_iter=300
).fit(pc_truncated_normalized)
centroids = k_means.cluster_centers_
inertia = k_means.inertia_
distance = spatial.distance.cdist(pc_truncated_normalized, centroids)
cluster_id = np.argmin(distance, axis=1)
min_distance = np.min(distance, axis=1)
# tag each object with cluster id
cluster_id_df = pd.DataFrame({
'cluster_id': cluster_id,
'distance_to_centroid': min_distance
})
return cluster_id_df, centroids, inertia
[docs]def assign_clusters_id(pc, contours, centroids, n_pcs=20):
"""
Assign the contours with id of the closest centroid.
Parameters
----------
pc : ndarray
Principal components of contours.
contours : ndarray
Object contours, with shape (n_contours, 2*n_points).
centroids : ndarray
Coordinates of cluster centers of K-means clusters.
n_pcs : int, optional
Number of principal components used for approximation.
Returns
-------
contours_df : DataFrame
DataFrame of objects' contour coordinates, cluster id,
and min distance from centroid.
"""
# find the closest centroid and get cluster id
pc_truncated = pc[:, :n_pcs]
# Original VAMPIRE GUI software did not normalize when
# assigning clusters. However, it is logical to keep the
# input of clustering and classifying consistent, so that
# the same data used in clustering and assign cluster give
# the same result.
pc_truncated = preprocessing.normalize(pc_truncated)
distance = spatial.distance.cdist(pc_truncated, centroids)
cluster_id = np.argmin(distance, axis=1)
min_distance = np.min(distance, axis=1)
# tag each object with cluster id
normalized_contours = {'normalized_contour': list(contours)}
contours_df = pd.DataFrame(normalized_contours)
contours_df['cluster_id'] = cluster_id
contours_df['plot_cluster_id'] = cluster_id + 1 # avoid zero-indexing for plotting
contours_df['distance_to_centroid'] = min_distance
return contours_df
[docs]def get_labeled_contours_df(contours, cluster_id_df):
"""
Return contour coordinates, cluster id, and distance to centroid.
Parameters
----------
contours : ndarray
Object contours, with shape (n_contours, 2*n_points).
cluster_id_df : DataFrame
DataFrame of objects' cluster id and min distance to centroid.
Returns
-------
labeled_contours_df : DaraFrame
DataFrame of contour coordinates, cluster id, and min
distance to centroid.
"""
return pd.DataFrame(contours).join(cluster_id_df)
[docs]def get_mean_cluster_contours(labeled_contours_df):
"""
Return mean contour of each cluster.
Parameters
----------
labeled_contours_df : DaraFrame
DataFrame of contour coordinates, cluster id, and min
distance to centroid.
Returns
-------
mean_cluster_contours : ndarray
Mean contour of each cluster.
"""
return labeled_contours_df.drop(['distance_to_centroid'], axis=1) \
.groupby('cluster_id') \
.mean().values
[docs]def hierarchical_cluster_contour(labeled_contours_df):
"""
Compute data structure for rendering dendrogram.
Parameters
----------
labeled_contours_df : DaraFrame
DataFrame of contour coordinates, cluster id, and min
distance to centroid.
Returns
-------
pair_distance : ndarray
Pairwise distance of mean cluster contour.
Result of `scipy.spatial.distance.pdist`.
linkage_matrix : ndarray
Linkage matrix for dendrogram.
Result of `scipy.cluster.hierarchy.linkage`.
branches : dict
A dictionary of data structures computed to render the dendrogram.
Result of `scipy.cluster.hierarchy.dendrogram`.
See Also
--------
scipy.spatial.distance.pdist
scipy.cluster.hierarchy.linkage
scipy.cluster.hierarchy.dendrogram
"""
mean_cluster_contours = get_mean_cluster_contours(labeled_contours_df)
pair_distance = spatial.distance.pdist(mean_cluster_contours, 'euclidean')
linkage_matrix = cluster.hierarchy.linkage(pair_distance, method='complete')
branches = cluster.hierarchy.dendrogram(
linkage_matrix,
p=0,
truncate_mode='lastp',
orientation='bottom',
above_threshold_color='k'
)
plt.close()
return pair_distance, linkage_matrix, branches
[docs]def get_cluster_order(branches):
"""
Get the cluster id of contours in order of dendrogram.
Parameters
----------
branches : dict
Output of ``scipy.cluster.hierarchy.dendrogram``.
Returns
-------
object_index
The cluster id of contours in order of dendrogram.
See Also
--------
scipy.cluster.hierarchy.dendrogram
"""
object_index = np.array(branches['ivl'], dtype=int)
return object_index
[docs]def get_distribution(properties_df):
"""
Return proportion of each cluster.
Parameters
----------
properties_df : DataFrame
DataFrame containing column `cluster_id`.
Returns
-------
distribution : ndarray
Proportion of each cluster.
"""
cluster_id = properties_df['cluster_id'].values
unique, counts = np.unique(cluster_id, return_counts=True)
distribution = counts / np.sum(counts)
return distribution
[docs]def get_shannon_entropy(distribution):
r"""
Calculate Shannon entropy from distribution (probability)
of each shape mode.
Parameters
----------
distribution : ndarray
Distribution of shape modes.
Returns
-------
entropy : float
Shannon entropy.
See Also
--------
vampire.analysis.get_distribution
Notes
-----
Shannon entropy here is defined as
.. math::
S = - \sum p_i \ln (p_i)
where :math:`p_i` is probability of cells in each
shape mode.
"""
entropy = -np.sum(distribution * np.log(distribution))
return entropy
[docs]def reorder_clusters(cluster_id, object_index):
"""
Reorder cluster id according to dendrogram order.
Parameters
----------
cluster_id : ndarray
Cluster ids.
object_index : ndarray
How original cluster id correspond to new id.
Returns
-------
cluster_id_sorted : ndarray
Reordered cluster id.
"""
cluster_id_sorted = np.zeros_like(cluster_id)
for i in range(len(object_index)):
cluster_id_sorted[cluster_id == object_index[i]] = i
return cluster_id_sorted
[docs]def reorder_centroids(centroids, object_index):
"""
Reorder centroids according to dendrogram order.
Parameters
----------
centroids : ndarray
Centroids
object_index : ndarray
How original cluster id correspond to new id.
Returns
-------
reordered_centroids : ndarray
Reordered centroids.
"""
return centroids[object_index, :]