Source code for vampire.analysis

import numpy as np
import pandas as pd
from scipy import spatial
from sklearn import preprocessing
from sklearn.cluster import KMeans

from . import amath


[docs]def pca_contours(contours):
    """
    Return the principal component of the contours.

    Parameters
    ----------
    contours : ndarray
        Object contours, with shape (num_contour, 2*num_points).

    Returns
    -------
    principal_directions : ndarray
        Loadings, weights, principal directions, principal axes,
        eigenvector of covariance matrix of mean-subtracted contours,
        with shape (2*num_points, 2*num_points).
    principal_components : ndarray
        PC score, principal components, coordinates of mean-subtracted contours
        in their principal directions, with shape (num_contours, 2*num_points).

    See Also
    --------
    vampire.amath.pca : Implementation of principal component analysis.

    """
    principal_directions, principal_components, variance = amath.pca(contours, 'eig')
    return principal_directions, principal_components


[docs]def pca_transform_contours(contours, mean_contour, principal_directions):
    """
    Transform contour coordinates to principal directions in the PC space.

    Parameters
    ----------
    contours : ndarray
        Object contours, with shape (num_contour, 2*num_points).
    mean_contour : ndarray
        Mean contour used to mean-center object contours.
    principal_directions : ndarray
        Loadings, weights, principal directions, principal axes,
        eigenvector of covariance matrix of mean-subtracted contours,
        with shape (2*num_points, 2*num_points).

    Returns
    -------
    principal_components : ndarray
        PC score, principal components, coordinates of mean-subtracted contours
        in their principal directions, with shape (num_contours, 2*num_points).

    """
    mean_centered_contours = contours - mean_contour
    principal_components = mean_centered_contours @ principal_directions
    return principal_components


[docs]def cluster_contours(pc, contours, num_clusters=5, num_pc=20, random_state=None):  # random: None
    """
    K-means clustering of contour principal components.

    Parameters
    ----------
    pc : ndarray
        Principal components of contours.
    contours : ndarray
        Object contours, with shape (num_contour, 2*num_points).
    num_clusters : int, optional
        Number of clusters.
    num_pc : int, optional
        Number of principal components used for approximation.
    random_state : None or int, optional
        Random state for K-means clustering.

    Returns
    -------
    contours_df : DataFrameDataFrame of objects' contour coordinates with cluster id.

    centroids : ndarray
        Coordinates of cluster centers of K-means clusters.

    See Also
    --------
    sklearn.cluster.KMeans : Implementation of K-means clustering.

    """
    pc_truncated = pc[:, :num_pc]
    pc_truncated_normalized = preprocessing.normalize(pc_truncated)

    # k-means clustering of normalized principal coordinates
    k_means = KMeans(n_clusters=num_clusters,
                     random_state=random_state,
                     init='k-means++',
                     n_init=3,
                     max_iter=300).fit(pc_truncated_normalized)
    centroids = k_means.cluster_centers_
    # distance = spatial.distance.cdist(pc_truncated_normalized, centroid)  # D, why not this line?
    distance = spatial.distance.cdist(pc_truncated, centroids)
    cluster_id = np.argmin(distance, axis=1)

    # tag each object with cluster id
    contours_df = pd.DataFrame(contours)
    contours_df['cluster_id'] = cluster_id

    return contours_df, centroids


[docs]def assign_clusters_id(pc, contours, centroids, num_pc=20):
    """
    Assign the contours with id of the closest centroid.

    Parameters
    ----------
    pc : ndarray
        Principal components of contours.
    contours : ndarray
        Object contours, with shape (num_contour, 2*num_points).
    centroids : ndarray
        Coordinates of cluster centers of K-means clusters.
    num_pc : int, optional
        Number of principal components used for approximation.

    Returns
    -------
    contours_df : DataFrame
        DataFrame of objects' contour coordinates with cluster id.
    min_distance : ndarray
        Distance of truncated principal components to the closest centroid.

    """
    # find closest centroid and get cluster id
    pc_truncated = pc[:, :num_pc]
    distance = spatial.distance.cdist(pc_truncated, centroids)
    cluster_id = np.argmin(distance, axis=1)
    min_distance = np.min(distance, axis=1)

    # tag each object with cluster id
    contours_df = pd.DataFrame(contours)
    contours_df['cluster_id'] = cluster_id
    return contours_df, min_distance