Source code for pml.unsupervised.pca

# Copyright (C) 2012 David Rusk
#
# Permission is hereby granted, free of charge, to any person obtaining a copy 
# of this software and associated documentation files (the "Software"), to 
# deal in the Software without restriction, including without limitation the 
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 
# sell copies of the Software, and to permit persons to whom the Software is 
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in 
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
# IN THE SOFTWARE.
"""
Implements principal component analysis (PCA) and related operations.

@author: drusk
"""

import numpy as np
import numpy.linalg as linalg
import pandas as pd

from pml.data import model
from pml.utils import plotting

[docs]class ReducedDataSet(model.DataSet):
    """
    A DataSet which has had dimensionality reduction performed on it.
    
    Columns are interpreted as features in the data set, and rows are 
    observations.
    
    This dimensionally reduced data set has all of the observations of the 
    original, but its features have been adjusted to be linear combinations 
    of the originals.  
    
    Those features with little variance may have been dropped during the 
    dimensionality reduction process.  Use the percent_variance() method to 
    find out how much of the original variance has been retained in the 
    reduced features.
    """
    
[docs]    def __init__(self, data, sample_ids, labels, eigenvalues):
        """
        Creates a new ReducedDataSet.
        
        Args:
          data: numpy.array
            The raw array with the new data.
          sample_ids: list
            The ids for the samples (rows, observations) in the data set.
          labels: pandas.Series
            The labels, if any, provided for the observations.
          eigenvalues: numpy.array (1D)
            The list of eigenvalues produced to determine which components in 
            the new feature space were most important.  This includes all of 
            the eigenvalues, not just the ones for the components selected.
        """
        # build a pandas DataFrame with the original row index
        dataframe = pd.DataFrame(data, index=sample_ids)
        super(ReducedDataSet, self).__init__(dataframe, labels=labels)
        
        self.eigenvalues = eigenvalues

[docs]    def percent_variance(self):
        """
        Calculates the percentage of the original DataSet's variance which is  
        still present in this dimensionally reduced DataSet.
        
        Returns:
          A floating point number between 0.0 and 1.0 representing the 
          percentage. 
        """
        return _percent_variance(self.eigenvalues, self.num_features())
    
    
def _percent_variance(eigenvalues, num_components):
    """
    Calculates the percentage of total variance found in the top princpal 
    components.
    
    Args:
      eigenvalues: numpy.array (1D)
        The list of all eigenvalues for a data set.
      num_components: int
        The number of principal components which will be selected.
        
    Returns:
      The percentage of total variance for the top number of principal 
      components selected.  This will be a floating point number between 0.0 
      and 1.0. 
    """
    # make sure eigenvalues are a numpy array (allows fancy indexing)
    eigenvalues = np.array(eigenvalues)
    
    # get indices sorted smallest to largest
    sorted_indices = np.argsort(eigenvalues)
    
    # get largest
    selected_indices = sorted_indices[-num_components:]
    
    return np.sum(eigenvalues[selected_indices]) / np.sum(eigenvalues)

def _get_cov_mat_eigen_values_and_vectors(dataset):
    """
    Calculates the eigenvalues and eigenvectors for the covariance matrix of a 
    DataSet. 
    
    Args:
      dataset: model.DataSet
        The data whose covariance matrix will be calculated.
    
    Returns:
      eigenvalues: numpy.array
        A 1D array of the eigenvalues of the covariance matrix.
      eigenvectors: numpy.array
        A 2D array of the eigenvectors of the covariance matrix.
    """
    # rowvar=0 so that rows are interpreted as observations
    cov_mat = np.cov(dataset.get_data_frame(), rowvar=0)
    
    eigenvalues, eigenvectors = linalg.eig(cov_mat)
    
    return eigenvalues, eigenvectors

def _copy_and_remove_means(dataset):
    """
    Copies the DataSet before removing the column means in order to preserve 
    the original data.
    
    Args:
      dataset: model.DataSet
        The DataSet to copy and remove means from.
    
    Returns:
      The new, copied DataSet with column means removed.
    """
    dataset = dataset.copy()
    remove_means(dataset)
    return dataset

def _get_descending_cov_mat_eigenvalues(dataset):
    """
    Get the eigenvalues of the covariance matrix sorted largest to smallest.
    
    Args:
      dataset: model.DataSet
        The data whose covariance matrix will be calculated.
        
    Returns:
      eigenvalues: list
        The list of eigenvalues in descending order of magnitude.
    """
    eigenvalues, _ = _get_cov_mat_eigen_values_and_vectors(dataset)
    eigenvalues = eigenvalues.tolist()
    
    # sort from largest to smallest
    eigenvalues.sort()
    eigenvalues.reverse()
    return eigenvalues

[docs]def plot_pct_variance_per_principal_component(dataset, plot_type="bar"):
    """
    Generates a plot to visualize the percentage of variance captured 
    by each principal component in the data set.
    
    Args:
      dataset: model.DataSet
        The data set whose principal components will be examined.  Should not 
        already be reduced.
      plot_type: string
        The plot type to generate.  Supported plot types are:
          'bar': vertical bar chart
          'barh': horizontal bar chart
          'line': line chart
          Default is 'bar'. 
        
    Returns:
      void, but produces a matplotlib plot. 
      
    Raises:
      UnsupportedPlotTypeError if plot_type is not recognized.
    """
    # Fail early: check plot type here right away even though the plotting 
    # module will check it later.  Don't want a user with a large data set to 
    # wait for all the processing to occur only to find out they made a typo 
    # on the plot type.
    plotting.verify_supported_series_plot_type(plot_type)
    variances = get_pct_variance_per_principal_component(dataset)
    plotting.plot_percent_series(variances, plot_type)

[docs]def get_pct_variance_per_principal_component(dataset):
    """
    Determines the percentage of variance captured by each principal component 
    in the data set.
    
    Args:
      dataset: model.DataSet
        The data set whose principal components will be examined.  Should not 
        already be reduced.
        
    Returns:
      variances: pandas.Series
        The percentage of variance (as a float between 0.0 and 1.0) for each 
        principal component.
    """
    eigenvalues = _get_descending_cov_mat_eigenvalues(dataset)
    return pd.Series(eigenvalues) / np.sum(eigenvalues)

[docs]def recommend_num_components(dataset, min_pct_variance=0.9):
    """
    Recommends the number of principal components that should be selected in 
    order to keep a minimum specified percentage of the original data's 
    variance while also minimizing dimensionality.
    
    Args:
      dataset: model.DataSet
        The dataset in question.
      min_pct_variance: float
        The minimum percent of variance which should be maintained when 
        selecting the recommended number of principal components.  Should be 
        between 0.0 and 1.0.
        Defaults to 0.9 (i.e. 90%).
        
    Returns:
      The integer number of principal components which should be selected for 
      Principal Component Analysis.
      
    Raises:
      ValueError if min_pct_variance is < 0 or > 1.
    """
    if min_pct_variance < 0 or min_pct_variance > 1:
        raise ValueError("Invalid minimum percent variance "
                         "(must be between 0 and 1): %f" %min_pct_variance)
    
    dataset = _copy_and_remove_means(dataset)
    eigenvalues = _get_descending_cov_mat_eigenvalues(dataset)
    
    cumulative_pct_variance = np.cumsum(eigenvalues) / np.sum(eigenvalues)
    
    num_components = 1
    for pct_variance in cumulative_pct_variance:
        if pct_variance >= min_pct_variance:
            return num_components
        
        num_components += 1
        
    # should never reach this point since if all components are used the 
    # percent variance will be 100%, and the min percent variance specified 
    # can never be greater than 100% 

[docs]def remove_means(dataset):
    """
    Remove the column mean from each value in the dataset.
    
    For example, if a certain column as values [1, 2, 3], the column mean is 
    2.  When the column means are removed, that column will then have the 
    values [-1, 0, 1].
    
    NOTE: the modifications are made in place in dataset.
    
    Args:
      dataset: model.DataSet
        The dataset to remove the column means from.
    """
    column_means = dataset.reduce_features(np.mean)
    
    for feature in dataset.feature_list():
        def subtract_mean(sample):
            """
            Subtracts the current column/feature's mean value from a sample.
            """
            return sample - column_means[feature]

        dataset.set_column(feature, 
                           dataset.get_column(feature).map(subtract_mean))

[docs]def pca(dataset, num_components):
    """
    Performs Principle Component Analysis (PCA) on a dataset.
    
    Args:
      dataset: model.DataSet
        The dataset to be analysed.
      num_components: int
        The number of principal components to select.
    """
    dataset = _copy_and_remove_means(dataset)
    eigenvalues, eigenvectors = _get_cov_mat_eigen_values_and_vectors(dataset)

    # get a list of indices for the eigenvalues ordered largest to smallest
    indices = np.argsort(eigenvalues).tolist()
    indices.reverse()
    
    # take the top N eigenvectors
    selected_indices = indices[:num_components]

    # transform the data into the new space created by the top N eigenvectors
    transformed_data = np.dot(dataset.get_data_frame(), 
                              eigenvectors[:, selected_indices])
    
    return ReducedDataSet(transformed_data, dataset.get_sample_ids(), 
                          dataset.get_labels(), eigenvalues)
Source code for pml.unsupervised.pca

Project Versions

This Page

Navigation

Source code for pml.unsupervised.pca

Project Versions

RTD Search

This Page

Quick search

Navigation