PCA and Data Whitening

In [1]:
import numpy as np
#from numpy import *
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline

Eigen vectors and Eigen values

Eigen equation:

$$ W \vec x = \lambda \vec x $$
  • $\vec x$ with this property is called Eigen vector of $W$
  • with the corresponding eigenvalue $\lambda$

(Random) data

In [2]:
cov = np.array([[2.0, 0.8], [0.8, 0.6]])
mean = np.array([1., 1.])

rd = np.random.multivariate_normal(mean, cov, 50)

def plot_data():
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.scatter(rd[...,0], rd[...,1], c='b', marker='x', label="data")
    ax.set_xlim(-2.,4.)
    ax.set_ylim(-2.,4.)
    ax.set_xlabel('x_values')
    ax.set_ylabel('y_values')
    
plot_data()
In [3]:
mean_empirical = rd.mean(axis=0)
# here you see that the empirical mean is different from the "true" mean.
mean_empirical
Out[3]:
array([ 0.84778864,  0.92297188])

Covariance matrix

In [4]:
n = rd.shape[0]

# definition
cov_mat = 1./(n-1)*(rd - mean_empirical).T.dot((rd - mean_empirical))

cov_mat_np = np.cov(rd, rowvar=0)
In [5]:
np.testing.assert_array_almost_equal(cov_mat, cov_mat_np)

x = np.array([1.,-1.])
x = x/np.linalg.norm(x)

x = cov_mat.dot(x)
x = x/np.linalg.norm(x)
In [6]:
x = np.array([1.,-1.])
x = x/np.linalg.norm(x) 

Repeated multiplication of a (renormed) vector with the covariance matrix:

In [7]:
plt.xlim(-1.1,1.1)
plt.ylim(-1.1,1.1)
for i in range(10):
    plt.arrow(0,0, x[0], x[1], alpha=0.5)
    x = cov_mat.dot(x)
    x = x/np.linalg.norm(x)    

The vector rotates to a "fix point" vector. The fix point vector is the eigen vector corresponding to the largest eingenvalue.

In [8]:
eig_val, eig_vector_matrix = np.linalg.eig(cov_mat)
eig_vector_1 = eig_vector_matrix[:,0]
eig_val_1 = eig_val[0]
eig_vector_2 = eig_vector_matrix[:,1]
eig_val_2 = eig_val[1]
eig_vector_1
Out[8]:
array([ 0.91575757,  0.40173133])

From the eigen equation: $$ \frac{W \vec x}{\lambda} = \vec x $$

In [9]:
x = eig_vector_1
print x
# this don't changes x:
x = cov_mat.dot(x)
x = x/eig_val[0]
print x
[ 0.91575757  0.40173133]
[ 0.91575757  0.40173133]

For visualization take the square root of the eigenvalues: in the new corrdinate system the cov-matrix is diagonal. the eigenvalues correspond to the variances. For visualization the standard deviation is more appropriate, so we take the square root of the variance.

In [10]:
plot_data()
s1 = np.sqrt(eig_val_1) 
s2 = np.sqrt(eig_val_2)
plt.arrow(mean_empirical[0],mean_empirical[1], s1 * eig_vector_1[0], s1 * eig_vector_1[1], alpha=1., color='r')
plt.arrow(mean_empirical[0],mean_empirical[1], s2 * eig_vector_2[0], s2 * eig_vector_2[1], alpha=1., color='g')
Out[10]:
<matplotlib.patches.FancyArrow at 0x10d8b2ad0>

PCA (Principal Component Analysis)

In [11]:
class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)

        
        
# class 0:
# covariance matrix and mean
cov0 = np.array([[5.,  -2., 1.], 
                 [-2., 3., 1.], 
                 [1.,  1., 9.]])
                 
mean0 = np.array([2.,-1.,3])

#cov0 = np.array([[5.,0.,0], [0., 1., 0], [0, 0, 3.]])
#mean0 = np.array([0.,0.,0.])
#number of data points
m0 = 1000

# generate m0 gaussian distributed data points with
# mean0 and cov0.
rd = np.random.multivariate_normal(mean0, cov0, m0)

mean = rd.mean(axis=0)
cov_mat = np.cov(rd, rowvar=0)
In [12]:
# note: if performance is an issue use the scipy implementation instead
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)


# check if the eigen equation is valid: A * x == lambda * x 
for eva, eve in zip(eigenvalues, eigenvectors.T):
    np.testing.assert_array_almost_equal(cov_mat.dot(eve), 
                                         eva * eve, 
                                         decimal=6, err_msg='', verbose=True)
# the same, but compact in matrix form
np.testing.assert_array_almost_equal(cov_mat.dot(eigenvectors), eigenvalues * eigenvectors)   
In [13]:
    
#scaled eigenvectors for visualization
sev = eigenvectors * np.sqrt(eigenvalues)    

#from sklearn import decomposition
#pca = decomposition.PCA(n_components=3)
#pca.fit(rd)
#sev = pca.components_.T * 3.
In [14]:
#%matplotlib notebook
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd[...,0], rd[...,1], rd[...,2], c='b', marker='x', label="raw data")

for s in sev.T:
    a = Arrow3D([mean[0], s[0]+mean[0]], [mean[1], s[1]+mean[1]], [mean[2], s[2]+mean[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")     
    ax.add_artist(a)
    
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
plt.title('Eigenvectors')
plt.draw()
plt.show()
In [15]:
#plt.close('all')
# the eigenvectors computed by numpy are normed to length 1
for ev in eigenvectors:
    np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
 
In [16]:
   

# sort the eigenvectors and eigenvalues 
idx = eigenvalues.argsort()[::-1]   
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]
        
# Now we change the orthonormal basis of the coordinate system
# Note that the scalar product of two vectors can be interpreted as a projection
# of one of the vectors on the other.
rd_new_basis = (rd-mean).dot(eigenvectors)
# rd_new_basis are the data points in the new basis
In [17]:
# Plot the data in the new space:
#plt.close('all') # close all latent plotting windows

fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], rd_new_basis[...,2], c='b', marker='x', label="projected data")

for s in (5. * np.eye(3)):
    print s, " "
    a = Arrow3D([0., s[0]], [0., s[1]], [0., s[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
     
    ax.add_artist(a)
    
ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
ax.set_xlabel('new_x_values')
ax.set_ylabel('new_y_values')
ax.set_zlabel('new_z_values')

plt.title('Data in Eigenspace.')
plt.draw()
plt.show()
[ 5.  0.  0.]  
[ 0.  5.  0.]  
[ 0.  0.  5.]  
In [18]:
%matplotlib inline
###
# So we can skip the last dim 

rd_new_basis = (rd-mean).dot(eigenvectors[...,:2])

# Whitening
# scale by the inverse of the sqrt of the eigenvalues
rd_new_basis = rd_new_basis / np.sqrt(eigenvalues[:2])


plt.close('all') # close all latent plotting windows
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], c='b', marker='x', label="projected data")
ax.set_xlim(-4.,4.)
ax.set_ylim(-4.,4.)
ax.set_xlabel('new x-values')
ax.set_ylabel('new y-values')
plt.draw()
plt.title("Whitened data")
plt.show()

Using matplotlib's PCA

In [19]:
# same with matplot pca
plt.close('all') # close all latent plotting windows

from matplotlib.mlab import PCA
#construct your numpy array of data
results = PCA(rd) 
#this will return an array of variance percentages for each component
results.fracs
#this will return a array of the data projected into PCA space
r = results.Y 

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(r[...,0], r[...,1], r[...,2], c='b', marker='x', label="raw data")
ax.set_xlim(-5.,5.)
ax.set_ylim(-5.,5.)
ax.set_zlim(-5.,5.)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
plt.title('Data in Eigenspace.')
plt.draw()
plt.show()

Assumptions of PCA

  1. Linearity
  2. Large variances have important structure - data has high signal noise ration (SNR)
  3. Principal components are orthogonal

see [Shl]