In this tutorial we will look into the Numpy library: http://www.numpy.org/
Numpy is a very important library for numerical computations and matrix manipulation. It has a lot of the functionality of Matlab, and some of the functionality of Pandas
We will also use the Scipy library for scientific computation: http://docs.scipy.org/doc/numpy/reference/index.html
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg
To demonstrate the benefits of using numpy we will perform the addition of two vectors using two different ways, one using lists, one using numpy and for loops, and one using the built-in addition operator of numpy
import time
def trad_version():
t1 = time.time()
X = range(10000000)
Y = range(10000000)
Z = [x+y for x,y in zip(X,Y)]
return time.time() - t1
def naive_numpy_version():
t1 = time.time()
X = np.arange(10000000)
Y = np.arange(10000000)
Z = np.zeros(10000000)
for i in range(10000000):
Z[i] = X[i]+Y[i]
return time.time() - t1
def numpy_version():
t1 = time.time()
X = np.arange(10000000)
Y = np.arange(10000000)
Z = X + Y
return time.time() - t1
traditional_time = trad_version()
naive_numpy_time = naive_numpy_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Naive numpy time = "+ str(naive_numpy_time))
print ("Numpy time = "+ str(numpy_time))
Traditional time = 2.0872244834899902 Naive numpy time = 5.860562324523926 Numpy time = 0.0379793643951416
In Numpy data is organized into arrays. There are many different ways to create a numpy array.
For the following we will use the random library of Numpy: http://docs.scipy.org/doc/numpy-1.10.0/reference/routines.random.html
Creating arrays from lists
#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)
#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)
Deterministic 1-dimensional array [ 2 5 18 14 4] Deterministic 2-dimensional array [[ 2 5 18 14 4] [12 15 1 2 8]]
We can also create Numpy arrays from Pandas DataFrames
d = {'A':[1., 2., 3., 4.],
'B':[4., 3., 2., 1.]}
df = pd.DataFrame(d)
x = np.array(df)
print(x)
[[1. 4.] [2. 3.] [3. 2.] [4. 1.]]
Creating random arrays
#1-dimensional arrays
x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)
#2-dimensional arrays
x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)
x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)
Random 1-dimensional array [0.3045817 0.68593352 0.29571302 0.26202982 0.97401971] Random 5x5 2-dimensional array [[0.97592057 0.44797147 0.74637882 0.21933798 0.50116858] [0.0401491 0.1708651 0.24359695 0.96919036 0.18232783] [0.94211783 0.49514494 0.54596413 0.05031611 0.12973151] [0.75306615 0.0302772 0.42787459 0.03703721 0.86811408] [0.12405954 0.71376471 0.15755744 0.0803535 0.84714838]] Random 2x3 array with integers [[9 3 1] [4 4 2]]
Transpose and get array dimensions
print("\n Matrix Dimensions \n")
print(x.shape)
print ("\n Transpose of the matrix \n")
print (x.T)
print (x.T.shape)
Matrix Dimensions (2, 3) Transpose of the matrix [[9 4] [3 4] [1 2]] (3, 2)
Special Arrays
x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)
x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)
x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)
x = np.diag([1,2,3])
print ("\n Diagonal matrix\n")
print(x)
4x4 array with zeros [[0. 0. 0. 0.] [0. 0. 0. 0.] [0. 0. 0. 0.] [0. 0. 0. 0.]] 4x4 array with ones [[1. 1. 1. 1.] [1. 1. 1. 1.] [1. 1. 1. 1.] [1. 1. 1. 1.]] Identity matrix of size 4 [[1. 0. 0. 0.] [0. 1. 0. 0.] [0. 0. 1. 0.] [0. 0. 0. 1.]] Diagonal matrix [[1 0 0] [0 2 0] [0 0 3]]
Diagonal matrices are useful because we can use them to mutliply the rows or columns of a matrix by a value. For example below we will multiply the first row of matrix A with 2 and the second row with 3.
A = np.random.randint(10,size=(2,3))
A
array([[2, 7, 9], [0, 9, 5]])
v = np.array([2,3])
D = np.diag(v)
print(D@A)
[[ 6 2 10] [21 3 3]]
These are very similar to what we did with Pandas
x = np.random.randint(10, size = (2,4))
print (x)
print('\n mean value of all elements')
print (np.mean(x))
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows
[[8 6 4 8] [7 8 5 9]] mean value of all elements 6.875 vector of mean values for columns [7.5 7. 4.5 8.5] vector of mean values for rows [6.5 7.25]
print('\n standard deviation of all elements')
print (np.std(x))
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x))
print('\n vector of median values for rows')
print (np.median(x,1))
print('\n sum of all elements')
print (np.sum(x))
print('\n vector of column sums')
print (np.sum(x,0))
print('\n product of all elements')
print (np.prod(x))
print('\n vector of row products')
print (np.prod(x,1))
standard deviation of all elements 1.6153559979150107 vector of std values for rows [1.6583124 1.47901995] median value of all elements 7.5 vector of median values for rows [7. 7.5] sum of all elements 55 vector of column sums [15 14 9 17] product of all elements 3870720 vector of row products [1536 2520]
x = np.random.rand(4,3)
print(x)
print("\n element\n")
print(x[1,2])
print("\n row zero \n")
print(x[0,:])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])
[[0.98876655 0.12454666 0.91366392] [0.01737282 0.99059361 0.99336315] [0.80761691 0.31713682 0.08819911] [0.1034435 0.76720037 0.74667172]] element 0.9933631497328915 row zero [0.98876655 0.12454666 0.91366392] column 2 [0.91366392 0.99336315 0.08819911 0.74667172] submatrix [[0.01737282 0.99059361] [0.80761691 0.31713682]] entries > 0.5 [0.98876655 0.91366392 0.99059361 0.99336315 0.80761691 0.76720037 0.74667172]
x = np.random.rand(4,3)
print(x)
x[1,2] = -5 #change an entry
x[0:2,:] += 1 #change a set of rows: add 1 to all the elements of the first two rows
x[2:4,1:3] = 0.5 #change a block
print(x)
print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)
[[0.73651874 0.88698779 0.03882106] [0.36525677 0.3854799 0.79835146] [0.70155822 0.50776678 0.38097916] [0.49055785 0.89628018 0.1100143 ]] [[ 1.73651874 1.88698779 1.03882106] [ 1.36525677 1.3854799 -4. ] [ 0.70155822 0.5 0.5 ] [ 0.49055785 0.5 0.5 ]] Set entries > 0.5 to zero [[ 0. 0. 0. ] [ 0. 0. -4. ] [ 0. 0.5 0.5 ] [ 0.49055785 0.5 0.5 ]]
x = np.random.rand(4,4)
print(x)
print('\n Read Diagonal \n')
print(x.diagonal())
print('\n Fill Diagonal with 1s \n')
np.fill_diagonal(x,1)
print(x)
print('\n Fill Diagonal with vector \n')
x[np.diag_indices_from(x)] = [1,2,3,4]
print(x)
[[0.08149231 0.38845034 0.00664335 0.17188248] [0.00422611 0.98275776 0.51606564 0.87474304] [0.3424854 0.80202498 0.36604999 0.51433298] [0.92224115 0.88739759 0.63106565 0.37334718]] Read Diagonal [0.08149231 0.98275776 0.36604999 0.37334718] Fill Diagonal with 1s [[1. 0.38845034 0.00664335 0.17188248] [0.00422611 1. 0.51606564 0.87474304] [0.3424854 0.80202498 1. 0.51433298] [0.92224115 0.88739759 0.63106565 1. ]] Fill Diagonal with vector [[1. 0.38845034 0.00664335 0.17188248] [0.00422611 2. 0.51606564 0.87474304] [0.3424854 0.80202498 3. 0.51433298] [0.92224115 0.88739759 0.63106565 4. ]]
We want to create a dataset of 10 users and 5 items, where each user i has selects an item j with probability 0.3.
How can we do this with matrix operations?
D = np.random.rand(10,5)
print(D)
D[D>=0.7] = 1
D[D< 0.7] = 0
#D[D <= 0.3] = 1
#D[D != 1] = 0
D
[[0.02897289 0.01862057 0.7411373 0.04384066 0.57667308] [0.74959852 0.53840493 0.92343734 0.74636006 0.5222953 ] [0.91362662 0.98275774 0.83496817 0.31708787 0.68401841] [0.1483231 0.42414062 0.56359105 0.9249087 0.7475312 ] [0.41435385 0.63869731 0.53540715 0.63295463 0.79776993] [0.88480976 0.11822337 0.11292914 0.05690713 0.71825044] [0.16490754 0.34375931 0.97686887 0.30762994 0.51454228] [0.63207275 0.27510211 0.14973195 0.35500921 0.56096891] [0.9860109 0.68342203 0.45799759 0.4002986 0.67623526] [0.19252044 0.46201291 0.57859879 0.82318672 0.04562726]]
array([[0., 0., 1., 0., 0.], [1., 0., 1., 1., 0.], [1., 1., 1., 0., 0.], [0., 0., 0., 1., 1.], [0., 0., 0., 0., 1.], [1., 0., 0., 0., 1.], [0., 0., 1., 0., 0.], [0., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 0., 0., 1., 0.]])
x = np.random.rand(4,3)
print(x)
#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n")
print(2*x+1)
[[0.17332937 0.12585772 0.05650331] [0.0450051 0.6894536 0.10691141] [0.78271553 0.22487894 0.25747072] [0.39224177 0.12789578 0.6429628 ]] Matrix 2x+1 [[1.34665873 1.25171545 1.11300663] [1.09001021 2.3789072 1.21382283] [2.56543107 1.44975789 1.51494145] [1.78448355 1.25579156 2.28592559]]
Vector-vector dot product
There are three ways to get the dot product of two vectors:
y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))
print(y@z)
y: [ 2 -1 3] z: [-1 2 2] vector-vector dot product 2 2 2
External product
The external product between two vectors x,y of size (n,1) and (m,1) results in a matrix M of size (n,m) with entries M(i,j) = x(i)*y(j)
print('\n y:',y)
print(' z:',z)
print('\n vector-vector external product')
print(np.outer(y,z))
y: [ 2 -1 3] z: [-1 2 2] vector-vector external product [[-2 4 4] [ 1 -2 -2] [-3 6 6]]
Element-wise operations
print('\n y:',y)
print(' z:',z)
print('\n element-wise addition')
print(y+z)
print('\n element-wise product')
print(y*z)
print('\n element-wise division')
print(y/z)
y: [ 2 -1 3] z: [-1 2 2] element-wise addition [1 1 5] element-wise product [-2 -2 6] element-wise division [-2. -0.5 1.5]
Matrix-Vector multiplication
Again we can do the multiplication either using the dot method or the '@' operator
X = np.random.randint(10, size = (4,3))
print('Matrix X:\n',X)
y = np.array([1,0,0])
print("\n Matrix-vector right multiplication with",y,"\n")
print(X.dot(y))
print(np.dot(X,y))
print(X@y)
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication with",y,"\n")
print(y.dot(X),'\n')
print(np.dot(y,X),'\n')
print(y@X,'\n')
print(y.shape)
Matrix X: [[2 8 9] [2 6 9] [3 3 7] [9 7 2]] Matrix-vector right multiplication with [1 0 0] [2 2 3 9] [2 2 3 9] [2 2 3 9] Matrix-vector left multiplication with [1 0 1 0] [ 5 11 16] [ 5 11 16] [ 5 11 16] (4,)
Matrix-Matrix multiplication
Same for the matrix-matrix operation
Y = np.random.randint(10, size=(3,2))
print("\n Matrix-matrix multiplication\n")
print('Matrix X:\n',X)
print('Matrix Y:\n',Y)
print('Product:\n',X.dot(Y))
print('Product:\n',X@Y)
Matrix-matrix multiplication Matrix X: [[2 8 9] [2 6 9] [3 3 7] [9 7 2]] Matrix Y: [[9 9] [4 2] [6 4]] Product: [[104 70] [ 96 66] [ 81 61] [121 103]] Product: [[104 70] [ 96 66] [ 81 61] [121 103]]
Matrix-Matrix element-wise operations
Z = np.random.randint(10, size=(3,2))+1
print('Matrix Y:\n',Y)
print('Matrix Z:\n',Z)
print("\n Matrix-matrix element-wise addition\n")
print(Y+Z)
print("\n Matrix-matrix element-wise multiplication\n")
print(Y*Z)
print("\n Matrix-matrix element-wise division\n")
print(Y/Z)
Matrix Y: [[4 9] [0 3] [2 7]] Matrix Z: [[10 6] [10 6] [ 3 4]] Matrix-matrix element-wise addition [[14 15] [10 9] [ 5 11]] Matrix-matrix element-wise multiplication [[40 54] [ 0 18] [ 6 28]] Matrix-matrix element-wise division [[0.4 1.5 ] [0. 0.5 ] [0.66666667 1.75 ]]
For sparse arrays we need to use the sp_sparse library from SciPy: http://docs.scipy.org/doc/scipy/reference/sparse.html
There are three types of sparse matrices:
The csr and csc formats are fast for arithmetic operations, but slow for slicing and incremental changes.
The lil format is fast for slicing and incremental construction, but slow for arithmetic operations.
The coo format does not support arithmetic operations and slicing, but it is very fast for constructing a matrix incrementally. You should then transform it to some other format for operations.
Creation of matrix from triplets
Triplets are of the form (row, column, value)
import scipy.sparse as sp_sparse
d = np.array([[0, 0, 12],
[0, 1, 1],
[0, 5, 34],
[1, 3, 12],
[1, 2, 6],
[2, 0, 23],
[3, 4, 14],
])
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
print(M)
print(M.toarray()) #transforms back to full matrix
(0, 0) 12 (0, 1) 1 (0, 5) 34 (1, 2) 6 (1, 3) 12 (2, 0) 23 (3, 4) 14 [[12 1 0 0 0 34] [ 0 0 6 12 0 0] [23 0 0 0 0 0] [ 0 0 0 0 14 0] [ 0 0 0 0 0 0]]
The method toarray() creates a dense matrix. If you need a sparse matrix you should not use it.
Making a full matrix sparse
x = np.random.randint(2,size = (3,4))
print(x)
print('\n make x sparce')
A = sp_sparse.csr_matrix(x)
print(A)
[[0 0 1 1] [0 1 1 0] [0 1 1 1]] make x sparce (0, 2) 1 (0, 3) 1 (1, 1) 1 (1, 2) 1 (2, 1) 1 (2, 2) 1 (2, 3) 1
Creating a sparse matrix incrementally
# Use lil (list of lists) representation, you can also use coo (coordinates)
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A)
print(A.toarray())
print(A.diagonal())
A = A.tocsr() # makes it a compressed column format. better for dot product.
B = A.dot(np.ones(10))
print(B)
(0, 0) 7.0 (0, 1) 5.0 (0, 3) 5.0 (0, 4) 1.0 (1, 1) 3.0 (1, 5) 9.0 (1, 6) 5.0 (1, 8) 5.0 (1, 9) 1.0 (2, 2) 7.0 (3, 3) 5.0 (4, 4) 1.0 (6, 6) 9.0 (7, 7) 2.0 (8, 8) 8.0 (9, 0) 1.0 (9, 9) 99.0 [[ 7. 5. 0. 5. 1. 0. 0. 0. 0. 0.] [ 0. 3. 0. 0. 0. 9. 5. 0. 5. 1.] [ 0. 0. 7. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 5. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 9. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 2. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 8. 0.] [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 99.]] [ 7. 3. 7. 5. 1. 0. 9. 2. 8. 99.] [ 18. 23. 7. 5. 1. 0. 9. 2. 8. 100.]
All operations work like before
print(A.dot(A.T))
(0, 4) 1.0 (0, 3) 25.0 (0, 1) 15.0 (0, 9) 7.0 (0, 0) 100.0 (1, 9) 99.0 (1, 8) 40.0 (1, 6) 45.0 (1, 1) 141.0 (1, 0) 15.0 (2, 2) 49.0 (3, 3) 25.0 (3, 0) 25.0 (4, 4) 1.0 (4, 0) 1.0 (6, 6) 81.0 (6, 1) 45.0 (7, 7) 4.0 (8, 8) 64.0 (8, 1) 40.0 (9, 1) 99.0 (9, 9) 9802.0 (9, 0) 7.0
A[0].mean()
1.8000000000000003
For the singular value decomposition we will use the libraries from Numpy and SciPy and SciKit Learn
We use sklearn to create a low-rank matrix (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_low_rank_matrix.html). We will create a matrix with effective rank 2.
import sklearn.datasets as sk_data
data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)
We will use the numpy.linalg.svd function to compute the Singular Value Decomposition of the matrix we created (http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html).
U, s, V = np.linalg.svd(data,full_matrices = False)
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:10])
plt.ylabel('singular value')
plt.xlabel('number of singular values')
(100, 50) (50,) (50, 50) [1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06 1.12535175e-07 1.60522805e-09 1.38879351e-11 7.28746716e-14 2.36633685e-16 1.76907433e-16 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17 5.80193185e-17 2.07203841e-17]
Text(0.5, 0, 'number of singular values')
We can also use the scipy.sparse.linalg libary to compute the SVD for sparse matrices (http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.linalg.svds.html)
We need to specify the number of components, otherwise it is by default k = 6. The singular values are in increasing order.
import scipy.sparse.linalg as sp_linalg
data2 = sp_sparse.csc_matrix(data)
print(data2.shape)
U,s,V = sp_linalg.svds(data2, k = 10) #by default returns k=6 singular values
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[::-1]) #invert the order of the singular values
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
(100, 50) (100, 10) (10,) (10, 50) [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.23409804e-04 1.93045414e-03 1.83156389e-02 1.05399225e-01 3.67879441e-01 7.78800783e-01 1.00000000e+00]
Text(0.5, 0, 'number of eigenvalues')
We can also compute SVD using the library of SciKit Learn (https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html)
from sklearn.decomposition import TruncatedSVD
K = 10
svd = TruncatedSVD(n_components=K)
svd.fit(data2)
print(svd.components_.shape) # the V vectors
print(svd.transform(data2).shape) # the U vectors
print(svd.singular_values_)
(10, 50) (100, 10) [1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06 1.12535175e-07 1.60522805e-09]
To obtain a rank-k approximation of the matrix we multiplty the k first columns of U, with the diagonal matrix with the k first (largest) singular values, with the matrix with the first k rows of V transpose
K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)
(100, 6) (6,) (6, 50) [0.00193045 0.01831564 0.10539922 0.36787944 0.77880078 1. ]
reconstruction_error = []
for k in range(K,0,-1): #iterate from end to start
data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:]) #here we obtain the rank-k matrix
error = np.linalg.norm(data_k-data2,ord='fro')
reconstruction_error.append(error)
print(error)
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))
plt.plot(1+np.array(range(6)),reconstruction_error)
plt.ylabel('rank-k reconstruction error')
plt.xlabel('rank')
1.324127691735753 0.8679367165994608 0.3831233278055767 0.10699626662742329 0.018417506182009297 0.001934400698365925 0.00012350259009393555
Text(0.5, 0, 'rank')
We will create a block diagonal matrix, with blocks of different "intensity" of values
import numpy as np
M1 = np.random.randint(1,50,(50,20))
M2 = np.random.randint(1,10,(50,20))
M3 = np.random.randint(1,10,(50,20))
M4 = np.random.randint(1,50,(50,20))
T = np.concatenate((M1,M2),axis=1)
B = np.concatenate((M3,M4),axis=1)
M = np.concatenate((T,B),axis = 0)
plt.imshow(M, cmap='hot')
plt.show()
We observe that there is a correlation between the column and row sums and the left and right singular vectors
Note: The values of the vectors are negative. We would get the same result if we make them positive.
import scipy.stats as stats
import matplotlib.pyplot as plt
(U,S,V) = np.linalg.svd(M,full_matrices = False)
#print(S)
c = M.sum(0)
r = M.sum(1)
print(stats.pearsonr(r,U[:,0]))
print(stats.pearsonr(c,V[0]))
plt.scatter(r,U[:,0])
plt.figure()
plt.scatter(c,V[0])
(-0.9827648448466841, 1.1567778643026673e-73) (-0.9657370833058015, 7.281103085996422e-24)
<matplotlib.collections.PathCollection at 0x15c2f655820>
Using the first two signular vectors we can clearly differentiate the two blocks of rows
plt.scatter(U[:,0],U[:,1])
<matplotlib.collections.PathCollection at 0x15c2f5f3e80>
plt.scatter(x = U[:50,0],y = U[:50,1],color='r')
plt.scatter(x = U[50:,0],y = U[50:,1], color = 'b')
<matplotlib.collections.PathCollection at 0x147065aeeb8>
We will now use the PCA package from the SciKit Learn (sklearn) library. PCA is the same as SVD but now the matrix is centered: the mean is removed from the columns of the matrix.
You can read more here: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(M)
PCA(n_components=2)
pca.components_
array([[-0.16094941, -0.13334638, -0.13914704, -0.16821662, -0.1840363 , -0.18137002, -0.18541204, -0.14320527, -0.15259798, -0.149383 , -0.17935769, -0.1520864 , -0.1689485 , -0.15273942, -0.1599209 , -0.13903709, -0.14112612, -0.15519897, -0.15303926, -0.18059461, 0.1603817 , 0.18929262, 0.13919234, 0.15163066, 0.15175376, 0.15654794, 0.17475247, 0.17971444, 0.15313127, 0.15183825, 0.14859543, 0.16568566, 0.16568236, 0.14203813, 0.1527631 , 0.15061923, 0.1417066 , 0.14426523, 0.14729464, 0.15073459], [-0.40134473, -0.05583601, 0.05972713, 0.21742267, -0.12659435, -0.20683708, 0.32515444, -0.19554171, -0.07890482, 0.32146998, 0.17886798, 0.25577635, 0.12682412, 0.00952398, -0.18375292, -0.33014606, -0.08378762, -0.31364891, -0.1091007 , 0.10372033, 0.02580287, -0.06069759, -0.0037416 , -0.03347731, 0.05605955, 0.03965687, -0.06553724, -0.05357751, -0.10207093, 0.01573521, -0.01510528, -0.1051866 , 0.03713903, -0.03832595, 0.06626553, -0.18725506, 0.01225572, 0.07721781, 0.00267 , -0.02166871]])
plt.scatter(pca.components_[0],pca.components_[1])
<matplotlib.collections.PathCollection at 0x15c2f6ce970>
Using the operation transform we can transform the data directly to the lower-dimensional space
MPCA = pca.transform(M)
print(MPCA.shape)
(100, 2)
plt.scatter(MPCA[:,0],MPCA[:,1])
<matplotlib.collections.PathCollection at 0x15c282845b0>
We will now experiment with a well-known dataset of data analysis, the iris dataset:
https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target #contains the labels of the data
pca = PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)
pca.explained_variance_
array([4.22824171, 0.24267075, 0.0782095 ])
plt.scatter(X[:,0],X[:,1])
<matplotlib.collections.PathCollection at 0x15c2872cc70>
plt.scatter(X[y==0,0],X[y==0,1], color='b')
plt.scatter(X[y==1,0],X[y==1,1], color='r')
plt.scatter(X[y==2,0],X[y==2,1], color='g')
<matplotlib.collections.PathCollection at 0x15c2f72dca0>
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[y==0,0],X[y==0,1], X[y==0,2], color='b')
ax.scatter(X[y==1,0],X[y==1,1], X[y==1,2], color='r')
ax.scatter(X[y==2,0],X[y==2,1], X[y==2,2], color='g')
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x15c2f7ac7c0>
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(news_data.data)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)
for sent in sent_tokenize(message)
for word in word_tokenize(sent))
for message in news_data.data]
# stemmed_data = news_data.data
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\tsap\AppData\Roaming\nltk_data... [nltk_data] Unzipping tokenizers\punkt.zip. [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\tsap\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\stopwords.zip.
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) ~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <module> 6 7 ----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word) 9 for sent in sent_tokenize(message) 10 for word in word_tokenize(sent)) ~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <listcomp>(.0) 6 7 ----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word) 9 for sent in sent_tokenize(message) 10 for word in word_tokenize(sent)) ~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <genexpr>(.0) 6 7 ----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word) 9 for sent in sent_tokenize(message) 10 for word in word_tokenize(sent)) C:\ProgramData\Anaconda3\lib\site-packages\nltk\stem\snowball.py in __init__(self, language, ignore_stopwords) 106 raise ValueError(f"The language '{language}' is not supported.") 107 stemmerclass = globals()[language.capitalize() + "Stemmer"] --> 108 self.stemmer = stemmerclass(ignore_stopwords) 109 self.stem = self.stemmer.stem 110 self.stopwords = self.stemmer.stopwords C:\ProgramData\Anaconda3\lib\site-packages\nltk\stem\snowball.py in __init__(self, ignore_stopwords) 138 if ignore_stopwords: 139 try: --> 140 for word in stopwords.words(language): 141 self.stopwords.add(word) 142 except OSError as e: C:\ProgramData\Anaconda3\lib\site-packages\nltk\corpus\reader\wordlist.py in words(self, fileids, ignore_lines_startswith) 19 return [ 20 line ---> 21 for line in line_tokenize(self.raw(fileids)) 22 if not line.startswith(ignore_lines_startswith) 23 ] C:\ProgramData\Anaconda3\lib\site-packages\nltk\corpus\reader\api.py in raw(self, fileids) 216 for f in fileids: 217 with self.open(f) as fp: --> 218 contents.append(fp.read()) 219 return concat(contents) 220 C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in __exit__(self, type, value, traceback) 1165 1166 def __exit__(self, type, value, traceback): -> 1167 self.close() 1168 1169 def xreadlines(self): C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in close(self) 1194 Close the underlying stream. 1195 """ -> 1196 self.stream.close() 1197 1198 # ///////////////////////////////////////////////////////////////// KeyboardInterrupt:
dtm = vectorizer.fit_transform(stemmed_data)
terms = vectorizer.get_feature_names()
print(terms)
dtm_dense = dtm.todense()
centered_dtm = dtm_dense - np.mean(dtm_dense, axis=0)
np.sum(centered_dtm,axis=0)[:,:10]
matrix([[ 9.68843061e-16, -5.26054894e-15, 2.83258660e-15, 5.48389459e-16, 4.95209342e-15, 2.94935517e-15, 1.01882478e-15, -6.34345007e-15, 1.35613092e-14, 6.73457600e-15]])
u, s, vt = np.linalg.svd(centered_dtm)
plt.xlim([0,50])
plt.plot(range(1,len(s)+1),s)
[<matplotlib.lines.Line2D at 0x15c29818040>]
k=2
vectorsk = np.array(u[:,:k] @ np.diag(s[:k]))
labels = [news_data.target_names[i] for i in news_data.target]
sns.scatterplot(x=vectorsk[:,0], y=vectorsk[:, 1], hue=labels)
<AxesSubplot:>
import seaborn as sns
k = 5
Xk = u[:,:k] @ np.diag(s[:k])
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
g.map(plt.scatter)
<seaborn.axisgrid.PairGrid at 0x15c297b2be0>
terms = vectorizer.get_feature_names()
for i in range(6):
top = np.argsort(vt[i])
topterms = [terms[top[0,f]] for f in range(12)]
print (i, topterms)
0 ['space', 'henry', 'nasa', 'alaska', 'toronto', 'gov', 'moon', 'zoo', 'aurora', 'spencer', 'nsmca', 'pat'] 1 ['windows', 'space', 'access', 'nasa', 'file', 'dos', 'gov', 'digex', 'files', 'use', 'henry', 'program'] 2 ['access', 'nasa', 'digex', 'gov', 'pat', 'jpl', 'space', 'baalke', '___', 'com', 'kelvin', '__'] 3 ['access', 'digex', 'pat', 'com', 'prb', 'net', 'express', 'online', 'communications', 'dseg', 'usa', 'ti'] 4 ['ax', 'henry', 'toronto', 'zoo', 'spencer', 'zoology', 'jpl', 'baalke', '___', 'com', 'gov', 'kelvin'] 5 ['nasa', 'gov', 'henry', 'jpl', 'baalke', 'toronto', '___', 'windows', 'kelvin', '__', 'ca', 'team']
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. warnings.warn(msg, category=FutureWarning)