import numpy as np
import scipy as sp
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
%matplotlib inline


import scipy.sparse as sp_sparse
import scipy.spatial.distance as sp_dist
import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing
import scipy.sparse.linalg as linalg


import time

def trad_version():
    t1 = time.time()
    X = range(10000000)
    Y = range(10000000)
    Z = [x+y for x,y in zip(X,Y)]
    return time.time() - t1

def naive_numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = np.zeros(10000000)
    for i in range(10000000):
        Z[i] = X[i]+Y[i] 
    return time.time() - t1

def numpy_version():
    t1 = time.time()
    X = np.arange(10000000)
    Y = np.arange(10000000)
    Z = X + Y
    return time.time() - t1


traditional_time = trad_version()
naive_numpy_time = naive_numpy_version()
numpy_time = numpy_version()
print ("Traditional time = "+ str(traditional_time))
print ("Naive numpy time = "+ str(naive_numpy_time))
print ("Numpy time       = "+ str(numpy_time))

Traditional time = 2.0872244834899902
Naive numpy time       = 5.860562324523926
Numpy time       = 0.0379793643951416


#1-dimensional arrays
x = np.array([2,5,18,14,4])
print ("\n Deterministic 1-dimensional array \n")
print (x)

#2-dimensional arrays
x = np.array([[2,5,18,14,4], [12,15,1,2,8]])
print ("\n Deterministic 2-dimensional array \n")
print (x)

 Deterministic 1-dimensional array 

[ 2  5 18 14  4]

 Deterministic 2-dimensional array 

[[ 2  5 18 14  4]
 [12 15  1  2  8]]


d = {'A':[1., 2., 3., 4.],
     'B':[4., 3., 2., 1.]} 
df = pd.DataFrame(d)
x = np.array(df)
print(x)

[[1. 4.]
 [2. 3.]
 [3. 2.]
 [4. 1.]]


#1-dimensional arrays
x = np.random.rand(5)
print ("\n Random 1-dimensional array \n")
print (x)

#2-dimensional arrays

x = np.random.rand(5,5)
print ("\n Random 5x5 2-dimensional array \n")
print (x)

x = np.random.randint(10,size=(2,3))
print("\n Random 2x3 array with integers")
print(x)

 Random 1-dimensional array 

[0.3045817  0.68593352 0.29571302 0.26202982 0.97401971]

 Random 5x5 2-dimensional array 

[[0.97592057 0.44797147 0.74637882 0.21933798 0.50116858]
 [0.0401491  0.1708651  0.24359695 0.96919036 0.18232783]
 [0.94211783 0.49514494 0.54596413 0.05031611 0.12973151]
 [0.75306615 0.0302772  0.42787459 0.03703721 0.86811408]
 [0.12405954 0.71376471 0.15755744 0.0803535  0.84714838]]

 Random 2x3 array with integers
[[9 3 1]
 [4 4 2]]


print("\n Matrix Dimensions \n")
print(x.shape)
print ("\n Transpose of the matrix \n")
print (x.T)
print (x.T.shape)

 Matrix Dimensions 

(2, 3)

 Transpose of the matrix 

[[9 4]
 [3 4]
 [1 2]]
(3, 2)


x = np.zeros((4,4))
print ("\n 4x4 array with zeros \n")
print(x)

x = np.ones((4,4))
print ("\n 4x4 array with ones \n")
print (x)

x = np.eye(4)
print ("\n Identity matrix of size 4\n")
print(x)

x = np.diag([1,2,3])
print ("\n Diagonal matrix\n")
print(x)

 4x4 array with zeros 

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]

 4x4 array with ones 

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]

 Identity matrix of size 4

[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]

 Diagonal matrix

[[1 0 0]
 [0 2 0]
 [0 0 3]]


A = np.random.randint(10,size=(2,3))
A

array([[2, 7, 9],
       [0, 9, 5]])


v = np.array([2,3])
D = np.diag(v)
print(D@A)

[[ 6  2 10]
 [21  3  3]]


x = np.random.randint(10, size = (2,4))
print (x)
print('\n mean value of all elements')
print (np.mean(x)) 
print('\n vector of mean values for columns')
print (np.mean(x,0)) #0 signifies the dimension meaning columns
print('\n vector of mean values for rows')
print (np.mean(x,1)) #1 signifies the dimension meaning rows

[[8 6 4 8]
 [7 8 5 9]]

 mean value of all elements
6.875

 vector of mean values for columns
[7.5 7.  4.5 8.5]

 vector of mean values for rows
[6.5  7.25]


print('\n standard deviation of all elements')
print (np.std(x)) 
print('\n vector of std values for rows')
print (np.std(x,1)) #1 signifies the dimension meaning rows
print('\n median value of all elements')
print (np.median(x)) 
print('\n vector of median values for rows')
print (np.median(x,1)) 
print('\n sum of all elements')
print (np.sum(x)) 
print('\n vector of column sums')
print (np.sum(x,0)) 
print('\n product of all elements')
print (np.prod(x)) 
print('\n vector of row products')
print (np.prod(x,1))

 standard deviation of all elements
1.6153559979150107

 vector of std values for rows
[1.6583124  1.47901995]

 median value of all elements
7.5

 vector of median values for rows
[7.  7.5]

 sum of all elements
55

 vector of column sums
[15 14  9 17]

 product of all elements
3870720

 vector of row products
[1536 2520]


x = np.random.rand(4,3)
print(x)
print("\n element\n")
print(x[1,2])
print("\n row zero \n")
print(x[0,:])
print("\n column 2 \n")
print(x[:,2])
print("\n submatrix \n")
print(x[1:3,0:2])
print("\n entries > 0.5 \n")
print(x[x>0.5])

[[0.98876655 0.12454666 0.91366392]
 [0.01737282 0.99059361 0.99336315]
 [0.80761691 0.31713682 0.08819911]
 [0.1034435  0.76720037 0.74667172]]

 element

0.9933631497328915

 row zero 

[0.98876655 0.12454666 0.91366392]

 column 2 

[0.91366392 0.99336315 0.08819911 0.74667172]

 submatrix 

[[0.01737282 0.99059361]
 [0.80761691 0.31713682]]

 entries > 0.5 

[0.98876655 0.91366392 0.99059361 0.99336315 0.80761691 0.76720037
 0.74667172]


x = np.random.rand(4,3)
print(x)

x[1,2] = -5 #change an entry
x[0:2,:] += 1 #change a set of rows: add 1 to all the elements of the first two rows
x[2:4,1:3] = 0.5 #change a block
print(x)

print('\n Set entries > 0.5 to zero')
x[x>0.5] = 0
print(x)

[[0.73651874 0.88698779 0.03882106]
 [0.36525677 0.3854799  0.79835146]
 [0.70155822 0.50776678 0.38097916]
 [0.49055785 0.89628018 0.1100143 ]]
[[ 1.73651874  1.88698779  1.03882106]
 [ 1.36525677  1.3854799  -4.        ]
 [ 0.70155822  0.5         0.5       ]
 [ 0.49055785  0.5         0.5       ]]

 Set entries > 0.5 to zero
[[ 0.          0.          0.        ]
 [ 0.          0.         -4.        ]
 [ 0.          0.5         0.5       ]
 [ 0.49055785  0.5         0.5       ]]


x = np.random.rand(4,4)
print(x)
print('\n Read Diagonal \n')
print(x.diagonal())
print('\n Fill Diagonal with 1s \n')
np.fill_diagonal(x,1)
print(x)
print('\n Fill Diagonal with vector \n')
x[np.diag_indices_from(x)] = [1,2,3,4]
print(x)

[[0.08149231 0.38845034 0.00664335 0.17188248]
 [0.00422611 0.98275776 0.51606564 0.87474304]
 [0.3424854  0.80202498 0.36604999 0.51433298]
 [0.92224115 0.88739759 0.63106565 0.37334718]]

 Read Diagonal 

[0.08149231 0.98275776 0.36604999 0.37334718]

 Fill Diagonal with 1s 

[[1.         0.38845034 0.00664335 0.17188248]
 [0.00422611 1.         0.51606564 0.87474304]
 [0.3424854  0.80202498 1.         0.51433298]
 [0.92224115 0.88739759 0.63106565 1.        ]]

 Fill Diagonal with vector 

[[1.         0.38845034 0.00664335 0.17188248]
 [0.00422611 2.         0.51606564 0.87474304]
 [0.3424854  0.80202498 3.         0.51433298]
 [0.92224115 0.88739759 0.63106565 4.        ]]


D = np.random.rand(10,5)
print(D)
D[D>=0.7] = 1
D[D< 0.7] = 0
#D[D <= 0.3] = 1
#D[D != 1] = 0
D

[[0.02897289 0.01862057 0.7411373  0.04384066 0.57667308]
 [0.74959852 0.53840493 0.92343734 0.74636006 0.5222953 ]
 [0.91362662 0.98275774 0.83496817 0.31708787 0.68401841]
 [0.1483231  0.42414062 0.56359105 0.9249087  0.7475312 ]
 [0.41435385 0.63869731 0.53540715 0.63295463 0.79776993]
 [0.88480976 0.11822337 0.11292914 0.05690713 0.71825044]
 [0.16490754 0.34375931 0.97686887 0.30762994 0.51454228]
 [0.63207275 0.27510211 0.14973195 0.35500921 0.56096891]
 [0.9860109  0.68342203 0.45799759 0.4002986  0.67623526]
 [0.19252044 0.46201291 0.57859879 0.82318672 0.04562726]]

array([[0., 0., 1., 0., 0.],
       [1., 0., 1., 1., 0.],
       [1., 1., 1., 0., 0.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])


x = np.random.rand(4,3)
print(x)

#multiplication and addition with scalar value
print("\n Matrix 2x+1 \n") 
print(2*x+1)

[[0.17332937 0.12585772 0.05650331]
 [0.0450051  0.6894536  0.10691141]
 [0.78271553 0.22487894 0.25747072]
 [0.39224177 0.12789578 0.6429628 ]]

 Matrix 2x+1 

[[1.34665873 1.25171545 1.11300663]
 [1.09001021 2.3789072  1.21382283]
 [2.56543107 1.44975789 1.51494145]
 [1.78448355 1.25579156 2.28592559]]


y = np.array([2,-1,3])
z = np.array([-1,2,2])
print('\n y:',y)
print(' z:',z)
print('\n vector-vector dot product')
print(y.dot(z))
print(np.dot(y,z))
print(y@z)

 y: [ 2 -1  3]
 z: [-1  2  2]

 vector-vector dot product
2
2
2


print('\n y:',y)
print(' z:',z)
print('\n vector-vector external product')
print(np.outer(y,z))

 y: [ 2 -1  3]
 z: [-1  2  2]

 vector-vector external product
[[-2  4  4]
 [ 1 -2 -2]
 [-3  6  6]]


print('\n y:',y)
print(' z:',z)
print('\n element-wise addition')
print(y+z)
print('\n element-wise product')
print(y*z)
print('\n element-wise division')
print(y/z)

 y: [ 2 -1  3]
 z: [-1  2  2]

 element-wise addition
[1 1 5]

 element-wise product
[-2 -2  6]

 element-wise division
[-2.  -0.5  1.5]


X = np.random.randint(10, size = (4,3))
print('Matrix X:\n',X)
y = np.array([1,0,0])
print("\n Matrix-vector right multiplication with",y,"\n")
print(X.dot(y))
print(np.dot(X,y))
print(X@y)
y = np.array([1,0,1,0])
print("\n Matrix-vector left multiplication with",y,"\n")
print(y.dot(X),'\n')
print(np.dot(y,X),'\n')
print(y@X,'\n')
print(y.shape)

Matrix X:
 [[2 8 9]
 [2 6 9]
 [3 3 7]
 [9 7 2]]

 Matrix-vector right multiplication with [1 0 0] 

[2 2 3 9]
[2 2 3 9]
[2 2 3 9]

 Matrix-vector left multiplication with [1 0 1 0] 

[ 5 11 16] 

[ 5 11 16] 

[ 5 11 16] 

(4,)


Y = np.random.randint(10, size=(3,2))
print("\n Matrix-matrix multiplication\n")
print('Matrix X:\n',X)
print('Matrix Y:\n',Y)
print('Product:\n',X.dot(Y))
print('Product:\n',X@Y)

 Matrix-matrix multiplication

Matrix X:
 [[2 8 9]
 [2 6 9]
 [3 3 7]
 [9 7 2]]
Matrix Y:
 [[9 9]
 [4 2]
 [6 4]]
Product:
 [[104  70]
 [ 96  66]
 [ 81  61]
 [121 103]]
Product:
 [[104  70]
 [ 96  66]
 [ 81  61]
 [121 103]]


Z = np.random.randint(10, size=(3,2))+1
print('Matrix Y:\n',Y)
print('Matrix Z:\n',Z)
print("\n Matrix-matrix element-wise addition\n")
print(Y+Z)
print("\n Matrix-matrix element-wise multiplication\n")
print(Y*Z)
print("\n Matrix-matrix element-wise division\n")
print(Y/Z)

Matrix Y:
 [[4 9]
 [0 3]
 [2 7]]
Matrix Z:
 [[10  6]
 [10  6]
 [ 3  4]]

 Matrix-matrix element-wise addition

[[14 15]
 [10  9]
 [ 5 11]]

 Matrix-matrix element-wise multiplication

[[40 54]
 [ 0 18]
 [ 6 28]]

 Matrix-matrix element-wise division

[[0.4        1.5       ]
 [0.         0.5       ]
 [0.66666667 1.75      ]]


import scipy.sparse as sp_sparse

d = np.array([[0, 0, 12],
              [0, 1, 1],
              [0, 5, 34],
              [1, 3, 12],
              [1, 2, 6],
              [2, 0, 23],
              [3, 4, 14],
              ])
row = d[:,0]
col = d[:,1]
data = d[:,2]
# a matrix M with M[row[i],col[i]] = data[i] will be created
M = sp_sparse.csr_matrix((data,(row,col)), shape=(5,6))
print(M)
print(M.toarray()) #transforms back to full matrix

  (0, 0)	12
  (0, 1)	1
  (0, 5)	34
  (1, 2)	6
  (1, 3)	12
  (2, 0)	23
  (3, 4)	14
[[12  1  0  0  0 34]
 [ 0  0  6 12  0  0]
 [23  0  0  0  0  0]
 [ 0  0  0  0 14  0]
 [ 0  0  0  0  0  0]]


x = np.random.randint(2,size = (3,4))
print(x)
print('\n make x sparce')
A = sp_sparse.csr_matrix(x)
print(A)

[[0 0 1 1]
 [0 1 1 0]
 [0 1 1 1]]

 make x sparce
  (0, 2)	1
  (0, 3)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1


# Use lil (list of lists) representation, you can also use coo (coordinates)
A = sp_sparse.lil_matrix((10, 10))
A[0, :5] = np.random.randint(10,size = 5)
A[1, 5:10] = A[0, :5]
A.setdiag(np.random.randint(10,size = 10))
A[9,9] = 99
A[9,0]=1
print(A)
print(A.toarray())
print(A.diagonal())
A = A.tocsr() # makes it a compressed column format. better for dot product.
B = A.dot(np.ones(10))
print(B)

  (0, 0)	7.0
  (0, 1)	5.0
  (0, 3)	5.0
  (0, 4)	1.0
  (1, 1)	3.0
  (1, 5)	9.0
  (1, 6)	5.0
  (1, 8)	5.0
  (1, 9)	1.0
  (2, 2)	7.0
  (3, 3)	5.0
  (4, 4)	1.0
  (6, 6)	9.0
  (7, 7)	2.0
  (8, 8)	8.0
  (9, 0)	1.0
  (9, 9)	99.0
[[ 7.  5.  0.  5.  1.  0.  0.  0.  0.  0.]
 [ 0.  3.  0.  0.  0.  9.  5.  0.  5.  1.]
 [ 0.  0.  7.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  5.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  9.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  2.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  8.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0. 99.]]
[ 7.  3.  7.  5.  1.  0.  9.  2.  8. 99.]
[ 18.  23.   7.   5.   1.   0.   9.   2.   8. 100.]


print(A.dot(A.T))

  (0, 4)	1.0
  (0, 3)	25.0
  (0, 1)	15.0
  (0, 9)	7.0
  (0, 0)	100.0
  (1, 9)	99.0
  (1, 8)	40.0
  (1, 6)	45.0
  (1, 1)	141.0
  (1, 0)	15.0
  (2, 2)	49.0
  (3, 3)	25.0
  (3, 0)	25.0
  (4, 4)	1.0
  (4, 0)	1.0
  (6, 6)	81.0
  (6, 1)	45.0
  (7, 7)	4.0
  (8, 8)	64.0
  (8, 1)	40.0
  (9, 1)	99.0
  (9, 9)	9802.0
  (9, 0)	7.0


A[0].mean()

1.8000000000000003


import sklearn.datasets as sk_data

data = sk_data.make_low_rank_matrix(n_samples=100, n_features=50, effective_rank=2, tail_strength=0.0, random_state=None)
#sns.heatmap(data, xticklabels=False, yticklabels=False, linewidths=0)


U, s, V = np.linalg.svd(data,full_matrices = False) 
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[0:10])
plt.ylabel('singular value')
plt.xlabel('number of singular values')

(100, 50) (50,) (50, 50)
[1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01
 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06
 1.12535175e-07 1.60522805e-09 1.38879351e-11 7.28746716e-14
 2.36633685e-16 1.76907433e-16 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 9.38603842e-17 9.38603842e-17 9.38603842e-17 9.38603842e-17
 5.80193185e-17 2.07203841e-17]

Text(0.5, 0, 'number of singular values')


import scipy.sparse.linalg as sp_linalg

data2 = sp_sparse.csc_matrix(data)
print(data2.shape)
U,s,V = sp_linalg.svds(data2, k = 10) #by default returns k=6 singular values
print (U.shape, s.shape, V.shape)
print(s)
plt.plot(s[::-1]) #invert the order of the singular values
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')

(100, 50)
(100, 10) (10,) (10, 50)
[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.23409804e-04
 1.93045414e-03 1.83156389e-02 1.05399225e-01 3.67879441e-01
 7.78800783e-01 1.00000000e+00]

Text(0.5, 0, 'number of eigenvalues')


from sklearn.decomposition import TruncatedSVD

K = 10
svd = TruncatedSVD(n_components=K)
svd.fit(data2)
print(svd.components_.shape) # the V vectors
print(svd.transform(data2).shape) # the U vectors
print(svd.singular_values_)

(10, 50)
(100, 10)
[1.00000000e+00 7.78800783e-01 3.67879441e-01 1.05399225e-01
 1.83156389e-02 1.93045414e-03 1.23409804e-04 4.78511739e-06
 1.12535175e-07 1.60522805e-09]


K = 6
U_k,s_k,V_k = sp_linalg.svds(data2, K, which = 'LM')
print (U_k.shape, s_k.shape, V_k.shape)
print(s_k)
plt.plot(s_k[::-1])
plt.ylabel('eigenvalue value')
plt.xlabel('number of eigenvalues')
S_k = np.diag(s_k)

(100, 6) (6,) (6, 50)
[0.00193045 0.01831564 0.10539922 0.36787944 0.77880078 1.        ]


reconstruction_error = []
for k in range(K,0,-1): #iterate from end to start
    data_k = U_k[:,k:].dot(S_k[k:,k:]).dot(V_k[k:,:]) #here we obtain the rank-k matrix
    error = np.linalg.norm(data_k-data2,ord='fro')
    reconstruction_error.append(error)
    print(error)
data_k = U_k.dot(S_k).dot(V_k)
print(np.linalg.norm(data_k-data2,ord='fro'))

plt.plot(1+np.array(range(6)),reconstruction_error)
plt.ylabel('rank-k reconstruction error')
plt.xlabel('rank')

1.324127691735753
0.8679367165994608
0.3831233278055767
0.10699626662742329
0.018417506182009297
0.001934400698365925
0.00012350259009393555

Text(0.5, 0, 'rank')


import numpy as np

M1 = np.random.randint(1,50,(50,20))
M2 = np.random.randint(1,10,(50,20))
M3 = np.random.randint(1,10,(50,20))
M4 = np.random.randint(1,50,(50,20))

T = np.concatenate((M1,M2),axis=1)
B = np.concatenate((M3,M4),axis=1)
M = np.concatenate((T,B),axis = 0)


plt.imshow(M, cmap='hot')
plt.show()


import scipy.stats as stats
import matplotlib.pyplot as plt

(U,S,V) = np.linalg.svd(M,full_matrices = False)
#print(S)
c = M.sum(0)
r = M.sum(1)
print(stats.pearsonr(r,U[:,0]))
print(stats.pearsonr(c,V[0]))
plt.scatter(r,U[:,0])
plt.figure()
plt.scatter(c,V[0])

(-0.9827648448466841, 1.1567778643026673e-73)
(-0.9657370833058015, 7.281103085996422e-24)

<matplotlib.collections.PathCollection at 0x15c2f655820>


plt.scatter(U[:,0],U[:,1])

<matplotlib.collections.PathCollection at 0x15c2f5f3e80>


plt.scatter(x = U[:50,0],y = U[:50,1],color='r')
plt.scatter(x = U[50:,0],y = U[50:,1], color = 'b')

<matplotlib.collections.PathCollection at 0x147065aeeb8>


from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(M)

PCA(n_components=2)


pca.components_

array([[-0.16094941, -0.13334638, -0.13914704, -0.16821662, -0.1840363 ,
        -0.18137002, -0.18541204, -0.14320527, -0.15259798, -0.149383  ,
        -0.17935769, -0.1520864 , -0.1689485 , -0.15273942, -0.1599209 ,
        -0.13903709, -0.14112612, -0.15519897, -0.15303926, -0.18059461,
         0.1603817 ,  0.18929262,  0.13919234,  0.15163066,  0.15175376,
         0.15654794,  0.17475247,  0.17971444,  0.15313127,  0.15183825,
         0.14859543,  0.16568566,  0.16568236,  0.14203813,  0.1527631 ,
         0.15061923,  0.1417066 ,  0.14426523,  0.14729464,  0.15073459],
       [-0.40134473, -0.05583601,  0.05972713,  0.21742267, -0.12659435,
        -0.20683708,  0.32515444, -0.19554171, -0.07890482,  0.32146998,
         0.17886798,  0.25577635,  0.12682412,  0.00952398, -0.18375292,
        -0.33014606, -0.08378762, -0.31364891, -0.1091007 ,  0.10372033,
         0.02580287, -0.06069759, -0.0037416 , -0.03347731,  0.05605955,
         0.03965687, -0.06553724, -0.05357751, -0.10207093,  0.01573521,
        -0.01510528, -0.1051866 ,  0.03713903, -0.03832595,  0.06626553,
        -0.18725506,  0.01225572,  0.07721781,  0.00267   , -0.02166871]])


plt.scatter(pca.components_[0],pca.components_[1])

<matplotlib.collections.PathCollection at 0x15c2f6ce970>


MPCA = pca.transform(M)
print(MPCA.shape)

(100, 2)


plt.scatter(MPCA[:,0],MPCA[:,1])

<matplotlib.collections.PathCollection at 0x15c282845b0>


from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target #contains the labels of the data

pca = PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)


pca.explained_variance_

array([4.22824171, 0.24267075, 0.0782095 ])


plt.scatter(X[:,0],X[:,1])

<matplotlib.collections.PathCollection at 0x15c2872cc70>


plt.scatter(X[y==0,0],X[y==0,1], color='b')
plt.scatter(X[y==1,0],X[y==1,1], color='r')
plt.scatter(X[y==2,0],X[y==2,1], color='g')

<matplotlib.collections.PathCollection at 0x15c2f72dca0>


from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X[y==0,0],X[y==0,1], X[y==0,2], color='b')
ax.scatter(X[y==1,0],X[y==1,1], X[y==1,2], color='r')
ax.scatter(X[y==2,0],X[y==2,1], X[y==2,2], color='g')

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x15c2f7ac7c0>


from sklearn.datasets import fetch_20newsgroups

categories = ['comp.os.ms-windows.misc', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)


from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=4,max_df=0.8)
dtm = vectorizer.fit_transform(news_data.data)


import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize, sent_tokenize


stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)  
         for sent in sent_tokenize(message)
        for word in word_tokenize(sent))
        for message in news_data.data]

# stemmed_data = news_data.data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tsap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tsap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <module>
      6 
      7 
----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)  
      9          for sent in sent_tokenize(message)
     10         for word in word_tokenize(sent))

~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <listcomp>(.0)
      6 
      7 
----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)  
      9          for sent in sent_tokenize(message)
     10         for word in word_tokenize(sent))

~\AppData\Local\Temp/ipykernel_1572/1551905980.py in <genexpr>(.0)
      6 
      7 
----> 8 stemmed_data = [" ".join(SnowballStemmer("english", ignore_stopwords=True).stem(word)  
      9          for sent in sent_tokenize(message)
     10         for word in word_tokenize(sent))

C:\ProgramData\Anaconda3\lib\site-packages\nltk\stem\snowball.py in __init__(self, language, ignore_stopwords)
    106             raise ValueError(f"The language '{language}' is not supported.")
    107         stemmerclass = globals()[language.capitalize() + "Stemmer"]
--> 108         self.stemmer = stemmerclass(ignore_stopwords)
    109         self.stem = self.stemmer.stem
    110         self.stopwords = self.stemmer.stopwords

C:\ProgramData\Anaconda3\lib\site-packages\nltk\stem\snowball.py in __init__(self, ignore_stopwords)
    138         if ignore_stopwords:
    139             try:
--> 140                 for word in stopwords.words(language):
    141                     self.stopwords.add(word)
    142             except OSError as e:

C:\ProgramData\Anaconda3\lib\site-packages\nltk\corpus\reader\wordlist.py in words(self, fileids, ignore_lines_startswith)
     19         return [
     20             line
---> 21             for line in line_tokenize(self.raw(fileids))
     22             if not line.startswith(ignore_lines_startswith)
     23         ]

C:\ProgramData\Anaconda3\lib\site-packages\nltk\corpus\reader\api.py in raw(self, fileids)
    216         for f in fileids:
    217             with self.open(f) as fp:
--> 218                 contents.append(fp.read())
    219         return concat(contents)
    220 

C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in __exit__(self, type, value, traceback)
   1165 
   1166     def __exit__(self, type, value, traceback):
-> 1167         self.close()
   1168 
   1169     def xreadlines(self):

C:\ProgramData\Anaconda3\lib\site-packages\nltk\data.py in close(self)
   1194         Close the underlying stream.
   1195         """
-> 1196         self.stream.close()
   1197 
   1198     # /////////////////////////////////////////////////////////////////

KeyboardInterrupt:


dtm = vectorizer.fit_transform(stemmed_data)
terms = vectorizer.get_feature_names()
print(terms)


dtm_dense = dtm.todense()
centered_dtm = dtm_dense - np.mean(dtm_dense, axis=0)
np.sum(centered_dtm,axis=0)[:,:10]

matrix([[ 9.68843061e-16, -5.26054894e-15,  2.83258660e-15,
          5.48389459e-16,  4.95209342e-15,  2.94935517e-15,
          1.01882478e-15, -6.34345007e-15,  1.35613092e-14,
          6.73457600e-15]])


u, s, vt = np.linalg.svd(centered_dtm)


plt.xlim([0,50])
plt.plot(range(1,len(s)+1),s)

[<matplotlib.lines.Line2D at 0x15c29818040>]


k=2
vectorsk = np.array(u[:,:k] @ np.diag(s[:k]))
labels = [news_data.target_names[i] for i in news_data.target]
sns.scatterplot(x=vectorsk[:,0], y=vectorsk[:, 1], hue=labels)

<AxesSubplot:>


import seaborn as sns
k = 5
Xk = u[:,:k] @ np.diag(s[:k])
X_df = pd.DataFrame(Xk)
g = sns.PairGrid(X_df)
g.map(plt.scatter)

<seaborn.axisgrid.PairGrid at 0x15c297b2be0>


terms = vectorizer.get_feature_names()
for i in range(6):
    top = np.argsort(vt[i])
    topterms = [terms[top[0,f]] for f in range(12)]
    print (i, topterms)

0 ['space', 'henry', 'nasa', 'alaska', 'toronto', 'gov', 'moon', 'zoo', 'aurora', 'spencer', 'nsmca', 'pat']
1 ['windows', 'space', 'access', 'nasa', 'file', 'dos', 'gov', 'digex', 'files', 'use', 'henry', 'program']
2 ['access', 'nasa', 'digex', 'gov', 'pat', 'jpl', 'space', 'baalke', '___', 'com', 'kelvin', '__']
3 ['access', 'digex', 'pat', 'com', 'prb', 'net', 'express', 'online', 'communications', 'dseg', 'usa', 'ti']
4 ['ax', 'henry', 'toronto', 'zoo', 'spencer', 'zoology', 'jpl', 'baalke', '___', 'com', 'gov', 'kelvin']
5 ['nasa', 'gov', 'henry', 'jpl', 'baalke', 'toronto', '___', 'windows', 'kelvin', '__', 'ca', 'team']

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

Introduction to Numpy, Scipy, SciKit-Learn¶

Why Numpy?¶

Arrays¶

Creating Arrays¶

Operations on arrays.¶

Manipulating arrays¶

Accessing and Slicing¶

Changing entries¶

Manipulating the diagonal¶

Quiz¶

Operations with Arrays¶

Multiplication and addition with scalar¶

Creating Sparse Arrays¶

Singluar Value Decomposition¶

Onbtaining a low rank approximation of the data¶

An example¶

PCA using SciKit Learn¶