# Introduction to Classification

The goal of this tutorial is to introduce you to the scikit libraries for classification. We will also cover the topic of feature normalization, and evaluation.

In [None]:
import numpy as np
import scipy.sparse as sp_sparse

import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn import preprocessing

import seaborn as sns

%matplotlib inline

## Feature normalization and Selection##

Python provides some functionality for normalizing and standardizing the data. Be careful though, some operations work only with dense data.

http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing

In [None]:
X = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X)
X_scaled
X_scaled.mean(axis=0)
X_scaled.var(axis = 0)

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
X_minmax

In [None]:
# works with sparse data
max_abs_scaler = preprocessing.MaxAbsScaler()
X_maxabs = max_abs_scaler.fit_transform(X)
X_maxabs

In [None]:
#works with sparse data

X_normalized = preprocessing.normalize(X, norm='l2')

X_normalized                                      

In [None]:
X = [[0,1,2],
     [1,2,3],
     [0,1,4]]
enc = preprocessing.OneHotEncoder()
enc.fit(X)
enc.transform([[0,2,4],[1,1,2]]).toarray()

In [None]:
#works with sparse data

X = [[0, 10, 45100],
     [1, 20, 45221],
     [0, 20, 45212]]
enc = preprocessing.OneHotEncoder(categorical_features=[2])
enc.fit(X)
enc.transform([[5,12,45212],[4,13,45221]]).toarray()

Feature selection is about finding the best features for your classifier. This may be important if you do not have enough training data. The idea is find metrics that either characterize the features by themselves, or with respect to the class we want to predict, or with respect to other features.

http://scikit-learn.org/stable/modules/feature_selection.html

https://en.wikipedia.org/wiki/Chi-squared_test

In [None]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
print(np.array(X))
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

In [None]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape
sel = SelectKBest(chi2, k=2)
X_new = sel.fit_transform(X, y)
X_new
#sel.scores_
#c,p = sk.feature_selection.chi2(X, y)
#c
#p

##Classification models##

http://scikit-learn.org/stable/supervised_learning.html#supervised-learning


###Preparing the data###

In [None]:
from sklearn.datasets import load_iris
import sklearn.utils as utils

iris = load_iris()
print(iris.data[:5,:])
print(iris.target)
print(iris.target_names)
X, y = utils.shuffle(iris.data, iris.target, random_state=1)
print(X.shape)
print(y.shape)
print(y)

In [None]:
train_set_size = 100
X_train = X[:train_set_size]  # selects first 100 rows (examples) for train set
y_train = y[:train_set_size]
X_test = X[train_set_size:]   # selects from row 100 until the last one for test set
y_test = y[train_set_size:]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

###Decision Trees###
http://scikit-learn.org/stable/modules/tree.html

In [None]:
from sklearn import tree

dtree = tree.DecisionTreeClassifier()
dtree = dtree.fit(X_train, y_train)
print(dtree.score(X_test,y_test))

y_pred = dtree.predict(X_test)
y_prob = dtree.predict_proba(X_test)
print(metrics.accuracy_score(y_test,y_pred))
print(metrics.confusion_matrix(y_test,y_pred))



###k-NN Classification ###

http://scikit-learn.org/stable/modules/neighbors.html

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
print(knn.score(X_test,y_test))

y_pred = knn.predict(X_test)
print(metrics.confusion_matrix(y_test,y_pred))

### SVM Classification###

http://scikit-learn.org/stable/modules/svm.html

http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

In [None]:
from sklearn import svm

#svm_clf = svm.LinearSVC()
#svm_clf = svm.SVC(kernel = 'poly')
svm_clf = svm.SVC()
svm_clf.fit(X_train,y_train)
print(svm_clf.score(X_test,y_test))
y_pred = svm_clf.predict(X_test)
print(metrics.confusion_matrix(y_test,y_pred))

###Logistic Regression ###

http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression

In [None]:
import sklearn.linear_model as linear_model

lr_clf = linear_model.LogisticRegression()
lr_clf.fit(X_train, y_train)
print(lr_clf.score(X_test,y_test))
y_pred = lr_clf.predict(X_test)
print(metrics.confusion_matrix(y_test,y_pred))
probs = lr_clf.predict_proba(X_test)
print (probs)
print (probs.argmax(axis = 1))
print (probs.max(axis = 1))

##Evaluation##

http://scikit-learn.org/stable/model_selection.html#model-selection

###Computing Scores###

In [None]:
p,r,f,s = metrics.precision_recall_fscore_support(y_test,y_pred)
print(p)
print(r)
print(f)
report = metrics.classification_report(y_test,y_pred)
print(report)

In [None]:
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
precision, recall, thresholds = metrics.precision_recall_curve(y_true,y_scores)
plt.scatter(recall,precision)
print(recall)
print(precision)
fpr, tpr, ths = metrics.roc_curve(y_true,y_scores)
print(metrics.roc_auc_score(y_true,y_scores))

###k-fold cross validation ###

http://scikit-learn.org/stable/modules/cross_validation.html

http://scikit-learn.org/stable/modules/cross_validation.html#k-fold

In [None]:
import sklearn.cross_validation as cross_validation

scores = cross_validation.cross_val_score(#lr_clf,
                                          #svm_clf,
                                          #knn,
                                          dtree,
                                          X,
                                          y,
                                          scoring='accuracy',
                                          cv=5)
print (scores)
print (scores.mean())