You are on page 1of 11

2/22/2019 Joshua

In [ ]: #Joshua.E
#1517105060

In [1]: import pandas as pd


# Since it is a data file with no header, we will supply the column names whic
h have been obtained from the above URL
# Create a python list of column names called "names"
#Input='preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'
#Output='class'

colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'cl
ass']

#Load the file from local directory using pd.read_csv which is a special form
of read_table
#while reading the data, supply the "colnames" list

df = pd.read_csv("pima-indians-diabetes.data", names= colnames)

In [2]: df.head()

Out[2]:
preg plas pres skin test mass pedi age class

0 6 148 72 35 0 33.6 0.627 50 1

1 1 85 66 29 0 26.6 0.351 31 0

2 8 183 64 0 0 23.3 0.672 32 1

3 1 89 66 23 94 28.1 0.167 21 0

4 0 137 40 35 168 43.1 2.288 33 1

In [3]: desc=df.describe().transpose()
desc

Out[3]:
count mean std min 25% 50% 75% max

preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00

plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00

pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00

skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00

test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00

mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10

pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42

age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00

class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 1/11
2/22/2019 Joshua

In [4]: %matplotlib inline


import matplotlib.pyplot as plt
import matplotlib.style as sty
import seaborn as sns
df.shape

Out[4]: (768, 9)

In [5]: plt.boxplot(df['preg'])

Out[5]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cedba20>,


<matplotlib.lines.Line2D at 0x2137cedbe48>],
'caps': [<matplotlib.lines.Line2D at 0x2137cedbf28>,
<matplotlib.lines.Line2D at 0x2137ceef6d8>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cedb4a8>],
'medians': [<matplotlib.lines.Line2D at 0x2137ceefb00>],
'fliers': [<matplotlib.lines.Line2D at 0x2137ceeff28>],
'means': []}

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 2/11
2/22/2019 Joshua

In [6]: plt.boxplot(df.plas)

Out[6]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cf70cf8>,


<matplotlib.lines.Line2D at 0x2137cf79550>],
'caps': [<matplotlib.lines.Line2D at 0x2137cf79978>,
<matplotlib.lines.Line2D at 0x2137cf79da0>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cf70ba8>],
'medians': [<matplotlib.lines.Line2D at 0x2137cf79e80>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cf81630>],
'means': []}

In [7]: plt.boxplot(df.pres)

Out[7]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cfddc18>,


<matplotlib.lines.Line2D at 0x2137cfddcf8>],
'caps': [<matplotlib.lines.Line2D at 0x2137cfe64a8>,
<matplotlib.lines.Line2D at 0x2137cfe68d0>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cfdd6d8>],
'medians': [<matplotlib.lines.Line2D at 0x2137cfe6cf8>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cfe6dd8>],
'means': []}

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 3/11
2/22/2019 Joshua

In [8]: plt.boxplot(df.skin)

Out[8]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e012a58>,


<matplotlib.lines.Line2D at 0x2137e012e80>],
'caps': [<matplotlib.lines.Line2D at 0x2137e012f60>,
<matplotlib.lines.Line2D at 0x2137e01a710>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e012518>],
'medians': [<matplotlib.lines.Line2D at 0x2137e01ab38>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e01af60>],
'means': []}

In [9]: plt.boxplot(df.test)

Out[9]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e06ae10>,


<matplotlib.lines.Line2D at 0x2137e06aef0>],
'caps': [<matplotlib.lines.Line2D at 0x2137e0726a0>,
<matplotlib.lines.Line2D at 0x2137e072ac8>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e06a8d0>],
'medians': [<matplotlib.lines.Line2D at 0x2137e072ef0>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e072fd0>],
'means': []}

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 4/11
2/22/2019 Joshua

In [10]: plt.boxplot(df.mass)

Out[10]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e0bfda0>,


<matplotlib.lines.Line2D at 0x2137e0bfe80>],
'caps': [<matplotlib.lines.Line2D at 0x2137cf52630>,
<matplotlib.lines.Line2D at 0x2137cf52a58>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e0bf860>],
'medians': [<matplotlib.lines.Line2D at 0x2137cf52e80>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cf52f60>],
'means': []}

In [11]: plt.boxplot(df.pedi)

Out[11]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e11ebe0>,


<matplotlib.lines.Line2D at 0x2137e11ecc0>],
'caps': [<matplotlib.lines.Line2D at 0x2137e127470>,
<matplotlib.lines.Line2D at 0x2137e127898>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e11e6a0>],
'medians': [<matplotlib.lines.Line2D at 0x2137e127cc0>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e127da0>],
'means': []}

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 5/11
2/22/2019 Joshua

In [12]: desc['iqr']=desc['75%']-desc['25%']
desc['upwhisk']=desc['75%']+(1.5*desc.iqr)
desc["lowwhisk"]=desc['25%']+(1.5*desc.iqr)
desc

Out[12]:
count mean std min 25% 50% 75% max iqr u

preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00 5.0000

plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00 41.2500

pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00 18.0000

skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00 32.0000

test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00 127.2500

mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10 9.3000

pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42 0.3825

age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00 17.0000

class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00 1.0000

In [13]: df.drop(df[(df['test']<desc.loc['test'].lowwhisk)|(df['test']>desc.loc['test']
.upwhisk)].index,inplace=True)

In [ ]:

In [14]: df.describe().transpose()

Out[14]:
count mean std min 25% 50% 75% max

preg 63.0 3.507937 3.192092 0.00 1.000 3.000 5.5000 12.0

plas 63.0 142.476190 28.776591 91.00 122.500 139.000 166.0000 198.0

pres 63.0 74.857143 11.726531 48.00 67.000 76.000 82.0000 110.0

skin 63.0 32.825397 10.073183 7.00 26.500 33.000 40.0000 51.0

test 63.0 238.222222 36.447242 191.00 207.000 230.000 271.5000 318.0

mass 63.0 36.465079 6.767407 22.10 32.850 36.500 40.2500 57.3

pedi 63.0 0.599175 0.327291 0.15 0.354 0.527 0.8235 1.6

age 63.0 33.444444 10.912818 21.00 25.000 30.000 39.5000 58.0

class 63.0 0.507937 0.503953 0.00 0.000 1.000 1.0000 1.0

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 6/11
2/22/2019 Joshua

In [15]: from scipy.stats.stats import pearsonr as pr


print('preg',pr(df['preg'],df['class']))
print('plas',pr(df['plas'],df['class']))
print('pres',pr(df['skin'],df['class']))
print('test',pr(df['test'],df['class']))
print('mass',pr(df['mass'],df['class']))
print('pedi',pr(df['pedi'],df['class']))
print('age',pr(df['age'],df['class']))
df.describe()

preg (0.4085342730658121, 0.000887264032651394)


plas (0.43793832509680725, 0.00033126262783352104)
pres (0.24016001290460404, 0.05797479449677408)
test (0.029758515040939633, 0.8169091264560253)
mass (0.228980407341545, 0.07105163807205235)
pedi (0.06546044046649355, 0.6102484032414921)
age (0.42753629639395974, 0.00047422238612432723)

Out[15]:
preg plas pres skin test mass pedi ag

count 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.00000

mean 3.507937 142.476190 74.857143 32.825397 238.222222 36.465079 0.599175 33.44444

std 3.192092 28.776591 11.726531 10.073183 36.447242 6.767407 0.327291 10.91281

min 0.000000 91.000000 48.000000 7.000000 191.000000 22.100000 0.150000 21.00000

25% 1.000000 122.500000 67.000000 26.500000 207.000000 32.850000 0.354000 25.00000

50% 3.000000 139.000000 76.000000 33.000000 230.000000 36.500000 0.527000 30.00000

75% 5.500000 166.000000 82.000000 40.000000 271.500000 40.250000 0.823500 39.50000

max 12.000000 198.000000 110.000000 51.000000 318.000000 57.300000 1.600000 58.00000

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 7/11
2/22/2019 Joshua

In [16]: sns.pairplot(df)

Out[16]: <seaborn.axisgrid.PairGrid at 0x2137e143908>

In [17]: dfn=df.copy()
Y=dfn['class']
Y.shape

Out[17]: (63,)

In [18]: X=dfn[["preg","plas","pres","skin","test","mass","pedi","age"]]

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 8/11
2/22/2019 Joshua

In [19]: from sklearn.model_selection import train_test_split


xtr,xte,ytr,yte=train_test_split(X,Y,train_size=.80,random_state=1)
print(xtr.shape)
print(xte.shape)
print(ytr.shape)
print(yte.shape)

(50, 8)
(13, 8)
(50,)
(13,)

C:\Users\Student\Anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:2026: FutureWarning: From version 0.21, test_size will always complement tr
ain_size unless both are specified.
FutureWarning)

In [20]: from sklearn.linear_model.logistic import LogisticRegression


from sklearn.metrics import accuracy_score
logreg = LogisticRegression()
logreg.fit(xtr,ytr)
y_pred= logreg.predict(xte)
print("LOGISTIC REGRESSION ACCURACY:",accuracy_score(yte,y_pred))
print('LOGISTIC CLASSIFIER CONFUSTION MATRIX:',pd.crosstab(yte,y_pred))
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(yte, y_pred, average='macro')

LOGISTIC REGRESSION ACCURACY: 0.5384615384615384


LOGISTIC CLASSIFIER CONFUSTION MATRIX: col_0 0 1
class
0 2 3
1 3 5

Out[20]: (0.5125, 0.5125, 0.5125000000000001, None)

In [21]: from sklearn import preprocessing


X2 = preprocessing.scale(xtr)
X3 = preprocessing.scale(xte)

In [22]: from sklearn.linear_model.logistic import LogisticRegression


logreg = LogisticRegression()
logreg.fit(X2,ytr)
y_pred_class_logreg = logreg.predict(X3)
print("LOGISTIC ACCURACY:",accuracy_score(yte,y_pred))
print('LOGISTIC CLASSIFIER CONFUSTION MATRIX:',pd.crosstab(yte,y_pred))
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(yte, y_pred, average='macro')

LOGISTIC ACCURACY: 0.5384615384615384


LOGISTIC CLASSIFIER CONFUSTION MATRIX: col_0 0 1
class
0 2 3
1 3 5

Out[22]: (0.5125, 0.5125, 0.5125000000000001, None)

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 9/11
2/22/2019 Joshua

In [23]: from sklearn.neighbors import KNeighborsClassifier


knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_7 = KNeighborsClassifier(n_neighbors=7)
knn_10 = KNeighborsClassifier(n_neighbors=10)
knn_5.fit(xtr,ytr)
pred_knn_5 = knn_5.predict(xte)
knn_7.fit(xtr,ytr)
pred_knn_7 = knn_7.predict(xte)
knn_10.fit(xtr,ytr)
pred_knn_10 = knn_7.predict(xte)

In [24]: from sklearn import metrics


print("KNN ACCURACY for 5 Neighbors:",metrics.accuracy_score(yte,pred_knn_5))
print('KNN CLASSIFIER CONFUSTION MATRIX - 5 Neighbors:',pd.crosstab(yte,pred_k
nn_5))
print('=======================================================================
=============')
print("KNN ACCURACY for 7 Neighbors:",metrics.accuracy_score(yte,pred_knn_7))
print('KNN CLASSIFIER CONFUSTION MATRIX - 13 Neighbors:',pd.crosstab(yte,pred_
knn_7))
print('=======================================================================
=============')
print("KNN ACCURACY for 10 Neighbors:",metrics.accuracy_score(yte,pred_knn_10
))
print('KNN CLASSIFIER CONFUSTION MATRIX - 21 Neighbors:',pd.crosstab(yte,pred_
knn_10))

KNN ACCURACY for 5 Neighbors: 0.46153846153846156


KNN CLASSIFIER CONFUSTION MATRIX - 5 Neighbors: col_0 0 1
class
0 4 1
1 6 2
=============================================================================
=======
KNN ACCURACY for 7 Neighbors: 0.5384615384615384
KNN CLASSIFIER CONFUSTION MATRIX - 13 Neighbors: col_0 0 1
class
0 4 1
1 5 3
=============================================================================
=======
KNN ACCURACY for 10 Neighbors: 0.5384615384615384
KNN CLASSIFIER CONFUSTION MATRIX - 21 Neighbors: col_0 0 1
class
0 4 1
1 5 3

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 10/11
2/22/2019 Joshua

In [26]: import numpy as np


error = []
for i in range(1,49):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(xtr, ytr)
pred_i = knn.predict(xte)
error.append(np.mean(pred_i != yte))
plt.figure(figsize=(12, 6))
plt.plot(range(1,49), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Out[26]: Text(0,0.5,'Mean Error')

In [ ]:

http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 11/11

You might also like