Joshua PDF

2/22/2019 Joshua
In [ ]: #Joshua.E
#1517105060
In [1]: import pandas as pd

# Since it is a data file with no header, we will supply the column names whic
h have been obtained from the above URL
# Create a python list of column names called "names"
#Input='preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'
#Output='class'
colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'cl
ass']
#Load the file from local directory using pd.read_csv which is a special form
of read_table
#while reading the data, supply the "colnames" list
df = pd.read_csv("pima-indians-diabetes.data", names= colnames)
In [2]: df.head()
Out[2]:
preg plas pres skin test mass pedi age class
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [3]: desc=df.describe().transpose()
desc
Out[3]:
count mean std min 25% 50% 75% max
preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00
plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00
pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00
skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00
test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00
mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10
pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42
age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00
class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00
http://localhost:8889/nbconvert/html/Joshua.ipynb?download=false 1/11
2/22/2019 Joshua
In [4]: %matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.style as sty
import seaborn as sns
df.shape
Out[4]: (768, 9)
In [5]: plt.boxplot(df['preg'])
Out[5]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cedba20>,

<matplotlib.lines.Line2D at 0x2137cedbe48>],
'caps': [<matplotlib.lines.Line2D at 0x2137cedbf28>,
<matplotlib.lines.Line2D at 0x2137ceef6d8>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cedb4a8>],
'medians': [<matplotlib.lines.Line2D at 0x2137ceefb00>],
'fliers': [<matplotlib.lines.Line2D at 0x2137ceeff28>],
'means': []}
2/22/2019 Joshua
In [6]: plt.boxplot(df.plas)
Out[6]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cf70cf8>,

<matplotlib.lines.Line2D at 0x2137cf79550>],
'caps': [<matplotlib.lines.Line2D at 0x2137cf79978>,
<matplotlib.lines.Line2D at 0x2137cf79da0>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cf70ba8>],
'medians': [<matplotlib.lines.Line2D at 0x2137cf79e80>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cf81630>],
'means': []}
In [7]: plt.boxplot(df.pres)
Out[7]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cfddc18>,

<matplotlib.lines.Line2D at 0x2137cfddcf8>],
'caps': [<matplotlib.lines.Line2D at 0x2137cfe64a8>,
<matplotlib.lines.Line2D at 0x2137cfe68d0>],
'boxes': [<matplotlib.lines.Line2D at 0x2137cfdd6d8>],
'medians': [<matplotlib.lines.Line2D at 0x2137cfe6cf8>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cfe6dd8>],
'means': []}
2/22/2019 Joshua
In [8]: plt.boxplot(df.skin)
Out[8]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e012a58>,

<matplotlib.lines.Line2D at 0x2137e012e80>],
'caps': [<matplotlib.lines.Line2D at 0x2137e012f60>,
<matplotlib.lines.Line2D at 0x2137e01a710>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e012518>],
'medians': [<matplotlib.lines.Line2D at 0x2137e01ab38>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e01af60>],
'means': []}
In [9]: plt.boxplot(df.test)
Out[9]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e06ae10>,

<matplotlib.lines.Line2D at 0x2137e06aef0>],
'caps': [<matplotlib.lines.Line2D at 0x2137e0726a0>,
<matplotlib.lines.Line2D at 0x2137e072ac8>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e06a8d0>],
'medians': [<matplotlib.lines.Line2D at 0x2137e072ef0>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e072fd0>],
'means': []}
2/22/2019 Joshua
In [10]: plt.boxplot(df.mass)
Out[10]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e0bfda0>,

<matplotlib.lines.Line2D at 0x2137e0bfe80>],
'caps': [<matplotlib.lines.Line2D at 0x2137cf52630>,
<matplotlib.lines.Line2D at 0x2137cf52a58>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e0bf860>],
'medians': [<matplotlib.lines.Line2D at 0x2137cf52e80>],
'fliers': [<matplotlib.lines.Line2D at 0x2137cf52f60>],
'means': []}
In [11]: plt.boxplot(df.pedi)
Out[11]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e11ebe0>,

<matplotlib.lines.Line2D at 0x2137e11ecc0>],
'caps': [<matplotlib.lines.Line2D at 0x2137e127470>,
<matplotlib.lines.Line2D at 0x2137e127898>],
'boxes': [<matplotlib.lines.Line2D at 0x2137e11e6a0>],
'medians': [<matplotlib.lines.Line2D at 0x2137e127cc0>],
'fliers': [<matplotlib.lines.Line2D at 0x2137e127da0>],
'means': []}
2/22/2019 Joshua
In [12]: desc['iqr']=desc['75%']-desc['25%']
desc['upwhisk']=desc['75%']+(1.5*desc.iqr)
desc["lowwhisk"]=desc['25%']+(1.5*desc.iqr)
desc
Out[12]:
count mean std min 25% 50% 75% max iqr u
preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00 5.0000
plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00 41.2500
pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00 18.0000
skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00 32.0000
test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00 127.2500
mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10 9.3000
pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42 0.3825
age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00 17.0000
class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00 1.0000
In [13]: df.drop(df[(df['test']<desc.loc['test'].lowwhisk)|(df['test']>desc.loc['test']
.upwhisk)].index,inplace=True)
In [ ]:
In [14]: df.describe().transpose()
Out[14]:
count mean std min 25% 50% 75% max
preg 63.0 3.507937 3.192092 0.00 1.000 3.000 5.5000 12.0
plas 63.0 142.476190 28.776591 91.00 122.500 139.000 166.0000 198.0
pres 63.0 74.857143 11.726531 48.00 67.000 76.000 82.0000 110.0
skin 63.0 32.825397 10.073183 7.00 26.500 33.000 40.0000 51.0
test 63.0 238.222222 36.447242 191.00 207.000 230.000 271.5000 318.0
mass 63.0 36.465079 6.767407 22.10 32.850 36.500 40.2500 57.3
pedi 63.0 0.599175 0.327291 0.15 0.354 0.527 0.8235 1.6
age 63.0 33.444444 10.912818 21.00 25.000 30.000 39.5000 58.0
class 63.0 0.507937 0.503953 0.00 0.000 1.000 1.0000 1.0
2/22/2019 Joshua
In [15]: from scipy.stats.stats import pearsonr as pr

print('preg',pr(df['preg'],df['class']))
print('plas',pr(df['plas'],df['class']))
print('pres',pr(df['skin'],df['class']))
print('test',pr(df['test'],df['class']))
print('mass',pr(df['mass'],df['class']))
print('pedi',pr(df['pedi'],df['class']))
print('age',pr(df['age'],df['class']))
df.describe()
preg (0.4085342730658121, 0.000887264032651394)

plas (0.43793832509680725, 0.00033126262783352104)
pres (0.24016001290460404, 0.05797479449677408)
test (0.029758515040939633, 0.8169091264560253)
mass (0.228980407341545, 0.07105163807205235)
pedi (0.06546044046649355, 0.6102484032414921)
age (0.42753629639395974, 0.00047422238612432723)
Out[15]:
preg plas pres skin test mass pedi ag
count 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.00000
mean 3.507937 142.476190 74.857143 32.825397 238.222222 36.465079 0.599175 33.44444
std 3.192092 28.776591 11.726531 10.073183 36.447242 6.767407 0.327291 10.91281
min 0.000000 91.000000 48.000000 7.000000 191.000000 22.100000 0.150000 21.00000
25% 1.000000 122.500000 67.000000 26.500000 207.000000 32.850000 0.354000 25.00000
50% 3.000000 139.000000 76.000000 33.000000 230.000000 36.500000 0.527000 30.00000
75% 5.500000 166.000000 82.000000 40.000000 271.500000 40.250000 0.823500 39.50000
max 12.000000 198.000000 110.000000 51.000000 318.000000 57.300000 1.600000 58.00000
2/22/2019 Joshua
In [16]: sns.pairplot(df)
Out[16]: <seaborn.axisgrid.PairGrid at 0x2137e143908>
In [17]: dfn=df.copy()
Y=dfn['class']
Y.shape
Out[17]: (63,)
In [18]: X=dfn[["preg","plas","pres","skin","test","mass","pedi","age"]]
2/22/2019 Joshua
In [19]: from sklearn.model_selection import train_test_split

xtr,xte,ytr,yte=train_test_split(X,Y,train_size=.80,random_state=1)
print(xtr.shape)
print(xte.shape)
print(ytr.shape)
print(yte.shape)
(50, 8)
(13, 8)
(50,)
(13,)
C:\Users\Student\Anaconda3\lib\site-packages\sklearn\model_selection\_split.p
y:2026: FutureWarning: From version 0.21, test_size will always complement tr
ain_size unless both are specified.
FutureWarning)
In [20]: from sklearn.linear_model.logistic import LogisticRegression

from sklearn.metrics import accuracy_score
logreg = LogisticRegression()
logreg.fit(xtr,ytr)
y_pred= logreg.predict(xte)
print("LOGISTIC REGRESSION ACCURACY:",accuracy_score(yte,y_pred))
print('LOGISTIC CLASSIFIER CONFUSTION MATRIX:',pd.crosstab(yte,y_pred))
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(yte, y_pred, average='macro')
LOGISTIC REGRESSION ACCURACY: 0.5384615384615384

LOGISTIC CLASSIFIER CONFUSTION MATRIX: col_0 0 1
class
0 2 3
1 3 5
Out[20]: (0.5125, 0.5125, 0.5125000000000001, None)
In [21]: from sklearn import preprocessing

X2 = preprocessing.scale(xtr)
X3 = preprocessing.scale(xte)
In [22]: from sklearn.linear_model.logistic import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X2,ytr)
y_pred_class_logreg = logreg.predict(X3)
print("LOGISTIC ACCURACY:",accuracy_score(yte,y_pred))
print('LOGISTIC CLASSIFIER CONFUSTION MATRIX:',pd.crosstab(yte,y_pred))
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(yte, y_pred, average='macro')
LOGISTIC ACCURACY: 0.5384615384615384

LOGISTIC CLASSIFIER CONFUSTION MATRIX: col_0 0 1
class
0 2 3
1 3 5
Out[22]: (0.5125, 0.5125, 0.5125000000000001, None)
2/22/2019 Joshua
In [23]: from sklearn.neighbors import KNeighborsClassifier

knn_5 = KNeighborsClassifier(n_neighbors=5)
knn_5.fit(xtr,ytr)
pred_knn_5 = knn_5.predict(xte)
knn_7.fit(xtr,ytr)
knn_10.fit(xtr,ytr)
In [24]: from sklearn import metrics

print("KNN ACCURACY for 5 Neighbors:",metrics.accuracy_score(yte,pred_knn_5))
print('KNN CLASSIFIER CONFUSTION MATRIX - 5 Neighbors:',pd.crosstab(yte,pred_k
nn_5))
print('=======================================================================
=============')
print("KNN ACCURACY for 7 Neighbors:",metrics.accuracy_score(yte,pred_knn_7))
print('KNN CLASSIFIER CONFUSTION MATRIX - 13 Neighbors:',pd.crosstab(yte,pred_
knn_7))
print('=======================================================================
=============')
print("KNN ACCURACY for 10 Neighbors:",metrics.accuracy_score(yte,pred_knn_10
))
print('KNN CLASSIFIER CONFUSTION MATRIX - 21 Neighbors:',pd.crosstab(yte,pred_
knn_10))
KNN ACCURACY for 5 Neighbors: 0.46153846153846156

KNN CLASSIFIER CONFUSTION MATRIX - 5 Neighbors: col_0 0 1
class
0 4 1
1 6 2
=============================================================================
=======
class
0 4 1
1 5 3
=============================================================================
=======
class
0 4 1
1 5 3
2/22/2019 Joshua
In [26]: import numpy as np

error = []
for i in range(1,49):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(xtr, ytr)
pred_i = knn.predict(xte)
error.append(np.mean(pred_i != yte))
plt.figure(figsize=(12, 6))
plt.plot(range(1,49), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
Out[26]: Text(0,0.5,'Mean Error')
In [ ]:

Joshua PDF

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Joshua PDF

Uploaded by

Copyright:

Available Formats

2/22/2019 Joshua

In [1]: import pandas as pd

df = pd.read_csv("pima-indians-diabetes.data", names= colnames)

0 6 148 72 35 0 33.6 0.627 50 1

2 8 183 64 0 0 23.3 0.672 32 1

4 0 137 40 35 168 43.1 2.288 33 1

preg 768.0 3.845052 3.369578 0.000 1.00000 3.0000 6.00000 17.00

plas 768.0 120.894531 31.972618 0.000 99.00000 117.0000 140.25000 199.00

pres 768.0 69.105469 19.355807 0.000 62.00000 72.0000 80.00000 122.00

skin 768.0 20.536458 15.952218 0.000 0.00000 23.0000 32.00000 99.00

test 768.0 79.799479 115.244002 0.000 0.00000 30.5000 127.25000 846.00

mass 768.0 31.992578 7.884160 0.000 27.30000 32.0000 36.60000 67.10

pedi 768.0 0.471876 0.331329 0.078 0.24375 0.3725 0.62625 2.42

age 768.0 33.240885 11.760232 21.000 24.00000 29.0000 41.00000 81.00

class 768.0 0.348958 0.476951 0.000 0.00000 0.0000 1.00000 1.00

In [4]: %matplotlib inline

Out[5]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cedba20>,

Out[6]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cf70cf8>,

Out[7]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137cfddc18>,

Out[8]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e012a58>,

Out[9]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e06ae10>,

Out[10]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e0bfda0>,

Out[11]: {'whiskers': [<matplotlib.lines.Line2D at 0x2137e11ebe0>,

preg 63.0 3.507937 3.192092 0.00 1.000 3.000 5.5000 12.0

plas 63.0 142.476190 28.776591 91.00 122.500 139.000 166.0000 198.0

pres 63.0 74.857143 11.726531 48.00 67.000 76.000 82.0000 110.0

skin 63.0 32.825397 10.073183 7.00 26.500 33.000 40.0000 51.0

test 63.0 238.222222 36.447242 191.00 207.000 230.000 271.5000 318.0

mass 63.0 36.465079 6.767407 22.10 32.850 36.500 40.2500 57.3

pedi 63.0 0.599175 0.327291 0.15 0.354 0.527 0.8235 1.6

age 63.0 33.444444 10.912818 21.00 25.000 30.000 39.5000 58.0

class 63.0 0.507937 0.503953 0.00 0.000 1.000 1.0000 1.0

In [15]: from scipy.stats.stats import pearsonr as pr

preg (0.4085342730658121, 0.000887264032651394)

count 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.000000 63.00000

mean 3.507937 142.476190 74.857143 32.825397 238.222222 36.465079 0.599175 33.44444

std 3.192092 28.776591 11.726531 10.073183 36.447242 6.767407 0.327291 10.91281

min 0.000000 91.000000 48.000000 7.000000 191.000000 22.100000 0.150000 21.00000

25% 1.000000 122.500000 67.000000 26.500000 207.000000 32.850000 0.354000 25.00000

50% 3.000000 139.000000 76.000000 33.000000 230.000000 36.500000 0.527000 30.00000

75% 5.500000 166.000000 82.000000 40.000000 271.500000 40.250000 0.823500 39.50000

max 12.000000 198.000000 110.000000 51.000000 318.000000 57.300000 1.600000 58.00000

Out[16]: <seaborn.axisgrid.PairGrid at 0x2137e143908>

In [19]: from sklearn.model_selection import train_test_split

In [20]: from sklearn.linear_model.logistic import LogisticRegression

LOGISTIC REGRESSION ACCURACY: 0.5384615384615384

Out[20]: (0.5125, 0.5125, 0.5125000000000001, None)

In [21]: from sklearn import preprocessing

In [22]: from sklearn.linear_model.logistic import LogisticRegression

LOGISTIC ACCURACY: 0.5384615384615384

Out[22]: (0.5125, 0.5125, 0.5125000000000001, None)

In [23]: from sklearn.neighbors import KNeighborsClassifier

In [24]: from sklearn import metrics

KNN ACCURACY for 5 Neighbors: 0.46153846153846156

In [26]: import numpy as np

Out[26]: Text(0,0.5,'Mean Error')

You might also like