import pandas as pd
import numpy as np
import matplotlib.pyplot as Plt
import seaborn as sn
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab
%matplotlib inline
heart_df=pd.read_csv("framingham.csv")
heart_df.drop(['education'],axis=1,inplace=True)
heart_df.head()
heart_df.isnull().sum()
heart_df.dropna(axis=0,inplace=True)
'''def draw_histograms(dataframe, features, rows, cols):
fig=Plt.figure(figsize=(20,20))
for i, feature in enumerate(features):
ax=fig.add_subplot(rows,cols,i+1)
dataframe[feature].hist(bins=20,ax=ax,facecolor='midnightblue')
ax.set_title(feature+" Distribution",color='DarkRed')
fig.tight_layout()
Plt.show()
draw_histograms(heart_df,heart_df.columns,6,3)'''
sn.countplot(x='TenYearCHD',data=heart_df)
There are 3179 patents with no heart disease and 572 patients with risk of heart disease.
heart_df.describe()
When all features plugged in:
$$logit(p) = log(p/(1-p))=\beta_0 +\beta_1\hspace{.1cm} *\hspace{.2cm} Sexmale\hspace{.2cm}+\beta_2\hspace{.1cm} * \hspace{.1cm}age\hspace{.2cm}+\hspace{.2cm}\beta_3\hspace{.1cm} *\hspace{.1cm} cigsPerDay\hspace{.2cm}+\hspace{.2cm}\beta_4 \hspace{.1cm}*\hspace{.1cm} totChol\hspace{.2cm}+\hspace{.2cm}\beta_5\hspace{.1cm} *\hspace{.1cm} sysBP\hspace{.2cm}+\hspace{.2cm}\beta_6\hspace{.1cm} *\hspace{.1cm} glucose\hspace{.2cm}$$This fitted model shows that, holding all other features constant, the odds of getting diagnosed with heart disease for males (sex_male = 1)over that of females (sex_male = 0) is exp(0.5815) = 1.788687. In terms of percent change, we can say that the odds for males are 78.8% higher than the odds for females.
The coefficient for age says that, holding all others constant, we will see 7% increase in the odds of getting diagnosed with CDH for a one year increase in age since exp(0.0655) = 1.067644.
import sklearn
from sklearn.model_selection import train_test_split
outcomes = heart_df['TenYearCHD']
features = heart_df.drop('TenYearCHD', axis = 1)
x_train,x_test,y_train,y_test=train_test_split(features,outcomes,test_size=.20,random_state=5)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
logreg=LogisticRegression(max_iter=1200,C=0.01,random_state=4)
logreg.fit(x_train,y_train)
y_train_pred=logreg.predict(x_train)
y_test_pred=logreg.predict(x_test)
print('The training F1 Score is', f1_score(y_train_pred, y_train))
print('The testing F1 Score is', f1_score(y_test_pred, y_test))
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)
logreg
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
decTree=DecisionTreeClassifier(max_depth=10,min_samples_leaf=9, min_samples_split=3)
decTree.fit(x_train,y_train)
y_train_pred=decTree.predict(x_train)
y_test_pred=decTree.predict(x_test)
print('The training F1 Score is', f1_score(y_train_pred, y_train))
print('The testing F1 Score is', f1_score(y_test_pred, y_test))
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)
decTree
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
svm=SVC()
svm.fit(x_train,y_train)
y_train_pred=svm.predict(x_train)
y_test_pred=svm.predict(x_test)
print('The training F1 Score is', f1_score(y_train_pred, y_train))
print('The testing F1 Score is', f1_score(y_test_pred, y_test))
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)
svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
nn = MLPClassifier(activation='relu', solver='adam', learning_rate_init=.01, learning_rate='constant',
max_iter=1200, early_stopping=False, n_iter_no_change=10, shuffle=True, validation_fraction=.1,
random_state=42, batch_size=32, hidden_layer_sizes=(100,))
nn.fit(x_train,y_train)
y_train_pred = nn.predict(x_train)
y_test_pred = nn.predict(x_test)
print('The training F1 Score is', f1_score(y_train_pred, y_train))
print('The testing F1 Score is', f1_score(y_test_pred, y_test))
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)
nn
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_test_pred)
conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:0','Predicted:1'],index=['Actual:0','Actual:1'])
Plt.figure(figsize = (8,5))
sn.heatmap(conf_matrix, annot=True,fmt='d',cmap="YlGnBu")
The confusion matrix shows 658+4 = 662 correct predictions and 88+1= 89 incorrect ones.
True Positives: 4
True Negatives: 658
False Positives: 1 (Type I error)
False Negatives: 88 ( Type II error)
TN=cm[0,0]
TP=cm[1,1]
FN=cm[1,0]
FP=cm[0,1]
sensitivity=TP/float(TP+FN)
specificity=TN/float(TN+FP)
print('The acuuracy of the model = TP+TN/(TP+TN+FP+FN) = ',(TP+TN)/float(TP+TN+FP+FN),'\n',
'The Missclassification = 1-Accuracy = ',1-((TP+TN)/float(TP+TN+FP+FN)),'\n',
'Sensitivity or True Positive Rate = TP/(TP+FN) = ',TP/float(TP+FN),'\n',
'Specificity or True Negative Rate = TN/(TN+FP) = ',TN/float(TN+FP),'\n',
'Positive Predictive value = TP/(TP+FP) = ',TP/float(TP+FP),'\n',
'Negative predictive Value = TN/(TN+FN) = ',TN/float(TN+FN),'\n',
'Positive Likelihood Ratio = Sensitivity/(1-Specificity) = ',sensitivity/(1-specificity),'\n',
'Negative likelihood Ratio = (1-Sensitivity)/Specificity = ',(1-sensitivity)/specificity)
'''import pickle
filename = 'LR_model.sav'
pickle.dump(logreg, open(filename, 'wb'))'''