# low level
from enum import Enum
import os
import random
from time import time
import datetime
import math
# middle level
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
import pandas as pd
import sklearn
# frameworkscompute the correlation coefficient between multiple variables python
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import transforms
data = pd.read_csv("framingham.csv")
data = data.dropna()
data
to_dummy_data = data.copy()
for d in ["age", "education", "cigsPerDay", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"]:
to_dummy_data.pop(d)
to_dummy_data
to_normalize_data = data.copy()
for d in ["male", "currentSmoker", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", "TenYearCHD"]:
to_normalize_data.pop(d)
to_normalize_data
to_normalize_data_stats = to_normalize_data.describe()
to_normalize_data_stats = to_normalize_data_stats.transpose()
to_normalize_data_stats
def norm(x):
return (x - to_normalize_data_stats['mean']) / to_normalize_data_stats['std']
normalize_data = norm(to_normalize_data)
normalize_data
dummy_data = {}
for d in to_dummy_data:
dummy_data[d] = pd.get_dummies(to_dummy_data[d])
for d in dummy_data:
print(d,'\n',dummy_data[d][:10])
normalize_data = normalize_data.reset_index()
for d in dummy_data:
dummy_data[d] = dummy_data[d].reset_index()
class DataClass(Dataset):
def __init__(self, normalize_data, dummy_data, targed_features):
self.normalize_data = normalize_data
self.dummy_data = dummy_data
self.targed_features = targed_features
def __len__(self):
return len(self.normalize_data)
def __getitem__(self, indx):
inputs = []
inputs_labels = []
for d in self.normalize_data:
if d not in self.targed_features:
continue
inputs.append(self.normalize_data[d][indx])
inputs_labels.append(d)
for d in self.dummy_data:
if d not in self.targed_features:
continue
inputs.append(self.dummy_data[d][0][indx])
inputs.append(self.dummy_data[d][1][indx])
inputs_labels.append("not"+d)
inputs_labels.append(d)
outputs = [self.dummy_data["TenYearCHD"][0][indx], self.dummy_data["TenYearCHD"][1][indx]]
return [inputs, outputs, inputs_labels]
targeted_features =['age','education','cigsPerDay','totChol','sysBP','diaBP','BMI','heartRate','glucose','male','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes',]
d = DataClass(normalize_data,dummy_data, targeted_features)
d.__getitem__(14)
len(d) == len(dummy_data['male']) == len(normalize_data)
data = DataClass(normalize_data,dummy_data,targeted_features)
inputs = []
outputs = []
for i in range(len(data)):
input, output, input_labels = data.__getitem__(i)
inputs.append(input)
outputs.append(output)
print(f"\r{int(((i+1)/len(data)) *100)} % {i}",end='')
inputs = np.asarray(inputs)
outputs = np.asarray(outputs)
# linear regression feature importance
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot
# define dataset
X, y = np.copy(inputs), np.copy(outputs)
print("done")
# define the model
model = LinearRegression()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_
# summarize feature importance
importance_list = []
for i,v in enumerate(importance[0]):
importance_list.append([i,v])
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance[0]))], importance[0])
pyplot.show()
importance_list.sort(key=lambda x: abs(x[1]))
x = []
# i = 0
for indx, score in importance_list[::-1]:
if input_labels[indx][:3] != "not":
x.append(score)
# i+=1
# if i==11:
# break
print(indx, input_labels[indx], score)
pyplot.bar([x for x in range(len(x))], x)
pyplot.show()
def create_datasets(normalize_data=normalize_data, dummy_data=dummy_data, targeted_features=targeted_features, batch_size=32, valid_size = 0.25):
train_dataset = DataClass(normalize_data,dummy_data, targeted_features)
valid_dataset = DataClass(normalize_data,dummy_data, targeted_features)
train_size = len(train_dataset)
indices = list(range(train_size))
np.random.shuffle(indices)
valid_split_size = int(valid_size * train_size)
train_indices, valid_indices = indices[valid_split_size:], indices[:valid_split_size]
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)
train_loader = DataLoader(train_dataset, batch_size=batch_size,sampler=train_sampler)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size,sampler=valid_sampler)
print(f"\nTrain set: {len(train_indices)} sample")
print(f"Valid set: {len(valid_indices)} sample")
return train_loader, valid_loader
train_loader, valid_loader = create_datasets()
class Model(nn.Module):
def __init__(self, input_size=20, n=15):
super(Model, self).__init__()
self.fc1 = nn.Linear(input_size, n)
self.fc2 = nn.Linear(n, n)
self.fc3 = nn.Linear(n, n)
self.fc4 = nn.Linear(n, 2)
self.ReLU = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.ReLU(x)
x = self.fc2(x)
x = self.ReLU(x)
x = self.fc3(x)
x = self.ReLU(x)
x = self.fc4(x)
return x
model = Model(21)
model(torch.rand(1,21))
class EarlyStopping:
def __init__(self,patience=15, path='/content'):
self.patience = patience
self.path = path
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
def __call__(self, val_loss, model, epoch):
score = -val_loss
if self.best_score is None:
self.best_score = score
if score < self.best_score:
self.counter += 1
print(f'\rEpoch {epoch} EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
print("\rEpoch {} loss reduced from {} to {}".format(epoch, -self.best_score, val_loss))
self.best_score = score
self.save_checkpoint(val_loss, model)
self.counter = 0
def save_checkpoint(self, val_loss, model):
torch.save(model.state_dict(), self.path)
self.val_loss_min = val_loss
x = None
def train(train_loader, model, optimizer, criterion):
global x
model.train()
t = time()
train_loss = 0
for i, data in enumerate(train_loader):
inputs, outputs, labels = data
l=[]
for inp in inputs:
l.append(inp.double())
inputs = torch.stack(l,1)
l=[]
for out in outputs:
l.append(out.double())
outputs = torch.stack(l,1)
inputs = inputs.float()
outputs = outputs.float()
optimizer.zero_grad()
_output = model(inputs)
loss = criterion(_output, outputs)
loss.backward()
optimizer.step()
train_loss += 1/(i + 1) * (loss.data - train_loss)
torch.cuda.empty_cache()
delay = time() - t
hms = str(datetime.timedelta(seconds=int(delay* (len(train_loader)-(i+1)))))
print(f"\rTaining finished batches {i+1}/{len(train_loader)} {int(((i+1)/len(train_loader))*100)}% delay {int(delay)}s time left {hms} loss {loss.item()}",end='')
t = time()
return train_loss
def validation(valid_loader, model, criterion):
model.eval()
loss, running_loss = 0.0, 0.0
t = time()
with torch.no_grad():
for i, data in enumerate(valid_loader):
inputs, outputs, labels = data
l=[]
for inp in inputs:
l.append(inp.double())
inputs = torch.stack(l,1)
l=[]
for out in outputs:
l.append(out.double())
outputs = torch.stack(l,1)
inputs = inputs.float()
outputs = outputs.float()
_output = model(inputs)
loss = criterion(_output, outputs)
running_loss += loss.item()
delay = time() - t
hms = str(datetime.timedelta(seconds=int(delay * (len(valid_loader)-(i+1)))))
print(f"\rValidation finished batches {i+1}/{len(valid_loader)} {int(((i+1)/len(valid_loader))*100)}% delay {int(delay)}s time left {hms} loss { loss.item()} ",end='')
t = time()
avg_loss = running_loss / (i+1)
model.train()
return avg_loss
def worker(n_epochs, train_loader, valid_loader, model, optimizer, criterion, early_stop):
all_train_loss, all_val_loss = [], []
for epoch in range(n_epochs):
train_loss = train(train_loader, model, optimizer, criterion)
val_loss = validation(valid_loader, model, criterion)
early_stop(val_loss, model, epoch)
all_train_loss.append(train_loss)
all_val_loss.append(val_loss)
if early_stop.early_stop:
break
return all_train_loss, all_val_loss
# optimizer = optim.SGD(model.parameters(), 0.01)
# criterion = nn.MSELoss()
# train(train_loader,model,optimizer,criterion);
optimizer = optim.SGD(model.parameters(), 0.01)
criterion = nn.MSELoss()
validation(valid_loader,model,criterion);
es = EarlyStopping(10,"model.pt")
all_train_loss, all_val_loss = worker(10000, train_loader, valid_loader, model, optimizer, criterion,es)
def display_graph(train_losses, valid_losses):
plt.plot(train_losses, label='Model pridictions')
plt.plot(valid_losses, label='Real Valuse loss')
plt.legend(frameon=False)
display_graph(all_train_loss, all_val_loss)
'''model.load_state_dict(torch.load("/content/model.pt"))'''
'''model.eval()
example = torch.rand(1, 21)
traced_script_module = torch.jit.trace(model, example)
traced_script_module.save("/content/andoirdmodel.pt")'''