In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/My Drive/Tohoku_Univ/Interpolation

/content/drive/My Drive/Tohoku_Univ/Interpolation


In [3]:
import sys
import glob
import math
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision.datasets import FashionMNIST
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline

class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_name, init_size, resize_num, interpolation=""):
        if dataset_name == "brain_tumor":
            self.dataset = "brain-mri-images-for-brain-tumor-detection/"
        elif dataset_name == "covid":
            self.dataset = "CT_COVID/"
        elif dataset_name == "gender":
            self.dataset = "Gender/"
        elif dataset_name == "hemorrhage":
            self.dataset = "head-ct-hemorrhage/"
        else:
            print('Error: unvalid dataset', file=sys.stderr)
        
        if interpolation == "nearest":
            self.method = Image.NEAREST
        elif interpolation == "box":
            self.method = Image.BOX
        elif interpolation == "bilinear":
            self.method = Image.BILINEAR
        elif interpolation == "hamming":
            self.method = Image.HAMMING
        elif interpolation == "bicubic":
            self.method = Image.BICUBIC
        elif interpolation == "lanczos":
            self.method = Image.LANCZOS
        else:
            self.method = -1
            
        self.patients_0, self.patients_1, self.N = self._check_patients(self.dataset)
        self.data, self.targets = self._dataloader(self.dataset, init_size=init_size, resize_num=resize_num)
    
    def __len__(self):
        return self.N

    def __getitem__(self, idx):
        out_data = self.data[idx]
        out_label =  self.targets[idx]

        return out_data, out_label
        
    def _check_patients(self, dataset_name):
        # Count the number of data contained in the dataset
        patients_0 = glob.glob(dataset_name + "0/*")
        patients_1 = glob.glob(dataset_name + "1/*")
        n = len(patients_0) + len(patients_1)
        return patients_0, patients_1, n
        
    def _dataloader(self, dataset_name, init_size, resize_num):
        w = init_size
        wr = resize_num
        data_0 = np.zeros((len(self.patients_0), 1, wr, wr))
        data_1 = np.zeros((len(self.patients_1), 1, wr, wr))
        targets_0 = np.zeros(len(self.patients_0))
        targets_1 = np.zeros((len(self.patients_1)))
        for i, patient in enumerate(self.patients_0):
            img = Image.open(patient).convert("L").resize((w, w))
            if self.method != -1:
                img = img.resize((wr, wr), self.method)
            data_0[i][0] = img
            targets_0[i] = 0
        for i, patient in enumerate(self.patients_1):
            img = Image.open(patient).convert("L").resize((w, w))
            if self.method != -1:
                img = img.resize((wr, wr), self.method)
            data_1[i][0] = img
            targets_1[i] = 1
        data = np.concatenate([data_0, data_1], axis=0)
        targets = np.concatenate([targets_0, targets_1])
        p = np.random.permutation(data.shape[0])
        data = data[p]
        targets = targets[p]
        #data = torch.from_numpy(data[p])
        #targets = torch.from_numpy(targets[p])
        return data, targets

class NFoldCrossValidation:
    # Inputs are dataset, size before resizing, size after resizing, interpolation method, and number of folds
    # "brain_tumor", "covid", "gender", "hemorrhage"
    def __init__(self, dataset, init_size=28, resize_num=56, interpolation="", n_fold=10, epoch=200, batchsize=10):
        self.dataset = MedicalDataset(dataset, init_size=init_size, resize_num=resize_num, interpolation=interpolation)
        self.resize_num = resize_num
        self.n_fold = n_fold
        self.epochs = epoch
        self.result_list = []
        self.batchsize = batchsize
        self.n_per_fold = int(self.dataset.N / n_fold)
        
    def _train_and_val(self, train_x, train_t, val_x, val_t): 
        
        net = simple_CNN(self.resize_num)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        net = net.to(device)
        
        criterion = nn.CrossEntropyLoss()
        
        optimizer = optim.Adam(net.parameters(), lr=0.0001)
        
        train_loss_list = []
        train_acc_list = []
        
        for epoch in range(self.epochs):
            train_x = np.array(train_x)
            train_t = np.array(train_t)
            p = np.random.permutation(len(train_x))
            train_x = torch.from_numpy(train_x[p])
            train_t = torch.from_numpy(train_t[p])
            train_loss = 0
            train_acc = 0
            
            net.train()
            for i in range(int(len(train_x) / self.batchsize)):
                batch_init = i * self.batchsize
                batch_end = batch_init + self.batchsize
                images = train_x[batch_init:batch_end]
                labels = train_t[batch_init:batch_end]
                images, labels = images.to(device), labels.to(device)
                images = images / 255
                optimizer.zero_grad()
                outputs = net.forward(images.float())
                loss = criterion(outputs, labels.long())
                train_loss += loss.item()
                train_acc += (outputs.max(1)[1] == labels.long()).sum().item()
                loss.backward()
                optimizer.step()
                
            avg_train_loss = train_loss / len(train_x)
            avg_train_acc = train_acc / len(train_x)
            #print("train_loss:" + str(avg_train_loss))
            #print("train_acc:" + str(avg_train_acc))
        
        net.eval()
        with torch.no_grad():
            val_loss = 0
            val_acc = 0
            val_x = torch.from_numpy(val_x)
            val_t = torch.from_numpy(val_t)
            for i in range(int(len(val_x))):
                images = val_x[i:i+1]
                labels = val_t[i:i+1]
                images, labels = images.to(device), labels.to(device)
                images = images / 255
                outputs = net.forward(images.float())
                loss = criterion(outputs, labels.long())
                val_loss += loss.item()
                val_acc += (outputs.max(1)[1] == labels.long()).sum().item()
        avg_test_loss = val_loss / len(val_x)
        avg_test_acc = val_acc / len(val_x)
        print("val_loss:" + str(avg_test_loss))
        print("val_acc:" + str(avg_test_acc))
        
        self.result_list.append(avg_test_acc)
    
    def kfold_cross_validation(self):
        for i in range(self.n_fold):
            print("fold:" + str(i+1))
            fold_init = i * self.n_per_fold
            fold_end = fold_init + self.n_per_fold
            val_x = self.dataset.data.copy()[fold_init:fold_end]
            val_t = self.dataset.targets.copy()[fold_init:fold_end]
            train_x = np.delete(self.dataset.data.copy(), range(fold_init, fold_end), axis=0)
            train_t = np.delete(self.dataset.targets.copy(), range(fold_init, fold_end), axis=0)
            self._train_and_val(train_x, train_t, val_x, val_t)
            
    def result_statistics(self):
        df = pd.Series(self.result_list)
        # Print statistics of result for test data
        print(df.describe())
        
    

class simple_CNN(nn.Module):
    def __init__(self, resize_num):
        super(simple_CNN,self).__init__()
        #Convolutional layer
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels = 1, out_channels = 16, kernel_size = 5, stride=1, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 5, stride=1, padding=0),
            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = 5, stride=1, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # Fully connected layer
        if resize_num == 28:
            units = 256
        if resize_num == 56:
            units = 5184
        if resize_num == 112:
            units = 33856
        if resize_num == 224:
            units = 166464
        self.dense = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(units, 128), # The variable 'units' depends on the input size
            nn.ReLU(inplace=True),
            nn.Linear(128, 128),
            nn.Dropout(p=0.5),
            nn.ReLU(inplace=True),
            nn.Linear(128, 2),
        )
         
    # forward function
    def forward(self,x):
         
        out = self.conv_layers(x)
        out = out.view(out.size(0), -1) # Flatten
        out = self.dense(out) # Fully connected
         
        return out
     
    # Function to check the size of output in convolutional layers
    def check_cnn_size(self, size_check):
        out = self.conv_layers(size_check)
         
        return out

In [4]:
# NFoldCrossValidation(dataset, init_size=28, resize_num=56, interpolation="", n_fold=10, epoch=200, batchsize=10)
# Dataset options："brain_tumor", "covid", "gender", "hemorrhage"
# Interpolation options："nearest", "box", "bilinear", "hamming", "bicubic", "lanczos"
cv = NFoldCrossValidation(dataset="gender", init_size=28, resize_num=28, interpolation="nearest", n_fold=10, epoch=100, batchsize=10)
cv.kfold_cross_validation()
cv.result_statistics()

fold:1
val_loss:0.26228101823411026
val_acc:0.9166666666666666
fold:2
val_loss:0.33810894182533957
val_acc:0.9166666666666666
fold:3
val_loss:0.24889325371865803
val_acc:0.9166666666666666
fold:4
val_loss:0.2761439841609293
val_acc:0.875
fold:5
val_loss:0.4857444621108395
val_acc:0.9166666666666666
fold:6
val_loss:0.4499736113551383
val_acc:0.8333333333333334
fold:7
val_loss:0.16517930342282247
val_acc:0.9583333333333334
fold:8
val_loss:0.6446417045190174
val_acc:0.7916666666666666
fold:9
val_loss:0.2706649388904528
val_acc:0.875
fold:10
val_loss:0.5546287509623653
val_acc:0.7916666666666666
count    10.000000
mean      0.879167
std       0.057097
min       0.791667
25%       0.843750
50%       0.895833
75%       0.916667
max       0.958333
dtype: float64
