'''VGG11/13/16/19 in Pytorch.'''
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pdb
import math
from torch.nn import init

cfg = {
    'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    #'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    'VGG19': [16, 16, 'M', 32, 32, 'M', 64,64, 64, 64, 'M', 128, 128, 128, 128, 'M', 128, 128, 128, 128, 'M'],
}
# cfg_mask = {
#     'vgg16': [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512]
# }


class GetMask(autograd.Function):
    @staticmethod
    def forward(ctx,mask,epislon):
        new_mask = mask > epislon
        return new_mask.float()

    @staticmethod
    def backward(ctx,g):
        #import pdb; pdb.set_trace()
        return g, None
    
class VGG(nn.Module):
    def __init__(self, vgg_name, layerwise=[1]*17):
        super(VGG, self).__init__()
        self.features = self._make_layers(cfg[vgg_name], layerwise)
        self.classifier = nn.Linear(int(512*layerwise[-1]), 10)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg, layer_wise):
        layers = []
        in_channels = 3
        i = 0 
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                x = int(x*layer_wise[i])
                i+=1
                layers += [PretrainConv(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)


class PretrainConv(nn.Conv2d):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.mask = nn.Parameter(torch.ones((self.weight.shape[0], self.weight.shape[1])))
        self.fixed_mask = torch.ones((self.weight.shape[0], self.weight.shape[1]), requires_grad=False).cuda()
        self.epsilon = 0.1
        self.flag = True

    def forward(self, x):
        if self.flag:
            mask = GetMask.apply(self.mask, self.epsilon)
        else: 
            mask = self.fixed_mask
            #import pdb; pdb.set_trace()
        if torch.sum(mask==0)/mask.numel() > 0.4 and self.flag == True:
            print('layer {} prune_rate {} stop learning'.format(self.weight.shape,torch.sum(mask==0)/mask.numel() ))
            self.mask.requires_grad_(False)
            # import pdb; pdb.set_trace()
            self.fixed_mask *= (self.mask > self.epsilon).float()
           # import pdb; pdb.set_trace()
            self.flag = False
            
        sparseWeight = self.weight * mask.unsqueeze(-1).unsqueeze(-1)
        x = F.conv2d(
            x, sparseWeight, self.bias, self.stride, self.padding, self.dilation, self.groups
        )

        return x


class VGG_vanilla(nn.Module):
    def __init__(self, vgg_name, layerwise=[1]*17):
        super(VGG_vanilla, self).__init__()
        self.features = self._make_layers(cfg[vgg_name], layerwise)
        self.classifier = nn.Linear(int(128*layerwise[-1]), 10)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def _make_layers(self, cfg, layer_wise):
        layers = []
        in_channels = 3
        i = 0 
        for x in cfg:
            if x == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                x = int(x*layer_wise[i])
                i+=1
                layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
                           nn.BatchNorm2d(x),
                           nn.ReLU(inplace=True)]
                in_channels = x
        layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
        return nn.Sequential(*layers)
    

def test():
    net = VGG_vanilla('VGG19').cuda()
    x = torch.randn(256,3,32,32).cuda()
    import time
    for i in range(100):
        start_time = time.time()
        y = net(x)
        end_time = time.time()
        print(end_time-start_time)
    import pdb; pdb.set_trace()
    import thop
    from thop import profile, clever_format
    flops, params = profile(net, inputs=(x,))
    flops, params = clever_format([flops, params], "%.3f")
    print("FLOPs: %s" %(flops))
    print("params: %s" %(params))
    
    print(y.size())

test()
