# ========================================================
#             Media and Cognition
#             Homework 2 Convolutional Neural Network
#             networks.py - Network definition
#             Student ID: 2022010639
#             Name: Gao Yixuan
#             Tsinghua University
#             (C) Copyright 2024
# ========================================================

import torch
import torch.nn as nn
import torch.nn.functional as F


class ConvBlock(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        stride,
        padding,
        use_batch_norm=False,
        use_residual=False,
    ):
        """
        Convolutional block with batch normalization and ReLU activation
        ----------------------
        :param in_channels: channel number of input image
        :param out_channels: channel number of output image
        :param kernel_size: size of convolutional kernel
        :param stride: stride of convolutional operation
        :param padding: padding of convolutional operation
        :param use_batch_norm: whether to use batch normalization in convolutional layers
        :param use_residual: whether to use residual connection
        """
        super().__init__()

        if use_batch_norm:
            bn2d = nn.BatchNorm2d
        else:
            # use identity function to replace batch normalization
            bn2d = nn.Identity

        self.use_residual = use_residual

        # >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
        # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
        # Network structure:
        # conv -> batchnorm -> relu
        self.conv = nn.Conv2d(
            in_channels, out_channels, kernel_size, stride=stride, padding=padding
        )
        self.bn = bn2d(out_channels)
        self.relu = nn.ReLU()
        # <<< TODO 2.1

    def forward(self, x):
        # >>> TODO 2.2: forward process
        # Hint: apply residual connection if `self.use_residual` is True
        fx = self.relu(self.bn(self.conv(x)))
        # out = self.relu(self.bn(self.conv(x)))
        if self.use_residual:
            out = fx + x
        else:
            out = fx

        # <<< TODO 2.2
        return out


class Classifier(nn.Module):
    def __init__(
        self,
        in_channels,
        num_classes,
        use_batch_norm=False,
        use_stn=False,
        dropout_prob=0,
    ):
        """
        Convolutional Neural Networks
        ----------------------
        :param in_channels: channel number of input image
        :param num_classes: number of classes for the classification task
        :param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
        :param use_stn: whether to use spatial transformer network
        :param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
        """
        super().__init__()

        if use_batch_norm:
            bn1d = nn.BatchNorm1d
        else:
            # use identity function to replace batch normalization
            bn1d = nn.Identity

        if use_stn:
            self.stn = STN(in_channels)
        else:
            # use identity function to replace spatial transformer network
            self.stn = nn.Identity(in_channels)

        # >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
        # input image with size [batch_size, in_channels, img_h, img_w]
        # Network structure:
        #            kernel_size  stride  padding  out_channels  use_residual
        # ConvBlock       5          1        2          32         False
        # ConvBlock       5          2        2          64         False
        # maxpool         2          2        0
        # ConvBlock       3          1        1          64         True
        # ConvBlock       3          1        1          128        False
        # maxpool         2          2        0
        # ConvBlock       3          1        1          128        True
        # dropout(p), where p is input parameter of dropout ratio

        self.conv_net = nn.Sequential(
            ConvBlock(
                in_channels=in_channels,
                out_channels=32,
                kernel_size=5,
                stride=1,
                padding=2,
            ),
            ConvBlock(
                in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
            ),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            ConvBlock(
                in_channels=64,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=1,
                use_residual=True,
            ),
            ConvBlock(
                in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
            ),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            ConvBlock(
                in_channels=128,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=1,
                use_residual=True,
            ),
            nn.Dropout2d(p=dropout_prob),
        )
        # <<< TODO 3.1

        # >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
        # Hint:
        #   (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
        #       the output of the convolution layers?
        #   (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
        # Network structure:
        #            out_channels
        # linear          256
        # activation
        # batchnorm
        # dropout(p), where p is input parameter of dropout ratio
        # linear       num_classes
        self.fc_net = nn.Sequential(
            nn.Linear(2048, 256),
            nn.ReLU(),
            bn1d(256),
            nn.Dropout1d(dropout_prob),
            nn.Linear(256, num_classes),
        )
        # <<< TODO 3.2

    def forward(self, x):
        """
        Define the forward function
        :param x: input features with size [batch_size, in_channels, img_h, img_w]
        :return: output features with size [batch_size, num_classes]
        """
        # Step 1: apply spatial transformer network if applicable
        x = self.stn(x)

        # >>> TODO 3.3: forward process
        # Step 2: forward process for the convolutional network
        x = self.conv_net(x)

        # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
        # fully connected layers.
        x = x.view(x.shape[0], -1)

        # Step 4: forward process for the fully connected network
        out = self.fc_net(x)
        # <<< TODO 3.3

        return out


class STN(nn.Module):
    def __init__(self, in_channels):
        """
        The spatial transformer network (STN) learns how to perform spatial transformations on the
        input image in order to enhance the geometric invariance of the model. For example, it can
        crop a region of interest, scale and correct the orientation of an image. It can be a useful
        mechanism because CNNs are not invariant to rotation and scale and more general affine
        transformations.

        The spatial transformer network boils down to three main components:

        - The localization network is a regular CNN which regresses the transformation parameters.
          The transformation is never learned explicitly from this dataset, instead the network
          learns automatically the spatial transformations that enhances the global accuracy.
        - The grid generator generates a grid of coordinates in the input image corresponding
          to each pixel from the output image.
        - The sampler uses the parameters of the transformation and applies it to the input image.

        Here, we are going to implement an STN that performs affine transformations on the input images.
        For more information, please refer to the slides and
        https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .

        ----------------------
        :param in_channels: channel number of input image
        """
        super().__init__()

        # >>> TODO 4.1: Build your localization net
        # Step 1: Build a convolutional network to extract features from input images.
        # Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
        # this network.
        # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
        self.localization_conv = nn.Sequential(
            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
            # 8 * 13 * 13
            ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
            ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
            # 32 * 4 * 4
        )

        # Step 2: Build a fully connected network to predict the parameters of affine transformation from
        # the extracted features.
        # Hint: Combine linear layers and ReLU activation functions to build this network.
        # Suggested structure: 2 linear layers with one BN and ReLU.
        self.localization_fc = nn.Sequential(
            nn.Linear(32 * 4 * 4, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 6)
        )
        # <<< TODO 4.1

        # >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
        # Hint: The STN should generate the identity transformation by default before training.
        # How to initialize the weight/bias of the last linear layer of the fully connected network to
        # achieve this goal?
        nn.init.zeros_(self.localization_fc[3].weight)
        # <<< TODO 4.2

    def forward(self, x):
        # Extract the features from input images and flatten them
        features = self.localization_conv(x)
        features = features.view(features.shape[0], -1)

        # Predict the parameters of affine transformation from the extracted features
        theta = self.localization_fc(features)
        theta = theta.view(-1, 2, 3)

        # Apply affine transformation to input images
        grid = F.affine_grid(theta, x.shape, align_corners=False)
        x = F.grid_sample(x, grid, align_corners=False)

        return x