feat(hw3): Copy file from hw2

2024-05-16 17:41:27 +08:00
parent b741c9d08e
commit 81de7b1d58
1 changed files with 271 additions and 0 deletions
--- a/hw3/code/networks.py
+++ b/hw3/code/networks.py
@@ -0,0 +1,271 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 2 Convolutional Neural Network
+#             networks.py - Network definition
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        use_batch_norm=False,
+        use_residual=False,
+    ):
+        """
+        Convolutional block with batch normalization and ReLU activation
+        ----------------------
+        :param in_channels: channel number of input image
+        :param out_channels: channel number of output image
+        :param kernel_size: size of convolutional kernel
+        :param stride: stride of convolutional operation
+        :param padding: padding of convolutional operation
+        :param use_batch_norm: whether to use batch normalization in convolutional layers
+        :param use_residual: whether to use residual connection
+        """
+        super().__init__()
+
+        if use_batch_norm:
+            bn2d = nn.BatchNorm2d
+        else:
+            # use identity function to replace batch normalization
+            bn2d = nn.Identity
+
+        self.use_residual = use_residual
+
+        # >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
+        # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
+        # Network structure:
+        # conv -> batchnorm -> relu
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.bn = bn2d(out_channels)
+        self.relu = nn.ReLU()
+        # <<< TODO 2.1
+
+    def forward(self, x):
+        # >>> TODO 2.2: forward process
+        # Hint: apply residual connection if `self.use_residual` is True
+        fx = self.relu(self.bn(self.conv(x)))
+        # out = self.relu(self.bn(self.conv(x)))
+        if self.use_residual:
+            out = fx + x
+        else:
+            out = fx
+
+        # <<< TODO 2.2
+        return out
+
+
+class Classifier(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_classes,
+        use_batch_norm=False,
+        use_stn=False,
+        dropout_prob=0,
+    ):
+        """
+        Convolutional Neural Networks
+        ----------------------
+        :param in_channels: channel number of input image
+        :param num_classes: number of classes for the classification task
+        :param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
+        :param use_stn: whether to use spatial transformer network
+        :param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
+        """
+        super().__init__()
+
+        if use_batch_norm:
+            bn1d = nn.BatchNorm1d
+        else:
+            # use identity function to replace batch normalization
+            bn1d = nn.Identity
+
+        if use_stn:
+            self.stn = STN(in_channels)
+        else:
+            # use identity function to replace spatial transformer network
+            self.stn = nn.Identity(in_channels)
+
+        # >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
+        # input image with size [batch_size, in_channels, img_h, img_w]
+        # Network structure:
+        #            kernel_size  stride  padding  out_channels  use_residual
+        # ConvBlock       5          1        2          32         False
+        # ConvBlock       5          2        2          64         False
+        # maxpool         2          2        0
+        # ConvBlock       3          1        1          64         True
+        # ConvBlock       3          1        1          128        False
+        # maxpool         2          2        0
+        # ConvBlock       3          1        1          128        True
+        # dropout(p), where p is input parameter of dropout ratio
+
+        self.conv_net = nn.Sequential(
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=32,
+                kernel_size=5,
+                stride=1,
+                padding=2,
+            ),
+            ConvBlock(
+                in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            ConvBlock(
+                in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=128,
+                out_channels=128,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            nn.Dropout2d(p=dropout_prob),
+        )
+        # <<< TODO 3.1
+
+        # >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
+        # Hint:
+        #   (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
+        #       the output of the convolution layers?
+        #   (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
+        # Network structure:
+        #            out_channels
+        # linear          256
+        # activation
+        # batchnorm
+        # dropout(p), where p is input parameter of dropout ratio
+        # linear       num_classes
+        self.fc_net = nn.Sequential(
+            nn.Linear(2048, 256),
+            nn.ReLU(),
+            bn1d(256),
+            nn.Dropout1d(dropout_prob),
+            nn.Linear(256, num_classes),
+        )
+        # <<< TODO 3.2
+
+    def forward(self, x):
+        """
+        Define the forward function
+        :param x: input features with size [batch_size, in_channels, img_h, img_w]
+        :return: output features with size [batch_size, num_classes]
+        """
+        # Step 1: apply spatial transformer network if applicable
+        x = self.stn(x)
+
+        # >>> TODO 3.3: forward process
+        # Step 2: forward process for the convolutional network
+        x = self.conv_net(x)
+
+        # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
+        # fully connected layers.
+        x = x.view(x.shape[0], -1)
+
+        # Step 4: forward process for the fully connected network
+        out = self.fc_net(x)
+        # <<< TODO 3.3
+
+        return out
+
+
+class STN(nn.Module):
+    def __init__(self, in_channels):
+        """
+        The spatial transformer network (STN) learns how to perform spatial transformations on the
+        input image in order to enhance the geometric invariance of the model. For example, it can
+        crop a region of interest, scale and correct the orientation of an image. It can be a useful
+        mechanism because CNNs are not invariant to rotation and scale and more general affine
+        transformations.
+
+        The spatial transformer network boils down to three main components:
+
+        - The localization network is a regular CNN which regresses the transformation parameters.
+          The transformation is never learned explicitly from this dataset, instead the network
+          learns automatically the spatial transformations that enhances the global accuracy.
+        - The grid generator generates a grid of coordinates in the input image corresponding
+          to each pixel from the output image.
+        - The sampler uses the parameters of the transformation and applies it to the input image.
+
+        Here, we are going to implement an STN that performs affine transformations on the input images.
+        For more information, please refer to the slides and
+        https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
+
+        ----------------------
+        :param in_channels: channel number of input image
+        """
+        super().__init__()
+
+        # >>> TODO 4.1: Build your localization net
+        # Step 1: Build a convolutional network to extract features from input images.
+        # Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
+        # this network.
+        # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
+        self.localization_conv = nn.Sequential(
+            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
+            # 8 * 13 * 13
+            ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
+            ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            # 32 * 4 * 4
+        )
+
+        # Step 2: Build a fully connected network to predict the parameters of affine transformation from
+        # the extracted features.
+        # Hint: Combine linear layers and ReLU activation functions to build this network.
+        # Suggested structure: 2 linear layers with one BN and ReLU.
+        self.localization_fc = nn.Sequential(
+            nn.Linear(32 * 4 * 4, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 6)
+        )
+        # <<< TODO 4.1
+
+        # >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
+        # Hint: The STN should generate the identity transformation by default before training.
+        # How to initialize the weight/bias of the last linear layer of the fully connected network to
+        # achieve this goal?
+        nn.init.zeros_(self.localization_fc[3].weight)
+        # <<< TODO 4.2
+
+    def forward(self, x):
+        # Extract the features from input images and flatten them
+        features = self.localization_conv(x)
+        features = features.view(features.shape[0], -1)
+
+        # Predict the parameters of affine transformation from the extracted features
+        theta = self.localization_fc(features)
+        theta = theta.view(-1, 2, 3)
+
+        # Apply affine transformation to input images
+        grid = F.affine_grid(theta, x.shape, align_corners=False)
+        x = F.grid_sample(x, grid, align_corners=False)
+
+        return x