272 lines
11 KiB
Python
272 lines
11 KiB
Python
# ========================================================
|
|
# Media and Cognition
|
|
# Homework 2 Convolutional Neural Network
|
|
# networks.py - Network definition
|
|
# Student ID: 2022010639
|
|
# Name: Gao Yixuan
|
|
# Tsinghua University
|
|
# (C) Copyright 2024
|
|
# ========================================================
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
|
|
|
|
class ConvBlock(nn.Module):
|
|
def __init__(
|
|
self,
|
|
in_channels,
|
|
out_channels,
|
|
kernel_size,
|
|
stride,
|
|
padding,
|
|
use_batch_norm=False,
|
|
use_residual=False,
|
|
):
|
|
"""
|
|
Convolutional block with batch normalization and ReLU activation
|
|
----------------------
|
|
:param in_channels: channel number of input image
|
|
:param out_channels: channel number of output image
|
|
:param kernel_size: size of convolutional kernel
|
|
:param stride: stride of convolutional operation
|
|
:param padding: padding of convolutional operation
|
|
:param use_batch_norm: whether to use batch normalization in convolutional layers
|
|
:param use_residual: whether to use residual connection
|
|
"""
|
|
super().__init__()
|
|
|
|
if use_batch_norm:
|
|
bn2d = nn.BatchNorm2d
|
|
else:
|
|
# use identity function to replace batch normalization
|
|
bn2d = nn.Identity
|
|
|
|
self.use_residual = use_residual
|
|
|
|
# >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
|
|
# Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
|
# Network structure:
|
|
# conv -> batchnorm -> relu
|
|
self.conv = nn.Conv2d(
|
|
in_channels, out_channels, kernel_size, stride=stride, padding=padding
|
|
)
|
|
self.bn = bn2d(out_channels)
|
|
self.relu = nn.ReLU()
|
|
# <<< TODO 2.1
|
|
|
|
def forward(self, x):
|
|
# >>> TODO 2.2: forward process
|
|
# Hint: apply residual connection if `self.use_residual` is True
|
|
fx = self.relu(self.bn(self.conv(x)))
|
|
# out = self.relu(self.bn(self.conv(x)))
|
|
if self.use_residual:
|
|
out = fx + x
|
|
else:
|
|
out = fx
|
|
|
|
# <<< TODO 2.2
|
|
return out
|
|
|
|
|
|
class Classifier(nn.Module):
|
|
def __init__(
|
|
self,
|
|
in_channels,
|
|
num_classes,
|
|
use_batch_norm=False,
|
|
use_stn=False,
|
|
dropout_prob=0,
|
|
):
|
|
"""
|
|
Convolutional Neural Networks
|
|
----------------------
|
|
:param in_channels: channel number of input image
|
|
:param num_classes: number of classes for the classification task
|
|
:param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
|
|
:param use_stn: whether to use spatial transformer network
|
|
:param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
|
|
"""
|
|
super().__init__()
|
|
|
|
if use_batch_norm:
|
|
bn1d = nn.BatchNorm1d
|
|
else:
|
|
# use identity function to replace batch normalization
|
|
bn1d = nn.Identity
|
|
|
|
if use_stn:
|
|
self.stn = STN(in_channels)
|
|
else:
|
|
# use identity function to replace spatial transformer network
|
|
self.stn = nn.Identity(in_channels)
|
|
|
|
# >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
|
|
# input image with size [batch_size, in_channels, img_h, img_w]
|
|
# Network structure:
|
|
# kernel_size stride padding out_channels use_residual
|
|
# ConvBlock 5 1 2 32 False
|
|
# ConvBlock 5 2 2 64 False
|
|
# maxpool 2 2 0
|
|
# ConvBlock 3 1 1 64 True
|
|
# ConvBlock 3 1 1 128 False
|
|
# maxpool 2 2 0
|
|
# ConvBlock 3 1 1 128 True
|
|
# dropout(p), where p is input parameter of dropout ratio
|
|
|
|
self.conv_net = nn.Sequential(
|
|
ConvBlock(
|
|
in_channels=in_channels,
|
|
out_channels=32,
|
|
kernel_size=5,
|
|
stride=1,
|
|
padding=2,
|
|
),
|
|
ConvBlock(
|
|
in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
|
|
),
|
|
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
|
ConvBlock(
|
|
in_channels=64,
|
|
out_channels=64,
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1,
|
|
use_residual=True,
|
|
),
|
|
ConvBlock(
|
|
in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
|
|
),
|
|
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
|
ConvBlock(
|
|
in_channels=128,
|
|
out_channels=128,
|
|
kernel_size=3,
|
|
stride=1,
|
|
padding=1,
|
|
use_residual=True,
|
|
),
|
|
nn.Dropout2d(p=dropout_prob),
|
|
)
|
|
# <<< TODO 3.1
|
|
|
|
# >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
|
|
# Hint:
|
|
# (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
|
|
# the output of the convolution layers?
|
|
# (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
|
# Network structure:
|
|
# out_channels
|
|
# linear 256
|
|
# activation
|
|
# batchnorm
|
|
# dropout(p), where p is input parameter of dropout ratio
|
|
# linear num_classes
|
|
self.fc_net = nn.Sequential(
|
|
nn.Linear(2048, 256),
|
|
nn.ReLU(),
|
|
bn1d(256),
|
|
nn.Dropout1d(dropout_prob),
|
|
nn.Linear(256, num_classes),
|
|
)
|
|
# <<< TODO 3.2
|
|
|
|
def forward(self, x):
|
|
"""
|
|
Define the forward function
|
|
:param x: input features with size [batch_size, in_channels, img_h, img_w]
|
|
:return: output features with size [batch_size, num_classes]
|
|
"""
|
|
# Step 1: apply spatial transformer network if applicable
|
|
x = self.stn(x)
|
|
|
|
# >>> TODO 3.3: forward process
|
|
# Step 2: forward process for the convolutional network
|
|
x = self.conv_net(x)
|
|
|
|
# Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
|
|
# fully connected layers.
|
|
x = x.view(x.shape[0], -1)
|
|
|
|
# Step 4: forward process for the fully connected network
|
|
out = self.fc_net(x)
|
|
# <<< TODO 3.3
|
|
|
|
return out
|
|
|
|
|
|
class STN(nn.Module):
|
|
def __init__(self, in_channels):
|
|
"""
|
|
The spatial transformer network (STN) learns how to perform spatial transformations on the
|
|
input image in order to enhance the geometric invariance of the model. For example, it can
|
|
crop a region of interest, scale and correct the orientation of an image. It can be a useful
|
|
mechanism because CNNs are not invariant to rotation and scale and more general affine
|
|
transformations.
|
|
|
|
The spatial transformer network boils down to three main components:
|
|
|
|
- The localization network is a regular CNN which regresses the transformation parameters.
|
|
The transformation is never learned explicitly from this dataset, instead the network
|
|
learns automatically the spatial transformations that enhances the global accuracy.
|
|
- The grid generator generates a grid of coordinates in the input image corresponding
|
|
to each pixel from the output image.
|
|
- The sampler uses the parameters of the transformation and applies it to the input image.
|
|
|
|
Here, we are going to implement an STN that performs affine transformations on the input images.
|
|
For more information, please refer to the slides and
|
|
https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
|
|
|
|
----------------------
|
|
:param in_channels: channel number of input image
|
|
"""
|
|
super().__init__()
|
|
|
|
# >>> TODO 4.1: Build your localization net
|
|
# Step 1: Build a convolutional network to extract features from input images.
|
|
# Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
|
|
# this network.
|
|
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
|
|
self.localization_conv = nn.Sequential(
|
|
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
|
|
# 8 * 13 * 13
|
|
ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
|
|
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
|
# 32 * 4 * 4
|
|
)
|
|
|
|
# Step 2: Build a fully connected network to predict the parameters of affine transformation from
|
|
# the extracted features.
|
|
# Hint: Combine linear layers and ReLU activation functions to build this network.
|
|
# Suggested structure: 2 linear layers with one BN and ReLU.
|
|
self.localization_fc = nn.Sequential(
|
|
nn.Linear(32 * 4 * 4, 256),
|
|
nn.ReLU(),
|
|
nn.BatchNorm1d(256),
|
|
nn.Linear(256, 6)
|
|
)
|
|
# <<< TODO 4.1
|
|
|
|
# >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
|
|
# Hint: The STN should generate the identity transformation by default before training.
|
|
# How to initialize the weight/bias of the last linear layer of the fully connected network to
|
|
# achieve this goal?
|
|
nn.init.zeros_(self.localization_fc[3].weight)
|
|
# <<< TODO 4.2
|
|
|
|
def forward(self, x):
|
|
# Extract the features from input images and flatten them
|
|
features = self.localization_conv(x)
|
|
features = features.view(features.shape[0], -1)
|
|
|
|
# Predict the parameters of affine transformation from the extracted features
|
|
theta = self.localization_fc(features)
|
|
theta = theta.view(-1, 2, 3)
|
|
|
|
# Apply affine transformation to input images
|
|
grid = F.affine_grid(theta, x.shape, align_corners=False)
|
|
x = F.grid_sample(x, grid, align_corners=False)
|
|
|
|
return x
|