feat(hw3): Copy file from hw2
This commit is contained in:
271
hw3/code/networks.py
Normal file
271
hw3/code/networks.py
Normal file
@@ -0,0 +1,271 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 2 Convolutional Neural Network
|
||||
# networks.py - Network definition
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class ConvBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
use_batch_norm=False,
|
||||
use_residual=False,
|
||||
):
|
||||
"""
|
||||
Convolutional block with batch normalization and ReLU activation
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
:param out_channels: channel number of output image
|
||||
:param kernel_size: size of convolutional kernel
|
||||
:param stride: stride of convolutional operation
|
||||
:param padding: padding of convolutional operation
|
||||
:param use_batch_norm: whether to use batch normalization in convolutional layers
|
||||
:param use_residual: whether to use residual connection
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if use_batch_norm:
|
||||
bn2d = nn.BatchNorm2d
|
||||
else:
|
||||
# use identity function to replace batch normalization
|
||||
bn2d = nn.Identity
|
||||
|
||||
self.use_residual = use_residual
|
||||
|
||||
# >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
|
||||
# Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
||||
# Network structure:
|
||||
# conv -> batchnorm -> relu
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels, out_channels, kernel_size, stride=stride, padding=padding
|
||||
)
|
||||
self.bn = bn2d(out_channels)
|
||||
self.relu = nn.ReLU()
|
||||
# <<< TODO 2.1
|
||||
|
||||
def forward(self, x):
|
||||
# >>> TODO 2.2: forward process
|
||||
# Hint: apply residual connection if `self.use_residual` is True
|
||||
fx = self.relu(self.bn(self.conv(x)))
|
||||
# out = self.relu(self.bn(self.conv(x)))
|
||||
if self.use_residual:
|
||||
out = fx + x
|
||||
else:
|
||||
out = fx
|
||||
|
||||
# <<< TODO 2.2
|
||||
return out
|
||||
|
||||
|
||||
class Classifier(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
num_classes,
|
||||
use_batch_norm=False,
|
||||
use_stn=False,
|
||||
dropout_prob=0,
|
||||
):
|
||||
"""
|
||||
Convolutional Neural Networks
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
:param num_classes: number of classes for the classification task
|
||||
:param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
|
||||
:param use_stn: whether to use spatial transformer network
|
||||
:param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if use_batch_norm:
|
||||
bn1d = nn.BatchNorm1d
|
||||
else:
|
||||
# use identity function to replace batch normalization
|
||||
bn1d = nn.Identity
|
||||
|
||||
if use_stn:
|
||||
self.stn = STN(in_channels)
|
||||
else:
|
||||
# use identity function to replace spatial transformer network
|
||||
self.stn = nn.Identity(in_channels)
|
||||
|
||||
# >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
|
||||
# input image with size [batch_size, in_channels, img_h, img_w]
|
||||
# Network structure:
|
||||
# kernel_size stride padding out_channels use_residual
|
||||
# ConvBlock 5 1 2 32 False
|
||||
# ConvBlock 5 2 2 64 False
|
||||
# maxpool 2 2 0
|
||||
# ConvBlock 3 1 1 64 True
|
||||
# ConvBlock 3 1 1 128 False
|
||||
# maxpool 2 2 0
|
||||
# ConvBlock 3 1 1 128 True
|
||||
# dropout(p), where p is input parameter of dropout ratio
|
||||
|
||||
self.conv_net = nn.Sequential(
|
||||
ConvBlock(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=5,
|
||||
stride=1,
|
||||
padding=2,
|
||||
),
|
||||
ConvBlock(
|
||||
in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
|
||||
),
|
||||
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
||||
ConvBlock(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
use_residual=True,
|
||||
),
|
||||
ConvBlock(
|
||||
in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
|
||||
),
|
||||
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
||||
ConvBlock(
|
||||
in_channels=128,
|
||||
out_channels=128,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
use_residual=True,
|
||||
),
|
||||
nn.Dropout2d(p=dropout_prob),
|
||||
)
|
||||
# <<< TODO 3.1
|
||||
|
||||
# >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
|
||||
# Hint:
|
||||
# (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
|
||||
# the output of the convolution layers?
|
||||
# (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
||||
# Network structure:
|
||||
# out_channels
|
||||
# linear 256
|
||||
# activation
|
||||
# batchnorm
|
||||
# dropout(p), where p is input parameter of dropout ratio
|
||||
# linear num_classes
|
||||
self.fc_net = nn.Sequential(
|
||||
nn.Linear(2048, 256),
|
||||
nn.ReLU(),
|
||||
bn1d(256),
|
||||
nn.Dropout1d(dropout_prob),
|
||||
nn.Linear(256, num_classes),
|
||||
)
|
||||
# <<< TODO 3.2
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Define the forward function
|
||||
:param x: input features with size [batch_size, in_channels, img_h, img_w]
|
||||
:return: output features with size [batch_size, num_classes]
|
||||
"""
|
||||
# Step 1: apply spatial transformer network if applicable
|
||||
x = self.stn(x)
|
||||
|
||||
# >>> TODO 3.3: forward process
|
||||
# Step 2: forward process for the convolutional network
|
||||
x = self.conv_net(x)
|
||||
|
||||
# Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
|
||||
# fully connected layers.
|
||||
x = x.view(x.shape[0], -1)
|
||||
|
||||
# Step 4: forward process for the fully connected network
|
||||
out = self.fc_net(x)
|
||||
# <<< TODO 3.3
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class STN(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
"""
|
||||
The spatial transformer network (STN) learns how to perform spatial transformations on the
|
||||
input image in order to enhance the geometric invariance of the model. For example, it can
|
||||
crop a region of interest, scale and correct the orientation of an image. It can be a useful
|
||||
mechanism because CNNs are not invariant to rotation and scale and more general affine
|
||||
transformations.
|
||||
|
||||
The spatial transformer network boils down to three main components:
|
||||
|
||||
- The localization network is a regular CNN which regresses the transformation parameters.
|
||||
The transformation is never learned explicitly from this dataset, instead the network
|
||||
learns automatically the spatial transformations that enhances the global accuracy.
|
||||
- The grid generator generates a grid of coordinates in the input image corresponding
|
||||
to each pixel from the output image.
|
||||
- The sampler uses the parameters of the transformation and applies it to the input image.
|
||||
|
||||
Here, we are going to implement an STN that performs affine transformations on the input images.
|
||||
For more information, please refer to the slides and
|
||||
https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
|
||||
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# >>> TODO 4.1: Build your localization net
|
||||
# Step 1: Build a convolutional network to extract features from input images.
|
||||
# Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
|
||||
# this network.
|
||||
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
|
||||
self.localization_conv = nn.Sequential(
|
||||
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
|
||||
# 8 * 13 * 13
|
||||
ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
|
||||
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
||||
# 32 * 4 * 4
|
||||
)
|
||||
|
||||
# Step 2: Build a fully connected network to predict the parameters of affine transformation from
|
||||
# the extracted features.
|
||||
# Hint: Combine linear layers and ReLU activation functions to build this network.
|
||||
# Suggested structure: 2 linear layers with one BN and ReLU.
|
||||
self.localization_fc = nn.Sequential(
|
||||
nn.Linear(32 * 4 * 4, 256),
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1d(256),
|
||||
nn.Linear(256, 6)
|
||||
)
|
||||
# <<< TODO 4.1
|
||||
|
||||
# >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
|
||||
# Hint: The STN should generate the identity transformation by default before training.
|
||||
# How to initialize the weight/bias of the last linear layer of the fully connected network to
|
||||
# achieve this goal?
|
||||
nn.init.zeros_(self.localization_fc[3].weight)
|
||||
# <<< TODO 4.2
|
||||
|
||||
def forward(self, x):
|
||||
# Extract the features from input images and flatten them
|
||||
features = self.localization_conv(x)
|
||||
features = features.view(features.shape[0], -1)
|
||||
|
||||
# Predict the parameters of affine transformation from the extracted features
|
||||
theta = self.localization_fc(features)
|
||||
theta = theta.view(-1, 2, 3)
|
||||
|
||||
# Apply affine transformation to input images
|
||||
grid = F.affine_grid(theta, x.shape, align_corners=False)
|
||||
x = F.grid_sample(x, grid, align_corners=False)
|
||||
|
||||
return x
|
||||
Reference in New Issue
Block a user