First working status.

This commit is contained in:
unlockable
2024-04-09 18:57:33 +08:00
parent bdb985ddb3
commit 251a7e599a
3 changed files with 79 additions and 19 deletions

3
.gitignore vendored
View File

@@ -6,4 +6,5 @@ __pycache__/
*.synctex.gz *.synctex.gz
*.synctex.gz(buzy) *.synctex.gz(buzy)
*.out *.out
*.pdf *.pdf
.DS_Store

View File

@@ -2,8 +2,8 @@
# Media and Cognition # Media and Cognition
# Homework 2 Convolutional Neural Network # Homework 2 Convolutional Neural Network
# datasets.py - Define the data loader for the traffic sign classification dataset # datasets.py - Define the data loader for the traffic sign classification dataset
# Student ID: # Student ID: 2022010639
# Name: # Name: Gao Yixuan
# Tsinghua University # Tsinghua University
# (C) Copyright 2024 # (C) Copyright 2024
# ======================================================== # ========================================================
@@ -11,7 +11,8 @@
import os import os
import numpy as np import numpy as np
import torchvision.transforms as transforms import torchvision.transforms.v2 as transforms
import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder from torchvision.datasets import ImageFolder
@@ -39,31 +40,38 @@ def get_data_loader(
# (2) convert the images to PyTorch tensors # (2) convert the images to PyTorch tensors
# (3) normalize the pixel values to [-1, 1] # (3) normalize the pixel values to [-1, 1]
data_transforms = [ data_transforms = [
# transforms.ToImage(),
transforms.Resize(image_size),
transforms.ToImage(),
transforms.ToDtype(torch.float32, scale=True),
transforms.Normalize(mean=[-127.0, -127.0, -127.0], std=[128.0, 128.0, 128.0])
] ]
# You should insert some data augmentation techniques to `data_transforms` when `augment` is True # You should insert some data augmentation techniques to `data_transforms` when `augment` is True
# for the training dataset. # for the training dataset.
# Consider what is an appropriate data augmentation technique for traffic sign classification. # Consider what is an appropriate data augmentation technique for traffic sign classification.
if mode == "train" and augment: if mode == "train" and augment:
pass # TODO # pass # TODO
data_transforms.append(transforms.AutoAugment())
# Else, the `data_transforms` should be left unchanged # Else, the `data_transforms` should be left unchanged
# <<< TODO 1.1 # <<< TODO 1.1
# Use `transforms.Compose` to compose the list of transforms into a single transform # Use `transforms.Compose` to compose the list of transforms into a single transform
data_transforms = transforms.Compose(data_transforms) data_transforms = transforms.Compose(data_transforms)
print(type(data_transforms))
# >>> TODO 1.2: Define the dataset. # >>> TODO 1.2: Define the dataset.
# You should build the path to the selected dataset according to the `mode` parameter, # You should build the path to the selected dataset according to the `mode` parameter,
# and use the `ImageFolder` class from `torchvision.datasets` to load the datasets. # and use the `ImageFolder` class from `torchvision.datasets` to load the datasets.
# Docs: https://pytorch.org/vision/stable/generated/torchvision.datasets.ImageFolder.html # Docs: https://pytorch.org/vision/stable/generated/torchvision.datasets.ImageFolder.html
# The `ImageFolder` class takes in the path to the dataset and the transform to apply to the images. # The `ImageFolder` class takes in the path to the dataset and the transform to apply to the images.
# The `ImageFolder` class will automatically load the images and labels for you. # The `ImageFolder` class will automatically load the images and labels for you.
dataset = ? dataset = ImageFolder(root=data_root + "/" + mode, transform=data_transforms)
# <<< TODO 1.2 # <<< TODO 1.2
# >>> TODO 1.3: Define the data loader. # >>> TODO 1.3: Define the data loader.
# You should set the `shuffle` parameter to `True` when `mode=='train'`, and `False` otherwise. # You should set the `shuffle` parameter to `True` when `mode=='train'`, and `False` otherwise.
loader = ? loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=(mode=='train'))
# <<< TODO 1.3 # <<< TODO 1.3
return loader return loader

View File

@@ -2,8 +2,8 @@
# Media and Cognition # Media and Cognition
# Homework 2 Convolutional Neural Network # Homework 2 Convolutional Neural Network
# networks.py - Network definition # networks.py - Network definition
# Student ID: # Student ID: 2022010639
# Name: # Name: Gao Yixuan
# Tsinghua University # Tsinghua University
# (C) Copyright 2024 # (C) Copyright 2024
# ======================================================== # ========================================================
@@ -49,14 +49,22 @@ class ConvBlock(nn.Module):
# Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm` # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
# Network structure: # Network structure:
# conv -> batchnorm -> relu # conv -> batchnorm -> relu
self.conv = ? self.conv = nn.Conv2d(
self.bn = ? in_channels, out_channels, kernel_size, stride=stride, padding=padding
self.relu = ? )
self.bn = bn2d(out_channels)
self.relu = nn.ReLU()
# <<< TODO 2.1 # <<< TODO 2.1
def forward(self, x): def forward(self, x):
# >>> TODO 2.2: forward process # >>> TODO 2.2: forward process
# Hint: apply residual connection if `self.use_residual` is True # Hint: apply residual connection if `self.use_residual` is True
fx = self.relu(self.bn(self.conv(x)))
# out = self.relu(self.bn(self.conv(x)))
if self.use_residual:
out = fx + x
else:
out = fx
# <<< TODO 2.2 # <<< TODO 2.2
return out return out
@@ -108,7 +116,38 @@ class Classifier(nn.Module):
# dropout(p), where p is input parameter of dropout ratio # dropout(p), where p is input parameter of dropout ratio
self.conv_net = nn.Sequential( self.conv_net = nn.Sequential(
ConvBlock(
in_channels=in_channels,
out_channels=32,
kernel_size=5,
stride=1,
padding=2,
),
ConvBlock(
in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
ConvBlock(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
use_residual=True,
),
ConvBlock(
in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
ConvBlock(
in_channels=128,
out_channels=128,
kernel_size=3,
stride=1,
padding=1,
use_residual=True,
),
nn.Dropout2d(p=dropout_prob),
) )
# <<< TODO 3.1 # <<< TODO 3.1
@@ -125,7 +164,11 @@ class Classifier(nn.Module):
# dropout(p), where p is input parameter of dropout ratio # dropout(p), where p is input parameter of dropout ratio
# linear num_classes # linear num_classes
self.fc_net = nn.Sequential( self.fc_net = nn.Sequential(
nn.Linear(2048, 256),
nn.ReLU(),
bn1d(256),
nn.Dropout1d(dropout_prob),
nn.Linear(256, num_classes),
) )
# <<< TODO 3.2 # <<< TODO 3.2
@@ -140,12 +183,14 @@ class Classifier(nn.Module):
# >>> TODO 3.3: forward process # >>> TODO 3.3: forward process
# Step 2: forward process for the convolutional network # Step 2: forward process for the convolutional network
x = self.conv_net(x)
# Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
# fully connected layers. # fully connected layers.
x = x.view(-1, 2048)
# Step 4: forward process for the fully connected network # Step 4: forward process for the fully connected network
out = self.fc_net(x)
# <<< TODO 3.3 # <<< TODO 3.3
return out return out
@@ -184,7 +229,10 @@ class STN(nn.Module):
# this network. # this network.
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU. # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
self.localization_conv = nn.Sequential( self.localization_conv = nn.Sequential(
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
ConvBlock(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
# 32 * 4 * 4
) )
# Step 2: Build a fully connected network to predict the parameters of affine transformation from # Step 2: Build a fully connected network to predict the parameters of affine transformation from
@@ -192,7 +240,10 @@ class STN(nn.Module):
# Hint: Combine linear layers and ReLU activation functions to build this network. # Hint: Combine linear layers and ReLU activation functions to build this network.
# Suggested structure: 2 linear layers with one BN and ReLU. # Suggested structure: 2 linear layers with one BN and ReLU.
self.localization_fc = nn.Sequential( self.localization_fc = nn.Sequential(
nn.Linear(16, 256),
nn.Linear(256, 360),
nn.BatchNorm1d(360),
nn.ReLU()
) )
# <<< TODO 4.1 # <<< TODO 4.1
@@ -200,7 +251,7 @@ class STN(nn.Module):
# Hint: The STN should generate the identity transformation by default before training. # Hint: The STN should generate the identity transformation by default before training.
# How to initialize the weight/bias of the last linear layer of the fully connected network to # How to initialize the weight/bias of the last linear layer of the fully connected network to
# achieve this goal? # achieve this goal?
nn.init.zeros_(self.localization_fc[1].weight)
# <<< TODO 4.2 # <<< TODO 4.2
def forward(self, x): def forward(self, x):