From 251a7e599a63eef3647f27f22311c5507aaa3deb Mon Sep 17 00:00:00 2001 From: unlockable Date: Tue, 9 Apr 2024 18:57:33 +0800 Subject: [PATCH] First working status. --- .gitignore | 3 +- hw2/code/datasets.py | 22 ++++++++----- hw2/code/networks.py | 73 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 79 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index f49f468..b14374f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__/ *.synctex.gz *.synctex.gz(buzy) *.out -*.pdf \ No newline at end of file +*.pdf +.DS_Store \ No newline at end of file diff --git a/hw2/code/datasets.py b/hw2/code/datasets.py index 00b3893..cd25bcc 100644 --- a/hw2/code/datasets.py +++ b/hw2/code/datasets.py @@ -2,8 +2,8 @@ # Media and Cognition # Homework 2 Convolutional Neural Network # datasets.py - Define the data loader for the traffic sign classification dataset -# Student ID: -# Name: +# Student ID: 2022010639 +# Name: Gao Yixuan # Tsinghua University # (C) Copyright 2024 # ======================================================== @@ -11,7 +11,8 @@ import os import numpy as np -import torchvision.transforms as transforms +import torchvision.transforms.v2 as transforms +import torch from torch.utils.data import DataLoader from torchvision.datasets import ImageFolder @@ -39,31 +40,38 @@ def get_data_loader( # (2) convert the images to PyTorch tensors # (3) normalize the pixel values to [-1, 1] data_transforms = [ - + # transforms.ToImage(), + transforms.Resize(image_size), + transforms.ToImage(), + transforms.ToDtype(torch.float32, scale=True), + transforms.Normalize(mean=[-127.0, -127.0, -127.0], std=[128.0, 128.0, 128.0]) ] # You should insert some data augmentation techniques to `data_transforms` when `augment` is True # for the training dataset. # Consider what is an appropriate data augmentation technique for traffic sign classification. if mode == "train" and augment: - pass # TODO + # pass # TODO + data_transforms.append(transforms.AutoAugment()) # Else, the `data_transforms` should be left unchanged # <<< TODO 1.1 # Use `transforms.Compose` to compose the list of transforms into a single transform data_transforms = transforms.Compose(data_transforms) + print(type(data_transforms)) + # >>> TODO 1.2: Define the dataset. # You should build the path to the selected dataset according to the `mode` parameter, # and use the `ImageFolder` class from `torchvision.datasets` to load the datasets. # Docs: https://pytorch.org/vision/stable/generated/torchvision.datasets.ImageFolder.html # The `ImageFolder` class takes in the path to the dataset and the transform to apply to the images. # The `ImageFolder` class will automatically load the images and labels for you. - dataset = ? + dataset = ImageFolder(root=data_root + "/" + mode, transform=data_transforms) # <<< TODO 1.2 # >>> TODO 1.3: Define the data loader. # You should set the `shuffle` parameter to `True` when `mode=='train'`, and `False` otherwise. - loader = ? + loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=(mode=='train')) # <<< TODO 1.3 return loader diff --git a/hw2/code/networks.py b/hw2/code/networks.py index 6588902..a4b2099 100644 --- a/hw2/code/networks.py +++ b/hw2/code/networks.py @@ -2,8 +2,8 @@ # Media and Cognition # Homework 2 Convolutional Neural Network # networks.py - Network definition -# Student ID: -# Name: +# Student ID: 2022010639 +# Name: Gao Yixuan # Tsinghua University # (C) Copyright 2024 # ======================================================== @@ -49,14 +49,22 @@ class ConvBlock(nn.Module): # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm` # Network structure: # conv -> batchnorm -> relu - self.conv = ? - self.bn = ? - self.relu = ? + self.conv = nn.Conv2d( + in_channels, out_channels, kernel_size, stride=stride, padding=padding + ) + self.bn = bn2d(out_channels) + self.relu = nn.ReLU() # <<< TODO 2.1 def forward(self, x): # >>> TODO 2.2: forward process # Hint: apply residual connection if `self.use_residual` is True + fx = self.relu(self.bn(self.conv(x))) + # out = self.relu(self.bn(self.conv(x))) + if self.use_residual: + out = fx + x + else: + out = fx # <<< TODO 2.2 return out @@ -108,7 +116,38 @@ class Classifier(nn.Module): # dropout(p), where p is input parameter of dropout ratio self.conv_net = nn.Sequential( - + ConvBlock( + in_channels=in_channels, + out_channels=32, + kernel_size=5, + stride=1, + padding=2, + ), + ConvBlock( + in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2 + ), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvBlock( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + use_residual=True, + ), + ConvBlock( + in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1 + ), + nn.MaxPool2d(kernel_size=2, stride=2, padding=0), + ConvBlock( + in_channels=128, + out_channels=128, + kernel_size=3, + stride=1, + padding=1, + use_residual=True, + ), + nn.Dropout2d(p=dropout_prob), ) # <<< TODO 3.1 @@ -125,7 +164,11 @@ class Classifier(nn.Module): # dropout(p), where p is input parameter of dropout ratio # linear num_classes self.fc_net = nn.Sequential( - + nn.Linear(2048, 256), + nn.ReLU(), + bn1d(256), + nn.Dropout1d(dropout_prob), + nn.Linear(256, num_classes), ) # <<< TODO 3.2 @@ -140,12 +183,14 @@ class Classifier(nn.Module): # >>> TODO 3.3: forward process # Step 2: forward process for the convolutional network + x = self.conv_net(x) # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the # fully connected layers. + x = x.view(-1, 2048) # Step 4: forward process for the fully connected network - + out = self.fc_net(x) # <<< TODO 3.3 return out @@ -184,7 +229,10 @@ class STN(nn.Module): # this network. # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU. self.localization_conv = nn.Sequential( - + ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=3, stride=2, padding=1, use_batch_norm=True), + ConvBlock(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1, use_batch_norm=True), + ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True), + # 32 * 4 * 4 ) # Step 2: Build a fully connected network to predict the parameters of affine transformation from @@ -192,7 +240,10 @@ class STN(nn.Module): # Hint: Combine linear layers and ReLU activation functions to build this network. # Suggested structure: 2 linear layers with one BN and ReLU. self.localization_fc = nn.Sequential( - + nn.Linear(16, 256), + nn.Linear(256, 360), + nn.BatchNorm1d(360), + nn.ReLU() ) # <<< TODO 4.1 @@ -200,7 +251,7 @@ class STN(nn.Module): # Hint: The STN should generate the identity transformation by default before training. # How to initialize the weight/bias of the last linear layer of the fully connected network to # achieve this goal? - + nn.init.zeros_(self.localization_fc[1].weight) # <<< TODO 4.2 def forward(self, x):