From 251a7e599a63eef3647f27f22311c5507aaa3deb Mon Sep 17 00:00:00 2001
From: unlockable <g@unlockableworld.com>
Date: Tue, 9 Apr 2024 18:57:33 +0800
Subject: [PATCH] First working status.

---
 .gitignore           |  3 +-
 hw2/code/datasets.py | 22 ++++++++-----
 hw2/code/networks.py | 73 +++++++++++++++++++++++++++++++++++++-------
 3 files changed, 79 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index f49f468..b14374f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ __pycache__/
 *.synctex.gz
 *.synctex.gz(buzy)
 *.out
-*.pdf
\ No newline at end of file
+*.pdf
+.DS_Store
\ No newline at end of file
diff --git a/hw2/code/datasets.py b/hw2/code/datasets.py
index 00b3893..cd25bcc 100644
--- a/hw2/code/datasets.py
+++ b/hw2/code/datasets.py
@@ -2,8 +2,8 @@
 #             Media and Cognition
 #             Homework 2 Convolutional Neural Network
 #             datasets.py - Define the data loader for the traffic sign classification dataset
-#             Student ID:
-#             Name:
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
@@ -11,7 +11,8 @@
 import os
 
 import numpy as np
-import torchvision.transforms as transforms
+import torchvision.transforms.v2 as transforms
+import torch
 from torch.utils.data import DataLoader
 from torchvision.datasets import ImageFolder
 
@@ -39,31 +40,38 @@ def get_data_loader(
     #   (2) convert the images to PyTorch tensors
     #   (3) normalize the pixel values to [-1, 1]
     data_transforms = [
-
+        # transforms.ToImage(),
+        transforms.Resize(image_size),
+        transforms.ToImage(),
+        transforms.ToDtype(torch.float32, scale=True),
+        transforms.Normalize(mean=[-127.0, -127.0, -127.0], std=[128.0, 128.0, 128.0])
     ]
 
     # You should insert some data augmentation techniques to `data_transforms` when `augment` is True
     # for the training dataset.
     # Consider what is an appropriate data augmentation technique for traffic sign classification.
     if mode == "train" and augment:
-        pass  # TODO
+        # pass  # TODO
+        data_transforms.append(transforms.AutoAugment())
     # Else, the `data_transforms` should be left unchanged
     # <<< TODO 1.1
     # Use `transforms.Compose` to compose the list of transforms into a single transform
     data_transforms = transforms.Compose(data_transforms)
 
+    print(type(data_transforms))
+
     # >>> TODO 1.2: Define the dataset.
     # You should build the path to the selected dataset according to the `mode` parameter,
     # and use the `ImageFolder` class from `torchvision.datasets` to load the datasets.
     # Docs: https://pytorch.org/vision/stable/generated/torchvision.datasets.ImageFolder.html
     # The `ImageFolder` class takes in the path to the dataset and the transform to apply to the images.
     # The `ImageFolder` class will automatically load the images and labels for you.
-    dataset = ?
+    dataset = ImageFolder(root=data_root + "/" + mode, transform=data_transforms)
     # <<< TODO 1.2
 
     # >>> TODO 1.3: Define the data loader.
     # You should set the `shuffle` parameter to `True` when `mode=='train'`, and `False` otherwise.
-    loader = ?
+    loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=(mode=='train'))
     # <<< TODO 1.3
 
     return loader
diff --git a/hw2/code/networks.py b/hw2/code/networks.py
index 6588902..a4b2099 100644
--- a/hw2/code/networks.py
+++ b/hw2/code/networks.py
@@ -2,8 +2,8 @@
 #             Media and Cognition
 #             Homework 2 Convolutional Neural Network
 #             networks.py - Network definition
-#             Student ID:
-#             Name:
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
@@ -49,14 +49,22 @@ class ConvBlock(nn.Module):
         # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
         # Network structure:
         # conv -> batchnorm -> relu
-        self.conv = ?
-        self.bn = ?
-        self.relu = ?
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.bn = bn2d(out_channels)
+        self.relu = nn.ReLU()
         # <<< TODO 2.1
 
     def forward(self, x):
         # >>> TODO 2.2: forward process
         # Hint: apply residual connection if `self.use_residual` is True
+        fx = self.relu(self.bn(self.conv(x)))
+        # out = self.relu(self.bn(self.conv(x)))
+        if self.use_residual:
+            out = fx + x
+        else:
+            out = fx
 
         # <<< TODO 2.2
         return out
@@ -108,7 +116,38 @@ class Classifier(nn.Module):
         # dropout(p), where p is input parameter of dropout ratio
 
         self.conv_net = nn.Sequential(
-
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=32,
+                kernel_size=5,
+                stride=1,
+                padding=2,
+            ),
+            ConvBlock(
+                in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            ConvBlock(
+                in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=128,
+                out_channels=128,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            nn.Dropout2d(p=dropout_prob),
         )
         # <<< TODO 3.1
 
@@ -125,7 +164,11 @@ class Classifier(nn.Module):
         # dropout(p), where p is input parameter of dropout ratio
         # linear       num_classes
         self.fc_net = nn.Sequential(
-
+            nn.Linear(2048, 256),
+            nn.ReLU(),
+            bn1d(256),
+            nn.Dropout1d(dropout_prob),
+            nn.Linear(256, num_classes),
         )
         # <<< TODO 3.2
 
@@ -140,12 +183,14 @@ class Classifier(nn.Module):
 
         # >>> TODO 3.3: forward process
         # Step 2: forward process for the convolutional network
+        x = self.conv_net(x)
 
         # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
         # fully connected layers.
+        x = x.view(-1, 2048)
 
         # Step 4: forward process for the fully connected network
-
+        out = self.fc_net(x)
         # <<< TODO 3.3
 
         return out
@@ -184,7 +229,10 @@ class STN(nn.Module):
         # this network.
         # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
         self.localization_conv = nn.Sequential(
-
+            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            ConvBlock(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            # 32 * 4 * 4
         )
 
         # Step 2: Build a fully connected network to predict the parameters of affine transformation from
@@ -192,7 +240,10 @@ class STN(nn.Module):
         # Hint: Combine linear layers and ReLU activation functions to build this network.
         # Suggested structure: 2 linear layers with one BN and ReLU.
         self.localization_fc = nn.Sequential(
-
+            nn.Linear(16, 256),
+            nn.Linear(256, 360),
+            nn.BatchNorm1d(360),
+            nn.ReLU()
         )
         # <<< TODO 4.1
 
@@ -200,7 +251,7 @@ class STN(nn.Module):
         # Hint: The STN should generate the identity transformation by default before training.
         # How to initialize the weight/bias of the last linear layer of the fully connected network to
         # achieve this goal?
-
+        nn.init.zeros_(self.localization_fc[1].weight)
         # <<< TODO 4.2
 
     def forward(self, x):