Complete.

2024-04-11 14:20:28 +08:00
parent 3747678e61
commit 8fc38ca6c5
25 changed files with 388 additions and 49 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,5 @@ __pycache__/
 *.out
 *.pdf
 .DS_Store
-hw2/code/checkpoints/
+hw2/code/checkpoints/
+hw2/code/visualized/
--- a/hw2/code/datasets.py
+++ b/hw2/code/datasets.py
@@ -52,7 +52,9 @@ def get_data_loader(
    # Consider what is an appropriate data augmentation technique for traffic sign classification.
    if mode == "train" and augment:
        # pass  # TODO
-        data_transforms.append(transforms.AutoAugment())
+        # data_transforms.append(transforms.AutoAugment())
+        data_transforms.append(transforms.RandomAffine(degrees=30,shear=10))
+        data_transforms.append(transforms.RandomAutocontrast())
    # Else, the `data_transforms` should be left unchanged
    # <<< TODO 1.1
    # Use `transforms.Compose` to compose the list of transforms into a single transform
--- a/hw2/code/networks.py
+++ b/hw2/code/networks.py
@@ -229,8 +229,9 @@ class STN(nn.Module):
        # this network.
        # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
        self.localization_conv = nn.Sequential(
-            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
-            ConvBlock(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
+            # 8 * 13 * 13
+            ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
            ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
            # 32 * 4 * 4
        )
@@ -240,10 +241,10 @@ class STN(nn.Module):
        # Hint: Combine linear layers and ReLU activation functions to build this network.
        # Suggested structure: 2 linear layers with one BN and ReLU.
        self.localization_fc = nn.Sequential(
-            nn.Linear(16, 256),
-            nn.Linear(256, 6),
-            nn.BatchNorm1d(6),
-            nn.ReLU()
+            nn.Linear(32 * 4 * 4, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 6)
        )
        # <<< TODO 4.1

@@ -251,7 +252,7 @@ class STN(nn.Module):
        # Hint: The STN should generate the identity transformation by default before training.
        # How to initialize the weight/bias of the last linear layer of the fully connected network to
        # achieve this goal?
-        nn.init.zeros_(self.localization_fc[1].weight)
+        nn.init.zeros_(self.localization_fc[3].weight)
        # <<< TODO 4.2

    def forward(self, x):
--- a/hw2/report/img/augmentation.jpg
+++ b/hw2/report/img/augmentation.jpg
--- a/hw2/report/img/feature/image.jpg
+++ b/hw2/report/img/feature/image.jpg
--- a/hw2/report/img/feature/layer_0/feature_map.jpg
+++ b/hw2/report/img/feature/layer_0/feature_map.jpg
--- a/hw2/report/img/feature/layer_1/feature_map.jpg
+++ b/hw2/report/img/feature/layer_1/feature_map.jpg
--- a/hw2/report/img/feature/layer_2/feature_map.jpg
+++ b/hw2/report/img/feature/layer_2/feature_map.jpg
--- a/hw2/report/img/feature/layer_3/feature_map.jpg
+++ b/hw2/report/img/feature/layer_3/feature_map.jpg
--- a/hw2/report/img/feature/layer_4/feature_map.jpg
+++ b/hw2/report/img/feature/layer_4/feature_map.jpg
--- a/hw2/report/img/filter/filter_layer_0.jpg
+++ b/hw2/report/img/filter/filter_layer_0.jpg
--- a/hw2/report/img/filter/filter_layer_1.jpg
+++ b/hw2/report/img/filter/filter_layer_1.jpg
--- a/hw2/report/img/filter/filter_layer_2.jpg
+++ b/hw2/report/img/filter/filter_layer_2.jpg
--- a/hw2/report/img/filter/filter_layer_3.jpg
+++ b/hw2/report/img/filter/filter_layer_3.jpg
--- a/hw2/report/img/filter/filter_layer_4.jpg
+++ b/hw2/report/img/filter/filter_layer_4.jpg
--- a/hw2/report/img/models/bn/loss_and_acc.jpg
+++ b/hw2/report/img/models/bn/loss_and_acc.jpg
--- a/hw2/report/img/models/bn_aug/loss_and_acc.jpg
+++ b/hw2/report/img/models/bn_aug/loss_and_acc.jpg
--- a/hw2/report/img/models/default/loss_and_acc.jpg
+++ b/hw2/report/img/models/default/loss_and_acc.jpg
--- a/hw2/report/img/models/dropout/loss_and_acc.jpg
+++ b/hw2/report/img/models/dropout/loss_and_acc.jpg
--- a/hw2/report/img/models/stn/loss_and_acc.jpg
+++ b/hw2/report/img/models/stn/loss_and_acc.jpg
--- a/hw2/report/img/stn/stn.jpg
+++ b/hw2/report/img/stn/stn.jpg
--- a/hw2/report/img/tsne/tsne.jpg
+++ b/hw2/report/img/tsne/tsne.jpg
--- a/hw2/report/main.tex
+++ b/hw2/report/main.tex
@@ -10,39 +10,48 @@
 \usepackage{enumitem}
 \usepackage{graphicx}
 \usepackage{listings}
-\usepackage{color}
+\usepackage{fontspec}
+\usepackage{xcolor}
+\usepackage{float}
+% \usepackage{color}
+
+\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
+\newfontfamily\cascadia{Cascadia Code}

 \lstset{
-    basicstyle          =   \sffamily,          % 基本代码风格
-    keywordstyle        =   \bfseries,          % 关键字风格
-    commentstyle        =   \rmfamily\itshape,  % 注释的风格，斜体
-    stringstyle         =   \ttfamily,  % 字符串风格
-    flexiblecolumns,                % 别问为什么，加上这个
-    numbers             =   left,   % 行号的位置在左边
-    showspaces          =   false,  % 是否显示空格，显示了有点乱，所以不现实了
-    numberstyle         =   \zihao{-5}\ttfamily,    % 行号的样式，小五号，tt等宽字体
+    basicstyle          =   \small\codefont,
+    % ---
+    tabsize             =   4,
    showstringspaces    =   false,
-    captionpos          =   t,      % 这段代码的名字所呈现的位置，t指的是top上面
-    frame               =   lrtb,   % 显示边框
+    numbers             =   left,
+    numberstyle         =   \codefont,
+    % ---
+    breaklines          =   true,
+    captionpos          =   t,      
+    % ---
+    frame               =   l,
+    flexiblecolumns,
 }

 \lstdefinestyle{Python}{
    language        =   Python, % 语言选Python
-    basicstyle      =   \zihao{-5}\ttfamily,
-    numberstyle     =   \zihao{-5}\ttfamily,
    keywordstyle    =   \color{blue},
    keywordstyle    =   [2] \color{teal},
-    stringstyle     =   \color{magenta},
-    commentstyle    =   \color{red}\ttfamily,
-    breaklines      =   true,   % 自动换行，建议不要写太长的行
-    columns         =   fixed,  % 如果不加这一句，字间距就不固定，很丑，必须加
-    basewidth       =   0.5em,
+    stringstyle     =   \color{orange!80!black},
+    commentstyle    =   \color{red},
+    identifierstyle =   \color{blue!80!white},
+}
+
+\lstdefinestyle{Bash}{
+    language        =   bash
 }
 \usepackage{subcaption}
 \usepackage{booktabs} % toprule
 \usepackage[mathcal]{eucal}
 \usepackage[thehwcnt = 2]{iidef}

+\allowdisplaybreaks
+
 \thecourseinstitute{清华大学电子工程系}
 \thecoursename{\textbf{媒体与认知} \space 课堂2}
 \theterm{2023-2024学年春季学期}
@@ -54,13 +63,13 @@
 \centerline{\textbf{\Large{理论部分}}}

 \section{单选题（15分）}
-\subsection{\underline{A}}
+\subsection{\underline{C}}

 \subsection{\underline{D}}

 \subsection{\underline{D}}

-\subsection{\underline{D}}
+\subsection{\underline{C}}

 \subsection{\underline{B}}

@@ -118,57 +127,58 @@ W=\left[ \begin{array}{cc}
    \begin{align*}
        \frac{\partial L}{\partial X} & =
        \begin{bmatrix}
-            0.3 & 0.1 & 0\\
-            -0.4 & 0.2 & 0\\
+            0.1 & -0.2 & 0\\
+            -0.3 & 0.4 & 0\\
            0 & 0 & 0
        \end{bmatrix} \frac{\partial L}{\partial Y_{11}}
        +
        \begin{bmatrix}
-            0 & 0.3 & 0.1\\
-            0 & -0.4 & 0.2\\
+            0 & 0.1 & -0.2\\
+            0 & -0.3 & 0.4\\
            0 & 0 & 0
        \end{bmatrix} \frac{\partial L}{\partial Y_{12}}\\
        & \quad +
        \begin{bmatrix}
            0 & 0 & 0\\
-            0.3 & 0.1 & 0\\
-            -0.4 & 0.2 & 0
+            0.1 & -0.2 & 0\\
+            -0.3 & 0.4 & 0
        \end{bmatrix} \frac{\partial L}{\partial Y_{21}}
        +
        \begin{bmatrix}
            0 & 0 & 0\\
-            0 & 0.3 & 0.1\\
-            0 & -0.4 & 0.2
+            0 & 0.1 & -0.2\\
+            0 & -0.3 & 0.4
        \end{bmatrix} \frac{\partial L}{\partial Y_{22}}\\
+        & = \mathrm{zeropad}(W) \ast \frac{\partial L}{\partial Y}\\
        & = 
        \begin{bmatrix}
-            0.09 & 0.03 & 0\\
-            -0.12 & 0.06 & 0\\
+            0.03 & -0.06 & 0\\
+            -0.09 & 0.12 & 0\\
            0 & 0 & 0
        \end{bmatrix}
        +
        \begin{bmatrix}
-            0 & 0.03 & 0.01\\
-            0 & -0.04 & 0.02\\
+            0 & 0.01 & -0.02\\
+            0 & -0.03 & 0.04\\
            0 & 0 & 0
        \end{bmatrix}\\
        & \quad +
        \begin{bmatrix}
            0 & 0 & 0\\
-            -0.12 & -0.04 & 0\\
-            0.16 & -0.08 & 0
+            -0.04 & 0.08 & 0\\
+            0.12 & -0.16 & 0
        \end{bmatrix}
        +
        \begin{bmatrix}
            0 & 0 & 0\\
-            0 & 0.06 & 0.02\\
-            0 & -0.08 & 0.04
+            0 & 0.02 & -0.04\\
+            0 & -0.06 & 0.08
        \end{bmatrix}\\
        & = 
        \begin{bmatrix}
-            0.09 & 0.06 & 0.01\\
-            -0.24 & 0.04 & 0.04\\
-            0.16 & -0.16 & 0.04
+            0.03 & -0.05 & -0.02\\
+            -0.13 & 0.19 & 0\\
+            0.12 & -0.22 & 0.08
        \end{bmatrix} \qedhere
    \end{align*}
 \end{proof}
@@ -178,7 +188,153 @@ W=\left[ \begin{array}{cc}

 % 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
 \section{编程作业报告}
-\section{自选课题工作进度汇报}
+\subsection{探究batch normalization和dropout的作用}
+\begin{enumerate}
+    \item 使用默认配置训练模型：
+    \begin{lstlisting}[style=Bash]
+python train.py --ckpt_path checkpoints/default
+    \end{lstlisting}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/models/default/loss_and_acc.jpg}
+    \end{figure}
+    之后测试得到的正确率为90.8\%。
+    \item 启用batch normalization：
+    \begin{lstlisting}[style=Bash]
+python train.py --ckpt_path checkpoints/bn --bn
+    \end{lstlisting}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/models/bn/loss_and_acc.jpg}
+    \end{figure}
+    测试得到的正确率为95.9\%。
+    \item 启用dropout并设置概率为0.3：
+    \begin{lstlisting}[style=Bash]
+python train.py --ckpt_path checkpoints/dropout --dropout 0.3
+    \end{lstlisting}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/models/dropout/loss_and_acc.jpg}
+    \end{figure}
+    测试后得到的正确率为94.1\%。
+\end{enumerate}
+
+\subsection{探究数据增广的作用}
+考虑到在不同的视角下，交通标志可能有旋转或者变形，因此使用
+\begin{lstlisting}[style=Python]
+transforms.RandomAffine(degrees=30,shear=10)
+\end{lstlisting}
+来对数据进行随机的形变与旋转；另外，考虑到可能在不同的光线条件下导致对比度变化，因此使用
+\begin{lstlisting}[style=Python]
+transforms.RandomAutocontrast()
+\end{lstlisting}
+来对数据进行随机的对比度调整。
+
+执行
+\begin{lstlisting}[style=Bash]
+python unit_test.py data_loader
+\end{lstlisting}
+得到
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/augmentation.jpg}
+    \caption{数据增广后的结果}
+\end{figure}
+
+训练最优模型使用的命令为
+\begin{lstlisting}[style=Bash]
+python train.py --ckpt_path checkpoints/bn_aug --bn --augment --epoch 20
+\end{lstlisting}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/models/bn_aug/loss_and_acc.jpg}
+\end{figure}
+测试得到的正确率为96.0\%，略微高于不使用数据增强时的结果。
+
+\subsection{探究空间变换网络（STN）的作用}
+运行
+\begin{lstlisting}[style=Bash]
+python train.py --ckpt_path checkpoints/stn --bn --stn
+\end{lstlisting}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/models/stn/loss_and_acc.jpg}
+\end{figure}
+测试得到的正确率为94.6\%。正确率比不使用stn反而有所降低，可能是设计的网络结构不够理想导致的。
+
+\subsection{可视化}
+\begin{enumerate}
+    \item 可视化各层卷积核：
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/filter/filter_layer_0.jpg}
+        \caption{第0层的卷积核}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/filter/filter_layer_1.jpg}
+        \caption{第1层的卷积核}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/filter/filter_layer_2.jpg}
+        \caption{第2层的卷积核}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/filter/filter_layer_3.jpg}
+        \caption{第3层的卷积核}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/filter/filter_layer_4.jpg}
+        \caption{第4层的卷积核}
+    \end{figure}
+    \item 可视化各层卷积层的输出特征图
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/feature/layer_0/feature_map.jpg}
+        \caption{第0层的卷积核特征图}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/feature/layer_1/feature_map.jpg}
+        \caption{第1层的卷积核特征图}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/feature/layer_2/feature_map.jpg}
+        \caption{第2层的卷积核特征图}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/feature/layer_3/feature_map.jpg}
+        \caption{第3层的卷积核特征图}
+    \end{figure}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/feature/layer_4/feature_map.jpg}
+        \caption{第4层的卷积核特征图}
+    \end{figure}
+    \item t-SNE可视化最后一层隐藏层的输出特征
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/tsne/tsne.jpg}
+    \end{figure}
+    t-SNE最后一层的隐藏层的输出证明，不同类别的输入已经被通过非线性变换分类到了不同的聚类。
+    \item STN学习到的变换
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=\linewidth]{img/stn/stn.jpg}
+    \end{figure}
+    网络尽可能将所有的路牌都变换到了同样的倾斜角度。
+\end{enumerate}
+
+\section{遇到的问题与解决办法}
+在自定义STN网络的时候，我最开始使用了比较小的卷积核，使得STN的效果很差，使用之后会使得正确率只有80\%；之后，我分析认为STN主要要感知整个图片的倾斜以及旋转情况，需要较大的视野，因此选择了较大的卷积核，之后得到了比较理想的效果。
+
+完成作业没有使用大模型。
+% \section{自选课题工作进度汇报}

 \end{document}

--- a/j.ps1
+++ b/j.ps1
@@ -0,0 +1 @@
+cd ./hw2/code
--- a/testtorch.ipynb
+++ b/testtorch.ipynb
@@ -0,0 +1,178 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import torchvision.transforms as transforms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ConvBlock(nn.Module):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        in_channels,\n",
+    "        out_channels,\n",
+    "        kernel_size,\n",
+    "        stride,\n",
+    "        padding,\n",
+    "        use_batch_norm=False,\n",
+    "        use_residual=False,\n",
+    "    ):\n",
+    "        \"\"\"\n",
+    "        Convolutional block with batch normalization and ReLU activation\n",
+    "        ----------------------\n",
+    "        :param in_channels: channel number of input image\n",
+    "        :param out_channels: channel number of output image\n",
+    "        :param kernel_size: size of convolutional kernel\n",
+    "        :param stride: stride of convolutional operation\n",
+    "        :param padding: padding of convolutional operation\n",
+    "        :param use_batch_norm: whether to use batch normalization in convolutional layers\n",
+    "        :param use_residual: whether to use residual connection\n",
+    "        \"\"\"\n",
+    "        super().__init__()\n",
+    "\n",
+    "        if use_batch_norm:\n",
+    "            bn2d = nn.BatchNorm2d\n",
+    "        else:\n",
+    "            # use identity function to replace batch normalization\n",
+    "            bn2d = nn.Identity\n",
+    "\n",
+    "        self.use_residual = use_residual\n",
+    "\n",
+    "        # >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation\n",
+    "        # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`\n",
+    "        # Network structure:\n",
+    "        # conv -> batchnorm -> relu\n",
+    "        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)\n",
+    "        self.bn = bn2d(out_channels)\n",
+    "        self.relu = nn.ReLU()\n",
+    "        # <<< TODO 2.1\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # >>> TODO 2.2: forward process\n",
+    "        # Hint: apply residual connection if `self.use_residual` is True\n",
+    "        out = self.relu(self.bn(self.conv(x)))\n",
+    "        if self.use_residual:\n",
+    "            out += x\n",
+    "\n",
+    "        # <<< TODO 2.2\n",
+    "        return out\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "in_channels = 3\n",
+    "dropout_prob = 0.5\n",
+    "conv_net = nn.Sequential(\n",
+    "    ConvBlock(\n",
+    "        in_channels=in_channels, out_channels=32, kernel_size=5, stride=1, padding=2\n",
+    "    ),\n",
+    "    ConvBlock(in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2),\n",
+    "    nn.MaxPool2d(kernel_size=2, stride=2, padding=0),\n",
+    "    ConvBlock(\n",
+    "        in_channels=64,\n",
+    "        out_channels=64,\n",
+    "        kernel_size=3,\n",
+    "        stride=1,\n",
+    "        padding=1,\n",
+    "        use_residual=True,\n",
+    "    ),\n",
+    "    ConvBlock(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),\n",
+    "    nn.MaxPool2d(kernel_size=2, stride=2, padding=0),\n",
+    "    ConvBlock(\n",
+    "        in_channels=128,\n",
+    "        out_channels=128,\n",
+    "        kernel_size=3,\n",
+    "        stride=1,\n",
+    "        padding=1,\n",
+    "        use_residual=True,\n",
+    "    ),\n",
+    "    nn.Dropout2d(p=dropout_prob),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([10, 128, 4, 4])\n",
+      "ConvBlock(\n",
+      "  (conv): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))\n",
+      "  (bn): Identity()\n",
+      "  (relu): ReLU()\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(10, 3, 32, 32)\n",
+    "print(conv_net(a).size())\n",
+    "print(conv_net[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([10, 8, 16, 16])\n",
+      "torch.Size([10, 16, 8, 8])\n"
+     ]
+    }
+   ],
+   "source": [
+    "conv_1 = ConvBlock(in_channels=3, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True)\n",
+    "conv_2 = ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True)\n",
+    "\n",
+    "print(conv_1(a).size())\n",
+    "print(conv_2(conv_1(a)).size())\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "media_cognition",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}