Complete.
1
.gitignore
vendored
@@ -9,3 +9,4 @@ __pycache__/
|
|||||||
*.pdf
|
*.pdf
|
||||||
.DS_Store
|
.DS_Store
|
||||||
hw2/code/checkpoints/
|
hw2/code/checkpoints/
|
||||||
|
hw2/code/visualized/
|
||||||
@@ -52,7 +52,9 @@ def get_data_loader(
|
|||||||
# Consider what is an appropriate data augmentation technique for traffic sign classification.
|
# Consider what is an appropriate data augmentation technique for traffic sign classification.
|
||||||
if mode == "train" and augment:
|
if mode == "train" and augment:
|
||||||
# pass # TODO
|
# pass # TODO
|
||||||
data_transforms.append(transforms.AutoAugment())
|
# data_transforms.append(transforms.AutoAugment())
|
||||||
|
data_transforms.append(transforms.RandomAffine(degrees=30,shear=10))
|
||||||
|
data_transforms.append(transforms.RandomAutocontrast())
|
||||||
# Else, the `data_transforms` should be left unchanged
|
# Else, the `data_transforms` should be left unchanged
|
||||||
# <<< TODO 1.1
|
# <<< TODO 1.1
|
||||||
# Use `transforms.Compose` to compose the list of transforms into a single transform
|
# Use `transforms.Compose` to compose the list of transforms into a single transform
|
||||||
|
|||||||
@@ -229,8 +229,9 @@ class STN(nn.Module):
|
|||||||
# this network.
|
# this network.
|
||||||
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
|
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
|
||||||
self.localization_conv = nn.Sequential(
|
self.localization_conv = nn.Sequential(
|
||||||
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
|
||||||
ConvBlock(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
# 8 * 13 * 13
|
||||||
|
ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
|
||||||
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
||||||
# 32 * 4 * 4
|
# 32 * 4 * 4
|
||||||
)
|
)
|
||||||
@@ -240,10 +241,10 @@ class STN(nn.Module):
|
|||||||
# Hint: Combine linear layers and ReLU activation functions to build this network.
|
# Hint: Combine linear layers and ReLU activation functions to build this network.
|
||||||
# Suggested structure: 2 linear layers with one BN and ReLU.
|
# Suggested structure: 2 linear layers with one BN and ReLU.
|
||||||
self.localization_fc = nn.Sequential(
|
self.localization_fc = nn.Sequential(
|
||||||
nn.Linear(16, 256),
|
nn.Linear(32 * 4 * 4, 256),
|
||||||
nn.Linear(256, 6),
|
nn.ReLU(),
|
||||||
nn.BatchNorm1d(6),
|
nn.BatchNorm1d(256),
|
||||||
nn.ReLU()
|
nn.Linear(256, 6)
|
||||||
)
|
)
|
||||||
# <<< TODO 4.1
|
# <<< TODO 4.1
|
||||||
|
|
||||||
@@ -251,7 +252,7 @@ class STN(nn.Module):
|
|||||||
# Hint: The STN should generate the identity transformation by default before training.
|
# Hint: The STN should generate the identity transformation by default before training.
|
||||||
# How to initialize the weight/bias of the last linear layer of the fully connected network to
|
# How to initialize the weight/bias of the last linear layer of the fully connected network to
|
||||||
# achieve this goal?
|
# achieve this goal?
|
||||||
nn.init.zeros_(self.localization_fc[1].weight)
|
nn.init.zeros_(self.localization_fc[3].weight)
|
||||||
# <<< TODO 4.2
|
# <<< TODO 4.2
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
|||||||
BIN
hw2/report/img/augmentation.jpg
Normal file
|
After Width: | Height: | Size: 96 KiB |
BIN
hw2/report/img/feature/image.jpg
Normal file
|
After Width: | Height: | Size: 1.4 KiB |
BIN
hw2/report/img/feature/layer_0/feature_map.jpg
Normal file
|
After Width: | Height: | Size: 108 KiB |
BIN
hw2/report/img/feature/layer_1/feature_map.jpg
Normal file
|
After Width: | Height: | Size: 146 KiB |
BIN
hw2/report/img/feature/layer_2/feature_map.jpg
Normal file
|
After Width: | Height: | Size: 155 KiB |
BIN
hw2/report/img/feature/layer_3/feature_map.jpg
Normal file
|
After Width: | Height: | Size: 289 KiB |
BIN
hw2/report/img/feature/layer_4/feature_map.jpg
Normal file
|
After Width: | Height: | Size: 265 KiB |
BIN
hw2/report/img/filter/filter_layer_0.jpg
Normal file
|
After Width: | Height: | Size: 124 KiB |
BIN
hw2/report/img/filter/filter_layer_1.jpg
Normal file
|
After Width: | Height: | Size: 231 KiB |
BIN
hw2/report/img/filter/filter_layer_2.jpg
Normal file
|
After Width: | Height: | Size: 330 KiB |
BIN
hw2/report/img/filter/filter_layer_3.jpg
Normal file
|
After Width: | Height: | Size: 616 KiB |
BIN
hw2/report/img/filter/filter_layer_4.jpg
Normal file
|
After Width: | Height: | Size: 613 KiB |
BIN
hw2/report/img/models/bn/loss_and_acc.jpg
Normal file
|
After Width: | Height: | Size: 97 KiB |
BIN
hw2/report/img/models/bn_aug/loss_and_acc.jpg
Normal file
|
After Width: | Height: | Size: 107 KiB |
BIN
hw2/report/img/models/default/loss_and_acc.jpg
Normal file
|
After Width: | Height: | Size: 103 KiB |
BIN
hw2/report/img/models/dropout/loss_and_acc.jpg
Normal file
|
After Width: | Height: | Size: 108 KiB |
BIN
hw2/report/img/models/stn/loss_and_acc.jpg
Normal file
|
After Width: | Height: | Size: 106 KiB |
BIN
hw2/report/img/stn/stn.jpg
Normal file
|
After Width: | Height: | Size: 159 KiB |
BIN
hw2/report/img/tsne/tsne.jpg
Normal file
|
After Width: | Height: | Size: 83 KiB |
@@ -10,39 +10,48 @@
|
|||||||
\usepackage{enumitem}
|
\usepackage{enumitem}
|
||||||
\usepackage{graphicx}
|
\usepackage{graphicx}
|
||||||
\usepackage{listings}
|
\usepackage{listings}
|
||||||
\usepackage{color}
|
\usepackage{fontspec}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{float}
|
||||||
|
% \usepackage{color}
|
||||||
|
|
||||||
|
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||||
|
\newfontfamily\cascadia{Cascadia Code}
|
||||||
|
|
||||||
\lstset{
|
\lstset{
|
||||||
basicstyle = \sffamily, % 基本代码风格
|
basicstyle = \small\codefont,
|
||||||
keywordstyle = \bfseries, % 关键字风格
|
% ---
|
||||||
commentstyle = \rmfamily\itshape, % 注释的风格,斜体
|
tabsize = 4,
|
||||||
stringstyle = \ttfamily, % 字符串风格
|
|
||||||
flexiblecolumns, % 别问为什么,加上这个
|
|
||||||
numbers = left, % 行号的位置在左边
|
|
||||||
showspaces = false, % 是否显示空格,显示了有点乱,所以不现实了
|
|
||||||
numberstyle = \zihao{-5}\ttfamily, % 行号的样式,小五号,tt等宽字体
|
|
||||||
showstringspaces = false,
|
showstringspaces = false,
|
||||||
captionpos = t, % 这段代码的名字所呈现的位置,t指的是top上面
|
numbers = left,
|
||||||
frame = lrtb, % 显示边框
|
numberstyle = \codefont,
|
||||||
|
% ---
|
||||||
|
breaklines = true,
|
||||||
|
captionpos = t,
|
||||||
|
% ---
|
||||||
|
frame = l,
|
||||||
|
flexiblecolumns,
|
||||||
}
|
}
|
||||||
|
|
||||||
\lstdefinestyle{Python}{
|
\lstdefinestyle{Python}{
|
||||||
language = Python, % 语言选Python
|
language = Python, % 语言选Python
|
||||||
basicstyle = \zihao{-5}\ttfamily,
|
|
||||||
numberstyle = \zihao{-5}\ttfamily,
|
|
||||||
keywordstyle = \color{blue},
|
keywordstyle = \color{blue},
|
||||||
keywordstyle = [2] \color{teal},
|
keywordstyle = [2] \color{teal},
|
||||||
stringstyle = \color{magenta},
|
stringstyle = \color{orange!80!black},
|
||||||
commentstyle = \color{red}\ttfamily,
|
commentstyle = \color{red},
|
||||||
breaklines = true, % 自动换行,建议不要写太长的行
|
identifierstyle = \color{blue!80!white},
|
||||||
columns = fixed, % 如果不加这一句,字间距就不固定,很丑,必须加
|
}
|
||||||
basewidth = 0.5em,
|
|
||||||
|
\lstdefinestyle{Bash}{
|
||||||
|
language = bash
|
||||||
}
|
}
|
||||||
\usepackage{subcaption}
|
\usepackage{subcaption}
|
||||||
\usepackage{booktabs} % toprule
|
\usepackage{booktabs} % toprule
|
||||||
\usepackage[mathcal]{eucal}
|
\usepackage[mathcal]{eucal}
|
||||||
\usepackage[thehwcnt = 2]{iidef}
|
\usepackage[thehwcnt = 2]{iidef}
|
||||||
|
|
||||||
|
\allowdisplaybreaks
|
||||||
|
|
||||||
\thecourseinstitute{清华大学电子工程系}
|
\thecourseinstitute{清华大学电子工程系}
|
||||||
\thecoursename{\textbf{媒体与认知} \space 课堂2}
|
\thecoursename{\textbf{媒体与认知} \space 课堂2}
|
||||||
\theterm{2023-2024学年春季学期}
|
\theterm{2023-2024学年春季学期}
|
||||||
@@ -54,13 +63,13 @@
|
|||||||
\centerline{\textbf{\Large{理论部分}}}
|
\centerline{\textbf{\Large{理论部分}}}
|
||||||
|
|
||||||
\section{单选题(15分)}
|
\section{单选题(15分)}
|
||||||
\subsection{\underline{A}}
|
\subsection{\underline{C}}
|
||||||
|
|
||||||
\subsection{\underline{D}}
|
\subsection{\underline{D}}
|
||||||
|
|
||||||
\subsection{\underline{D}}
|
\subsection{\underline{D}}
|
||||||
|
|
||||||
\subsection{\underline{D}}
|
\subsection{\underline{C}}
|
||||||
|
|
||||||
\subsection{\underline{B}}
|
\subsection{\underline{B}}
|
||||||
|
|
||||||
@@ -118,57 +127,58 @@ W=\left[ \begin{array}{cc}
|
|||||||
\begin{align*}
|
\begin{align*}
|
||||||
\frac{\partial L}{\partial X} & =
|
\frac{\partial L}{\partial X} & =
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0.3 & 0.1 & 0\\
|
0.1 & -0.2 & 0\\
|
||||||
-0.4 & 0.2 & 0\\
|
-0.3 & 0.4 & 0\\
|
||||||
0 & 0 & 0
|
0 & 0 & 0
|
||||||
\end{bmatrix} \frac{\partial L}{\partial Y_{11}}
|
\end{bmatrix} \frac{\partial L}{\partial Y_{11}}
|
||||||
+
|
+
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0.3 & 0.1\\
|
0 & 0.1 & -0.2\\
|
||||||
0 & -0.4 & 0.2\\
|
0 & -0.3 & 0.4\\
|
||||||
0 & 0 & 0
|
0 & 0 & 0
|
||||||
\end{bmatrix} \frac{\partial L}{\partial Y_{12}}\\
|
\end{bmatrix} \frac{\partial L}{\partial Y_{12}}\\
|
||||||
& \quad +
|
& \quad +
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0 & 0\\
|
0 & 0 & 0\\
|
||||||
0.3 & 0.1 & 0\\
|
0.1 & -0.2 & 0\\
|
||||||
-0.4 & 0.2 & 0
|
-0.3 & 0.4 & 0
|
||||||
\end{bmatrix} \frac{\partial L}{\partial Y_{21}}
|
\end{bmatrix} \frac{\partial L}{\partial Y_{21}}
|
||||||
+
|
+
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0 & 0\\
|
0 & 0 & 0\\
|
||||||
0 & 0.3 & 0.1\\
|
0 & 0.1 & -0.2\\
|
||||||
0 & -0.4 & 0.2
|
0 & -0.3 & 0.4
|
||||||
\end{bmatrix} \frac{\partial L}{\partial Y_{22}}\\
|
\end{bmatrix} \frac{\partial L}{\partial Y_{22}}\\
|
||||||
|
& = \mathrm{zeropad}(W) \ast \frac{\partial L}{\partial Y}\\
|
||||||
& =
|
& =
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0.09 & 0.03 & 0\\
|
0.03 & -0.06 & 0\\
|
||||||
-0.12 & 0.06 & 0\\
|
-0.09 & 0.12 & 0\\
|
||||||
0 & 0 & 0
|
0 & 0 & 0
|
||||||
\end{bmatrix}
|
\end{bmatrix}
|
||||||
+
|
+
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0.03 & 0.01\\
|
0 & 0.01 & -0.02\\
|
||||||
0 & -0.04 & 0.02\\
|
0 & -0.03 & 0.04\\
|
||||||
0 & 0 & 0
|
0 & 0 & 0
|
||||||
\end{bmatrix}\\
|
\end{bmatrix}\\
|
||||||
& \quad +
|
& \quad +
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0 & 0\\
|
0 & 0 & 0\\
|
||||||
-0.12 & -0.04 & 0\\
|
-0.04 & 0.08 & 0\\
|
||||||
0.16 & -0.08 & 0
|
0.12 & -0.16 & 0
|
||||||
\end{bmatrix}
|
\end{bmatrix}
|
||||||
+
|
+
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0 & 0 & 0\\
|
0 & 0 & 0\\
|
||||||
0 & 0.06 & 0.02\\
|
0 & 0.02 & -0.04\\
|
||||||
0 & -0.08 & 0.04
|
0 & -0.06 & 0.08
|
||||||
\end{bmatrix}\\
|
\end{bmatrix}\\
|
||||||
& =
|
& =
|
||||||
\begin{bmatrix}
|
\begin{bmatrix}
|
||||||
0.09 & 0.06 & 0.01\\
|
0.03 & -0.05 & -0.02\\
|
||||||
-0.24 & 0.04 & 0.04\\
|
-0.13 & 0.19 & 0\\
|
||||||
0.16 & -0.16 & 0.04
|
0.12 & -0.22 & 0.08
|
||||||
\end{bmatrix} \qedhere
|
\end{bmatrix} \qedhere
|
||||||
\end{align*}
|
\end{align*}
|
||||||
\end{proof}
|
\end{proof}
|
||||||
@@ -178,7 +188,153 @@ W=\left[ \begin{array}{cc}
|
|||||||
|
|
||||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||||
\section{编程作业报告}
|
\section{编程作业报告}
|
||||||
\section{自选课题工作进度汇报}
|
\subsection{探究batch normalization和dropout的作用}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item 使用默认配置训练模型:
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python train.py --ckpt_path checkpoints/default
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/models/default/loss_and_acc.jpg}
|
||||||
|
\end{figure}
|
||||||
|
之后测试得到的正确率为90.8\%。
|
||||||
|
\item 启用batch normalization:
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python train.py --ckpt_path checkpoints/bn --bn
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/models/bn/loss_and_acc.jpg}
|
||||||
|
\end{figure}
|
||||||
|
测试得到的正确率为95.9\%。
|
||||||
|
\item 启用dropout并设置概率为0.3:
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python train.py --ckpt_path checkpoints/dropout --dropout 0.3
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/models/dropout/loss_and_acc.jpg}
|
||||||
|
\end{figure}
|
||||||
|
测试后得到的正确率为94.1\%。
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection{探究数据增广的作用}
|
||||||
|
考虑到在不同的视角下,交通标志可能有旋转或者变形,因此使用
|
||||||
|
\begin{lstlisting}[style=Python]
|
||||||
|
transforms.RandomAffine(degrees=30,shear=10)
|
||||||
|
\end{lstlisting}
|
||||||
|
来对数据进行随机的形变与旋转;另外,考虑到可能在不同的光线条件下导致对比度变化,因此使用
|
||||||
|
\begin{lstlisting}[style=Python]
|
||||||
|
transforms.RandomAutocontrast()
|
||||||
|
\end{lstlisting}
|
||||||
|
来对数据进行随机的对比度调整。
|
||||||
|
|
||||||
|
执行
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python unit_test.py data_loader
|
||||||
|
\end{lstlisting}
|
||||||
|
得到
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/augmentation.jpg}
|
||||||
|
\caption{数据增广后的结果}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
训练最优模型使用的命令为
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python train.py --ckpt_path checkpoints/bn_aug --bn --augment --epoch 20
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/models/bn_aug/loss_and_acc.jpg}
|
||||||
|
\end{figure}
|
||||||
|
测试得到的正确率为96.0\%,略微高于不使用数据增强时的结果。
|
||||||
|
|
||||||
|
\subsection{探究空间变换网络(STN)的作用}
|
||||||
|
运行
|
||||||
|
\begin{lstlisting}[style=Bash]
|
||||||
|
python train.py --ckpt_path checkpoints/stn --bn --stn
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/models/stn/loss_and_acc.jpg}
|
||||||
|
\end{figure}
|
||||||
|
测试得到的正确率为94.6\%。正确率比不使用stn反而有所降低,可能是设计的网络结构不够理想导致的。
|
||||||
|
|
||||||
|
\subsection{可视化}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item 可视化各层卷积核:
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/filter/filter_layer_0.jpg}
|
||||||
|
\caption{第0层的卷积核}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/filter/filter_layer_1.jpg}
|
||||||
|
\caption{第1层的卷积核}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/filter/filter_layer_2.jpg}
|
||||||
|
\caption{第2层的卷积核}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/filter/filter_layer_3.jpg}
|
||||||
|
\caption{第3层的卷积核}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/filter/filter_layer_4.jpg}
|
||||||
|
\caption{第4层的卷积核}
|
||||||
|
\end{figure}
|
||||||
|
\item 可视化各层卷积层的输出特征图
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/feature/layer_0/feature_map.jpg}
|
||||||
|
\caption{第0层的卷积核特征图}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/feature/layer_1/feature_map.jpg}
|
||||||
|
\caption{第1层的卷积核特征图}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/feature/layer_2/feature_map.jpg}
|
||||||
|
\caption{第2层的卷积核特征图}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/feature/layer_3/feature_map.jpg}
|
||||||
|
\caption{第3层的卷积核特征图}
|
||||||
|
\end{figure}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/feature/layer_4/feature_map.jpg}
|
||||||
|
\caption{第4层的卷积核特征图}
|
||||||
|
\end{figure}
|
||||||
|
\item t-SNE可视化最后一层隐藏层的输出特征
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/tsne/tsne.jpg}
|
||||||
|
\end{figure}
|
||||||
|
t-SNE最后一层的隐藏层的输出证明,不同类别的输入已经被通过非线性变换分类到了不同的聚类。
|
||||||
|
\item STN学习到的变换
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/stn/stn.jpg}
|
||||||
|
\end{figure}
|
||||||
|
网络尽可能将所有的路牌都变换到了同样的倾斜角度。
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\section{遇到的问题与解决办法}
|
||||||
|
在自定义STN网络的时候,我最开始使用了比较小的卷积核,使得STN的效果很差,使用之后会使得正确率只有80\%;之后,我分析认为STN主要要感知整个图片的倾斜以及旋转情况,需要较大的视野,因此选择了较大的卷积核,之后得到了比较理想的效果。
|
||||||
|
|
||||||
|
完成作业没有使用大模型。
|
||||||
|
% \section{自选课题工作进度汇报}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
|
|
||||||
|
|||||||
178
testtorch.ipynb
Normal file
@@ -0,0 +1,178 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"import torch.nn as nn\n",
|
||||||
|
"import torch.nn.functional as F\n",
|
||||||
|
"\n",
|
||||||
|
"import torchvision.transforms as transforms"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class ConvBlock(nn.Module):\n",
|
||||||
|
" def __init__(\n",
|
||||||
|
" self,\n",
|
||||||
|
" in_channels,\n",
|
||||||
|
" out_channels,\n",
|
||||||
|
" kernel_size,\n",
|
||||||
|
" stride,\n",
|
||||||
|
" padding,\n",
|
||||||
|
" use_batch_norm=False,\n",
|
||||||
|
" use_residual=False,\n",
|
||||||
|
" ):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Convolutional block with batch normalization and ReLU activation\n",
|
||||||
|
" ----------------------\n",
|
||||||
|
" :param in_channels: channel number of input image\n",
|
||||||
|
" :param out_channels: channel number of output image\n",
|
||||||
|
" :param kernel_size: size of convolutional kernel\n",
|
||||||
|
" :param stride: stride of convolutional operation\n",
|
||||||
|
" :param padding: padding of convolutional operation\n",
|
||||||
|
" :param use_batch_norm: whether to use batch normalization in convolutional layers\n",
|
||||||
|
" :param use_residual: whether to use residual connection\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" super().__init__()\n",
|
||||||
|
"\n",
|
||||||
|
" if use_batch_norm:\n",
|
||||||
|
" bn2d = nn.BatchNorm2d\n",
|
||||||
|
" else:\n",
|
||||||
|
" # use identity function to replace batch normalization\n",
|
||||||
|
" bn2d = nn.Identity\n",
|
||||||
|
"\n",
|
||||||
|
" self.use_residual = use_residual\n",
|
||||||
|
"\n",
|
||||||
|
" # >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation\n",
|
||||||
|
" # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`\n",
|
||||||
|
" # Network structure:\n",
|
||||||
|
" # conv -> batchnorm -> relu\n",
|
||||||
|
" self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride=stride, padding=padding)\n",
|
||||||
|
" self.bn = bn2d(out_channels)\n",
|
||||||
|
" self.relu = nn.ReLU()\n",
|
||||||
|
" # <<< TODO 2.1\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" # >>> TODO 2.2: forward process\n",
|
||||||
|
" # Hint: apply residual connection if `self.use_residual` is True\n",
|
||||||
|
" out = self.relu(self.bn(self.conv(x)))\n",
|
||||||
|
" if self.use_residual:\n",
|
||||||
|
" out += x\n",
|
||||||
|
"\n",
|
||||||
|
" # <<< TODO 2.2\n",
|
||||||
|
" return out\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"in_channels = 3\n",
|
||||||
|
"dropout_prob = 0.5\n",
|
||||||
|
"conv_net = nn.Sequential(\n",
|
||||||
|
" ConvBlock(\n",
|
||||||
|
" in_channels=in_channels, out_channels=32, kernel_size=5, stride=1, padding=2\n",
|
||||||
|
" ),\n",
|
||||||
|
" ConvBlock(in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2),\n",
|
||||||
|
" nn.MaxPool2d(kernel_size=2, stride=2, padding=0),\n",
|
||||||
|
" ConvBlock(\n",
|
||||||
|
" in_channels=64,\n",
|
||||||
|
" out_channels=64,\n",
|
||||||
|
" kernel_size=3,\n",
|
||||||
|
" stride=1,\n",
|
||||||
|
" padding=1,\n",
|
||||||
|
" use_residual=True,\n",
|
||||||
|
" ),\n",
|
||||||
|
" ConvBlock(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),\n",
|
||||||
|
" nn.MaxPool2d(kernel_size=2, stride=2, padding=0),\n",
|
||||||
|
" ConvBlock(\n",
|
||||||
|
" in_channels=128,\n",
|
||||||
|
" out_channels=128,\n",
|
||||||
|
" kernel_size=3,\n",
|
||||||
|
" stride=1,\n",
|
||||||
|
" padding=1,\n",
|
||||||
|
" use_residual=True,\n",
|
||||||
|
" ),\n",
|
||||||
|
" nn.Dropout2d(p=dropout_prob),\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"torch.Size([10, 128, 4, 4])\n",
|
||||||
|
"ConvBlock(\n",
|
||||||
|
" (conv): Conv2d(32, 64, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))\n",
|
||||||
|
" (bn): Identity()\n",
|
||||||
|
" (relu): ReLU()\n",
|
||||||
|
")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = torch.randn(10, 3, 32, 32)\n",
|
||||||
|
"print(conv_net(a).size())\n",
|
||||||
|
"print(conv_net[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"torch.Size([10, 8, 16, 16])\n",
|
||||||
|
"torch.Size([10, 16, 8, 8])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"conv_1 = ConvBlock(in_channels=3, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True)\n",
|
||||||
|
"conv_2 = ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True)\n",
|
||||||
|
"\n",
|
||||||
|
"print(conv_1(a).size())\n",
|
||||||
|
"print(conv_2(conv_1(a)).size())\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "media_cognition",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||