Merge pull request 'Submit homework 3.' (#4) from homework3 into main

Reviewed-on: #4
2024-05-22 20:24:30 +08:00
parent 121ca13130 c850f38778
commit f1459069da
44 changed files with 3117 additions and 1318 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,14 @@
-*.zip
-__pycache__/
-*.pth
-*.log
-*.aux
-*.synctex.gz
-*.synctex.gz(buzy)
-*.out
-*.pdf
-.DS_Store
-hw2/code/checkpoints/
-hw2/code/visualized/
+*.zip
+__pycache__/
+*.pth
+*.log
+*.aux
+*.synctex.gz
+*.synctex.gz(buzy)
+*.out
+*.pdf
+.DS_Store
+hw2/code/checkpoints/
+hw2/code/visualized/
+hw3/code/data/
+hw3/code/checkpoints/
--- a/hw1/.vscode/settings.json
+++ b/hw1/.vscode/settings.json
@@ -1,4 +1,4 @@
-{
-    "python.analysis.typeCheckingMode": "basic",
-    "python.analysis.autoImportCompletions": true
+{
+    "python.analysis.typeCheckingMode": "basic",
+    "python.analysis.autoImportCompletions": true
 }
--- a/hw1/HW1-Report/codes/1.1.out.txt
+++ b/hw1/HW1-Report/codes/1.1.out.txt
@@ -1,56 +1,56 @@
-Epoch 01: loss = inf
-Epoch 02: loss = inf
-Epoch 03: loss = 6.678
-Epoch 04: loss = 4.361
-Epoch 05: loss = 3.110
-Epoch 06: loss = 2.099
-Epoch 07: loss = 1.698
-Epoch 08: loss = 1.320
-Epoch 09: loss = 0.970
-Epoch 10: loss = 0.891
-Epoch 10: validation accuracy = 66.0%
-Epoch 11: loss = 0.817
-Epoch 12: loss = 0.723
-Epoch 13: loss = 0.512
-Epoch 14: loss = 0.353
-Epoch 15: loss = 0.202
-Epoch 16: loss = 0.182
-Epoch 17: loss = 0.184
-Epoch 18: loss = 0.191
-Epoch 19: loss = 0.175
-Epoch 20: loss = 0.166
-Epoch 20: validation accuracy = 68.0%
-Epoch 21: loss = 0.146
-Epoch 22: loss = 0.105
-Epoch 23: loss = 0.109
-Epoch 24: loss = 0.074
-Epoch 25: loss = 0.097
-Epoch 26: loss = 0.047
-Epoch 27: loss = 0.038
-Epoch 28: loss = 0.037
-Epoch 29: loss = 0.024
-Epoch 30: loss = 0.021
-Epoch 30: validation accuracy = 68.8%
-Epoch 31: loss = 0.019
-Epoch 32: loss = 0.024
-Epoch 33: loss = 0.023
-Epoch 34: loss = 0.014
-Epoch 35: loss = 0.013
-Epoch 36: loss = 0.012
-Epoch 37: loss = 0.011
-Epoch 38: loss = 0.013
-Epoch 39: loss = 0.013
-Epoch 40: loss = 0.016
-Epoch 40: validation accuracy = 70.5%
-Epoch 41: loss = 0.015
-Epoch 42: loss = 0.009
-Epoch 43: loss = 0.011
-Epoch 44: loss = 0.008
-Epoch 45: loss = 0.008
-Epoch 46: loss = 0.010
-Epoch 47: loss = 0.009
-Epoch 48: loss = 0.007
-Epoch 49: loss = 0.007
-Epoch 50: loss = 0.010
-Epoch 50: validation accuracy = 70.5%
+Epoch 01: loss = inf
+Epoch 02: loss = inf
+Epoch 03: loss = 6.678
+Epoch 04: loss = 4.361
+Epoch 05: loss = 3.110
+Epoch 06: loss = 2.099
+Epoch 07: loss = 1.698
+Epoch 08: loss = 1.320
+Epoch 09: loss = 0.970
+Epoch 10: loss = 0.891
+Epoch 10: validation accuracy = 66.0%
+Epoch 11: loss = 0.817
+Epoch 12: loss = 0.723
+Epoch 13: loss = 0.512
+Epoch 14: loss = 0.353
+Epoch 15: loss = 0.202
+Epoch 16: loss = 0.182
+Epoch 17: loss = 0.184
+Epoch 18: loss = 0.191
+Epoch 19: loss = 0.175
+Epoch 20: loss = 0.166
+Epoch 20: validation accuracy = 68.0%
+Epoch 21: loss = 0.146
+Epoch 22: loss = 0.105
+Epoch 23: loss = 0.109
+Epoch 24: loss = 0.074
+Epoch 25: loss = 0.097
+Epoch 26: loss = 0.047
+Epoch 27: loss = 0.038
+Epoch 28: loss = 0.037
+Epoch 29: loss = 0.024
+Epoch 30: loss = 0.021
+Epoch 30: validation accuracy = 68.8%
+Epoch 31: loss = 0.019
+Epoch 32: loss = 0.024
+Epoch 33: loss = 0.023
+Epoch 34: loss = 0.014
+Epoch 35: loss = 0.013
+Epoch 36: loss = 0.012
+Epoch 37: loss = 0.011
+Epoch 38: loss = 0.013
+Epoch 39: loss = 0.013
+Epoch 40: loss = 0.016
+Epoch 40: validation accuracy = 70.5%
+Epoch 41: loss = 0.015
+Epoch 42: loss = 0.009
+Epoch 43: loss = 0.011
+Epoch 44: loss = 0.008
+Epoch 45: loss = 0.008
+Epoch 46: loss = 0.010
+Epoch 47: loss = 0.009
+Epoch 48: loss = 0.007
+Epoch 49: loss = 0.007
+Epoch 50: loss = 0.010
+Epoch 50: validation accuracy = 70.5%
 Model saved in ./saved_models/default.pth
--- a/hw1/HW1-Report/codes/1.2.out.txt
+++ b/hw1/HW1-Report/codes/1.2.out.txt
@@ -1,2 +1,2 @@
-[Info] Load model from .\saved_models\default.pth
+[Info] Load model from .\saved_models\default.pth
 [Info] Test accuracy = 72.0%
--- a/hw1/HW1-Report/codes/2.2.out.txt
+++ b/hw1/HW1-Report/codes/2.2.out.txt
@@ -1,2 +1,2 @@
-[Info] Load model from .\saved_models\adam_optim.pth
+[Info] Load model from .\saved_models\adam_optim.pth
 [Info] Test accuracy = 85.0%
--- a/hw1/HW1-Report/codes/adam_optim_cuda.out.txt
+++ b/hw1/HW1-Report/codes/adam_optim_cuda.out.txt
@@ -1,56 +1,56 @@
-Epoch 01: loss = inf
-Epoch 02: loss = inf
-Epoch 03: loss = inf
-Epoch 04: loss = inf
-Epoch 05: loss = inf
-Epoch 06: loss = inf
-Epoch 07: loss = inf
-Epoch 08: loss = inf
-Epoch 09: loss = 3.250
-Epoch 10: loss = 2.567
-Epoch 10: validation accuracy = 59.0%
-Epoch 11: loss = 1.963
-Epoch 12: loss = 1.558
-Epoch 13: loss = 1.320
-Epoch 14: loss = 0.911
-Epoch 15: loss = 0.808
-Epoch 16: loss = 0.932
-Epoch 17: loss = 0.861
-Epoch 18: loss = 0.748
-Epoch 19: loss = 0.783
-Epoch 20: loss = 0.809
-Epoch 20: validation accuracy = 65.5%
-Epoch 21: loss = 0.678
-Epoch 22: loss = 0.757
-Epoch 23: loss = 0.747
-Epoch 24: loss = 0.660
-Epoch 25: loss = 0.536
-Epoch 26: loss = 0.506
-Epoch 27: loss = 0.577
-Epoch 28: loss = 0.600
-Epoch 29: loss = 0.681
-Epoch 30: loss = 0.604
-Epoch 30: validation accuracy = 68.0%
-Epoch 31: loss = 0.552
-Epoch 32: loss = 0.671
-Epoch 33: loss = 0.604
-Epoch 34: loss = 0.600
-Epoch 35: loss = 0.818
-Epoch 36: loss = 0.659
-Epoch 37: loss = 0.375
-Epoch 38: loss = 0.380
-Epoch 39: loss = 0.418
-Epoch 40: loss = 0.431
-Epoch 40: validation accuracy = 73.5%
-Epoch 41: loss = 0.551
-Epoch 42: loss = 0.488
-Epoch 43: loss = 0.350
-Epoch 44: loss = 0.287
-Epoch 45: loss = 0.294
-Epoch 46: loss = 0.463
-Epoch 47: loss = 0.438
-Epoch 48: loss = 0.392
-Epoch 49: loss = 0.325
-Epoch 50: loss = 0.332
-Epoch 50: validation accuracy = 80.8%
+Epoch 01: loss = inf
+Epoch 02: loss = inf
+Epoch 03: loss = inf
+Epoch 04: loss = inf
+Epoch 05: loss = inf
+Epoch 06: loss = inf
+Epoch 07: loss = inf
+Epoch 08: loss = inf
+Epoch 09: loss = 3.250
+Epoch 10: loss = 2.567
+Epoch 10: validation accuracy = 59.0%
+Epoch 11: loss = 1.963
+Epoch 12: loss = 1.558
+Epoch 13: loss = 1.320
+Epoch 14: loss = 0.911
+Epoch 15: loss = 0.808
+Epoch 16: loss = 0.932
+Epoch 17: loss = 0.861
+Epoch 18: loss = 0.748
+Epoch 19: loss = 0.783
+Epoch 20: loss = 0.809
+Epoch 20: validation accuracy = 65.5%
+Epoch 21: loss = 0.678
+Epoch 22: loss = 0.757
+Epoch 23: loss = 0.747
+Epoch 24: loss = 0.660
+Epoch 25: loss = 0.536
+Epoch 26: loss = 0.506
+Epoch 27: loss = 0.577
+Epoch 28: loss = 0.600
+Epoch 29: loss = 0.681
+Epoch 30: loss = 0.604
+Epoch 30: validation accuracy = 68.0%
+Epoch 31: loss = 0.552
+Epoch 32: loss = 0.671
+Epoch 33: loss = 0.604
+Epoch 34: loss = 0.600
+Epoch 35: loss = 0.818
+Epoch 36: loss = 0.659
+Epoch 37: loss = 0.375
+Epoch 38: loss = 0.380
+Epoch 39: loss = 0.418
+Epoch 40: loss = 0.431
+Epoch 40: validation accuracy = 73.5%
+Epoch 41: loss = 0.551
+Epoch 42: loss = 0.488
+Epoch 43: loss = 0.350
+Epoch 44: loss = 0.287
+Epoch 45: loss = 0.294
+Epoch 46: loss = 0.463
+Epoch 47: loss = 0.438
+Epoch 48: loss = 0.392
+Epoch 49: loss = 0.325
+Epoch 50: loss = 0.332
+Epoch 50: validation accuracy = 80.8%
 Model saved in .\saved_models\adam_optim_cuda.pth
--- a/hw1/HW1-Report/codes/self_test.out.txt
+++ b/hw1/HW1-Report/codes/self_test.out.txt
@@ -1,2 +1,2 @@
-[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
+[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
 [Info] Test accuracy = 88.8%
--- a/hw1/HW1-Report/codes/self_train.out.txt
+++ b/hw1/HW1-Report/codes/self_train.out.txt
@@ -1,111 +1,111 @@
-Epoch 01: loss = inf
-Epoch 02: loss = inf
-Epoch 03: loss = inf
-Epoch 04: loss = inf
-Epoch 05: loss = inf
-Epoch 06: loss = inf
-Epoch 07: loss = inf
-Epoch 08: loss = inf
-Epoch 09: loss = inf
-Epoch 10: loss = inf
-Epoch 10: validation accuracy = 40.2%
-Epoch 11: loss = inf
-Epoch 12: loss = inf
-Epoch 13: loss = inf
-Epoch 14: loss = inf
-Epoch 15: loss = inf
-Epoch 16: loss = inf
-Epoch 17: loss = 2.360
-Epoch 18: loss = 2.086
-Epoch 19: loss = 1.684
-Epoch 20: loss = 1.453
-Epoch 20: validation accuracy = 53.0%
-Epoch 21: loss = 1.174
-Epoch 22: loss = 1.046
-Epoch 23: loss = 0.859
-Epoch 24: loss = 0.740
-Epoch 25: loss = 0.663
-Epoch 26: loss = 0.495
-Epoch 27: loss = 0.566
-Epoch 28: loss = 0.521
-Epoch 29: loss = 0.470
-Epoch 30: loss = 0.363
-Epoch 30: validation accuracy = 59.0%
-Epoch 31: loss = 0.365
-Epoch 32: loss = 0.305
-Epoch 33: loss = 0.333
-Epoch 34: loss = 0.293
-Epoch 35: loss = 0.191
-Epoch 36: loss = 0.295
-Epoch 37: loss = 0.275
-Epoch 38: loss = 0.461
-Epoch 39: loss = 0.509
-Epoch 40: loss = 0.298
-Epoch 40: validation accuracy = 65.2%
-Epoch 41: loss = 0.186
-Epoch 42: loss = 0.395
-Epoch 43: loss = 0.323
-Epoch 44: loss = 0.309
-Epoch 45: loss = 0.199
-Epoch 46: loss = 0.285
-Epoch 47: loss = 0.290
-Epoch 48: loss = 0.302
-Epoch 49: loss = 0.235
-Epoch 50: loss = 0.190
-Epoch 50: validation accuracy = 71.2%
-Epoch 51: loss = 0.294
-Epoch 52: loss = 0.311
-Epoch 53: loss = 0.254
-Epoch 54: loss = 0.289
-Epoch 55: loss = 0.264
-Epoch 56: loss = 0.213
-Epoch 57: loss = 0.166
-Epoch 58: loss = 0.218
-Epoch 59: loss = 0.231
-Epoch 60: loss = 0.283
-Epoch 60: validation accuracy = 74.8%
-Epoch 61: loss = 0.324
-Epoch 62: loss = 0.245
-Epoch 63: loss = 0.277
-Epoch 64: loss = 0.286
-Epoch 65: loss = 0.255
-Epoch 66: loss = 0.263
-Epoch 67: loss = 0.272
-Epoch 68: loss = 0.272
-Epoch 69: loss = 0.260
-Epoch 70: loss = 0.271
-Epoch 70: validation accuracy = 79.0%
-Epoch 71: loss = 0.310
-Epoch 72: loss = 0.301
-Epoch 73: loss = 0.305
-Epoch 74: loss = 0.311
-Epoch 75: loss = 0.329
-Epoch 76: loss = 0.295
-Epoch 77: loss = 0.300
-Epoch 78: loss = 0.316
-Epoch 79: loss = 0.326
-Epoch 80: loss = 0.352
-Epoch 80: validation accuracy = 77.5%
-Epoch 81: loss = 0.344
-Epoch 82: loss = 0.326
-Epoch 83: loss = 0.326
-Epoch 84: loss = 0.335
-Epoch 85: loss = 0.342
-Epoch 86: loss = 0.361
-Epoch 87: loss = 0.337
-Epoch 88: loss = 0.339
-Epoch 89: loss = 0.339
-Epoch 90: loss = 0.341
-Epoch 90: validation accuracy = 82.8%
-Epoch 91: loss = 0.350
-Epoch 92: loss = 0.359
-Epoch 93: loss = 0.352
-Epoch 94: loss = 0.363
-Epoch 95: loss = 0.347
-Epoch 96: loss = 0.341
-Epoch 97: loss = 0.336
-Epoch 98: loss = 0.348
-Epoch 99: loss = 0.365
-Epoch 100: loss = 0.350
-Epoch 100: validation accuracy = 85.2%
+Epoch 01: loss = inf
+Epoch 02: loss = inf
+Epoch 03: loss = inf
+Epoch 04: loss = inf
+Epoch 05: loss = inf
+Epoch 06: loss = inf
+Epoch 07: loss = inf
+Epoch 08: loss = inf
+Epoch 09: loss = inf
+Epoch 10: loss = inf
+Epoch 10: validation accuracy = 40.2%
+Epoch 11: loss = inf
+Epoch 12: loss = inf
+Epoch 13: loss = inf
+Epoch 14: loss = inf
+Epoch 15: loss = inf
+Epoch 16: loss = inf
+Epoch 17: loss = 2.360
+Epoch 18: loss = 2.086
+Epoch 19: loss = 1.684
+Epoch 20: loss = 1.453
+Epoch 20: validation accuracy = 53.0%
+Epoch 21: loss = 1.174
+Epoch 22: loss = 1.046
+Epoch 23: loss = 0.859
+Epoch 24: loss = 0.740
+Epoch 25: loss = 0.663
+Epoch 26: loss = 0.495
+Epoch 27: loss = 0.566
+Epoch 28: loss = 0.521
+Epoch 29: loss = 0.470
+Epoch 30: loss = 0.363
+Epoch 30: validation accuracy = 59.0%
+Epoch 31: loss = 0.365
+Epoch 32: loss = 0.305
+Epoch 33: loss = 0.333
+Epoch 34: loss = 0.293
+Epoch 35: loss = 0.191
+Epoch 36: loss = 0.295
+Epoch 37: loss = 0.275
+Epoch 38: loss = 0.461
+Epoch 39: loss = 0.509
+Epoch 40: loss = 0.298
+Epoch 40: validation accuracy = 65.2%
+Epoch 41: loss = 0.186
+Epoch 42: loss = 0.395
+Epoch 43: loss = 0.323
+Epoch 44: loss = 0.309
+Epoch 45: loss = 0.199
+Epoch 46: loss = 0.285
+Epoch 47: loss = 0.290
+Epoch 48: loss = 0.302
+Epoch 49: loss = 0.235
+Epoch 50: loss = 0.190
+Epoch 50: validation accuracy = 71.2%
+Epoch 51: loss = 0.294
+Epoch 52: loss = 0.311
+Epoch 53: loss = 0.254
+Epoch 54: loss = 0.289
+Epoch 55: loss = 0.264
+Epoch 56: loss = 0.213
+Epoch 57: loss = 0.166
+Epoch 58: loss = 0.218
+Epoch 59: loss = 0.231
+Epoch 60: loss = 0.283
+Epoch 60: validation accuracy = 74.8%
+Epoch 61: loss = 0.324
+Epoch 62: loss = 0.245
+Epoch 63: loss = 0.277
+Epoch 64: loss = 0.286
+Epoch 65: loss = 0.255
+Epoch 66: loss = 0.263
+Epoch 67: loss = 0.272
+Epoch 68: loss = 0.272
+Epoch 69: loss = 0.260
+Epoch 70: loss = 0.271
+Epoch 70: validation accuracy = 79.0%
+Epoch 71: loss = 0.310
+Epoch 72: loss = 0.301
+Epoch 73: loss = 0.305
+Epoch 74: loss = 0.311
+Epoch 75: loss = 0.329
+Epoch 76: loss = 0.295
+Epoch 77: loss = 0.300
+Epoch 78: loss = 0.316
+Epoch 79: loss = 0.326
+Epoch 80: loss = 0.352
+Epoch 80: validation accuracy = 77.5%
+Epoch 81: loss = 0.344
+Epoch 82: loss = 0.326
+Epoch 83: loss = 0.326
+Epoch 84: loss = 0.335
+Epoch 85: loss = 0.342
+Epoch 86: loss = 0.361
+Epoch 87: loss = 0.337
+Epoch 88: loss = 0.339
+Epoch 89: loss = 0.339
+Epoch 90: loss = 0.341
+Epoch 90: validation accuracy = 82.8%
+Epoch 91: loss = 0.350
+Epoch 92: loss = 0.359
+Epoch 93: loss = 0.352
+Epoch 94: loss = 0.363
+Epoch 95: loss = 0.347
+Epoch 96: loss = 0.341
+Epoch 97: loss = 0.336
+Epoch 98: loss = 0.348
+Epoch 99: loss = 0.365
+Epoch 100: loss = 0.350
+Epoch 100: validation accuracy = 85.2%
 Model saved in .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
--- a/hw1/HW1-Report/main.tex
+++ b/hw1/HW1-Report/main.tex
@@ -1,244 +1,244 @@
-% Homework Template
-\documentclass[a4paper]{article}
-\usepackage{ctex}
-\usepackage{amsmath, amssymb, amsthm}
-\usepackage{moreenum}
-\usepackage{mathtools}
-\usepackage{url}
-\usepackage{bm}
-\usepackage{enumitem}
-\usepackage{graphicx}
-\usepackage{subcaption}
-\usepackage{booktabs} % toprule
-\usepackage[mathcal]{eucal}
-\usepackage[thehwcnt = 1]{iidef}
-\usepackage{listings}
-\usepackage[x11names]{xcolor}
-\usepackage{float}
-\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
-
-\DeclareMathOperator{\arctanh}{arctanh}
-% \DeclareMathOperator{\diag}{diag}
-
-\setenumerate[1]{label=(\arabic{*})}
-\setenumerate[2]{label=\arabic{*})}
-
-\definecolor{codekeyword}{RGB}{171, 0, 216}
-\definecolor{codetypename}{RGB}{29, 37, 251}
-\definecolor{codevariable}{RGB}{10, 23, 126}
-\definecolor{codestring}{RGB}{157, 0, 25}
-\definecolor{codecomment}{RGB}{31, 129, 19}
-
-\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
-% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
-\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
-% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
-
-\lstset{
-    basicstyle          =   \small\codefont,
-    % ---
-    tabsize             =   4,
-    showstringspaces    =   false,
-    numbers             =   left,
-    numberstyle         =   \cascadia,
-    % ---
-    breaklines          =   true,
-    captionpos          =   t,      
-    % ---
-    frame               =   l,
-    flexiblecolumns,
-    columns = fixed,
-}
-
-\thecourseinstitute{清华大学电子工程系}
-\thecoursename{\textbf{媒体与认知} \space 课堂2}
-\theterm{2023-2024学年春季学期}
-\hwname{作业}
-\begin{document}
-\courseheader
-% 请在YOUR NAME处填写自己的姓名
-\name{高艺轩}
-\vspace{3mm}
-\centerline{\textbf{\Large{理论部分}}}
-
-\section{单选题（15分）}
-% 请在？处填写答案
-\subsection{\underline{B}}
-
-\subsection{\underline{A}}
-
-\subsection{\underline{B}}
-
-\subsection{\underline{A}}
-
-\subsection{\underline{B}}
-
-\section{计算题（15 分）}
-\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$，其中$\mathbf{x}\in R^{(m \times 1)}$，$\mathbf{z}\in R^{(n\times 1)}$，$\mathbf{W}\in R^{(m\times n)}$，$\mathbf{b} \in R^{(n\times 1)}$均为已知，其激活函数如下：
-$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
-tanh表示双曲正切函数。若训练过程中的目标函数为L，且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$和$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
-}
-\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
-}
-
-\begin{proof}[解]
-    首先，对$i \neq j$，$\dfrac{\partial y_i}{\partial z_j} = 0$。
-    
-    同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$，因此
-    \[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
-    因此
-    \[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
-\end{proof}
-
-\subsubsection{请使用$\mathbf{y}$和$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$，$\frac{\partial L}{\partial \mathbf{W}}$，$\frac{\partial L}{\partial \mathbf{b}}$。
-}
-提示:$\frac{\partial L}{\partial \mathbf{x}}$，$\frac{\partial L}{\partial \mathbf{W}}$，$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
-
-\begin{proof}[解]
-    由链式法则
-    \[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
-
-    对于$\dfrac{\partial L}{\partial W}$，
-    \[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
-        \boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
-    \end{bmatrix}_{m \times n}\]
-
-    \begin{align*}
-        \frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
-        & = \begin{bmatrix}
-        \boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
-    \end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
-    \end{align*}
-
-    对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$，由链式法则
-    \[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
-\end{proof}
-\vspace{6mm}
-\centerline{\textbf{\Large{编程部分}}}
-
-
-\vspace{3mm}
-% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
-\section{编程作业报告}
-% 请在此处完成编程作业报告
-完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
-\begin{enumerate}
-    \item 使用默认配置进行训练和测试。
-    \begin{enumerate}
-        \item 训练模型。
-        
-        输入：
-        \lstinputlisting{codes/1.1.in.txt}
-        
-        输出：
-        \lstinputlisting{codes/1.1.out.txt}
-        \begin{figure}[H]
-            \centering
-            \includegraphics[width=0.9\linewidth]{img/1default_train.png}
-        \end{figure}
-
-        \item 测试模型。
-        
-        输入：
-        \lstinputlisting{codes/1.2.in.txt}
-
-        输出：
-        \lstinputlisting{codes/1.2.out.txt}
-    \end{enumerate}
-    \item 调整参数、使用Adam优化器训练并测试。
-    \begin{enumerate}
-        \item 训练模型。
-        
-        输入：
-        \lstinputlisting{codes/2.1.in.txt}
-
-        输出：
-        \lstinputlisting{codes/2.1.out.txt}
-        \begin{figure}[H]
-            \centering
-            \includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
-        \end{figure}
-        \item 测试性能。
-        
-        输入：
-        \lstinputlisting{codes/2.2.in.txt}
-
-        输出：
-        \lstinputlisting{codes/2.2.out.txt}
-    \end{enumerate}
-
-    \item 使用效果最佳的模型测试。
-    经过简单的尝试，发现使用
-    \lstinputlisting{codes/self_train.in.txt}
-    可以使测试集准确率达到88.8\%，有略微的提升。训练的loss曲线：
-    \begin{figure}[H]
-        \centering
-        \includegraphics[width=.9\linewidth]{img/3found_best.png}
-    \end{figure}
-    使用它进行预测：
-    \begin{figure}[H]
-        \centering
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict01.png}
-            \subcaption{预测：A}
-        \end{subfigure}
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict02.png}
-            \subcaption{预测：B}
-        \end{subfigure}
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict03.png}
-            \subcaption{预测：M}
-        \end{subfigure}
-
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict04.png}
-            \subcaption{预测：R}
-        \end{subfigure}
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict05.png}
-            \subcaption{预测：M}
-        \end{subfigure}
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict06.png}
-            \subcaption{预测：O}
-        \end{subfigure}
-
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict07.png}
-            \subcaption{预测：B}
-        \end{subfigure}
-        \hfill
-        \begin{subfigure}[b]{.3\linewidth}
-            \includegraphics[width=\linewidth]{img/predict/predict08.png}
-            \subcaption{预测：W}
-        \end{subfigure}
-        \hfill
-    \end{figure}
-    \item 遇到的问题及解决方法
-    \begin{enumerate}
-        \item 代码中对灰度图像的矩阵进行标准化时，\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}的\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
-        \item 在利用训练好的模型进行预测时，发现自己找到的大部分模型都预测错误；最后与训练集的图片进行了对比，发现主要问题是裁切字母时留下了过大的边距，导致模型不能正确理解输入。重新裁剪边框后，得到正确的结果。
-    \end{enumerate}
-    \item 建议：希望下次发布作业代码可以利用清华的git。
-\end{enumerate}
-
-
-
-
-% \section{自选课题开题报告}
-% 请在此处介绍自选课题
-
-\end{document}
-
-
-
-%%% Local Variables:
-%%% mode: late\rvx
-%%% TeX-master: t
-%%% End:
+% Homework Template
+\documentclass[a4paper]{article}
+\usepackage{ctex}
+\usepackage{amsmath, amssymb, amsthm}
+\usepackage{moreenum}
+\usepackage{mathtools}
+\usepackage{url}
+\usepackage{bm}
+\usepackage{enumitem}
+\usepackage{graphicx}
+\usepackage{subcaption}
+\usepackage{booktabs} % toprule
+\usepackage[mathcal]{eucal}
+\usepackage[thehwcnt = 1]{iidef}
+\usepackage{listings}
+\usepackage[x11names]{xcolor}
+\usepackage{float}
+\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
+
+\DeclareMathOperator{\arctanh}{arctanh}
+% \DeclareMathOperator{\diag}{diag}
+
+\setenumerate[1]{label=(\arabic{*})}
+\setenumerate[2]{label=\arabic{*})}
+
+\definecolor{codekeyword}{RGB}{171, 0, 216}
+\definecolor{codetypename}{RGB}{29, 37, 251}
+\definecolor{codevariable}{RGB}{10, 23, 126}
+\definecolor{codestring}{RGB}{157, 0, 25}
+\definecolor{codecomment}{RGB}{31, 129, 19}
+
+\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
+% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
+\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
+% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
+
+\lstset{
+    basicstyle          =   \small\codefont,
+    % ---
+    tabsize             =   4,
+    showstringspaces    =   false,
+    numbers             =   left,
+    numberstyle         =   \cascadia,
+    % ---
+    breaklines          =   true,
+    captionpos          =   t,      
+    % ---
+    frame               =   l,
+    flexiblecolumns,
+    columns = fixed,
+}
+
+\thecourseinstitute{清华大学电子工程系}
+\thecoursename{\textbf{媒体与认知} \space 课堂2}
+\theterm{2023-2024学年春季学期}
+\hwname{作业}
+\begin{document}
+\courseheader
+% 请在YOUR NAME处填写自己的姓名
+\name{高艺轩}
+\vspace{3mm}
+\centerline{\textbf{\Large{理论部分}}}
+
+\section{单选题（15分）}
+% 请在？处填写答案
+\subsection{\underline{B}}
+
+\subsection{\underline{A}}
+
+\subsection{\underline{B}}
+
+\subsection{\underline{A}}
+
+\subsection{\underline{B}}
+
+\section{计算题（15 分）}
+\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$，其中$\mathbf{x}\in R^{(m \times 1)}$，$\mathbf{z}\in R^{(n\times 1)}$，$\mathbf{W}\in R^{(m\times n)}$，$\mathbf{b} \in R^{(n\times 1)}$均为已知，其激活函数如下：
+$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
+tanh表示双曲正切函数。若训练过程中的目标函数为L，且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$和$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
+}
+\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
+}
+
+\begin{proof}[解]
+    首先，对$i \neq j$，$\dfrac{\partial y_i}{\partial z_j} = 0$。
+    
+    同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$，因此
+    \[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
+    因此
+    \[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
+\end{proof}
+
+\subsubsection{请使用$\mathbf{y}$和$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$，$\frac{\partial L}{\partial \mathbf{W}}$，$\frac{\partial L}{\partial \mathbf{b}}$。
+}
+提示:$\frac{\partial L}{\partial \mathbf{x}}$，$\frac{\partial L}{\partial \mathbf{W}}$，$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
+
+\begin{proof}[解]
+    由链式法则
+    \[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
+
+    对于$\dfrac{\partial L}{\partial W}$，
+    \[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
+        \boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
+    \end{bmatrix}_{m \times n}\]
+
+    \begin{align*}
+        \frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
+        & = \begin{bmatrix}
+        \boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
+    \end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
+    \end{align*}
+
+    对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$，由链式法则
+    \[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
+\end{proof}
+\vspace{6mm}
+\centerline{\textbf{\Large{编程部分}}}
+
+
+\vspace{3mm}
+% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
+\section{编程作业报告}
+% 请在此处完成编程作业报告
+完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
+\begin{enumerate}
+    \item 使用默认配置进行训练和测试。
+    \begin{enumerate}
+        \item 训练模型。
+        
+        输入：
+        \lstinputlisting{codes/1.1.in.txt}
+        
+        输出：
+        \lstinputlisting{codes/1.1.out.txt}
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.9\linewidth]{img/1default_train.png}
+        \end{figure}
+
+        \item 测试模型。
+        
+        输入：
+        \lstinputlisting{codes/1.2.in.txt}
+
+        输出：
+        \lstinputlisting{codes/1.2.out.txt}
+    \end{enumerate}
+    \item 调整参数、使用Adam优化器训练并测试。
+    \begin{enumerate}
+        \item 训练模型。
+        
+        输入：
+        \lstinputlisting{codes/2.1.in.txt}
+
+        输出：
+        \lstinputlisting{codes/2.1.out.txt}
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
+        \end{figure}
+        \item 测试性能。
+        
+        输入：
+        \lstinputlisting{codes/2.2.in.txt}
+
+        输出：
+        \lstinputlisting{codes/2.2.out.txt}
+    \end{enumerate}
+
+    \item 使用效果最佳的模型测试。
+    经过简单的尝试，发现使用
+    \lstinputlisting{codes/self_train.in.txt}
+    可以使测试集准确率达到88.8\%，有略微的提升。训练的loss曲线：
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=.9\linewidth]{img/3found_best.png}
+    \end{figure}
+    使用它进行预测：
+    \begin{figure}[H]
+        \centering
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict01.png}
+            \subcaption{预测：A}
+        \end{subfigure}
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict02.png}
+            \subcaption{预测：B}
+        \end{subfigure}
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict03.png}
+            \subcaption{预测：M}
+        \end{subfigure}
+
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict04.png}
+            \subcaption{预测：R}
+        \end{subfigure}
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict05.png}
+            \subcaption{预测：M}
+        \end{subfigure}
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict06.png}
+            \subcaption{预测：O}
+        \end{subfigure}
+
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict07.png}
+            \subcaption{预测：B}
+        \end{subfigure}
+        \hfill
+        \begin{subfigure}[b]{.3\linewidth}
+            \includegraphics[width=\linewidth]{img/predict/predict08.png}
+            \subcaption{预测：W}
+        \end{subfigure}
+        \hfill
+    \end{figure}
+    \item 遇到的问题及解决方法
+    \begin{enumerate}
+        \item 代码中对灰度图像的矩阵进行标准化时，\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}的\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
+        \item 在利用训练好的模型进行预测时，发现自己找到的大部分模型都预测错误；最后与训练集的图片进行了对比，发现主要问题是裁切字母时留下了过大的边距，导致模型不能正确理解输入。重新裁剪边框后，得到正确的结果。
+    \end{enumerate}
+    \item 建议：希望下次发布作业代码可以利用清华的git。
+\end{enumerate}
+
+
+
+
+% \section{自选课题开题报告}
+% 请在此处介绍自选课题
+
+\end{document}
+
+
+
+%%% Local Variables:
+%%% mode: late\rvx
+%%% TeX-master: t
+%%% End:
--- a/hw1/HW1-code/activations.py
+++ b/hw1/HW1-code/activations.py
@@ -1,164 +1,164 @@
-#========================================================
-#             Media and Cognition
-#             Homework 1 Neural network basics
-#             activations.py - activation functions
-#             Student ID: 2022010639
-#             Name: Gao Yixuan
-#             Tsinghua University
-#             (C) Copyright 2024
-#========================================================
-import torch
-import torch.nn as nn
-
-'''
-In this script we will implement three activation functions, including both forward and backward processes.
-More details about customizing a backward process in PyTorch can be found in:
-https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
-'''
-
-## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
-class Tanh(torch.autograd.Function):
-    '''
-    Tanh activation function
-    y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-    '''
-    # static method of a python class means that we can call the function without initializing an instance of the class
-    @staticmethod
-    def forward(ctx, x):
-        '''
-        In the forward pass we receive a Tensor containing the input x and return
-        a Tensor containing the output. 
-        
-        ctx: it is a context object that can be used to save information for backward computation. You can save 
-        objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
-
-        x: input with arbitrary shape
-        '''
-        # Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
-        # y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
-
-        # here we directly use torch.tanh(x) to avoid the problem above
-        y = torch.tanh(x)
-
-        # save an variable in ctx
-        ctx.save_for_backward(y)
-
-        return y
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        In the backward pass we receive a Tensor containing the gradient of the loss
-        with respect to the output, and we need to compute the gradient of the loss
-        with respect to the input.
-
-        grad_output: dL/dy
-        grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
-        """
-        # get an variable from ctx
-        y, = ctx.saved_tensors
-
-        # chain rule: dL/dx = dL/dy * dy/dx
-        # where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
-        grad_input = grad_output * (1 - y ** 2)
-
-        return grad_input
-
-#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
-#Note: You can refer to the activation function Tanh
-class Sigmoid(torch.autograd.Function):
-    '''
-    Sigmoid activation function
-    y = 1 / (1 + exp(-x))
-    '''
-
-    @staticmethod
-    def forward(ctx, x):
-
-        # hint: you can use torch.exp(x) to calculate exp(x)
-        y = 1 - (1 + torch.exp(-x))
-
-        # here we save y in ctx, in this way we can use y to calculate gradients in backward process
-        ctx.save_for_backward(y)
-
-        return y
-
-    @staticmethod
-    def backward(ctx, grad_output):
-
-        # get y from ctx
-        y, = ctx.saved_tensors
-
-        # implement gradient of x (grad_input), grad_input refers to dL/dx
-        # chain rule: dL/dx = dL/dy * dy/dx
-        # where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
-        grad_input = grad_output * y * (1 - y)
-
-        return grad_input
-
-#TODO 2: complete the forward and backward functions of the ReLU activation function.
-#Note: You can refer to the activation function Tanh
-class ReLU(torch.autograd.Function):
-    '''
-    ReLU activation function
-    y = max{x, 0}
-    '''
-
-    @staticmethod
-    def forward(ctx, x):
-
-        # set elements less than 0 in x to 0
-        # this operation is inplace
-        x = torch.max(x, torch.tensor([0.]).to(x.device))
-
-        # save x in ctx, in this way we can use x to calculate gradients in backward process
-        ctx.save_for_backward(x)
-
-        # return the output
-        return x
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        In the backward pass we receive a Tensor containing the gradient of the loss
-        with respect to the output, and we need to compute the gradient of the loss
-        with respect to the input.
-        """
-
-        # get x from ctx
-        x, = ctx.saved_tensors
-        # print("Before heaviside")
-        # print(x, x.size())
-        x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
-        # print("After heaviside")
-        # print(x, x.size())
-        # print(grad_output, grad_output.size())
-        # print(grad_output * x)
-
-        # chain rule: dL/dx = dL/dy * dy/dx
-        # where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
-        grad_input = grad_output * x
-        
-        return grad_input
-
-
-# activate function class according to the type
-class Activation(nn.Module):
-    def __init__(self, type):
-        '''
-        :param type:  'sigmoid', 'tanh', or 'relu'
-        '''
-        super().__init__()
-
-        if type == 'sigmoid':
-            self.act = Sigmoid.apply
-        elif type == 'tanh':
-            self.act = Tanh.apply
-        elif type == 'relu':
-            self.act = ReLU.apply
-        else:
-            print('activation type should be one of [sigmoid, tanh, relu]')
-            raise NotImplementedError
-
-    def forward(self, x):
-        return self.act(x)
+#========================================================
+#             Media and Cognition
+#             Homework 1 Neural network basics
+#             activations.py - activation functions
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+#========================================================
+import torch
+import torch.nn as nn
+
+'''
+In this script we will implement three activation functions, including both forward and backward processes.
+More details about customizing a backward process in PyTorch can be found in:
+https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
+'''
+
+## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
+class Tanh(torch.autograd.Function):
+    '''
+    Tanh activation function
+    y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+    '''
+    # static method of a python class means that we can call the function without initializing an instance of the class
+    @staticmethod
+    def forward(ctx, x):
+        '''
+        In the forward pass we receive a Tensor containing the input x and return
+        a Tensor containing the output. 
+        
+        ctx: it is a context object that can be used to save information for backward computation. You can save 
+        objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
+
+        x: input with arbitrary shape
+        '''
+        # Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
+        # y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
+
+        # here we directly use torch.tanh(x) to avoid the problem above
+        y = torch.tanh(x)
+
+        # save an variable in ctx
+        ctx.save_for_backward(y)
+
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+
+        grad_output: dL/dy
+        grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
+        """
+        # get an variable from ctx
+        y, = ctx.saved_tensors
+
+        # chain rule: dL/dx = dL/dy * dy/dx
+        # where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
+        grad_input = grad_output * (1 - y ** 2)
+
+        return grad_input
+
+#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
+#Note: You can refer to the activation function Tanh
+class Sigmoid(torch.autograd.Function):
+    '''
+    Sigmoid activation function
+    y = 1 / (1 + exp(-x))
+    '''
+
+    @staticmethod
+    def forward(ctx, x):
+
+        # hint: you can use torch.exp(x) to calculate exp(x)
+        y = 1 - (1 + torch.exp(-x))
+
+        # here we save y in ctx, in this way we can use y to calculate gradients in backward process
+        ctx.save_for_backward(y)
+
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # get y from ctx
+        y, = ctx.saved_tensors
+
+        # implement gradient of x (grad_input), grad_input refers to dL/dx
+        # chain rule: dL/dx = dL/dy * dy/dx
+        # where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
+        grad_input = grad_output * y * (1 - y)
+
+        return grad_input
+
+#TODO 2: complete the forward and backward functions of the ReLU activation function.
+#Note: You can refer to the activation function Tanh
+class ReLU(torch.autograd.Function):
+    '''
+    ReLU activation function
+    y = max{x, 0}
+    '''
+
+    @staticmethod
+    def forward(ctx, x):
+
+        # set elements less than 0 in x to 0
+        # this operation is inplace
+        x = torch.max(x, torch.tensor([0.]).to(x.device))
+
+        # save x in ctx, in this way we can use x to calculate gradients in backward process
+        ctx.save_for_backward(x)
+
+        # return the output
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+
+        # get x from ctx
+        x, = ctx.saved_tensors
+        # print("Before heaviside")
+        # print(x, x.size())
+        x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
+        # print("After heaviside")
+        # print(x, x.size())
+        # print(grad_output, grad_output.size())
+        # print(grad_output * x)
+
+        # chain rule: dL/dx = dL/dy * dy/dx
+        # where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
+        grad_input = grad_output * x
+        
+        return grad_input
+
+
+# activate function class according to the type
+class Activation(nn.Module):
+    def __init__(self, type):
+        '''
+        :param type:  'sigmoid', 'tanh', or 'relu'
+        '''
+        super().__init__()
+
+        if type == 'sigmoid':
+            self.act = Sigmoid.apply
+        elif type == 'tanh':
+            self.act = Tanh.apply
+        elif type == 'relu':
+            self.act = ReLU.apply
+        else:
+            print('activation type should be one of [sigmoid, tanh, relu]')
+            raise NotImplementedError
+
+    def forward(self, x):
+        return self.act(x)
--- a/hw1/HW1-code/losses.py
+++ b/hw1/HW1-code/losses.py
@@ -1,118 +1,118 @@
-#========================================================
-#             Media and Cognition
-#             Homework 1 Neural network basics
-#             losses.py - loss functions
-#             Student ID: 2022010639
-#             Name: Gao Yixuan
-#             Tsinghua University
-#             (C) Copyright 2024
-#========================================================
-
-import torch
-import torch.nn.functional as F
-
-'''
-In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
-More details about customizing a backward process can be found in:
-https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
-'''
-
-# here is the sample code of MSELoss
-# you can use this as reference to implement the CrossEntropyLoss
-class MSELoss(torch.autograd.Function):
-    '''
-    MSE loss function
-    loss = (label - pred) ** 2
-    '''
-
-    @staticmethod
-    def forward(ctx, pred, label):
-        """
-        :param pred: prediction with shape [batch_size, *], where ∗ means additional dimensions
-        :param label: groundtruth, same shape as the predition
-        :return: MSE loss, averaged by batch_size
-        """
-
-        # step 1: here we compute the summation of loss for each element and save both pred and label in ctx
-        loss = torch.sum((pred - label) ** 2)
-        ctx.save_for_backward(pred, label)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        """
-        :param grad_output: for loss function, grad_output will be 1
-        """
-
-        # step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
-        pred, label = ctx.saved_tensors
-        grad_input = grad_output * 2 * (pred - label)
-
-        # return None for gradient of label since we do not need to compute dL/dlabel
-        return grad_input, None
-
-#TODO 1: Complete the CrossEntropyLoss loss function
-class CrossEntropyLoss(torch.autograd.Function):
-    '''
-    Cross entropy loss function:
-        loss = - log q_i
-    where
-        q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
-
-    However, when z_i has a lager value, exp(z_i) might become infinity.
-    So we use stable softmax:
-        softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
-    where
-        A = exp(-z_max) = exp(-max{z_0, z_1, ...})
-    therefore we have
-        softmax(z_i) = softmax(z_i - z_max)
-    '''
-
-    @staticmethod
-    def forward(ctx, logits, label):
-        """
-        :param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
-        :param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
-        :return: cross entropy loss, averaged by batch_size
-        """
-
-        # step 1: calculate softmax(z) using stable softmax method
-        # hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
-        #e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
-
-        # calculate z_max
-        z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
-
-        # calculate exps = exp(z - z_max)
-        exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
-
-        # calculate q = softmax(y - y_max)
-        sums = torch.sum(exps, 1) # of size [batch_size]
-        # print(exps.size(), sums.size())
-        # print(sums.reshape(-1, 1))
-        q = exps / sums.reshape(-1, 1)
-
-        # step 2: convert label into one-hot version
-        # e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]] 
-        # the converted label has shape [batch_size, n_classes]
-        # tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
-        one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
-
-        # step 3: calculate cross entropy loss = - log q_i, and averaged by batch
-        # save result of softmax and one-hot label in ctx for gradient computation
-        cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
-
-        ctx.save_for_backward(q, one_hot_label)
-
-        return cross_entropy
-
-    @staticmethod
-    def backward(ctx, grad_output):
-
-        # step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
-        q, label = ctx.saved_tensors
-        grad_input = grad_output * (q - label)
-
-        # return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
+#========================================================
+#             Media and Cognition
+#             Homework 1 Neural network basics
+#             losses.py - loss functions
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+#========================================================
+
+import torch
+import torch.nn.functional as F
+
+'''
+In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
+More details about customizing a backward process can be found in:
+https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
+'''
+
+# here is the sample code of MSELoss
+# you can use this as reference to implement the CrossEntropyLoss
+class MSELoss(torch.autograd.Function):
+    '''
+    MSE loss function
+    loss = (label - pred) ** 2
+    '''
+
+    @staticmethod
+    def forward(ctx, pred, label):
+        """
+        :param pred: prediction with shape [batch_size, *], where ∗ means additional dimensions
+        :param label: groundtruth, same shape as the predition
+        :return: MSE loss, averaged by batch_size
+        """
+
+        # step 1: here we compute the summation of loss for each element and save both pred and label in ctx
+        loss = torch.sum((pred - label) ** 2)
+        ctx.save_for_backward(pred, label)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        :param grad_output: for loss function, grad_output will be 1
+        """
+
+        # step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
+        pred, label = ctx.saved_tensors
+        grad_input = grad_output * 2 * (pred - label)
+
+        # return None for gradient of label since we do not need to compute dL/dlabel
+        return grad_input, None
+
+#TODO 1: Complete the CrossEntropyLoss loss function
+class CrossEntropyLoss(torch.autograd.Function):
+    '''
+    Cross entropy loss function:
+        loss = - log q_i
+    where
+        q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
+
+    However, when z_i has a lager value, exp(z_i) might become infinity.
+    So we use stable softmax:
+        softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
+    where
+        A = exp(-z_max) = exp(-max{z_0, z_1, ...})
+    therefore we have
+        softmax(z_i) = softmax(z_i - z_max)
+    '''
+
+    @staticmethod
+    def forward(ctx, logits, label):
+        """
+        :param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
+        :param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
+        :return: cross entropy loss, averaged by batch_size
+        """
+
+        # step 1: calculate softmax(z) using stable softmax method
+        # hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
+        #e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
+
+        # calculate z_max
+        z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
+
+        # calculate exps = exp(z - z_max)
+        exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
+
+        # calculate q = softmax(y - y_max)
+        sums = torch.sum(exps, 1) # of size [batch_size]
+        # print(exps.size(), sums.size())
+        # print(sums.reshape(-1, 1))
+        q = exps / sums.reshape(-1, 1)
+
+        # step 2: convert label into one-hot version
+        # e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]] 
+        # the converted label has shape [batch_size, n_classes]
+        # tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
+        one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
+
+        # step 3: calculate cross entropy loss = - log q_i, and averaged by batch
+        # save result of softmax and one-hot label in ctx for gradient computation
+        cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
+
+        ctx.save_for_backward(q, one_hot_label)
+
+        return cross_entropy
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        # step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
+        q, label = ctx.saved_tensors
+        grad_input = grad_output * (q - label)
+
+        # return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
        return grad_input, None
--- a/hw1/HW1-code/network.py
+++ b/hw1/HW1-code/network.py
@@ -1,156 +1,156 @@
-#========================================================
-#             Media and Cognition
-#             Homework 1 Neural network basics
-#             network.py - linear layer and MLP network
-#             Student ID: 2022010639
-#             Name: Gao Yixuan
-#             Tsinghua University
-#             (C) Copyright 2024
-#========================================================
-import torch
-import torch.nn as nn
-from activations import Activation
-
-'''
-In this script we will implement our Linear layer and MLP network.
-For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
-More details about customizing a backward process can be found in:
-https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
-For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
-'''
-
-
-class LinearFunction(torch.autograd.Function):
-    '''
-    we will implement the linear function:
-    y = xW^T + b
-    as well as its gradient computation process
-    '''
-
-    @staticmethod
-    def forward(ctx, x, W, b):
-        '''
-        Input:
-        :param ctx: a context object that can be used to stash information for backward computation
-        :param x: input features with size [batch_size, input_size]
-        :param W: weight matrix with size [output_size, input_size]
-        :param b: bias with size [output_size]
-        Return:
-        y :output features with size [batch_size, output_size]
-        '''
-
-        # print(x, x.size(), x.dtype)
-        # print(W.T, W.T.size(), W.T.dtype)
-        # print(x.device, W.T.device)
-        y = torch.matmul(x, W.T) + b
-        ctx.save_for_backward(x, W)
-
-        return y
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        '''
-        Input:
-        :param ctx: a context object with saved variables
-        :param grad_output: dL/dy, with size [batch_size, output_size]
-        Return:
-        grad_input: dL/dx, with size [batch_size, input_size]
-        grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
-        grad_b: dL/db, with size [output_size], summed for data in the batch
-        '''
-
-        x, W = ctx.saved_variables
-
-        # calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
-        # calculate dL/dW by using dL/dy (grad_output) and x
-        # calculate dL/db using dL/dy (grad_output)
-        # you can use torch.matmul(A, B) to compute matrix product of A and B
-
-        grad_input = torch.matmul(grad_output, W)
-        grad_W = torch.matmul(grad_output.T, x)
-        grad_b = grad_output.sum(0)
-
-        return grad_input, grad_W, grad_b
-
-
-class Linear(nn.Module):
-    def __init__(self, input_size, output_size):
-        '''
-        A linear layer which uses our own LinearFunction implemented above.
-        -----------------------------------------------
-        :param input_size: dimension of input features
-        :param output_size: dimension of output features
-        '''
-        super(Linear, self).__init__()
-
-
-        W = torch.randn(output_size, input_size).float()
-        b = torch.zeros(output_size).float()
-        self.W = nn.Parameter(W, requires_grad=True)
-        self.b = nn.Parameter(b, requires_grad=True)
-
-    def forward(self, x):
-        # here we call the LinearFunction we implement above
-        return LinearFunction.apply(x, self.W, self.b)
-
-class MLP(nn.Module):
-    def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
-        '''
-        Multilayer Perceptron
-        ----------------------
-        :param input_size: dimension of input features
-        :param output_size: dimension of output features
-        :param hidden_size: a list containing hidden size for each hidden layer
-        :param n_layers: number of layers
-        :param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
-        '''
-        # TODO 1: initialize the parent class nn.Module
-        super(MLP, self).__init__()
-
-        # total layer number should be hidden layer number + 1 (output layer)
-        # print(hidden_size, n_layers)
-        assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
-
-        # TODO 2；complete the network structures 
-        # instantiate the activation function by using the defined classes in activations.py
-        self.act = Activation(act_type)
-
-        # initialize a list to save layers
-        layers = nn.ModuleList()
-
-        if n_layers == 1:
-            # append a linear layer into the module list
-            # if n_layers == 1, MLP degenerates to a single linear layer
-            layers.append(Linear(input_size, output_size))
-
-        # MLP with at least 2 layers
-        else:
-            # construct the hidden layers and add them to the module list
-            # a hidden layer of MLP consists of a linear layer and an activation function
-            in_size = input_size
-            for i in range(n_layers - 1):
-                layer = Linear(in_size, hidden_size[i])
-                layers.append(layer) # append the linear layer into the module list
-                layers.append(self.act)
-                in_size = hidden_size[i] # update in_size for the next layer
-
-            # initialize the output layer and append the layer into the module list
-            # hint: what is the output size of the output layer?
-            layers.append(Linear(hidden_size[-1], output_size))
-
-        # Use nn.Sequential to get the neural network
-        self.network = torch.nn.Sequential()
-        for layer in layers:
-            self.network.append(layer)
-
-
-    def forward(self, x):
-        '''
-        Define the forward function
-        :param x: input features with size [batch_size, input_size]
-        :return: output features with size [batch_size, output_size]
-        '''
-        # TODO 3: implement the forward propagation of the MLP
-        out = self.network(x)
-
-        return out
+#========================================================
+#             Media and Cognition
+#             Homework 1 Neural network basics
+#             network.py - linear layer and MLP network
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+#========================================================
+import torch
+import torch.nn as nn
+from activations import Activation
+
+'''
+In this script we will implement our Linear layer and MLP network.
+For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
+More details about customizing a backward process can be found in:
+https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
+For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
+'''
+
+
+class LinearFunction(torch.autograd.Function):
+    '''
+    we will implement the linear function:
+    y = xW^T + b
+    as well as its gradient computation process
+    '''
+
+    @staticmethod
+    def forward(ctx, x, W, b):
+        '''
+        Input:
+        :param ctx: a context object that can be used to stash information for backward computation
+        :param x: input features with size [batch_size, input_size]
+        :param W: weight matrix with size [output_size, input_size]
+        :param b: bias with size [output_size]
+        Return:
+        y :output features with size [batch_size, output_size]
+        '''
+
+        # print(x, x.size(), x.dtype)
+        # print(W.T, W.T.size(), W.T.dtype)
+        # print(x.device, W.T.device)
+        y = torch.matmul(x, W.T) + b
+        ctx.save_for_backward(x, W)
+
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        '''
+        Input:
+        :param ctx: a context object with saved variables
+        :param grad_output: dL/dy, with size [batch_size, output_size]
+        Return:
+        grad_input: dL/dx, with size [batch_size, input_size]
+        grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
+        grad_b: dL/db, with size [output_size], summed for data in the batch
+        '''
+
+        x, W = ctx.saved_variables
+
+        # calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
+        # calculate dL/dW by using dL/dy (grad_output) and x
+        # calculate dL/db using dL/dy (grad_output)
+        # you can use torch.matmul(A, B) to compute matrix product of A and B
+
+        grad_input = torch.matmul(grad_output, W)
+        grad_W = torch.matmul(grad_output.T, x)
+        grad_b = grad_output.sum(0)
+
+        return grad_input, grad_W, grad_b
+
+
+class Linear(nn.Module):
+    def __init__(self, input_size, output_size):
+        '''
+        A linear layer which uses our own LinearFunction implemented above.
+        -----------------------------------------------
+        :param input_size: dimension of input features
+        :param output_size: dimension of output features
+        '''
+        super(Linear, self).__init__()
+
+
+        W = torch.randn(output_size, input_size).float()
+        b = torch.zeros(output_size).float()
+        self.W = nn.Parameter(W, requires_grad=True)
+        self.b = nn.Parameter(b, requires_grad=True)
+
+    def forward(self, x):
+        # here we call the LinearFunction we implement above
+        return LinearFunction.apply(x, self.W, self.b)
+
+class MLP(nn.Module):
+    def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
+        '''
+        Multilayer Perceptron
+        ----------------------
+        :param input_size: dimension of input features
+        :param output_size: dimension of output features
+        :param hidden_size: a list containing hidden size for each hidden layer
+        :param n_layers: number of layers
+        :param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
+        '''
+        # TODO 1: initialize the parent class nn.Module
+        super(MLP, self).__init__()
+
+        # total layer number should be hidden layer number + 1 (output layer)
+        # print(hidden_size, n_layers)
+        assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
+
+        # TODO 2；complete the network structures 
+        # instantiate the activation function by using the defined classes in activations.py
+        self.act = Activation(act_type)
+
+        # initialize a list to save layers
+        layers = nn.ModuleList()
+
+        if n_layers == 1:
+            # append a linear layer into the module list
+            # if n_layers == 1, MLP degenerates to a single linear layer
+            layers.append(Linear(input_size, output_size))
+
+        # MLP with at least 2 layers
+        else:
+            # construct the hidden layers and add them to the module list
+            # a hidden layer of MLP consists of a linear layer and an activation function
+            in_size = input_size
+            for i in range(n_layers - 1):
+                layer = Linear(in_size, hidden_size[i])
+                layers.append(layer) # append the linear layer into the module list
+                layers.append(self.act)
+                in_size = hidden_size[i] # update in_size for the next layer
+
+            # initialize the output layer and append the layer into the module list
+            # hint: what is the output size of the output layer?
+            layers.append(Linear(hidden_size[-1], output_size))
+
+        # Use nn.Sequential to get the neural network
+        self.network = torch.nn.Sequential()
+        for layer in layers:
+            self.network.append(layer)
+
+
+    def forward(self, x):
+        '''
+        Define the forward function
+        :param x: input features with size [batch_size, input_size]
+        :return: output features with size [batch_size, output_size]
+        '''
+        # TODO 3: implement the forward propagation of the MLP
+        out = self.network(x)
+
+        return out
--- a/hw1/HW1-code/recognition.py
+++ b/hw1/HW1-code/recognition.py
@@ -1,397 +1,397 @@
-#========================================================
-#             Media and Cognition
-#             Homework 1 Neural network basics
-#             recognition.py - character classification
-#             Student ID: 2022010639
-#             Name: Gao Yixuan
-#             Tsinghua University
-#             (C) Copyright 2024
-#========================================================
-
-# ==== Part 0: import libs
-import torch
-import torch.optim as optim
-from torch.utils.data import Dataset, DataLoader
-
-import json, cv2, os, string
-import matplotlib.pyplot as plt
-
-import numpy as np
-
-# this time we implement our networks and loss functions in other python script, and import them here
-from network import MLP
-from losses import CrossEntropyLoss
-
-# argparse is used to conveniently set our configurations
-import argparse
-
-# ==== Part 1: data loader
-
-# construct a dataset and a data loader, more details can be found in
-# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
-
-class ListDataset(Dataset):
-    def __init__(self, im_dir, file_path, norm_size=(32, 32)):
-        '''
-        :param im_dir: path to directory with images
-        :param file_path: json file containing image names and labels
-        :param norm_size: image normalization size, (height, width)
-        '''
-
-        # this time we will try to recognize 26 English letters (case-insensitive)
-        letters = string.ascii_letters[-26:]  # ABCD...XYZ
-        self.alphabet = {letters[i]:i for i in range(len(letters))}
-        self.norm_size = norm_size
-
-        with open(file_path, 'r') as f:
-            imgs = json.load(f)
-            im_names = list(imgs.keys())
-
-            self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
-            self.labels = list(imgs.values())
-
-    def __len__(self):
-        # the __len__() function should return the total number of samples in the dataset
-        return len(self.im_paths)
-
-    def __getitem__(self, index):
-        assert index <= len(self), 'index range error'
-
-        # read an image and convert it to grey scale
-        im_path = self.im_paths[index]
-        im = cv2.imread(im_path)
-        im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
-
-        # image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
-        im = cv2.resize(im, self.norm_size)
-        # im = im / 255.
-        """ The above command does not seems to be valid in my environment """
-        im = np.divide(im, 255.)
-        im = (im - 0.5) * 2.0
-
-        # get the label of the current image
-        # upper() is used to convert a letter into uppercase
-        label = self.labels[index].upper()
-
-        # convert an English letter into a number index
-        label = self.alphabet[label]
-
-        # TODO 1: return the image and its label
-        return im, label
-        
-
-
-def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
-    '''
-    :param im_dir: path to directory with images
-    :param file_path: file with image paths and labels
-    :param norm_size: image normalization size, (height, width)
-    :param batch_size: batch size
-    :param workers: number of workers for loading data in multiple threads
-    :return: a data loader
-    '''
-
-    dataset = ListDataset(im_dir, file_path, norm_size)
-    return DataLoader(dataset,
-                      batch_size=batch_size,
-                      shuffle=True if 'train' in file_path else False,  # shuffle images only when training
-                      num_workers=workers)
-
-
-# ==== Part 2: training, validation and testing
-
-def train_val(model, trainloader, valloader, n_epochs, 
-              lr, optim_type, momentum, weight_decay,
-              valInterval, device='cpu'):
-    '''
-    The main training procedure
-    ----------------------------
-    :param model: the MLP model
-    :param trainloader: the dataloader of the train set
-    :param valloader: the dataloader of the validation set
-    :param n_epochs: number of training epochs
-    :param lr: learning rate
-    :param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
-    :param momentum: only used if optim_type == 'sgd'
-    :param weight_decay: the factor of L2 penalty on network weights
-    :param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
-    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
-    '''
-
-    # define the cross entropy loss function.
-    ce_loss = CrossEntropyLoss.apply
-
-    # optimizer
-    if optim_type == 'sgd':
-        optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
-    elif optim_type == 'adagrad':
-        optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
-    elif optim_type == 'rmsprop':
-        optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
-    elif optim_type == 'adam':
-        optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
-    elif optim_type == 'adadelta':
-        optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
-    else:
-        print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
-        raise NotImplementedError
-
-    # training
-
-    # to save loss of each training epoch in a python "list" data structure
-    losses = []
-
-    for epoch in range(n_epochs):
-        # set the model in training mode
-        model.train()
-
-        # to save total loss in one epoch
-        total_loss = 0.
-
-        #TODO 2: Calculate losses and train the network using the optimizer
-        for data, labels in trainloader:  # get a batch of data
-
-            # step 1: set data type and device
-            # data = torch.from_numpy(data)
-            data = data.type(torch.float32)
-            data = data.to(device)
-            labels = labels.to(device)
-
-            # print(data.device)
-
-            # step 2: convert an image to a vector as the input of the MLP
-            data = torch.flatten(data, start_dim=1)
-            # print(data.size())
-
-            # hit: clear gradients in the optimizer
-            optimizer.zero_grad()
-
-            # step 3: run the model which is the forward process
-            output = model(data)
-
-            # step 4: compute the loss, and call backward propagation function
-            loss = ce_loss(output, labels)
-            loss.backward()
-            # I have no idea why pylance can't get the data type of what ce_loss returns
-
-            # step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
-            # this operation is not differentiable
-            total_loss += loss.item()
-
-            # step 6: call a function, optimizer.step(), to update the parameters of the models
-            optimizer.step()
-            
-
-        # average of the total loss for iterations
-        avg_loss = total_loss / len(trainloader)
-        losses.append(avg_loss)
-        print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
-
-        # validation
-        if (epoch + 1) % valInterval == 0:
-            val_acc = test(model, valloader, device)
-            # show prediction accuracy
-            print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
-
-
-    # save model parameters in a file
-    # model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
-    model_save_path = opt.model_path
-
-    torch.save({'state_dict': model.state_dict(),
-                }, model_save_path)
-    print('Model saved in {}\n'.format(model_save_path))
-
-    # draw the loss curve
-    plot_loss(losses)
-
-
-def test(model, testloader, device):
-    '''
-    The testing procedure
-    ----------------------------
-    :param model: the MLP model
-    :param testloader: the dataloader to be tested/validated
-    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
-    '''
-    # set the model in evaluation mode
-    model.eval()
-
-    n_correct = 0.  # number of images that are correctly classified
-    n_imgs = 0.  # number of total images
-    
-    with torch.no_grad():  # we do not need to compute gradients during validation
-
-        #TODO 3: get the prediction of the data and calculate the accuracy
-        for imgs, labels in testloader:
-            # step 1: set data type and device
-            # imgs = torch.from_numpy(imgs)
-            imgs = imgs.type(torch.float32)
-            imgs = imgs.to(device)
-            labels = labels.to(device)
-
-            # step 2: convert an image to a vector as the input of the MLP
-            imgs = torch.flatten(imgs, start_dim=1)
-
-            # step 3: run the model which is the forward process
-            output = model(imgs)
-
-            # step 4: get the predicted value by the output using out.argmax(1)
-            pred = output.argmax(1)
-
-            # step 5: sum up the number of images correctly recognized and the total image number
-            for predict, label in zip(pred, labels):
-                if predict == label:
-                    n_correct += 1
-                n_imgs += 1
-
-    accuracy = n_correct / n_imgs
-    return accuracy
-
-
-# ==== Part 3: predict new images
-def predict(model, im_path, norm_size, device):
-    '''
-    The predicting procedure
-    ---------------
-    :param model: the MLP model
-    :param im_path: path of an image
-    :param norm_size: image normalization size, (height, width)
-    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
-    '''
-
-    # TODO 4: enter the evaluation mode
-    model.eval()
-
-    # TODO 4: image pre-processing, similar to what we do in ListDataset()
-    im = cv2.imread(im_path)
-    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
-
-    im = cv2.resize(im, norm_size)
-    im = np.divide(im, 255.)
-    im = (im - 0.5) * 2.0
-
-    # convert im from numpy.ndarray to torch.tensor
-    im = torch.from_numpy(im)
-
-    # input im into the model
-    with torch.no_grad():
-        input = im.view(1, -1).type(torch.float32).to(device)
-        out = model(input)
-        prediction = out.argmax(1)[0].item()
-
-    # convert index of prediction to the corresponding character
-    letters = string.ascii_letters[-26:]  # ABCD...XYZ
-    prediction = letters[prediction]
-
-    print('Prediction: {}'.format(prediction))
-
-
-# ==== Part 4: draw the loss curve
-def plot_loss(losses):
-    '''
-    :param losses: list of losses for each epoch
-    :return:
-    '''
-
-    f, ax = plt.subplots()
-
-    # draw loss
-    ax.plot(losses)
-
-    # set labels
-    ax.set_xlabel('training epoch')
-    ax.set_ylabel('loss')
-
-    # show the plots
-    plt.show()
-
-
-if __name__ == '__main__':
-    # set random seed for reproducibility
-    seed = 2023
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.deterministic = True
-
-    # set configurations
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
-    parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
-                        help='path to directory with images')
-    parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
-                        help='file list of training image paths and labels')
-    parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
-                        help='file list of validation image paths and labels')
-    parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
-                        help='file list of test image paths and labels')
-    parser.add_argument('--batchsize', type=int, default=8, help='batch size')
-    parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
-
-    # configurations for training
-    parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
-    parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
-    parser.add_argument('--act', type=str, default='relu',
-                        help='type of activation function, can be sigmoid, tanh, or relu')
-    parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
-    parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
-    parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
-    parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
-    parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
-    parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
-    parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
-    parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
-
-    # configurations for test and prediction
-    parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
-    parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
-                        help='path of an image to be recognized')
-
-    opt = parser.parse_args()
-
-    # TODO 5: initialize the MLP model
-    # what is the input size of the MLP?
-    # hint 1: we convert an image to a vector as the input of the MLP
-    # hint 2: each image has shape [norm_size[0], norm_size[1]]
-    model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
-
-    # for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
-    if opt.mode == 'test' or opt.mode == 'predict':
-        checkpoint = torch.load(opt.model_path, map_location='cpu')
-        # """The above code did not consider device problem"""
-        # checkpoint = torch.load(opt.model_path, map_location=opt.device)
-        # load model parameters we saved in model_path
-        model.load_state_dict(checkpoint['state_dict'])
-        print('[Info] Load model from {}'.format(opt.model_path))
-
-    # put the model on CPU or GPU according to the device in args
-    model = model.to(opt.device)
-
-    # -- run the code for training and validation
-    if opt.mode == 'train':
-        # training and validation data loader
-        trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
-        valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
-        train_val(model, trainloader, valloader,
-                  n_epochs=opt.epoch,
-                  lr=opt.lr,
-                  optim_type=opt.optim_type,
-                  momentum=opt.momentum,
-                  weight_decay=opt.weight_decay,
-                  valInterval=opt.valInterval,
-                  device=opt.device)
-
-    # -- test the saved model
-    elif opt.mode == 'test':
-        testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
-        acc = test(model, testloader, opt.device)
-        print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
-
-    # -- predict a new image
-    elif opt.mode == 'predict':
-        predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
-
-    else:
-        print('mode should be train, test, or predict')
-        raise NotImplementedError
+#========================================================
+#             Media and Cognition
+#             Homework 1 Neural network basics
+#             recognition.py - character classification
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+#========================================================
+
+# ==== Part 0: import libs
+import torch
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+
+import json, cv2, os, string
+import matplotlib.pyplot as plt
+
+import numpy as np
+
+# this time we implement our networks and loss functions in other python script, and import them here
+from network import MLP
+from losses import CrossEntropyLoss
+
+# argparse is used to conveniently set our configurations
+import argparse
+
+# ==== Part 1: data loader
+
+# construct a dataset and a data loader, more details can be found in
+# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
+
+class ListDataset(Dataset):
+    def __init__(self, im_dir, file_path, norm_size=(32, 32)):
+        '''
+        :param im_dir: path to directory with images
+        :param file_path: json file containing image names and labels
+        :param norm_size: image normalization size, (height, width)
+        '''
+
+        # this time we will try to recognize 26 English letters (case-insensitive)
+        letters = string.ascii_letters[-26:]  # ABCD...XYZ
+        self.alphabet = {letters[i]:i for i in range(len(letters))}
+        self.norm_size = norm_size
+
+        with open(file_path, 'r') as f:
+            imgs = json.load(f)
+            im_names = list(imgs.keys())
+
+            self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
+            self.labels = list(imgs.values())
+
+    def __len__(self):
+        # the __len__() function should return the total number of samples in the dataset
+        return len(self.im_paths)
+
+    def __getitem__(self, index):
+        assert index <= len(self), 'index range error'
+
+        # read an image and convert it to grey scale
+        im_path = self.im_paths[index]
+        im = cv2.imread(im_path)
+        im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+
+        # image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
+        im = cv2.resize(im, self.norm_size)
+        # im = im / 255.
+        """ The above command does not seems to be valid in my environment """
+        im = np.divide(im, 255.)
+        im = (im - 0.5) * 2.0
+
+        # get the label of the current image
+        # upper() is used to convert a letter into uppercase
+        label = self.labels[index].upper()
+
+        # convert an English letter into a number index
+        label = self.alphabet[label]
+
+        # TODO 1: return the image and its label
+        return im, label
+        
+
+
+def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
+    '''
+    :param im_dir: path to directory with images
+    :param file_path: file with image paths and labels
+    :param norm_size: image normalization size, (height, width)
+    :param batch_size: batch size
+    :param workers: number of workers for loading data in multiple threads
+    :return: a data loader
+    '''
+
+    dataset = ListDataset(im_dir, file_path, norm_size)
+    return DataLoader(dataset,
+                      batch_size=batch_size,
+                      shuffle=True if 'train' in file_path else False,  # shuffle images only when training
+                      num_workers=workers)
+
+
+# ==== Part 2: training, validation and testing
+
+def train_val(model, trainloader, valloader, n_epochs, 
+              lr, optim_type, momentum, weight_decay,
+              valInterval, device='cpu'):
+    '''
+    The main training procedure
+    ----------------------------
+    :param model: the MLP model
+    :param trainloader: the dataloader of the train set
+    :param valloader: the dataloader of the validation set
+    :param n_epochs: number of training epochs
+    :param lr: learning rate
+    :param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
+    :param momentum: only used if optim_type == 'sgd'
+    :param weight_decay: the factor of L2 penalty on network weights
+    :param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
+    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
+    '''
+
+    # define the cross entropy loss function.
+    ce_loss = CrossEntropyLoss.apply
+
+    # optimizer
+    if optim_type == 'sgd':
+        optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
+    elif optim_type == 'adagrad':
+        optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
+    elif optim_type == 'rmsprop':
+        optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
+    elif optim_type == 'adam':
+        optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
+    elif optim_type == 'adadelta':
+        optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
+    else:
+        print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
+        raise NotImplementedError
+
+    # training
+
+    # to save loss of each training epoch in a python "list" data structure
+    losses = []
+
+    for epoch in range(n_epochs):
+        # set the model in training mode
+        model.train()
+
+        # to save total loss in one epoch
+        total_loss = 0.
+
+        #TODO 2: Calculate losses and train the network using the optimizer
+        for data, labels in trainloader:  # get a batch of data
+
+            # step 1: set data type and device
+            # data = torch.from_numpy(data)
+            data = data.type(torch.float32)
+            data = data.to(device)
+            labels = labels.to(device)
+
+            # print(data.device)
+
+            # step 2: convert an image to a vector as the input of the MLP
+            data = torch.flatten(data, start_dim=1)
+            # print(data.size())
+
+            # hit: clear gradients in the optimizer
+            optimizer.zero_grad()
+
+            # step 3: run the model which is the forward process
+            output = model(data)
+
+            # step 4: compute the loss, and call backward propagation function
+            loss = ce_loss(output, labels)
+            loss.backward()
+            # I have no idea why pylance can't get the data type of what ce_loss returns
+
+            # step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
+            # this operation is not differentiable
+            total_loss += loss.item()
+
+            # step 6: call a function, optimizer.step(), to update the parameters of the models
+            optimizer.step()
+            
+
+        # average of the total loss for iterations
+        avg_loss = total_loss / len(trainloader)
+        losses.append(avg_loss)
+        print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
+
+        # validation
+        if (epoch + 1) % valInterval == 0:
+            val_acc = test(model, valloader, device)
+            # show prediction accuracy
+            print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
+
+
+    # save model parameters in a file
+    # model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
+    model_save_path = opt.model_path
+
+    torch.save({'state_dict': model.state_dict(),
+                }, model_save_path)
+    print('Model saved in {}\n'.format(model_save_path))
+
+    # draw the loss curve
+    plot_loss(losses)
+
+
+def test(model, testloader, device):
+    '''
+    The testing procedure
+    ----------------------------
+    :param model: the MLP model
+    :param testloader: the dataloader to be tested/validated
+    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
+    '''
+    # set the model in evaluation mode
+    model.eval()
+
+    n_correct = 0.  # number of images that are correctly classified
+    n_imgs = 0.  # number of total images
+    
+    with torch.no_grad():  # we do not need to compute gradients during validation
+
+        #TODO 3: get the prediction of the data and calculate the accuracy
+        for imgs, labels in testloader:
+            # step 1: set data type and device
+            # imgs = torch.from_numpy(imgs)
+            imgs = imgs.type(torch.float32)
+            imgs = imgs.to(device)
+            labels = labels.to(device)
+
+            # step 2: convert an image to a vector as the input of the MLP
+            imgs = torch.flatten(imgs, start_dim=1)
+
+            # step 3: run the model which is the forward process
+            output = model(imgs)
+
+            # step 4: get the predicted value by the output using out.argmax(1)
+            pred = output.argmax(1)
+
+            # step 5: sum up the number of images correctly recognized and the total image number
+            for predict, label in zip(pred, labels):
+                if predict == label:
+                    n_correct += 1
+                n_imgs += 1
+
+    accuracy = n_correct / n_imgs
+    return accuracy
+
+
+# ==== Part 3: predict new images
+def predict(model, im_path, norm_size, device):
+    '''
+    The predicting procedure
+    ---------------
+    :param model: the MLP model
+    :param im_path: path of an image
+    :param norm_size: image normalization size, (height, width)
+    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
+    '''
+
+    # TODO 4: enter the evaluation mode
+    model.eval()
+
+    # TODO 4: image pre-processing, similar to what we do in ListDataset()
+    im = cv2.imread(im_path)
+    im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
+
+    im = cv2.resize(im, norm_size)
+    im = np.divide(im, 255.)
+    im = (im - 0.5) * 2.0
+
+    # convert im from numpy.ndarray to torch.tensor
+    im = torch.from_numpy(im)
+
+    # input im into the model
+    with torch.no_grad():
+        input = im.view(1, -1).type(torch.float32).to(device)
+        out = model(input)
+        prediction = out.argmax(1)[0].item()
+
+    # convert index of prediction to the corresponding character
+    letters = string.ascii_letters[-26:]  # ABCD...XYZ
+    prediction = letters[prediction]
+
+    print('Prediction: {}'.format(prediction))
+
+
+# ==== Part 4: draw the loss curve
+def plot_loss(losses):
+    '''
+    :param losses: list of losses for each epoch
+    :return:
+    '''
+
+    f, ax = plt.subplots()
+
+    # draw loss
+    ax.plot(losses)
+
+    # set labels
+    ax.set_xlabel('training epoch')
+    ax.set_ylabel('loss')
+
+    # show the plots
+    plt.show()
+
+
+if __name__ == '__main__':
+    # set random seed for reproducibility
+    seed = 2023
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+
+    # set configurations
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
+    parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
+                        help='path to directory with images')
+    parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
+                        help='file list of training image paths and labels')
+    parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
+                        help='file list of validation image paths and labels')
+    parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
+                        help='file list of test image paths and labels')
+    parser.add_argument('--batchsize', type=int, default=8, help='batch size')
+    parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
+
+    # configurations for training
+    parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
+    parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
+    parser.add_argument('--act', type=str, default='relu',
+                        help='type of activation function, can be sigmoid, tanh, or relu')
+    parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
+    parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
+    parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
+    parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
+    parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
+    parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
+    parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
+    parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
+
+    # configurations for test and prediction
+    parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
+    parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
+                        help='path of an image to be recognized')
+
+    opt = parser.parse_args()
+
+    # TODO 5: initialize the MLP model
+    # what is the input size of the MLP?
+    # hint 1: we convert an image to a vector as the input of the MLP
+    # hint 2: each image has shape [norm_size[0], norm_size[1]]
+    model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
+
+    # for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
+    if opt.mode == 'test' or opt.mode == 'predict':
+        checkpoint = torch.load(opt.model_path, map_location='cpu')
+        # """The above code did not consider device problem"""
+        # checkpoint = torch.load(opt.model_path, map_location=opt.device)
+        # load model parameters we saved in model_path
+        model.load_state_dict(checkpoint['state_dict'])
+        print('[Info] Load model from {}'.format(opt.model_path))
+
+    # put the model on CPU or GPU according to the device in args
+    model = model.to(opt.device)
+
+    # -- run the code for training and validation
+    if opt.mode == 'train':
+        # training and validation data loader
+        trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
+        valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
+        train_val(model, trainloader, valloader,
+                  n_epochs=opt.epoch,
+                  lr=opt.lr,
+                  optim_type=opt.optim_type,
+                  momentum=opt.momentum,
+                  weight_decay=opt.weight_decay,
+                  valInterval=opt.valInterval,
+                  device=opt.device)
+
+    # -- test the saved model
+    elif opt.mode == 'test':
+        testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
+        acc = test(model, testloader, opt.device)
+        print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
+
+    # -- predict a new image
+    elif opt.mode == 'predict':
+        predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
+
+    else:
+        print('mode should be train, test, or predict')
+        raise NotImplementedError
--- a/hw3/code/check.py
+++ b/hw3/code/check.py
@@ -0,0 +1,41 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             check.py - Check your implementation of several modules
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+from svm_hw import SVM_HINGE, LinearFunction, Hinge
+import torch
+from torch.autograd import gradcheck
+
+
+def run():
+    model = SVM_HINGE(2, C=1.0).double()
+    x = torch.randn(50, 2, requires_grad=False).double()
+    W = torch.randn(1, 2, requires_grad=True).double()
+    b = torch.zeros(1, requires_grad=True).double()
+    test = gradcheck(LinearFunction.apply, (x, W, b), eps=1e-6, atol=1e-4)
+    if test:
+        print('Linear successully tested!')
+    output = torch.randn(50, 1, requires_grad=True).double()
+    W = torch.randn(1, 2, requires_grad=True).double()
+    labels = torch.ones(1, requires_grad=False).double()
+    C = torch.tensor([[1.0]], requires_grad=False).double()
+    test = gradcheck(Hinge.apply, (output, W, labels, C), eps=1e-6, atol=1e-5)
+    if test:
+        print('Hinge successfully tested！')
+    x = torch.randn(50, 2, requires_grad=False).double()
+    labels = torch.ones(50, requires_grad=False).double()
+    try:
+        output, loss = model(x, labels)
+        assert model.W.requires_grad is True
+        assert model.b.requires_grad is True
+        print('SVM_HINGE successfully tested！')
+    except:
+        raise Exception('Failed testing SVM_HINGE!')
+
+
+if __name__ == '__main__':
+    run()
--- a/hw3/code/data_preprocess.py
+++ b/hw3/code/data_preprocess.py
@@ -0,0 +1,181 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             data_preprocess.py - Using pretrained convolutional layers to extract feature,
+#                                   and using PCA for dimensionality reduction
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+import os
+import torchvision.transforms as transforms
+import torch
+from PIL import Image
+from networks import Classifier
+import matplotlib.pyplot as plt
+import argparse
+
+
+def preprocess(pre_conv, data_root, image_size, classes):
+    # TODO 1: Using PCA to reduce the dimensionality of 2048 point features extracted by convolution
+
+    # ===============  process training dataset ======================
+    print("Start preprocessing the training dataset !!!")
+    train_data, train_label = loaddata(pre_conv, data_root, 'train', image_size, classes)
+
+    # calculate the mean and PCA projection matrix
+    data_mean, u = PCA(train_data, 2)
+
+    u = u * 20
+
+    # TODO: using PCA to compress the dimensionality of the train_data after subtracting the mean vector
+    train_data_pca = (train_data - data_mean) @ u
+
+    visualize(train_data_pca, train_label, "train")
+    savedata(train_data_pca, train_label, data_root+"/train.pt")
+    print("training dataset saved !!!")
+
+    # ===============  process validation dataset ======================
+    print("Start preprocessing the validation dataset!!!")
+    val_data, val_label = loaddata(pre_conv, data_root, 'val', image_size, classes)
+
+    # TODO: using PCA to compress the dimensionality of the val_data after subtracting the mean vector
+    val_data_pca = (val_data - data_mean) @ u
+
+    visualize(val_data_pca, val_label, "val")
+    savedata(val_data_pca, val_label, data_root+"/val.pt")
+    print("validation dataset saved !!!")
+
+    # ===============  process testing dataset ======================
+    print("Start preprocessing the testing dataset!!!")
+    test_data, test_label = loaddata(pre_conv, data_root, 'test', image_size, classes)
+
+    # TODO: using PCA to compress the dimensionality of the test_data after subtracting the mean vector
+    test_data_pca = (test_data - data_mean) @ u
+
+    visualize(test_data_pca, test_label, "test")
+    savedata(test_data_pca, test_label, data_root+"/test.pt")
+    print("testing dataset saved !!!")
+
+
+def savedata(data, label, save_path):
+    save_dict = {
+        'data': data,
+        'label': label
+    }
+    torch.save(save_dict, save_path)
+
+
+def visualize(datas, labels, mode):
+    """
+    Display feature points after dimensionality reduction
+    -------------------------------
+    :param datas: the samples after dimensionality reduction, with the shape of [N, 2]
+    :param labels: the labels (chosen from {-1, +1}) corresponding to the samples
+    :param mode: chosen from {'train', 'val', 'test'}
+    :return:
+    """
+    plt.figure()
+    for idx in range(datas.shape[1]):
+        plt.scatter(datas[labels == 2*idx-1, 0], datas[labels == 2*idx-1, 1], label=(2*idx-1))
+    plt.legend()
+    plt.title(mode)
+    plt.show()
+
+
+def PCA(data, dim=2):
+    """
+    calculate the mean value of the data and the projection matrix for PCA
+    :param data: the sample features extracted by the pretrained network in homework2, with the shape of [N, 2048]
+    :param dim: the data dimension after projection
+    :return:
+        data_mean: the mean value of the data
+        u: the projection matrix for PCA, with the shape of [2048, dim]
+    """
+    # TODO 2: complete the algorithm of PCA, calculate the mean value of the data and the projection matrix
+
+    # TODO: compute the mean of train_data
+    data_mean = data.mean(dim=0)
+    # TODO: compute the covariance matrix of train_data
+    diff = data - data_mean
+    # data_cov = diff.T @ diff
+    data_cov = torch.cov(diff.T)
+    # TODO: compute the SVD decompositon of data_cov using torch.linalg.svd
+    # reference: https://pytorch.org/docs/1.11/generated/torch.linalg.svd.html
+    u, s, v = torch.linalg.svd(data_cov)
+    # TODO: return the proper 'data_mean' and 'u[]'
+    return data_mean, u[:, :dim]
+
+def loaddata(pre_conv, data_root, mode, image_size, classes):
+    """
+    load one dataset, and use pretrained network in homework 2 to extract feature
+    :param pre_conv: pretrained network in homework 2
+    :param data_root: the path of the dataset
+    :param mode: chosen from {'train', 'val', 'test'}
+    :param image_size: the preset size that each image try to zoom to
+    :param classes: two classes that need to be classified
+    :return:
+        datas: the samples of extracted features with the shape of [N, 2048]
+        labels: the corresponding labels for each sample (chosen from {-1, +1}), with the shape of [N]
+    """
+    assert len(classes) == 2
+    datas = []
+    labels = []
+    for idx in range(len(classes)):
+        for img in os.listdir(data_root + '/' + mode + '/' + classes[idx]):
+            data = readimg(pre_conv, data_root + '/' + mode + '/' + classes[idx] + '/' + img, image_size)
+            label = 2 * idx - 1
+            datas.append(data)
+            labels.append(label)
+    return torch.stack(datas), torch.tensor(labels)
+
+
+def readimg(pre_conv, filepath, image_size):
+    """
+    Read one image and use pretrained network to extract the feature
+    --------------------------
+    :param pre_conv: pretrained network in homework 2
+    :param filepath: the file path of one image
+    :param image_size: the preset size that each image try to zoom to
+    :return:
+        data: the extracted feature with the length of 2048
+    """
+    img_pil = Image.open(filepath).convert('RGB')
+    img_pil = img_pil.resize(image_size)
+    img_transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize(0.5, 0.5),
+                                        ])
+    img_tensor = img_transform(img_pil)
+    data = pre_conv(img_tensor.unsqueeze(0)).reshape(-1)
+
+    return data
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pretrained_net", type=str, default="checkpoints/bn/ckpt_epoch_15.pth",
+                        help="the filepath of the pretrained network in homework 2")
+    parser.add_argument("--data_root", type=str, default="data", help="the path of all datasets")
+    parser.add_argument("--image_size", type=tuple, default=(32, 32),
+                        help="the preset size that each image try to zoom to")
+    parser.add_argument("--classes", default=["B", "C"], help="two classes that need to be classified")
+
+    args = parser.parse_args()
+
+    pretrained_checkpoint = torch.load(args.pretrained_net, map_location="cpu")
+    configs = pretrained_checkpoint["configs"]
+    cls = Classifier(
+        configs["in_channels"],
+        configs["num_classes"],
+        configs["use_batch_norm"],
+        configs["use_stn"],
+        configs["dropout_prob"],
+    )
+    cls.load_state_dict(pretrained_checkpoint["model_state"], strict=False)
+    for param in cls.parameters():
+        param.requires_grad = False
+    conv = cls.conv_net
+
+    preprocess(conv, args.data_root, args.image_size, args.classes)
--- a/hw3/code/datasets.py
+++ b/hw3/code/datasets.py
@@ -0,0 +1,26 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             datasets.py - Define the data loader for the traffic sign classification dataset
+#             Student ID:
+#             Name:
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+
+import torch
+import torch.utils.data as data
+
+
+class Traffic_Dataset(data.Dataset):
+    def __init__(self, data_root):
+        dataset = torch.load(data_root)
+        self.datas = dataset["data"]
+        self.labels = dataset["label"]
+
+    def __getitem__(self, index):
+        return self.datas[index], self.labels[index]
+
+    def __len__(self):
+        return len(self.datas)
--- a/hw3/code/networks.py
+++ b/hw3/code/networks.py
@@ -0,0 +1,271 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 2 Convolutional Neural Network
+#             networks.py - Network definition
+#             Student ID: 2022010639
+#             Name: Gao Yixuan
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        use_batch_norm=False,
+        use_residual=False,
+    ):
+        """
+        Convolutional block with batch normalization and ReLU activation
+        ----------------------
+        :param in_channels: channel number of input image
+        :param out_channels: channel number of output image
+        :param kernel_size: size of convolutional kernel
+        :param stride: stride of convolutional operation
+        :param padding: padding of convolutional operation
+        :param use_batch_norm: whether to use batch normalization in convolutional layers
+        :param use_residual: whether to use residual connection
+        """
+        super().__init__()
+
+        if use_batch_norm:
+            bn2d = nn.BatchNorm2d
+        else:
+            # use identity function to replace batch normalization
+            bn2d = nn.Identity
+
+        self.use_residual = use_residual
+
+        # >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
+        # Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
+        # Network structure:
+        # conv -> batchnorm -> relu
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=padding
+        )
+        self.bn = bn2d(out_channels)
+        self.relu = nn.ReLU()
+        # <<< TODO 2.1
+
+    def forward(self, x):
+        # >>> TODO 2.2: forward process
+        # Hint: apply residual connection if `self.use_residual` is True
+        fx = self.relu(self.bn(self.conv(x)))
+        # out = self.relu(self.bn(self.conv(x)))
+        if self.use_residual:
+            out = fx + x
+        else:
+            out = fx
+
+        # <<< TODO 2.2
+        return out
+
+
+class Classifier(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        num_classes,
+        use_batch_norm=False,
+        use_stn=False,
+        dropout_prob=0,
+    ):
+        """
+        Convolutional Neural Networks
+        ----------------------
+        :param in_channels: channel number of input image
+        :param num_classes: number of classes for the classification task
+        :param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
+        :param use_stn: whether to use spatial transformer network
+        :param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
+        """
+        super().__init__()
+
+        if use_batch_norm:
+            bn1d = nn.BatchNorm1d
+        else:
+            # use identity function to replace batch normalization
+            bn1d = nn.Identity
+
+        if use_stn:
+            self.stn = STN(in_channels)
+        else:
+            # use identity function to replace spatial transformer network
+            self.stn = nn.Identity(in_channels)
+
+        # >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
+        # input image with size [batch_size, in_channels, img_h, img_w]
+        # Network structure:
+        #            kernel_size  stride  padding  out_channels  use_residual
+        # ConvBlock       5          1        2          32         False
+        # ConvBlock       5          2        2          64         False
+        # maxpool         2          2        0
+        # ConvBlock       3          1        1          64         True
+        # ConvBlock       3          1        1          128        False
+        # maxpool         2          2        0
+        # ConvBlock       3          1        1          128        True
+        # dropout(p), where p is input parameter of dropout ratio
+
+        self.conv_net = nn.Sequential(
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=32,
+                kernel_size=5,
+                stride=1,
+                padding=2,
+            ),
+            ConvBlock(
+                in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=64,
+                out_channels=64,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            ConvBlock(
+                in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
+            ),
+            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
+            ConvBlock(
+                in_channels=128,
+                out_channels=128,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                use_residual=True,
+            ),
+            nn.Dropout2d(p=dropout_prob),
+        )
+        # <<< TODO 3.1
+
+        # >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
+        # Hint:
+        #   (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
+        #       the output of the convolution layers?
+        #   (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
+        # Network structure:
+        #            out_channels
+        # linear          256
+        # activation
+        # batchnorm
+        # dropout(p), where p is input parameter of dropout ratio
+        # linear       num_classes
+        self.fc_net = nn.Sequential(
+            nn.Linear(2048, 256),
+            nn.ReLU(),
+            bn1d(256),
+            nn.Dropout1d(dropout_prob),
+            nn.Linear(256, num_classes),
+        )
+        # <<< TODO 3.2
+
+    def forward(self, x):
+        """
+        Define the forward function
+        :param x: input features with size [batch_size, in_channels, img_h, img_w]
+        :return: output features with size [batch_size, num_classes]
+        """
+        # Step 1: apply spatial transformer network if applicable
+        x = self.stn(x)
+
+        # >>> TODO 3.3: forward process
+        # Step 2: forward process for the convolutional network
+        x = self.conv_net(x)
+
+        # Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
+        # fully connected layers.
+        x = x.view(x.shape[0], -1)
+
+        # Step 4: forward process for the fully connected network
+        out = self.fc_net(x)
+        # <<< TODO 3.3
+
+        return out
+
+
+class STN(nn.Module):
+    def __init__(self, in_channels):
+        """
+        The spatial transformer network (STN) learns how to perform spatial transformations on the
+        input image in order to enhance the geometric invariance of the model. For example, it can
+        crop a region of interest, scale and correct the orientation of an image. It can be a useful
+        mechanism because CNNs are not invariant to rotation and scale and more general affine
+        transformations.
+
+        The spatial transformer network boils down to three main components:
+
+        - The localization network is a regular CNN which regresses the transformation parameters.
+          The transformation is never learned explicitly from this dataset, instead the network
+          learns automatically the spatial transformations that enhances the global accuracy.
+        - The grid generator generates a grid of coordinates in the input image corresponding
+          to each pixel from the output image.
+        - The sampler uses the parameters of the transformation and applies it to the input image.
+
+        Here, we are going to implement an STN that performs affine transformations on the input images.
+        For more information, please refer to the slides and
+        https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
+
+        ----------------------
+        :param in_channels: channel number of input image
+        """
+        super().__init__()
+
+        # >>> TODO 4.1: Build your localization net
+        # Step 1: Build a convolutional network to extract features from input images.
+        # Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
+        # this network.
+        # Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
+        self.localization_conv = nn.Sequential(
+            ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
+            # 8 * 13 * 13
+            ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
+            ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
+            # 32 * 4 * 4
+        )
+
+        # Step 2: Build a fully connected network to predict the parameters of affine transformation from
+        # the extracted features.
+        # Hint: Combine linear layers and ReLU activation functions to build this network.
+        # Suggested structure: 2 linear layers with one BN and ReLU.
+        self.localization_fc = nn.Sequential(
+            nn.Linear(32 * 4 * 4, 256),
+            nn.ReLU(),
+            nn.BatchNorm1d(256),
+            nn.Linear(256, 6)
+        )
+        # <<< TODO 4.1
+
+        # >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
+        # Hint: The STN should generate the identity transformation by default before training.
+        # How to initialize the weight/bias of the last linear layer of the fully connected network to
+        # achieve this goal?
+        nn.init.zeros_(self.localization_fc[3].weight)
+        # <<< TODO 4.2
+
+    def forward(self, x):
+        # Extract the features from input images and flatten them
+        features = self.localization_conv(x)
+        features = features.view(features.shape[0], -1)
+
+        # Predict the parameters of affine transformation from the extracted features
+        theta = self.localization_fc(features)
+        theta = theta.view(-1, 2, 3)
+
+        # Apply affine transformation to input images
+        grid = F.affine_grid(theta, x.shape, align_corners=False)
+        x = F.grid_sample(x, grid, align_corners=False)
+
+        return x
--- a/hw3/code/svm_hw.py
+++ b/hw3/code/svm_hw.py
@@ -0,0 +1,148 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             svm_hw.py - The implementation of SVM using hinge loss
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# TODO 1: complete the forward and backward propagation processes of the linear layer
+class LinearFunction(torch.autograd.Function):
+    '''
+    we will implement the linear function:
+    y = xW^T + b
+    as well as its gradient computation process
+    '''
+
+    @staticmethod
+    def forward(ctx, x, W, b):
+        '''
+        Input:
+        :param ctx: a context object that can be used to stash information for backward computation
+        :param x: input features with size [batch_size, input_size]
+        :param W: weight matrix with size [output_size, input_size]
+        :param b: bias with size [output_size]
+        Return:
+        y :output features with size [batch_size, output_size]
+        '''
+
+        # TODO
+        y = torch.matmul(x, W.T) + b
+        ctx.save_for_backward(x, W)
+
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        '''
+        Input:
+        :param ctx: a context object with saved variables
+        :param grad_output: dL/dy, with size [batch_size, output_size]
+        Return:
+        grad_input: dL/dx, with size [batch_size, input_size]
+        grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
+        grad_b: dL/db, with size [output_size], summed for data in the batch
+        '''
+
+        x, W = ctx.saved_variables
+
+        # calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
+        # calculate dL/dW by using dL/dy (grad_output) and x
+        # calculate dL/db using dL/dy (grad_output)
+        # you can use torch.matmul(A, B) to compute matrix product of A and B
+
+        # TODO
+        grad_input = torch.matmul(grad_output, W)
+        grad_W = torch.matmul(grad_output.T, x)
+        grad_b = grad_output.sum(0)
+
+        return grad_input, grad_W, grad_b
+
+
+# TODO 2: complete the forward and backward propagation processes of the hinge loss
+class Hinge(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, output, W, label, C):
+        """
+        Compute the hinge loss
+        --------------------------------------
+        :param ctx: a context object that can be used to stash information for backward computation
+        :param output: the output of the linear layer with size [batch_size, 1], i.e. output = W^T*x + b
+        :param W: weight matrix with size [1, input_size]
+        :param label: the ground truth y in the equation for loss calculation, with size [batch_size]
+        :param C: the regularization coefficient of hinge loss with size [1, 1]
+        :return: the hinge loss with size [1, 1]
+        """
+        C = C.type_as(W)
+
+        # TODO: compute the hinge loss (together with L2 norm for SVM): loss = 0.5*||w||^2 + C*\sum_i{max(0, 1 - y_i*output_i)}
+        # you may need F.relu() to implement the max() function.
+        # print("output size", output.size())
+        # print("label size", label.size())
+        # print("product", label * output.reshape_as(label))
+        # print("minus", 1 - label * output.reshape_as(label))
+        # print("relu", F.relu(1 - label * output.reshape_as(label)))
+        # print("sum", (F.relu(1 - label * output.reshape_as(label))).sum())
+        loss = 1/2 * (W @ W.T) + C * (F.relu(1 - (output.T * label).T)).sum()
+        ctx.save_for_backward(output, W, label, C)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_loss):
+        """
+        Compute the gradient of hinge loss
+        :param ctx: a context object with saved variables
+        :param grad_loss: dL/dloss, with size [1, 1], the gradient of the final target loss with respect to the output (variable 'loss') of the forward function
+        :return:
+            grad_output: dL/doutput, with size [batch_size, 1]
+            grad_W: dL/dW, with size [1, channels]
+        """
+        output, W, label, C = ctx.saved_tensors
+        # TODO: compute the grad with respect to the output of the linear function and W: dL/doutput, dL/dW
+        # print("output", output, "label", label, "product", (1 - label.reshape_as(output) * output))
+        # print("grad_loss size", grad_loss.size())
+        # print("sizeof l / output", (C * torch.heaviside(1 - label.reshape_as(output) * output, torch.tensor(0).type_as(output)) * (-label.reshape_as(output))).size())
+        grad_output = grad_loss * C * ((torch.heaviside(1 - (output.T * label).T, torch.tensor(1).type_as(output)).T * (-label))).T
+        grad_W = grad_loss * W
+        return grad_output, grad_W, None, None
+
+
+# TODO 3: complete the structure of SVM model
+class SVM_HINGE(nn.Module):
+
+    def __init__(self, in_channels, C):
+        """
+        :param in_channels: number of feature channels for SVM input
+        :param C: regularization coefficient of hinge loss with size [1, 1]
+        """
+        super().__init__()
+
+        # TODO: define the parameters W and b
+        """
+            the shape of W should be [1, channels] and the shape of b should be [1, ]
+            you need to use nn.Parameter() to make W and b be trainable parameters, don't forget to set requires_grad=True for self.W and self.b
+            please use torch.randn() to initialize W and b
+        """
+
+        self.W = nn.Parameter(torch.rand(1, in_channels), requires_grad=True)
+        self.b = nn.Parameter(torch.rand(1, ), requires_grad=True)
+        self.C = torch.tensor([[C]], requires_grad=False)
+
+    def forward(self, x, label=None):
+        # SVM calculation
+        output = LinearFunction.apply(x, self.W, self.b)
+        if label is not None:
+            loss = Hinge.apply(output, self.W, label, self.C)
+        else:
+            loss = None
+        output = (output > 0.0).type_as(x) * 2.0 - 1.0
+        return output, loss
--- a/hw3/code/test_svm.py
+++ b/hw3/code/test_svm.py
@@ -0,0 +1,110 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             test_svm.py - Test svm model for traffic sign
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+# ==== Part 1: import libs
+import argparse
+import torch
+from datasets import Traffic_Dataset
+from svm_hw import SVM_HINGE
+from torch.utils.data import DataLoader
+import os.path
+
+
+# ==== Part 2: testing
+def test(
+    data_root,
+    model_save_path,
+    device,
+):
+    """
+    The main testing procedure of SVM model
+    ----------------------------
+    :param data_root: path to the root directory of dataset
+    :param model_save_path: path to pretrained SVM model
+    :param device: device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
+    """
+
+    # TODO 1: =================== load the pretrained SVM model ==================================
+
+    # TODO: construct testing data loader with 'Traffic_Dataset' and DataLoader, and set 'batch_size=1' and 'shuffle=False'
+    test_data = Traffic_Dataset(os.path.join(data_root, 'test.pt'))
+    test_loader = DataLoader(test_data, shuffle=False)
+
+    # TODO: load state dictionary of pretrained SVM model
+    model_svm = torch.load(os.path.join(model_save_path))
+
+    # TODO: initialize the SVM model using 'model_svm["configs"]["feature_channel"]' and 'model_svm["configs"]["C"]'
+    svm = SVM_HINGE(model_svm["configs"]["feature_channel"], model_svm["configs"]["C"])
+
+    # TODO: load model parameters (model_svm['state_dict']) we saved in model_path using svm.load_state_dict()
+    svm.load_state_dict(model_svm["state_dict"])
+
+    # TODO: put the model on CPU or GPU
+    svm.to(device)
+
+    # TODO 2 : ================================ testing ==============================================
+
+    # TODO: set the model in evaluation mode
+    svm.eval()
+
+    # to calculate and save the testing accuracy
+    n_correct = 0.  # number of images that are correctly classified
+    n_feas = 0.  # number of total images
+
+    with torch.no_grad():  # we do not need to compute gradients during validation
+        # TODO: inference on the testing dataset, similar to the training stage but use 'test_loader'.
+        for input, label in test_loader:
+            # TODO: set data type (.float()) and device (.to())
+            input, label = (
+                    input.type(torch.float).to(device),
+                    label.type(torch.float).to(device)
+            )
+
+            # TODO: run the model; at the validation step, the model only needs one input: feas
+            # _ refers to a placeholder, which means we do not need the second returned value during validating
+            out, _ = svm(input)
+
+            # TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
+            n_correct += (out.reshape_as(label) == label).sum().item()
+
+            # TODO:sum up the total image number
+            n_feas += label.numel()
+
+    # show prediction accuracy
+    acc = 100 * n_correct / n_feas
+    print('Test accuracy = {:.1f}%'.format(acc))
+
+
+if __name__ == "__main__":
+    # set configurations of the testing process
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels")
+    parser.add_argument("--device", type=str, help="cpu or cuda")
+    parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
+
+    args = parser.parse_args()
+    if args.device is None:
+        args.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # run the testing procedure
+    test(
+        data_root=args.data_root,
+        model_save_path=args.model_save_path,
+        device=args.device,
+    )
+
+
+
+
+
+
+
+
+
--- a/hw3/code/train_svm.py
+++ b/hw3/code/train_svm.py
@@ -0,0 +1,296 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 3 Support Vector Machine
+#             train_svm.py - Train svm model for traffic sign
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+# ==== Part 1: import libs
+import argparse
+import matplotlib.pyplot as plt
+import torch
+import numpy as np
+import random
+from datasets import Traffic_Dataset
+from svm_hw import SVM_HINGE
+from torch.utils.data import DataLoader
+import os.path
+
+
+# ==== Part 2: training and validation
+def train(
+    data_root,
+    feature_channel,
+    batch_size,
+    n_epoch,
+    lr,
+    C,
+    model_save_path,
+    device,
+):
+    """
+    The main training procedure of SVM model
+    ----------------------------
+    :param data_root: path to the root directory of dataset
+    :param feature_channel: number of feature channels for SVM input
+    :param batch_size: batch size of training
+    :param n_epoch: number of training epochs
+    :param lr: learning rate
+    :param C: regularization coefficient in hinge loss
+    :param model_save_path: path to save SVM model
+    :param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
+    """
+
+    # TODO 1: construct training and validation data loader with 'Traffic_Dataset' and DataLoader, and set proper values for 'batch_size' and 'shuffle'
+    train_data = Traffic_Dataset(os.path.join(data_root, 'train.pt'))
+    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
+    val_data = Traffic_Dataset(os.path.join(data_root, 'val.pt'))
+    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
+
+    # scale the regularization coefficient
+    C = C * len(train_loader)
+
+    # TODO: initialize the SVM model
+    svm = SVM_HINGE(feature_channel, C)
+
+    # TODO: put the model on CPU or GPU
+    svm.to(device)
+
+    # TODO: define the Adam optimizer
+    optimizer = torch.optim.Adam(svm.parameters(), lr)
+
+    # to save the training loss, training accuracy, validation accuracy, and the epoch index of each training epoch
+    train_loss = []
+    train_acc = []
+    val_acc = []
+    epochs = []
+
+    for epoch in range(n_epoch):
+        # TODO: save the index of current epoch in the array 'epochs'
+        epochs.append(epoch + 1)
+
+        # TODO 2: ========================= training =======================
+        # TODO: set the model in training mode›
+        svm.train()
+
+        # to calculate and save the training loss and training accuracy
+        total_loss = 0.  # to save total training loss in one epoch
+        n_correct = 0.  # number of images that are correctly classified
+        n_feas = 0.  # number of total images
+
+        # TODO: get a batch of data; you may need enumerate() to iteratively get data from 'train_loader'.
+        # you can refer to previous homework, for example hw2
+        for step, (input, label) in enumerate(train_loader):
+            # TODO: set data type (.float()) and device (.to())
+            input, label = (
+                input.type(torch.float).to(device),
+                label.type(torch.float).to(device)
+            )
+
+            # TODO: clear gradients in the optimizer
+            optimizer.zero_grad()
+
+            # TODO: run the model with hinge loss; the model needs two inputs: feas and labels
+            out, loss = svm(input, label)
+
+            # TODO: back-propagation on the computation graph
+            loss.backward()
+
+            # TODO: sum up of total loss, loss.item() return the value of the tensor as a standard python number
+            total_loss += loss.item()
+
+            # TODO: call a function to update the parameters of the models
+            optimizer.step()
+
+            # TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
+            n_correct += (out.reshape_as(label) == label).sum().item()
+
+            # TODO: sum up the total image number
+            n_feas += label.numel()
+
+        # average of the total loss for iterations
+        acc = 100 * n_correct / n_feas
+        avg_loss = total_loss / len(train_loader)
+        train_acc.append(acc)
+        train_loss.append(avg_loss)
+        print('Epoch {:02d}: loss = {:.3f}, training accuracy = {:.1f}%'.format(epoch + 1, avg_loss, acc))
+
+        # TODO 3: ========================== Validation ======================================
+
+        # TODO: set the model in evaluation mode
+        svm.eval()
+
+        # to calculate and save the validation accuracy
+        n_correct = 0.  # number of images that are correctly classified
+        n_feas = 0.  # number of total images
+
+        with torch.no_grad():  # we do not need to compute gradients during validation
+            # TODO: inference on the validation dataset, similar to the training stage but use 'val_loader'.
+            for input, label in val_loader:
+                # TODO: set data type (.float()) and device (.to())
+                input, label = (
+                    input.type(torch.float).to(device),
+                    label.type(torch.float).to(device)
+                )
+
+                # TODO: run the model; at the validation step, the model only needs one input: feas
+                # _ refers to a placeholder, which means we do not need the second returned value during validating
+                out, _ = svm(input)
+
+                # TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
+                n_correct += (out.reshape_as(label) == label).sum().item()
+
+                # TODO: sum up the total image number
+                n_feas += label.numel()
+
+        # show prediction accuracy
+        acc = 100 * n_correct / n_feas
+        print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, acc))
+        val_acc.append(acc)
+
+    # save model parameters in a file
+    torch.save({'state_dict': svm.state_dict(),
+                'configs': {
+                    'feature_channel': feature_channel,
+                    'C': C}
+                }, model_save_path)
+    print('Model saved in {}\n'.format(model_save_path))
+
+    W = svm.W.data.cpu()
+    b = svm.b.data.cpu()
+
+    # TODO 4: calculate the index of support vectors in training samples using 'train_data.datas' and 'train_data.labels'
+    # 'sv' should be a list in python structure with the shape of [K], where K is the number of support vectors.
+    sv = [idx for idx, (data, label) in enumerate(zip(train_data.datas, train_data.labels)) if label * ((W @ data) + b) <= 1]
+
+    plot(train_loss, train_acc, val_acc, epochs)
+    plot_feature(train_features=train_data.datas, val_features=val_data.datas, train_labels=train_data.labels,
+                 val_labels=val_data.labels, sv=sv, W=W, b=b)
+
+
+def plot_feature(train_features, val_features, train_labels, val_labels, sv, W, b):
+    """
+    Draw the samples,SVM decision boundary, and support vectors
+    ---------------------
+    :param train_features: training samples with the shape of [B, 2]
+    :param val_features: validation samples with the shape of [B, 2]
+    :param train_labels: the labels (chosen from{-1, +1}) corresponding to training samples, with the shape of [B, 1]
+    :param val_labels: the labels (chosen from{-1, +1}) corresponding to validation samples, with the shape of [B, 1]
+    :param sv: a list with the index of support vectors in training samples, with the shape of [K] (K is the number of support vectors)
+    :param W: the weight vector of SVM decision boundary (W^Tx + b), with the shape of [1, feature_channel]
+    :param b: the bias of SVM decision boundary (W^Tx + b), with the shape of [1,]
+    """
+    train_labels = (train_labels > 0.0).int()
+    val_labels = (val_labels > 0.0).int()
+    train_labels[sv] = 2
+    foreground = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(sv))
+    foreground_sv = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(foreground))
+    background = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(sv))
+    background_sv = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(background))
+    f, ax = plt.subplots()
+    plt.title("training dataset")
+    ax.scatter(train_features[foreground, 0], train_features[foreground, 1], marker='.', c='r', label="-1")
+    ax.scatter(train_features[foreground_sv, 0], train_features[foreground_sv, 1], marker='.', c='darkorange',
+               label="-1 (support vector)")
+    ax.scatter(train_features[background, 0], train_features[background, 1], marker='x', c='b', label="+1")
+    ax.scatter(train_features[background_sv, 0], train_features[background_sv, 1], marker='x', c='c',
+               label="+1 (support vector)")
+    x = np.linspace(-20, 20, 100)
+    ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
+    ax.legend(loc="best")
+    plt.ylim([-30, 30])
+    plt.show()
+    f, ax = plt.subplots()
+    plt.title("validation dataset")
+    foreground_val = [i for i in range(val_labels.shape[0] // 2)]
+    background_val = [i + val_labels.shape[0] // 2 for i in range(val_labels.shape[0] // 2)]
+    ax.scatter(val_features[foreground_val, 0], val_features[foreground_val, 1], marker='.', c='r', label="-1")
+    ax.scatter(val_features[background_val, 0], val_features[background_val, 1], marker='x', c='b', label="+1")
+    x = np.linspace(-20, 20, 100)
+    ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
+    ax.legend(loc="best")
+    plt.ylim([-30, 30])
+    plt.show()
+
+
+def plot(train_loss, train_acc, val_acc, epochs):
+    """
+    Draw loss and accuracy curve
+    ------------------
+    :param train_loss: a list with loss of each training epoch
+    :param train_acc: a list with accuracy on training dataset of each training epoch
+    :param val_acc: a list with accuracy on validation dataset of each training epoch
+    :param epochs: a list with the index of all training epochs
+    """
+
+    # draw the training loss curve
+    f, ax = plt.subplots()
+    plt.title("Training Loss")
+    ax.plot(epochs, train_loss, color="tab:blue")
+    ax.set_xlabel("Training epoch")
+    ax.set_ylabel("Loss")
+    ax.legend(["training loss"], loc="best")
+    plt.show()
+
+    # draw the accuracy curve
+    f, ax = plt.subplots()
+    plt.title("Training and Validation Accuracy")
+    ax.plot(epochs, train_acc, color="tab:orange")
+    ax.plot(epochs, val_acc, color="tab:green")
+    ax.legend(["training accuracy","validation accuracy"], loc="best")
+    ax.set_xlabel("Training epoch")
+    ax.set_ylabel("Accuracy")
+    ax.set_ylim(0, 101)
+    plt.show()
+
+
+if __name__ == "__main__":
+    # set random seed for reproducibility
+    seed = 2024
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+
+    # set configurations of the model and training process
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels",)
+    parser.add_argument("--n_epoch", type=int, default=50, help="number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=20, help="training batch size")
+    parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
+    parser.add_argument("--C", type=float, default=1e-3, help="regularization coefficient in hinge loss")
+    parser.add_argument("--device", type=str, help="cpu or cuda")
+    parser.add_argument("--feature_channel", type=int, default=2, help="number of pre-extracted feature channel by pretrained network")
+    parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
+
+    args = parser.parse_args()
+    if args.device is None:
+        args.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # run the training procedure
+    train(
+        data_root=args.data_root,
+        feature_channel=args.feature_channel,
+        batch_size=args.batch_size,
+        n_epoch=args.n_epoch,
+        lr=args.lr,
+        C=args.C,
+        model_save_path=args.model_save_path,
+        device=args.device,
+    )
+
+
+
+
+
+
+
+
+
+
+
--- a/hw3/report/dtx-style.sty
+++ b/hw3/report/dtx-style.sty
@@ -0,0 +1,132 @@
+%%
+%% This is file `dtx-style.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% thucoursework.dtx  (with options: `dtx-style')
+%% 
+%% This is a generated file.
+%% 
+%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
+%% 
+%% This work may be distributed and/or modified under the
+%% conditions of the LaTeX Project Public License, either version 1.3
+%% of this license or (at your option) any later version.
+%% The latest version of this license is in
+%%   http://www.latex-project.org/lppl.txt
+%% and version 1.3 or later is part of all distributions of LaTeX
+%% version 2005/12/01 or later.
+%% 
+%% To produce the documentation run the original source files ending with `.dtx'
+%% through LaTeX.
+%% 
+
+\ProvidesPackage{dtx-style}
+\RequirePackage{hypdoc}
+\RequirePackage[UTF8,scheme=chinese]{ctex}
+\RequirePackage{newpxtext}
+\RequirePackage{newpxmath}
+\RequirePackage[
+  top=2.5cm, bottom=2.5cm,
+  left=4cm, right=2cm,
+  headsep=3mm]{geometry}
+\RequirePackage{array,longtable,booktabs}
+\RequirePackage{listings}
+\RequirePackage{fancyhdr}
+\RequirePackage{xcolor}
+\RequirePackage{enumitem}
+\RequirePackage{etoolbox}
+\RequirePackage{metalogo}
+
+\colorlet{thu@macro}{blue!60!black}
+\colorlet{thu@env}{blue!70!black}
+\colorlet{thu@option}{purple}
+\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
+\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
+\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
+\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
+
+\def\DescribeOption{%
+  \leavevmode\@bsphack\begingroup\MakePrivateLetters%
+  \Describe@Option}
+\def\Describe@Option#1{\endgroup
+  \marginpar{\raggedleft\PrintDescribeOption{#1}}%
+  \thu@special@index{option}{#1}\@esphack\ignorespaces}
+\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
+\def\thu@special@index#1#2{\@bsphack
+  \begingroup
+    \HD@target
+    \let\HDorg@encapchar\encapchar
+    \edef\encapchar usage{%
+      \HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
+    }%
+    \index{#2\actualchar{\string\ttfamily\space#2}
+           (#1)\encapchar usage}%
+    \index{#1:\levelchar#2\actualchar
+           {\string\ttfamily\space#2}\encapchar usage}%
+  \endgroup
+  \@esphack}
+
+\lstdefinestyle{lstStyleBase}{%
+   basicstyle=\small\ttfamily,
+   aboveskip=\medskipamount,
+   belowskip=\medskipamount,
+   lineskip=0pt,
+   boxpos=c,
+   showlines=false,
+   extendedchars=true,
+   upquote=true,
+   tabsize=2,
+   showtabs=false,
+   showspaces=false,
+   showstringspaces=false,
+   numbers=none,
+   linewidth=\linewidth,
+   xleftmargin=4pt,
+   xrightmargin=0pt,
+   resetmargins=false,
+   breaklines=true,
+   breakatwhitespace=false,
+   breakindent=0pt,
+   breakautoindent=true,
+   columns=flexible,
+   keepspaces=true,
+   gobble=2,
+   framesep=3pt,
+   rulesep=1pt,
+   framerule=1pt,
+   backgroundcolor=\color{gray!5},
+   stringstyle=\color{green!40!black!100},
+   keywordstyle=\bfseries\color{blue!50!black},
+   commentstyle=\slshape\color{black!60}}
+
+\lstdefinestyle{lstStyleShell}{%
+   style=lstStyleBase,
+   frame=l,
+   rulecolor=\color{purple},
+   language=bash}
+
+\lstdefinestyle{lstStyleLaTeX}{%
+   style=lstStyleBase,
+   frame=l,
+   rulecolor=\color{violet},
+   language=[LaTeX]TeX}
+
+\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
+\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
+
+\setlist{nosep}
+
+\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
+\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
+\DeclareDocumentCommand{\pkg}{s m}{%
+  \texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
+\DeclareDocumentCommand{\file}{s m}{%
+  \texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
+\newcommand{\myentry}[1]{%
+  \marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
+\newcommand{\note}[2][Note]{{%
+  \color{magenta}{\bfseries #1}\emph{#2}}}
+
+\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
--- a/hw3/report/iidef.sty
+++ b/hw3/report/iidef.sty
@@ -0,0 +1,153 @@
+%%
+%% This is file `iidef.sty',
+%% generated with the docstrip utility.
+%%
+%% The original source files were:
+%%
+%% thucoursework.dtx  (with options: `sty')
+%% 
+%% This is a generated file.
+%% 
+%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
+%% 
+%% This work may be distributed and/or modified under the
+%% conditions of the LaTeX Project Public License, either version 1.3
+%% of this license or (at your option) any later version.
+%% The latest version of this license is in
+%%   http://www.latex-project.org/lppl.txt
+%% and version 1.3 or later is part of all distributions of LaTeX
+%% version 2005/12/01 or later.
+%% 
+%% To produce the documentation run the original source files ending with `.dtx'
+%% through LaTeX.
+%% 
+
+\NeedsTeXFormat{LaTeX2e}[1999/12/01]
+\ProvidesClass{iidef}
+[2020/09/09 2.6 Tsinghua University Coursework Template]
+%% configuration of nested enumerate env
+\RequirePackage{enumitem}
+%% set hwcount key-value option
+\RequirePackage{kvoptions}
+%% required by macro DeclareMathOperator
+\RequirePackage{amsmath}
+%% Set up page headers using with fancyhdr
+\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
+{\def\@thulhead{thulhead}}
+\RequirePackage{amsthm}
+%% semester
+\def\@term{term}
+\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
+%% institute
+\newcommand{\@courseinstitute}[1]{institute}
+\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
+%% coursename
+\newcommand{\@coursename}[1]{coursename}
+\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
+%% user can rewrite homework name
+\def\@hwname{Homework}
+\def\hwname#1{\renewcommand\@hwname{#1}}
+%% \iidef@thehwcnt = 1
+\DeclareStringOption[1]{thehwcnt}
+\ProcessKeyvalOptions*
+\def\thehwcnt{\iidef@thehwcnt}
+%% page header setup, distinguish between first page(plain style)
+%% and second page on (runningpage style)
+%%***************************************************************************
+\newcommand{\courseheader}{
+\thispagestyle{plain}%first page use native plain style to suppress header
+\vspace*{-1in}
+\begin{center}
+\@courseinstitute\\
+\@coursename\\
+\@term
+\vspace*{0.1in}
+\hrule
+\end{center}
+\begin{center}
+  \underline{\bf \@hwname\;\thehwcnt} \\
+\end{center}
+}
+\@ifundefined{@thulhead}{
+\fancypagestyle{runningpage}
+{
+  \fancyhead[L]{\small\@coursename}
+  \fancyhead[R]{\small\@courseinstitute}
+}
+%% use runningpage style from second page on
+\pagestyle{runningpage}
+}{}
+%% *********************************************************************************************
+%%name command macro
+%%*************************
+\newcommand{\name}[1]{
+\begin{flushleft}
+  #1\hfill
+  \today
+\end{flushleft}
+\hrule
+
+\vspace{2em}
+
+\flushleft
+}
+%%*************************
+%% enumitem related configuration
+\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
+\setlist[enumerate,2]{label=(\alph*)}
+\setlist[enumerate,3]{label=\roman*.}
+\setlist[enumerate,4]{label=\greek*}
+%%******************************
+\def\@slname{Solution}
+\def\slname#1{\renewcommand\@slname{#1}}
+
+\@ifundefined{solution}{
+\newenvironment{solution}
+{
+\proof[\@slname]
+}
+{
+%% no qed symbol in solution env
+\renewcommand{\qedsymbol}{}
+\endproof
+}
+}{}
+%%******************************
+%%common math symbols go here
+%%*************************************************
+\def\v#1{\underline{#1}}
+\newcommand{\uc}{\underline{c}}    % c, vec
+\newcommand{\uv}{\underline{v}}    % x, vec
+\newcommand{\uw}{\underline{w}}    % w, vec
+\newcommand{\ux}{\underline{x}}    % x, vec
+\newcommand{\uy}{\underline{y}}    % y, vec
+\newcommand{\uz}{\underline{z}}    % z, vec
+\newcommand{\um}{\underline{m}}    % m, vec
+\newcommand{\rvx}{\mathsf{x}}    % x, r.v.
+\newcommand{\rvy}{\mathsf{y}}    % y, r.v.
+\newcommand{\rvz}{\mathsf{z}}    % z, r.v.
+\newcommand{\rvw}{\mathsf{w}}    % w, r.v.
+\newcommand{\rvH}{\mathsf{H}}    % H, r.v.
+\newcommand{\urvx}{\underline{\mathsf{x}}}    % x, r.v. vec
+\newcommand{\urvy}{\underline{\mathsf{y}}}    % y, r.v. vec
+\newcommand{\urvz}{\underline{\mathsf{z}}}    % z, r.v. vec
+\newcommand{\urvw}{\underline{\mathsf{w}}}    % w, r.v. vec
+
+\newcommand{\defas}{\triangleq} %\coloneqq
+\newcommand{\reals}{\mathbb{R}}
+\newcommand{\TT}{\mathrm{T}}    % transpose
+\DeclareMathOperator*{\argmax}{arg\,max}
+\DeclareMathOperator*{\argmin}{arg\,min}
+\DeclareMathOperator*{\argsup}{arg\,sup}
+\DeclareMathOperator*{\arginf}{arg\,inf}
+\DeclareMathOperator{\diag}{diag}
+\DeclareMathOperator{\Var}{Var}
+\DeclareMathOperator{\Cov}{Cov}
+\DeclareMathOperator{\MSE}{MSE}
+\DeclareMathOperator{\1}{\mathds{1}}
+\DeclareMathOperator{\In}{\mathbb{I}}
+\DeclareMathOperator{\E}{\mathbb{E}}
+\DeclareMathOperator{\Prob}{\mathbb{P}}
+\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
+\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
+%%************************************************************************************
--- a/hw3/report/img/check/check.png
+++ b/hw3/report/img/check/check.png
--- a/hw3/report/img/preprocess/preprocess_test.png
+++ b/hw3/report/img/preprocess/preprocess_test.png
--- a/hw3/report/img/preprocess/preprocess_train.png
+++ b/hw3/report/img/preprocess/preprocess_train.png
--- a/hw3/report/img/preprocess/preprocess_val.png
+++ b/hw3/report/img/preprocess/preprocess_val.png
--- a/hw3/report/img/train/1/accu.png
+++ b/hw3/report/img/train/1/accu.png
--- a/hw3/report/img/train/1/loss.png
+++ b/hw3/report/img/train/1/loss.png
--- a/hw3/report/img/train/1/sv.png
+++ b/hw3/report/img/train/1/sv.png
--- a/hw3/report/img/train/1/test.png
+++ b/hw3/report/img/train/1/test.png
--- a/hw3/report/img/train/1/val.png
+++ b/hw3/report/img/train/1/val.png
--- a/hw3/report/img/train/1e-6/accu.png
+++ b/hw3/report/img/train/1e-6/accu.png
--- a/hw3/report/img/train/1e-6/loss.png
+++ b/hw3/report/img/train/1e-6/loss.png
--- a/hw3/report/img/train/1e-6/sv.png
+++ b/hw3/report/img/train/1e-6/sv.png
--- a/hw3/report/img/train/1e-6/test.png
+++ b/hw3/report/img/train/1e-6/test.png
--- a/hw3/report/img/train/1e-6/val.png
+++ b/hw3/report/img/train/1e-6/val.png
--- a/hw3/report/img/train/default/loss.png
+++ b/hw3/report/img/train/default/loss.png
--- a/hw3/report/img/train/default/sv.png
+++ b/hw3/report/img/train/default/sv.png
--- a/hw3/report/img/train/default/test.png
+++ b/hw3/report/img/train/default/test.png
--- a/hw3/report/img/train/default/train_accu.png
+++ b/hw3/report/img/train/default/train_accu.png
--- a/hw3/report/img/train/default/val.png
+++ b/hw3/report/img/train/default/val.png
--- a/hw3/report/main.tex
+++ b/hw3/report/main.tex
@@ -0,0 +1,379 @@
+% Homework Template
+\documentclass[a4paper]{article}
+\usepackage{ctex}
+\usepackage{amsmath, amssymb, amsthm}
+\usepackage{moreenum}
+\usepackage{mathtools}
+\usepackage{url}
+\usepackage{bm}
+\usepackage{enumitem}
+\usepackage{graphicx}
+\usepackage{subcaption}
+\usepackage{booktabs} % toprule
+\usepackage[mathcal]{eucal}
+\usepackage[thehwcnt = 3]{iidef}
+\usepackage{listings}
+\usepackage{fontspec}
+\usepackage{xcolor}
+\usepackage{float}
+\usepackage{siunitx}
+
+\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
+\newfontfamily\cascadia{Cascadia Code}
+
+\lstset{
+    basicstyle          =   \small\codefont,
+    % ---
+    tabsize             =   4,
+    showstringspaces    =   false,
+    numbers             =   left,
+    numberstyle         =   \codefont,
+    % ---
+    breaklines          =   true,
+    captionpos          =   t,      
+    % ---
+    frame               =   l,
+    flexiblecolumns,
+}
+
+\lstdefinestyle{Python}{
+    language        =   Python, % 语言选Python
+    keywordstyle    =   \color{blue},
+    keywordstyle    =   [2] \color{teal},
+    stringstyle     =   \color{orange!80!black},
+    commentstyle    =   \color{red},
+    identifierstyle =   \color{blue!80!white},
+}
+
+\lstdefinestyle{Bash}{
+    language        =   bash
+}
+
+\thecourseinstitute{清华大学电子工程系}
+\thecoursename{\textbf{媒体与认知}}
+\theterm{2023-2024学年春季学期}
+\hwname{作业}
+\begin{document}
+\courseheader
+% 请在YOUR NAME处填写自己的姓名
+\name{高艺轩}
+\vspace{3mm}
+\centerline{\textbf{\Large{理论部分}}}
+
+\section{单选题（15分）}
+% 请在？处填写答案
+\subsection{\underline{D}}
+
+\subsection{\underline{C}}
+
+\subsection{\underline{D}}
+
+\subsection{\underline{D}}
+
+\subsection{\underline{B}}
+
+\section{计算题（15 分）}
+
+
+\subsection{给定两个类别的样本分别为:
+\begin{align*}
+     &\omega_1:\{(3,1),(2,2),(4,3),(3,2)\} \\
+   &\omega_2:\{(1,3),(1,2),(-1,1),(-1,2)\}
+\end{align*}
+试利用LDA，将样本特征维数压缩为一维。
+}
+
+\begin{proof}[解]
+    首先计算$\mu_1 = (3, 2), \mu_2 = (0, 2), \mu = (1.5, 2)$。因此
+    \[S_1 = \frac{1}{4}
+    \left(
+        \begin{bmatrix}
+            0 & 0\\
+            0 & 1
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            1 & 0\\
+            0 & 0
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            1 & 1\\
+            1 & 1
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            0 & 0\\
+            0 & 0
+        \end{bmatrix}
+    \right)
+    =
+    \begin{bmatrix}
+        0.5 & 0.25\\
+        0.25 & 0.5
+    \end{bmatrix}\]
+    \[S_2 = \frac{1}{4}
+    \left(
+        \begin{bmatrix}
+            0 & 0\\
+            0 & 1
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            1 & 0\\
+            0 & 0
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            1 & 1\\
+            1 & 1
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            1 & 0\\
+            0 & 0
+        \end{bmatrix}
+    \right)
+    =
+    \begin{bmatrix}
+        0.75 & 0.25\\
+        0.25 & 0.5
+    \end{bmatrix}\]
+    进一步地，
+    \[S_w = \frac{1}{2} (S_1 + S_2) = 
+    \begin{bmatrix}
+        0.625 & 0.25\\
+        0.25 & 0.5
+    \end{bmatrix}\]
+    \[S_b = \frac{1}{2} \left(
+        \begin{bmatrix}
+            2.25 & 0\\
+            0 & 0
+        \end{bmatrix}
+        +
+        \begin{bmatrix}
+            2.25 & 0\\
+            0 & 0
+        \end{bmatrix}
+    \right)
+    =
+    \begin{bmatrix}
+        2.25 & 0\\
+        0 & 0
+    \end{bmatrix}\]
+    广义特征值分解得到$\lambda = 4.5$，$v = (0.8944, -0.4472)$。投影后的样本为
+    \[\omega_1: \left\{2.2360, 0.8944, 2.2360, 1.7888\right\}\]
+    \[\omega_2: \left\{-0.4472, 0, -1.3416, -1.7888\right\}\]
+\end{proof}
+
+
+
+\vspace{3mm}
+\subsection{模型训练通常需要大量的数据，假设某采集的数据集包含80\%的有效数据和20\%的无效数据。采用一种算法判断数据是否有效，其中无效数据被成功判别为无效数据的概率为90\%，而有效数据被误判为无效数据的概率为5\%。如果某条数据经过该算法被判别为无效数据，则根据贝叶斯定理，这条数据是无效数据的概率是多少？(提示：全概率公式$P(Y)=\sum^{N}_{i=1}P(Y|X_i)P(X_i)$)\\}
+
+\begin{proof}[解]
+    \begin{align*}
+        & P(\text{无效数据} \mid \text{判定无效})\\
+        = & \frac{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据})}{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据}) + p(\text{判定无效} \mid \text{有效数据})p(\text{有效数据})}\\
+        = & \frac{0.9 \times 0.2}{0.9 \times 0.2 + 0.05 \times 0.8}\\
+        = & \frac{0.18}{0.18 + 0.04}\\
+        = & \frac{9}{11} 
+    \end{align*}
+\end{proof}
+
+\vspace{3mm}
+\subsection{设有两类正态分布的样本集，第一类均值为$\mu_1=[2,-1]^T$，第二类均值为$\mu_2=[1,1]^T$。两类样本集的协方差矩阵和出现的先验概率都相等：$\Sigma_1=\Sigma_2=\Sigma=\left[ \begin{array}{cc}
+    4 & 2 \\
+    2 & \frac{4}{3}
+\end{array} \right]$，$p(\omega_1)=p(\omega_2)$。试计算分类界面，并对特征向量$x=[6,2]^T$分类。}
+
+\begin{proof}[解]
+    \[\Sigma^{-1} = \begin{bmatrix}
+        1 & -1.5\\
+        -1.5 & 3
+    \end{bmatrix}\]
+    决策方程
+    \[g_{LDF1} = \Sigma^{-1} \mu_1 \boldsymbol{x} + -\frac{1}{2} \mu_1^T \Sigma^{-1} \mu_1 = (3.5, -1) \boldsymbol{x} - 6.5\]
+    类似地可以得到
+    \[g_{LDF2} = (-0.5, 1.5) \boldsymbol{x} - 0.5\]
+    因此分类界面为
+    \begin{align*}
+        (3.5, -1) \boldsymbol{x} - 6.5 & = (-0.5, 1.5) \boldsymbol{x} - 0.5\\
+        (4, -2.5) \boldsymbol{x} & = 6
+    \end{align*}
+    对于$(6, 2)$，计算$g_{LDF1}((6, 2)) = 12.5$，$g_{LDF2}((6, 2)) = -0.5$，因此属于第一类。
+\end{proof}
+
+\vspace{3mm}
+\subsection{给定异或的样本集$D=\left\{\left((0,0)^T,-1\right),\left((0,1)^T,1\right),\left((1,0)^T,1\right),\left((1,1)^T,-1\right)\right\}$该样本集是线性不可分的，可采用如下所示的多项式函数$\phi(\mathbf{x})$将样本$D=\left\{(\mathbf{x}_n,y_n)\right\}$映射为$D_\phi=\left\{(\phi(\mathbf{x}_n),y_n)\right\}$，其中$\phi(\mathbf{x})$满足
+\begin{equation*}
+\begin{aligned}
+    \phi_1(\mathbf{x})&=2(x_1-0.5) \\
+    \phi_2(\mathbf{x})&=4(x_1-0.5)(x_2-0.5)
+\end{aligned}
+\end{equation*}
+\\
+\qquad(1) 给出映射后的样本集；\\
+\qquad(2) 在映射后的样本集中，设计一个线性SVM分类器，给出支持向量及分类界面。
+}
+
+\begin{proof}[解]
+    映射后的样本集
+    \[D_{\phi} = \left\lbrace\left((-1, 1)^T, -1\right), \left((-1, -1)^T, 1\right), \left((1, -1)^T, 1\right), \left((1, 1)^T, -1\right)\right\rbrace\]
+
+    待优化的问题为
+    \[L(\boldsymbol{\alpha}) = \sum_{i = 1}^4 \alpha_i - \frac{1}{2} \sum_{i = 1}^4 \sum_{j = 1}^4 \alpha_i \alpha_j y_i y_j \boldsymbol{x}_i^T \boldsymbol{x}_j\]
+    因此
+    \begin{align*}
+        \frac{\partial L}{\partial \alpha_1} & = 1 - \frac{1}{2}\sum_{i \neq 1}^4 \alpha_i y_1 y_i \boldsymbol{x}_1^T \boldsymbol{x}_i - 2 \alpha_1 y_1 y_1 \boldsymbol{x}_1^T \boldsymbol{x}_1\\
+        & = 1 - 2 \alpha_3 - 4 \alpha_1\\
+        \frac{\partial L}{\partial \alpha_2} & = 1 - 2\alpha_4 - 4 \alpha_2\\
+        \frac{\partial L}{\partial \alpha_3} & = 1 - 2 \alpha_1 - 4 \alpha_3\\
+        \frac{\partial L}{\partial \alpha_4} & = 1 - 2 \alpha_3 - 4 \alpha_4
+    \end{align*}
+    令四个偏导数均为0，得到$\alpha_1 = \alpha_2 = \alpha_3 = \alpha_4 = \frac{1}{6}$。全部的点均为支持向量。因此
+    \[\boldsymbol{w} = \sum_{i = 1}^4 \alpha_i y_i \boldsymbol{x}_i = \left(0, -\frac{2}{3}\right)\]
+
+    为求偏置量，带入$\boldsymbol{x}_1$：
+    \[(-1) (\boldsymbol{w}^T \boldsymbol{x}_1 + b) = 1\]
+    得到$b = -\frac{1}{3}$。
+
+    分类界面$\boldsymbol{w}^T \boldsymbol{x} + b = 0$，即
+    \[\begin{bmatrix}
+        0\\-\frac{2}{3}
+    \end{bmatrix} \boldsymbol{x} - \frac{1}{3} = 0\]
+    得到$x_2 = \frac{1}{2}$，因此在原空间中，
+    \[4(x_1 - 0.5)(x_2 - 0.5) = 0.5\]
+
+\end{proof}
+
+
+
+\vspace{3mm}
+\subsection{使用KMeans算法对2维空间中的6个点$(0,2)$,$(2,0)$,$(2,3)$,$(3,2)$,$(4,0)$,$(5,4)$进行聚类，距离函数选择欧氏距离$d=\sqrt{(x_1-x_2)^2+(y_1-y_2)^2}$。\\
+\qquad (1)起始聚类中心选择(0,0)和(4,3)，计算聚类中心；\\
+\qquad (2)起始聚类中心选择(1,4)和(3,1)，计算聚类中心。\\
+}
+
+\begin{proof}[解]
+    中心选择$(0, 0), (4, 3)$，第一次分为$(0, 2), (2,0)$与$(2, 3), (3, 2), (4, 0), (5, 4)$，更新后的中心为$(1, 1)$与$\left(\frac{7}{2}, \frac{9}{4}\right)$。收敛。
+
+    中心选择$(1, 4)$与$(3, 1)$，第一次分为$(0, 2), (2, 3)$与$(2, 0), (4, 0), (3, 2), (5, 4)$，更新后中心为$(1, \frac{5}{2})$与$(\frac{7}{2}, \frac{3}{2})$，收敛。
+\end{proof}
+
+\vspace{3mm}
+\centerline{\textbf{\Large{编程部分}}}
+
+
+\vspace{3mm}
+% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题进度汇报”中的一项完成
+\section{编程作业报告}
+\subsection{程序验证}
+与助教给出的图片相比，我写出的程序PCA得到的结果的xy坐标都在$[-1, 1]$之间，不利于之后的分类。我将所有的PCA之后的坐标都扩大了20倍。
+
+运行\lstinline{check.py}进行检查：
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/check/check.png}
+\end{figure}
+
+\subsection{数据预处理}
+运行
+\begin{lstlisting}[style=Bash]
+python data_preprocess.py
+\end{lstlisting}
+得到的输出为
+\begin{figure}[H]
+    \centering
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/preprocess/preprocess_train.png}
+        \caption{训练集preprocess结果}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/preprocess/preprocess_val.png}
+        \caption{验证集preprocess结果}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/preprocess/preprocess_test.png}
+        \caption{测试集preprocess结果}
+    \end{subfigure}
+\end{figure}
+
+\subsection{训练、验证及测试}
+\begin{figure}[H]
+    \centering
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/default/loss.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/default/train_accu.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/default/sv.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/default/val.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.8\linewidth}
+        \includegraphics[width=\textwidth]{img/train/default/test.png}
+    \end{subfigure}
+\end{figure}
+
+\subsection{调整正则化系数}
+\subsubsection{C = \num{1e-6}}
+\begin{figure}[H]
+    \centering
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1e-6/loss.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1e-6/accu.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1e-6/sv.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1e-6/val.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.8\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1e-6/test.png}
+    \end{subfigure}
+\end{figure}
+可以看到出现了严重的欠拟合，分类界面超出了绘图的范围。这是因为C过小，导致不能正确地分辨合适的分类界面。
+
+\subsubsection{C = 1}
+\begin{figure}[H]
+    \centering
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1/loss.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1/accu.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1/sv.png}
+    \end{subfigure}
+    \hspace{0.5cm}
+    \begin{subfigure}[t]{.45\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1/val.png}
+    \end{subfigure}\\[2ex]
+    \begin{subfigure}[t]{.8\linewidth}
+        \includegraphics[width=\textwidth]{img/train/1/test.png}
+    \end{subfigure}
+\end{figure}
+发生了过拟合，直线被交界面的点限制，斜率不是最优。
+
+\end{document}
+
+
+
+%%% Local Variables:
+%%% mode: late\rvx
+%%% TeX-master: t
+%%% End:
--- a/j.ps1
+++ b/j.ps1
@@ -1 +1 @@
-cd ./hw2/code
+cd ./hw3/code
--- a/testtorch.ipynb
+++ b/testtorch.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -152,6 +152,66 @@
    "print(conv_1(a).size())\n",
    "print(conv_2(conv_1(a)).size())\n"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([0., 1.])\n",
+      "1\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.Tensor([1.0, 2.0])\n",
+    "b = torch.Tensor([1.0, 1.0])\n",
+    "print((a > b).type_as(a))\n",
+    "print((a == b).sum().item())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor(2.5000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
+    "mu = a.mean(dim=0)\n",
+    "print(mu, a - mu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[5.],\n",
+      "        [4.]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.Tensor([[5], [4]])\n",
+    "b = torch.Tensor([1])\n",
+    "print((a.T * b).T)"
+   ]
  }
 ],
 "metadata": {