11 Commits

Author SHA1 Message Date
69e52e0e50 Merge pull request 'Homework4 Submit' (#5) from homework4 into main
Reviewed-on: #5
2024-05-27 00:04:04 +08:00
unlockable
76a643ebc4 Homework4 Submit. 2024-05-27 00:01:48 +08:00
f1459069da Merge pull request 'Submit homework 3.' (#4) from homework3 into main
Reviewed-on: #4
2024-05-22 20:24:30 +08:00
unlockable
c6b2420b85 TA Release homework4. 2024-05-22 20:22:47 +08:00
unlockable
c850f38778 Homework3 Submit 2024-05-18 16:23:40 +08:00
unlockable
820f679067 SVM and PCA not working 2024-05-18 00:12:06 +08:00
unlockable
81de7b1d58 feat(hw3): Copy file from hw2 2024-05-16 17:41:27 +08:00
unlockable
b741c9d08e feat(hw3): Non program part of the homework 2024-05-16 17:38:56 +08:00
unlockable
8b657be441 Mac Sync 2024-05-15 20:05:18 +08:00
unlockable
4bc3f77879 TA release homework3. 2024-05-01 17:13:51 +08:00
121ca13130 Merge pull request 'Submit Homework 2' (#3) from homework2 into main
Reviewed-on: #3
2024-04-15 21:57:18 +08:00
67 changed files with 52780 additions and 1319 deletions

27
.gitignore vendored
View File

@@ -1,12 +1,15 @@
*.zip
__pycache__/
*.pth
*.log
*.aux
*.synctex.gz
*.synctex.gz(buzy)
*.out
*.pdf
.DS_Store
hw2/code/checkpoints/
hw2/code/visualized/
*.zip
__pycache__/
*.pth
*.log
*.aux
*.synctex.gz
*.synctex.gz(buzy)
*.out
*.pdf
.DS_Store
hw2/code/checkpoints/
hw2/code/visualized/
hw3/code/data/
hw3/code/checkpoints/
hw4/code/workdirs/

View File

@@ -1,4 +1,4 @@
{
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true
{
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true
}

View File

@@ -1,56 +1,56 @@
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = 6.678
Epoch 04: loss = 4.361
Epoch 05: loss = 3.110
Epoch 06: loss = 2.099
Epoch 07: loss = 1.698
Epoch 08: loss = 1.320
Epoch 09: loss = 0.970
Epoch 10: loss = 0.891
Epoch 10: validation accuracy = 66.0%
Epoch 11: loss = 0.817
Epoch 12: loss = 0.723
Epoch 13: loss = 0.512
Epoch 14: loss = 0.353
Epoch 15: loss = 0.202
Epoch 16: loss = 0.182
Epoch 17: loss = 0.184
Epoch 18: loss = 0.191
Epoch 19: loss = 0.175
Epoch 20: loss = 0.166
Epoch 20: validation accuracy = 68.0%
Epoch 21: loss = 0.146
Epoch 22: loss = 0.105
Epoch 23: loss = 0.109
Epoch 24: loss = 0.074
Epoch 25: loss = 0.097
Epoch 26: loss = 0.047
Epoch 27: loss = 0.038
Epoch 28: loss = 0.037
Epoch 29: loss = 0.024
Epoch 30: loss = 0.021
Epoch 30: validation accuracy = 68.8%
Epoch 31: loss = 0.019
Epoch 32: loss = 0.024
Epoch 33: loss = 0.023
Epoch 34: loss = 0.014
Epoch 35: loss = 0.013
Epoch 36: loss = 0.012
Epoch 37: loss = 0.011
Epoch 38: loss = 0.013
Epoch 39: loss = 0.013
Epoch 40: loss = 0.016
Epoch 40: validation accuracy = 70.5%
Epoch 41: loss = 0.015
Epoch 42: loss = 0.009
Epoch 43: loss = 0.011
Epoch 44: loss = 0.008
Epoch 45: loss = 0.008
Epoch 46: loss = 0.010
Epoch 47: loss = 0.009
Epoch 48: loss = 0.007
Epoch 49: loss = 0.007
Epoch 50: loss = 0.010
Epoch 50: validation accuracy = 70.5%
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = 6.678
Epoch 04: loss = 4.361
Epoch 05: loss = 3.110
Epoch 06: loss = 2.099
Epoch 07: loss = 1.698
Epoch 08: loss = 1.320
Epoch 09: loss = 0.970
Epoch 10: loss = 0.891
Epoch 10: validation accuracy = 66.0%
Epoch 11: loss = 0.817
Epoch 12: loss = 0.723
Epoch 13: loss = 0.512
Epoch 14: loss = 0.353
Epoch 15: loss = 0.202
Epoch 16: loss = 0.182
Epoch 17: loss = 0.184
Epoch 18: loss = 0.191
Epoch 19: loss = 0.175
Epoch 20: loss = 0.166
Epoch 20: validation accuracy = 68.0%
Epoch 21: loss = 0.146
Epoch 22: loss = 0.105
Epoch 23: loss = 0.109
Epoch 24: loss = 0.074
Epoch 25: loss = 0.097
Epoch 26: loss = 0.047
Epoch 27: loss = 0.038
Epoch 28: loss = 0.037
Epoch 29: loss = 0.024
Epoch 30: loss = 0.021
Epoch 30: validation accuracy = 68.8%
Epoch 31: loss = 0.019
Epoch 32: loss = 0.024
Epoch 33: loss = 0.023
Epoch 34: loss = 0.014
Epoch 35: loss = 0.013
Epoch 36: loss = 0.012
Epoch 37: loss = 0.011
Epoch 38: loss = 0.013
Epoch 39: loss = 0.013
Epoch 40: loss = 0.016
Epoch 40: validation accuracy = 70.5%
Epoch 41: loss = 0.015
Epoch 42: loss = 0.009
Epoch 43: loss = 0.011
Epoch 44: loss = 0.008
Epoch 45: loss = 0.008
Epoch 46: loss = 0.010
Epoch 47: loss = 0.009
Epoch 48: loss = 0.007
Epoch 49: loss = 0.007
Epoch 50: loss = 0.010
Epoch 50: validation accuracy = 70.5%
Model saved in ./saved_models/default.pth

View File

@@ -1,2 +1,2 @@
[Info] Load model from .\saved_models\default.pth
[Info] Load model from .\saved_models\default.pth
[Info] Test accuracy = 72.0%

View File

@@ -1,2 +1,2 @@
[Info] Load model from .\saved_models\adam_optim.pth
[Info] Load model from .\saved_models\adam_optim.pth
[Info] Test accuracy = 85.0%

View File

@@ -1,56 +1,56 @@
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = inf
Epoch 04: loss = inf
Epoch 05: loss = inf
Epoch 06: loss = inf
Epoch 07: loss = inf
Epoch 08: loss = inf
Epoch 09: loss = 3.250
Epoch 10: loss = 2.567
Epoch 10: validation accuracy = 59.0%
Epoch 11: loss = 1.963
Epoch 12: loss = 1.558
Epoch 13: loss = 1.320
Epoch 14: loss = 0.911
Epoch 15: loss = 0.808
Epoch 16: loss = 0.932
Epoch 17: loss = 0.861
Epoch 18: loss = 0.748
Epoch 19: loss = 0.783
Epoch 20: loss = 0.809
Epoch 20: validation accuracy = 65.5%
Epoch 21: loss = 0.678
Epoch 22: loss = 0.757
Epoch 23: loss = 0.747
Epoch 24: loss = 0.660
Epoch 25: loss = 0.536
Epoch 26: loss = 0.506
Epoch 27: loss = 0.577
Epoch 28: loss = 0.600
Epoch 29: loss = 0.681
Epoch 30: loss = 0.604
Epoch 30: validation accuracy = 68.0%
Epoch 31: loss = 0.552
Epoch 32: loss = 0.671
Epoch 33: loss = 0.604
Epoch 34: loss = 0.600
Epoch 35: loss = 0.818
Epoch 36: loss = 0.659
Epoch 37: loss = 0.375
Epoch 38: loss = 0.380
Epoch 39: loss = 0.418
Epoch 40: loss = 0.431
Epoch 40: validation accuracy = 73.5%
Epoch 41: loss = 0.551
Epoch 42: loss = 0.488
Epoch 43: loss = 0.350
Epoch 44: loss = 0.287
Epoch 45: loss = 0.294
Epoch 46: loss = 0.463
Epoch 47: loss = 0.438
Epoch 48: loss = 0.392
Epoch 49: loss = 0.325
Epoch 50: loss = 0.332
Epoch 50: validation accuracy = 80.8%
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = inf
Epoch 04: loss = inf
Epoch 05: loss = inf
Epoch 06: loss = inf
Epoch 07: loss = inf
Epoch 08: loss = inf
Epoch 09: loss = 3.250
Epoch 10: loss = 2.567
Epoch 10: validation accuracy = 59.0%
Epoch 11: loss = 1.963
Epoch 12: loss = 1.558
Epoch 13: loss = 1.320
Epoch 14: loss = 0.911
Epoch 15: loss = 0.808
Epoch 16: loss = 0.932
Epoch 17: loss = 0.861
Epoch 18: loss = 0.748
Epoch 19: loss = 0.783
Epoch 20: loss = 0.809
Epoch 20: validation accuracy = 65.5%
Epoch 21: loss = 0.678
Epoch 22: loss = 0.757
Epoch 23: loss = 0.747
Epoch 24: loss = 0.660
Epoch 25: loss = 0.536
Epoch 26: loss = 0.506
Epoch 27: loss = 0.577
Epoch 28: loss = 0.600
Epoch 29: loss = 0.681
Epoch 30: loss = 0.604
Epoch 30: validation accuracy = 68.0%
Epoch 31: loss = 0.552
Epoch 32: loss = 0.671
Epoch 33: loss = 0.604
Epoch 34: loss = 0.600
Epoch 35: loss = 0.818
Epoch 36: loss = 0.659
Epoch 37: loss = 0.375
Epoch 38: loss = 0.380
Epoch 39: loss = 0.418
Epoch 40: loss = 0.431
Epoch 40: validation accuracy = 73.5%
Epoch 41: loss = 0.551
Epoch 42: loss = 0.488
Epoch 43: loss = 0.350
Epoch 44: loss = 0.287
Epoch 45: loss = 0.294
Epoch 46: loss = 0.463
Epoch 47: loss = 0.438
Epoch 48: loss = 0.392
Epoch 49: loss = 0.325
Epoch 50: loss = 0.332
Epoch 50: validation accuracy = 80.8%
Model saved in .\saved_models\adam_optim_cuda.pth

View File

@@ -1,2 +1,2 @@
[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
[Info] Test accuracy = 88.8%

View File

@@ -1,111 +1,111 @@
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = inf
Epoch 04: loss = inf
Epoch 05: loss = inf
Epoch 06: loss = inf
Epoch 07: loss = inf
Epoch 08: loss = inf
Epoch 09: loss = inf
Epoch 10: loss = inf
Epoch 10: validation accuracy = 40.2%
Epoch 11: loss = inf
Epoch 12: loss = inf
Epoch 13: loss = inf
Epoch 14: loss = inf
Epoch 15: loss = inf
Epoch 16: loss = inf
Epoch 17: loss = 2.360
Epoch 18: loss = 2.086
Epoch 19: loss = 1.684
Epoch 20: loss = 1.453
Epoch 20: validation accuracy = 53.0%
Epoch 21: loss = 1.174
Epoch 22: loss = 1.046
Epoch 23: loss = 0.859
Epoch 24: loss = 0.740
Epoch 25: loss = 0.663
Epoch 26: loss = 0.495
Epoch 27: loss = 0.566
Epoch 28: loss = 0.521
Epoch 29: loss = 0.470
Epoch 30: loss = 0.363
Epoch 30: validation accuracy = 59.0%
Epoch 31: loss = 0.365
Epoch 32: loss = 0.305
Epoch 33: loss = 0.333
Epoch 34: loss = 0.293
Epoch 35: loss = 0.191
Epoch 36: loss = 0.295
Epoch 37: loss = 0.275
Epoch 38: loss = 0.461
Epoch 39: loss = 0.509
Epoch 40: loss = 0.298
Epoch 40: validation accuracy = 65.2%
Epoch 41: loss = 0.186
Epoch 42: loss = 0.395
Epoch 43: loss = 0.323
Epoch 44: loss = 0.309
Epoch 45: loss = 0.199
Epoch 46: loss = 0.285
Epoch 47: loss = 0.290
Epoch 48: loss = 0.302
Epoch 49: loss = 0.235
Epoch 50: loss = 0.190
Epoch 50: validation accuracy = 71.2%
Epoch 51: loss = 0.294
Epoch 52: loss = 0.311
Epoch 53: loss = 0.254
Epoch 54: loss = 0.289
Epoch 55: loss = 0.264
Epoch 56: loss = 0.213
Epoch 57: loss = 0.166
Epoch 58: loss = 0.218
Epoch 59: loss = 0.231
Epoch 60: loss = 0.283
Epoch 60: validation accuracy = 74.8%
Epoch 61: loss = 0.324
Epoch 62: loss = 0.245
Epoch 63: loss = 0.277
Epoch 64: loss = 0.286
Epoch 65: loss = 0.255
Epoch 66: loss = 0.263
Epoch 67: loss = 0.272
Epoch 68: loss = 0.272
Epoch 69: loss = 0.260
Epoch 70: loss = 0.271
Epoch 70: validation accuracy = 79.0%
Epoch 71: loss = 0.310
Epoch 72: loss = 0.301
Epoch 73: loss = 0.305
Epoch 74: loss = 0.311
Epoch 75: loss = 0.329
Epoch 76: loss = 0.295
Epoch 77: loss = 0.300
Epoch 78: loss = 0.316
Epoch 79: loss = 0.326
Epoch 80: loss = 0.352
Epoch 80: validation accuracy = 77.5%
Epoch 81: loss = 0.344
Epoch 82: loss = 0.326
Epoch 83: loss = 0.326
Epoch 84: loss = 0.335
Epoch 85: loss = 0.342
Epoch 86: loss = 0.361
Epoch 87: loss = 0.337
Epoch 88: loss = 0.339
Epoch 89: loss = 0.339
Epoch 90: loss = 0.341
Epoch 90: validation accuracy = 82.8%
Epoch 91: loss = 0.350
Epoch 92: loss = 0.359
Epoch 93: loss = 0.352
Epoch 94: loss = 0.363
Epoch 95: loss = 0.347
Epoch 96: loss = 0.341
Epoch 97: loss = 0.336
Epoch 98: loss = 0.348
Epoch 99: loss = 0.365
Epoch 100: loss = 0.350
Epoch 100: validation accuracy = 85.2%
Epoch 01: loss = inf
Epoch 02: loss = inf
Epoch 03: loss = inf
Epoch 04: loss = inf
Epoch 05: loss = inf
Epoch 06: loss = inf
Epoch 07: loss = inf
Epoch 08: loss = inf
Epoch 09: loss = inf
Epoch 10: loss = inf
Epoch 10: validation accuracy = 40.2%
Epoch 11: loss = inf
Epoch 12: loss = inf
Epoch 13: loss = inf
Epoch 14: loss = inf
Epoch 15: loss = inf
Epoch 16: loss = inf
Epoch 17: loss = 2.360
Epoch 18: loss = 2.086
Epoch 19: loss = 1.684
Epoch 20: loss = 1.453
Epoch 20: validation accuracy = 53.0%
Epoch 21: loss = 1.174
Epoch 22: loss = 1.046
Epoch 23: loss = 0.859
Epoch 24: loss = 0.740
Epoch 25: loss = 0.663
Epoch 26: loss = 0.495
Epoch 27: loss = 0.566
Epoch 28: loss = 0.521
Epoch 29: loss = 0.470
Epoch 30: loss = 0.363
Epoch 30: validation accuracy = 59.0%
Epoch 31: loss = 0.365
Epoch 32: loss = 0.305
Epoch 33: loss = 0.333
Epoch 34: loss = 0.293
Epoch 35: loss = 0.191
Epoch 36: loss = 0.295
Epoch 37: loss = 0.275
Epoch 38: loss = 0.461
Epoch 39: loss = 0.509
Epoch 40: loss = 0.298
Epoch 40: validation accuracy = 65.2%
Epoch 41: loss = 0.186
Epoch 42: loss = 0.395
Epoch 43: loss = 0.323
Epoch 44: loss = 0.309
Epoch 45: loss = 0.199
Epoch 46: loss = 0.285
Epoch 47: loss = 0.290
Epoch 48: loss = 0.302
Epoch 49: loss = 0.235
Epoch 50: loss = 0.190
Epoch 50: validation accuracy = 71.2%
Epoch 51: loss = 0.294
Epoch 52: loss = 0.311
Epoch 53: loss = 0.254
Epoch 54: loss = 0.289
Epoch 55: loss = 0.264
Epoch 56: loss = 0.213
Epoch 57: loss = 0.166
Epoch 58: loss = 0.218
Epoch 59: loss = 0.231
Epoch 60: loss = 0.283
Epoch 60: validation accuracy = 74.8%
Epoch 61: loss = 0.324
Epoch 62: loss = 0.245
Epoch 63: loss = 0.277
Epoch 64: loss = 0.286
Epoch 65: loss = 0.255
Epoch 66: loss = 0.263
Epoch 67: loss = 0.272
Epoch 68: loss = 0.272
Epoch 69: loss = 0.260
Epoch 70: loss = 0.271
Epoch 70: validation accuracy = 79.0%
Epoch 71: loss = 0.310
Epoch 72: loss = 0.301
Epoch 73: loss = 0.305
Epoch 74: loss = 0.311
Epoch 75: loss = 0.329
Epoch 76: loss = 0.295
Epoch 77: loss = 0.300
Epoch 78: loss = 0.316
Epoch 79: loss = 0.326
Epoch 80: loss = 0.352
Epoch 80: validation accuracy = 77.5%
Epoch 81: loss = 0.344
Epoch 82: loss = 0.326
Epoch 83: loss = 0.326
Epoch 84: loss = 0.335
Epoch 85: loss = 0.342
Epoch 86: loss = 0.361
Epoch 87: loss = 0.337
Epoch 88: loss = 0.339
Epoch 89: loss = 0.339
Epoch 90: loss = 0.341
Epoch 90: validation accuracy = 82.8%
Epoch 91: loss = 0.350
Epoch 92: loss = 0.359
Epoch 93: loss = 0.352
Epoch 94: loss = 0.363
Epoch 95: loss = 0.347
Epoch 96: loss = 0.341
Epoch 97: loss = 0.336
Epoch 98: loss = 0.348
Epoch 99: loss = 0.365
Epoch 100: loss = 0.350
Epoch 100: validation accuracy = 85.2%
Model saved in .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth

View File

@@ -1,244 +1,244 @@
% Homework Template
\documentclass[a4paper]{article}
\usepackage{ctex}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{moreenum}
\usepackage{mathtools}
\usepackage{url}
\usepackage{bm}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
\usepackage[mathcal]{eucal}
\usepackage[thehwcnt = 1]{iidef}
\usepackage{listings}
\usepackage[x11names]{xcolor}
\usepackage{float}
\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
\DeclareMathOperator{\arctanh}{arctanh}
% \DeclareMathOperator{\diag}{diag}
\setenumerate[1]{label=(\arabic{*})}
\setenumerate[2]{label=\arabic{*})}
\definecolor{codekeyword}{RGB}{171, 0, 216}
\definecolor{codetypename}{RGB}{29, 37, 251}
\definecolor{codevariable}{RGB}{10, 23, 126}
\definecolor{codestring}{RGB}{157, 0, 25}
\definecolor{codecomment}{RGB}{31, 129, 19}
\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
\lstset{
basicstyle = \small\codefont,
% ---
tabsize = 4,
showstringspaces = false,
numbers = left,
numberstyle = \cascadia,
% ---
breaklines = true,
captionpos = t,
% ---
frame = l,
flexiblecolumns,
columns = fixed,
}
\thecourseinstitute{清华大学电子工程系}
\thecoursename{\textbf{媒体与认知} \space 课堂2}
\theterm{2023-2024学年春季学期}
\hwname{作业}
\begin{document}
\courseheader
% 请在YOUR NAME处填写自己的姓名
\name{高艺轩}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
% 请在?处填写答案
\subsection{\underline{B}}
\subsection{\underline{A}}
\subsection{\underline{B}}
\subsection{\underline{A}}
\subsection{\underline{B}}
\section{计算题15 分)}
\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$,其中$\mathbf{x}\in R^{(m \times 1)}$$\mathbf{z}\in R^{(n\times 1)}$$\mathbf{W}\in R^{(m\times n)}$$\mathbf{b} \in R^{(n\times 1)}$均为已知,其激活函数如下:
$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
tanh表示双曲正切函数。若训练过程中的目标函数为L且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
}
\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
}
\begin{proof}[解]
首先,对$i \neq j$$\dfrac{\partial y_i}{\partial z_j} = 0$
同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$,因此
\[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
因此
\[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
\end{proof}
\subsubsection{请使用$\mathbf{y}$$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$$\frac{\partial L}{\partial \mathbf{W}}$$\frac{\partial L}{\partial \mathbf{b}}$
}
提示:$\frac{\partial L}{\partial \mathbf{x}}$$\frac{\partial L}{\partial \mathbf{W}}$$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
\begin{proof}[解]
由链式法则
\[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
对于$\dfrac{\partial L}{\partial W}$
\[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
\end{bmatrix}_{m \times n}\]
\begin{align*}
\frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
& = \begin{bmatrix}
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
\end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
\end{align*}
对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$,由链式法则
\[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
\end{proof}
\vspace{6mm}
\centerline{\textbf{\Large{编程部分}}}
\vspace{3mm}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
\section{编程作业报告}
% 请在此处完成编程作业报告
完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
\begin{enumerate}
\item 使用默认配置进行训练和测试。
\begin{enumerate}
\item 训练模型。
输入:
\lstinputlisting{codes/1.1.in.txt}
输出:
\lstinputlisting{codes/1.1.out.txt}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{img/1default_train.png}
\end{figure}
\item 测试模型。
输入:
\lstinputlisting{codes/1.2.in.txt}
输出:
\lstinputlisting{codes/1.2.out.txt}
\end{enumerate}
\item 调整参数、使用Adam优化器训练并测试。
\begin{enumerate}
\item 训练模型。
输入:
\lstinputlisting{codes/2.1.in.txt}
输出:
\lstinputlisting{codes/2.1.out.txt}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
\end{figure}
\item 测试性能。
输入:
\lstinputlisting{codes/2.2.in.txt}
输出:
\lstinputlisting{codes/2.2.out.txt}
\end{enumerate}
\item 使用效果最佳的模型测试。
经过简单的尝试,发现使用
\lstinputlisting{codes/self_train.in.txt}
可以使测试集准确率达到88.8\%有略微的提升。训练的loss曲线
\begin{figure}[H]
\centering
\includegraphics[width=.9\linewidth]{img/3found_best.png}
\end{figure}
使用它进行预测:
\begin{figure}[H]
\centering
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict01.png}
\subcaption{预测A}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict02.png}
\subcaption{预测B}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict03.png}
\subcaption{预测M}
\end{subfigure}
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict04.png}
\subcaption{预测R}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict05.png}
\subcaption{预测M}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict06.png}
\subcaption{预测O}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict07.png}
\subcaption{预测B}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict08.png}
\subcaption{预测W}
\end{subfigure}
\hfill
\end{figure}
\item 遇到的问题及解决方法
\begin{enumerate}
\item 代码中对灰度图像的矩阵进行标准化时,\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
\item 在利用训练好的模型进行预测时,发现自己找到的大部分模型都预测错误;最后与训练集的图片进行了对比,发现主要问题是裁切字母时留下了过大的边距,导致模型不能正确理解输入。重新裁剪边框后,得到正确的结果。
\end{enumerate}
\item 建议希望下次发布作业代码可以利用清华的git。
\end{enumerate}
% \section{自选课题开题报告}
% 请在此处介绍自选课题
\end{document}
%%% Local Variables:
%%% mode: late\rvx
%%% TeX-master: t
%%% End:
% Homework Template
\documentclass[a4paper]{article}
\usepackage{ctex}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{moreenum}
\usepackage{mathtools}
\usepackage{url}
\usepackage{bm}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
\usepackage[mathcal]{eucal}
\usepackage[thehwcnt = 1]{iidef}
\usepackage{listings}
\usepackage[x11names]{xcolor}
\usepackage{float}
\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
\DeclareMathOperator{\arctanh}{arctanh}
% \DeclareMathOperator{\diag}{diag}
\setenumerate[1]{label=(\arabic{*})}
\setenumerate[2]{label=\arabic{*})}
\definecolor{codekeyword}{RGB}{171, 0, 216}
\definecolor{codetypename}{RGB}{29, 37, 251}
\definecolor{codevariable}{RGB}{10, 23, 126}
\definecolor{codestring}{RGB}{157, 0, 25}
\definecolor{codecomment}{RGB}{31, 129, 19}
\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
\lstset{
basicstyle = \small\codefont,
% ---
tabsize = 4,
showstringspaces = false,
numbers = left,
numberstyle = \cascadia,
% ---
breaklines = true,
captionpos = t,
% ---
frame = l,
flexiblecolumns,
columns = fixed,
}
\thecourseinstitute{清华大学电子工程系}
\thecoursename{\textbf{媒体与认知} \space 课堂2}
\theterm{2023-2024学年春季学期}
\hwname{作业}
\begin{document}
\courseheader
% 请在YOUR NAME处填写自己的姓名
\name{高艺轩}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
% 请在?处填写答案
\subsection{\underline{B}}
\subsection{\underline{A}}
\subsection{\underline{B}}
\subsection{\underline{A}}
\subsection{\underline{B}}
\section{计算题15 分)}
\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$,其中$\mathbf{x}\in R^{(m \times 1)}$$\mathbf{z}\in R^{(n\times 1)}$$\mathbf{W}\in R^{(m\times n)}$$\mathbf{b} \in R^{(n\times 1)}$均为已知,其激活函数如下:
$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
tanh表示双曲正切函数。若训练过程中的目标函数为L且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
}
\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
}
\begin{proof}[解]
首先,对$i \neq j$$\dfrac{\partial y_i}{\partial z_j} = 0$
同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$,因此
\[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
因此
\[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
\end{proof}
\subsubsection{请使用$\mathbf{y}$$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$$\frac{\partial L}{\partial \mathbf{W}}$$\frac{\partial L}{\partial \mathbf{b}}$
}
提示:$\frac{\partial L}{\partial \mathbf{x}}$$\frac{\partial L}{\partial \mathbf{W}}$$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
\begin{proof}[解]
由链式法则
\[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
对于$\dfrac{\partial L}{\partial W}$
\[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
\end{bmatrix}_{m \times n}\]
\begin{align*}
\frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
& = \begin{bmatrix}
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
\end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
\end{align*}
对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$,由链式法则
\[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
\end{proof}
\vspace{6mm}
\centerline{\textbf{\Large{编程部分}}}
\vspace{3mm}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
\section{编程作业报告}
% 请在此处完成编程作业报告
完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
\begin{enumerate}
\item 使用默认配置进行训练和测试。
\begin{enumerate}
\item 训练模型。
输入:
\lstinputlisting{codes/1.1.in.txt}
输出:
\lstinputlisting{codes/1.1.out.txt}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{img/1default_train.png}
\end{figure}
\item 测试模型。
输入:
\lstinputlisting{codes/1.2.in.txt}
输出:
\lstinputlisting{codes/1.2.out.txt}
\end{enumerate}
\item 调整参数、使用Adam优化器训练并测试。
\begin{enumerate}
\item 训练模型。
输入:
\lstinputlisting{codes/2.1.in.txt}
输出:
\lstinputlisting{codes/2.1.out.txt}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
\end{figure}
\item 测试性能。
输入:
\lstinputlisting{codes/2.2.in.txt}
输出:
\lstinputlisting{codes/2.2.out.txt}
\end{enumerate}
\item 使用效果最佳的模型测试。
经过简单的尝试,发现使用
\lstinputlisting{codes/self_train.in.txt}
可以使测试集准确率达到88.8\%有略微的提升。训练的loss曲线
\begin{figure}[H]
\centering
\includegraphics[width=.9\linewidth]{img/3found_best.png}
\end{figure}
使用它进行预测:
\begin{figure}[H]
\centering
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict01.png}
\subcaption{预测A}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict02.png}
\subcaption{预测B}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict03.png}
\subcaption{预测M}
\end{subfigure}
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict04.png}
\subcaption{预测R}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict05.png}
\subcaption{预测M}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict06.png}
\subcaption{预测O}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict07.png}
\subcaption{预测B}
\end{subfigure}
\hfill
\begin{subfigure}[b]{.3\linewidth}
\includegraphics[width=\linewidth]{img/predict/predict08.png}
\subcaption{预测W}
\end{subfigure}
\hfill
\end{figure}
\item 遇到的问题及解决方法
\begin{enumerate}
\item 代码中对灰度图像的矩阵进行标准化时,\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
\item 在利用训练好的模型进行预测时,发现自己找到的大部分模型都预测错误;最后与训练集的图片进行了对比,发现主要问题是裁切字母时留下了过大的边距,导致模型不能正确理解输入。重新裁剪边框后,得到正确的结果。
\end{enumerate}
\item 建议希望下次发布作业代码可以利用清华的git。
\end{enumerate}
% \section{自选课题开题报告}
% 请在此处介绍自选课题
\end{document}
%%% Local Variables:
%%% mode: late\rvx
%%% TeX-master: t
%%% End:

View File

@@ -1,164 +1,164 @@
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# activations.py - activation functions
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn as nn
'''
In this script we will implement three activation functions, including both forward and backward processes.
More details about customizing a backward process in PyTorch can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
'''
## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
class Tanh(torch.autograd.Function):
'''
Tanh activation function
y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
'''
# static method of a python class means that we can call the function without initializing an instance of the class
@staticmethod
def forward(ctx, x):
'''
In the forward pass we receive a Tensor containing the input x and return
a Tensor containing the output.
ctx: it is a context object that can be used to save information for backward computation. You can save
objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
x: input with arbitrary shape
'''
# Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
# y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
# here we directly use torch.tanh(x) to avoid the problem above
y = torch.tanh(x)
# save an variable in ctx
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
grad_output: dL/dy
grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
"""
# get an variable from ctx
y, = ctx.saved_tensors
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
grad_input = grad_output * (1 - y ** 2)
return grad_input
#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
#Note: You can refer to the activation function Tanh
class Sigmoid(torch.autograd.Function):
'''
Sigmoid activation function
y = 1 / (1 + exp(-x))
'''
@staticmethod
def forward(ctx, x):
# hint: you can use torch.exp(x) to calculate exp(x)
y = 1 - (1 + torch.exp(-x))
# here we save y in ctx, in this way we can use y to calculate gradients in backward process
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, grad_output):
# get y from ctx
y, = ctx.saved_tensors
# implement gradient of x (grad_input), grad_input refers to dL/dx
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
grad_input = grad_output * y * (1 - y)
return grad_input
#TODO 2: complete the forward and backward functions of the ReLU activation function.
#Note: You can refer to the activation function Tanh
class ReLU(torch.autograd.Function):
'''
ReLU activation function
y = max{x, 0}
'''
@staticmethod
def forward(ctx, x):
# set elements less than 0 in x to 0
# this operation is inplace
x = torch.max(x, torch.tensor([0.]).to(x.device))
# save x in ctx, in this way we can use x to calculate gradients in backward process
ctx.save_for_backward(x)
# return the output
return x
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
# get x from ctx
x, = ctx.saved_tensors
# print("Before heaviside")
# print(x, x.size())
x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
# print("After heaviside")
# print(x, x.size())
# print(grad_output, grad_output.size())
# print(grad_output * x)
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
grad_input = grad_output * x
return grad_input
# activate function class according to the type
class Activation(nn.Module):
def __init__(self, type):
'''
:param type: 'sigmoid', 'tanh', or 'relu'
'''
super().__init__()
if type == 'sigmoid':
self.act = Sigmoid.apply
elif type == 'tanh':
self.act = Tanh.apply
elif type == 'relu':
self.act = ReLU.apply
else:
print('activation type should be one of [sigmoid, tanh, relu]')
raise NotImplementedError
def forward(self, x):
return self.act(x)
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# activations.py - activation functions
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn as nn
'''
In this script we will implement three activation functions, including both forward and backward processes.
More details about customizing a backward process in PyTorch can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
'''
## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
class Tanh(torch.autograd.Function):
'''
Tanh activation function
y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
'''
# static method of a python class means that we can call the function without initializing an instance of the class
@staticmethod
def forward(ctx, x):
'''
In the forward pass we receive a Tensor containing the input x and return
a Tensor containing the output.
ctx: it is a context object that can be used to save information for backward computation. You can save
objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
x: input with arbitrary shape
'''
# Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
# y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
# here we directly use torch.tanh(x) to avoid the problem above
y = torch.tanh(x)
# save an variable in ctx
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
grad_output: dL/dy
grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
"""
# get an variable from ctx
y, = ctx.saved_tensors
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
grad_input = grad_output * (1 - y ** 2)
return grad_input
#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
#Note: You can refer to the activation function Tanh
class Sigmoid(torch.autograd.Function):
'''
Sigmoid activation function
y = 1 / (1 + exp(-x))
'''
@staticmethod
def forward(ctx, x):
# hint: you can use torch.exp(x) to calculate exp(x)
y = 1 - (1 + torch.exp(-x))
# here we save y in ctx, in this way we can use y to calculate gradients in backward process
ctx.save_for_backward(y)
return y
@staticmethod
def backward(ctx, grad_output):
# get y from ctx
y, = ctx.saved_tensors
# implement gradient of x (grad_input), grad_input refers to dL/dx
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
grad_input = grad_output * y * (1 - y)
return grad_input
#TODO 2: complete the forward and backward functions of the ReLU activation function.
#Note: You can refer to the activation function Tanh
class ReLU(torch.autograd.Function):
'''
ReLU activation function
y = max{x, 0}
'''
@staticmethod
def forward(ctx, x):
# set elements less than 0 in x to 0
# this operation is inplace
x = torch.max(x, torch.tensor([0.]).to(x.device))
# save x in ctx, in this way we can use x to calculate gradients in backward process
ctx.save_for_backward(x)
# return the output
return x
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
# get x from ctx
x, = ctx.saved_tensors
# print("Before heaviside")
# print(x, x.size())
x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
# print("After heaviside")
# print(x, x.size())
# print(grad_output, grad_output.size())
# print(grad_output * x)
# chain rule: dL/dx = dL/dy * dy/dx
# where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
grad_input = grad_output * x
return grad_input
# activate function class according to the type
class Activation(nn.Module):
def __init__(self, type):
'''
:param type: 'sigmoid', 'tanh', or 'relu'
'''
super().__init__()
if type == 'sigmoid':
self.act = Sigmoid.apply
elif type == 'tanh':
self.act = Tanh.apply
elif type == 'relu':
self.act = ReLU.apply
else:
print('activation type should be one of [sigmoid, tanh, relu]')
raise NotImplementedError
def forward(self, x):
return self.act(x)

View File

@@ -1,118 +1,118 @@
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# losses.py - loss functions
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn.functional as F
'''
In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
More details about customizing a backward process can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
'''
# here is the sample code of MSELoss
# you can use this as reference to implement the CrossEntropyLoss
class MSELoss(torch.autograd.Function):
'''
MSE loss function
loss = (label - pred) ** 2
'''
@staticmethod
def forward(ctx, pred, label):
"""
:param pred: prediction with shape [batch_size, *], where means additional dimensions
:param label: groundtruth, same shape as the predition
:return: MSE loss, averaged by batch_size
"""
# step 1: here we compute the summation of loss for each element and save both pred and label in ctx
loss = torch.sum((pred - label) ** 2)
ctx.save_for_backward(pred, label)
return loss
@staticmethod
def backward(ctx, grad_output):
"""
:param grad_output: for loss function, grad_output will be 1
"""
# step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
pred, label = ctx.saved_tensors
grad_input = grad_output * 2 * (pred - label)
# return None for gradient of label since we do not need to compute dL/dlabel
return grad_input, None
#TODO 1: Complete the CrossEntropyLoss loss function
class CrossEntropyLoss(torch.autograd.Function):
'''
Cross entropy loss function:
loss = - log q_i
where
q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
However, when z_i has a lager value, exp(z_i) might become infinity.
So we use stable softmax:
softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
where
A = exp(-z_max) = exp(-max{z_0, z_1, ...})
therefore we have
softmax(z_i) = softmax(z_i - z_max)
'''
@staticmethod
def forward(ctx, logits, label):
"""
:param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
:param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
:return: cross entropy loss, averaged by batch_size
"""
# step 1: calculate softmax(z) using stable softmax method
# hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
#e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
# calculate z_max
z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
# calculate exps = exp(z - z_max)
exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
# calculate q = softmax(y - y_max)
sums = torch.sum(exps, 1) # of size [batch_size]
# print(exps.size(), sums.size())
# print(sums.reshape(-1, 1))
q = exps / sums.reshape(-1, 1)
# step 2: convert label into one-hot version
# e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
# the converted label has shape [batch_size, n_classes]
# tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
# step 3: calculate cross entropy loss = - log q_i, and averaged by batch
# save result of softmax and one-hot label in ctx for gradient computation
cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
ctx.save_for_backward(q, one_hot_label)
return cross_entropy
@staticmethod
def backward(ctx, grad_output):
# step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
q, label = ctx.saved_tensors
grad_input = grad_output * (q - label)
# return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# losses.py - loss functions
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn.functional as F
'''
In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
More details about customizing a backward process can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
'''
# here is the sample code of MSELoss
# you can use this as reference to implement the CrossEntropyLoss
class MSELoss(torch.autograd.Function):
'''
MSE loss function
loss = (label - pred) ** 2
'''
@staticmethod
def forward(ctx, pred, label):
"""
:param pred: prediction with shape [batch_size, *], where means additional dimensions
:param label: groundtruth, same shape as the predition
:return: MSE loss, averaged by batch_size
"""
# step 1: here we compute the summation of loss for each element and save both pred and label in ctx
loss = torch.sum((pred - label) ** 2)
ctx.save_for_backward(pred, label)
return loss
@staticmethod
def backward(ctx, grad_output):
"""
:param grad_output: for loss function, grad_output will be 1
"""
# step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
pred, label = ctx.saved_tensors
grad_input = grad_output * 2 * (pred - label)
# return None for gradient of label since we do not need to compute dL/dlabel
return grad_input, None
#TODO 1: Complete the CrossEntropyLoss loss function
class CrossEntropyLoss(torch.autograd.Function):
'''
Cross entropy loss function:
loss = - log q_i
where
q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
However, when z_i has a lager value, exp(z_i) might become infinity.
So we use stable softmax:
softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
where
A = exp(-z_max) = exp(-max{z_0, z_1, ...})
therefore we have
softmax(z_i) = softmax(z_i - z_max)
'''
@staticmethod
def forward(ctx, logits, label):
"""
:param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
:param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
:return: cross entropy loss, averaged by batch_size
"""
# step 1: calculate softmax(z) using stable softmax method
# hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
#e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
# calculate z_max
z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
# calculate exps = exp(z - z_max)
exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
# calculate q = softmax(y - y_max)
sums = torch.sum(exps, 1) # of size [batch_size]
# print(exps.size(), sums.size())
# print(sums.reshape(-1, 1))
q = exps / sums.reshape(-1, 1)
# step 2: convert label into one-hot version
# e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
# the converted label has shape [batch_size, n_classes]
# tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
# step 3: calculate cross entropy loss = - log q_i, and averaged by batch
# save result of softmax and one-hot label in ctx for gradient computation
cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
ctx.save_for_backward(q, one_hot_label)
return cross_entropy
@staticmethod
def backward(ctx, grad_output):
# step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
q, label = ctx.saved_tensors
grad_input = grad_output * (q - label)
# return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
return grad_input, None

View File

@@ -1,156 +1,156 @@
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# network.py - linear layer and MLP network
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn as nn
from activations import Activation
'''
In this script we will implement our Linear layer and MLP network.
For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
More details about customizing a backward process can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
'''
class LinearFunction(torch.autograd.Function):
'''
we will implement the linear function:
y = xW^T + b
as well as its gradient computation process
'''
@staticmethod
def forward(ctx, x, W, b):
'''
Input:
:param ctx: a context object that can be used to stash information for backward computation
:param x: input features with size [batch_size, input_size]
:param W: weight matrix with size [output_size, input_size]
:param b: bias with size [output_size]
Return:
y :output features with size [batch_size, output_size]
'''
# print(x, x.size(), x.dtype)
# print(W.T, W.T.size(), W.T.dtype)
# print(x.device, W.T.device)
y = torch.matmul(x, W.T) + b
ctx.save_for_backward(x, W)
return y
@staticmethod
def backward(ctx, grad_output):
'''
Input:
:param ctx: a context object with saved variables
:param grad_output: dL/dy, with size [batch_size, output_size]
Return:
grad_input: dL/dx, with size [batch_size, input_size]
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
grad_b: dL/db, with size [output_size], summed for data in the batch
'''
x, W = ctx.saved_variables
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
# calculate dL/dW by using dL/dy (grad_output) and x
# calculate dL/db using dL/dy (grad_output)
# you can use torch.matmul(A, B) to compute matrix product of A and B
grad_input = torch.matmul(grad_output, W)
grad_W = torch.matmul(grad_output.T, x)
grad_b = grad_output.sum(0)
return grad_input, grad_W, grad_b
class Linear(nn.Module):
def __init__(self, input_size, output_size):
'''
A linear layer which uses our own LinearFunction implemented above.
-----------------------------------------------
:param input_size: dimension of input features
:param output_size: dimension of output features
'''
super(Linear, self).__init__()
W = torch.randn(output_size, input_size).float()
b = torch.zeros(output_size).float()
self.W = nn.Parameter(W, requires_grad=True)
self.b = nn.Parameter(b, requires_grad=True)
def forward(self, x):
# here we call the LinearFunction we implement above
return LinearFunction.apply(x, self.W, self.b)
class MLP(nn.Module):
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
'''
Multilayer Perceptron
----------------------
:param input_size: dimension of input features
:param output_size: dimension of output features
:param hidden_size: a list containing hidden size for each hidden layer
:param n_layers: number of layers
:param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
'''
# TODO 1: initialize the parent class nn.Module
super(MLP, self).__init__()
# total layer number should be hidden layer number + 1 (output layer)
# print(hidden_size, n_layers)
assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
# TODO 2complete the network structures
# instantiate the activation function by using the defined classes in activations.py
self.act = Activation(act_type)
# initialize a list to save layers
layers = nn.ModuleList()
if n_layers == 1:
# append a linear layer into the module list
# if n_layers == 1, MLP degenerates to a single linear layer
layers.append(Linear(input_size, output_size))
# MLP with at least 2 layers
else:
# construct the hidden layers and add them to the module list
# a hidden layer of MLP consists of a linear layer and an activation function
in_size = input_size
for i in range(n_layers - 1):
layer = Linear(in_size, hidden_size[i])
layers.append(layer) # append the linear layer into the module list
layers.append(self.act)
in_size = hidden_size[i] # update in_size for the next layer
# initialize the output layer and append the layer into the module list
# hint: what is the output size of the output layer?
layers.append(Linear(hidden_size[-1], output_size))
# Use nn.Sequential to get the neural network
self.network = torch.nn.Sequential()
for layer in layers:
self.network.append(layer)
def forward(self, x):
'''
Define the forward function
:param x: input features with size [batch_size, input_size]
:return: output features with size [batch_size, output_size]
'''
# TODO 3: implement the forward propagation of the MLP
out = self.network(x)
return out
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# network.py - linear layer and MLP network
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
import torch
import torch.nn as nn
from activations import Activation
'''
In this script we will implement our Linear layer and MLP network.
For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
More details about customizing a backward process can be found in:
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
'''
class LinearFunction(torch.autograd.Function):
'''
we will implement the linear function:
y = xW^T + b
as well as its gradient computation process
'''
@staticmethod
def forward(ctx, x, W, b):
'''
Input:
:param ctx: a context object that can be used to stash information for backward computation
:param x: input features with size [batch_size, input_size]
:param W: weight matrix with size [output_size, input_size]
:param b: bias with size [output_size]
Return:
y :output features with size [batch_size, output_size]
'''
# print(x, x.size(), x.dtype)
# print(W.T, W.T.size(), W.T.dtype)
# print(x.device, W.T.device)
y = torch.matmul(x, W.T) + b
ctx.save_for_backward(x, W)
return y
@staticmethod
def backward(ctx, grad_output):
'''
Input:
:param ctx: a context object with saved variables
:param grad_output: dL/dy, with size [batch_size, output_size]
Return:
grad_input: dL/dx, with size [batch_size, input_size]
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
grad_b: dL/db, with size [output_size], summed for data in the batch
'''
x, W = ctx.saved_variables
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
# calculate dL/dW by using dL/dy (grad_output) and x
# calculate dL/db using dL/dy (grad_output)
# you can use torch.matmul(A, B) to compute matrix product of A and B
grad_input = torch.matmul(grad_output, W)
grad_W = torch.matmul(grad_output.T, x)
grad_b = grad_output.sum(0)
return grad_input, grad_W, grad_b
class Linear(nn.Module):
def __init__(self, input_size, output_size):
'''
A linear layer which uses our own LinearFunction implemented above.
-----------------------------------------------
:param input_size: dimension of input features
:param output_size: dimension of output features
'''
super(Linear, self).__init__()
W = torch.randn(output_size, input_size).float()
b = torch.zeros(output_size).float()
self.W = nn.Parameter(W, requires_grad=True)
self.b = nn.Parameter(b, requires_grad=True)
def forward(self, x):
# here we call the LinearFunction we implement above
return LinearFunction.apply(x, self.W, self.b)
class MLP(nn.Module):
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
'''
Multilayer Perceptron
----------------------
:param input_size: dimension of input features
:param output_size: dimension of output features
:param hidden_size: a list containing hidden size for each hidden layer
:param n_layers: number of layers
:param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
'''
# TODO 1: initialize the parent class nn.Module
super(MLP, self).__init__()
# total layer number should be hidden layer number + 1 (output layer)
# print(hidden_size, n_layers)
assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
# TODO 2complete the network structures
# instantiate the activation function by using the defined classes in activations.py
self.act = Activation(act_type)
# initialize a list to save layers
layers = nn.ModuleList()
if n_layers == 1:
# append a linear layer into the module list
# if n_layers == 1, MLP degenerates to a single linear layer
layers.append(Linear(input_size, output_size))
# MLP with at least 2 layers
else:
# construct the hidden layers and add them to the module list
# a hidden layer of MLP consists of a linear layer and an activation function
in_size = input_size
for i in range(n_layers - 1):
layer = Linear(in_size, hidden_size[i])
layers.append(layer) # append the linear layer into the module list
layers.append(self.act)
in_size = hidden_size[i] # update in_size for the next layer
# initialize the output layer and append the layer into the module list
# hint: what is the output size of the output layer?
layers.append(Linear(hidden_size[-1], output_size))
# Use nn.Sequential to get the neural network
self.network = torch.nn.Sequential()
for layer in layers:
self.network.append(layer)
def forward(self, x):
'''
Define the forward function
:param x: input features with size [batch_size, input_size]
:return: output features with size [batch_size, output_size]
'''
# TODO 3: implement the forward propagation of the MLP
out = self.network(x)
return out

View File

@@ -1,397 +1,397 @@
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# recognition.py - character classification
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
# ==== Part 0: import libs
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json, cv2, os, string
import matplotlib.pyplot as plt
import numpy as np
# this time we implement our networks and loss functions in other python script, and import them here
from network import MLP
from losses import CrossEntropyLoss
# argparse is used to conveniently set our configurations
import argparse
# ==== Part 1: data loader
# construct a dataset and a data loader, more details can be found in
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
class ListDataset(Dataset):
def __init__(self, im_dir, file_path, norm_size=(32, 32)):
'''
:param im_dir: path to directory with images
:param file_path: json file containing image names and labels
:param norm_size: image normalization size, (height, width)
'''
# this time we will try to recognize 26 English letters (case-insensitive)
letters = string.ascii_letters[-26:] # ABCD...XYZ
self.alphabet = {letters[i]:i for i in range(len(letters))}
self.norm_size = norm_size
with open(file_path, 'r') as f:
imgs = json.load(f)
im_names = list(imgs.keys())
self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
self.labels = list(imgs.values())
def __len__(self):
# the __len__() function should return the total number of samples in the dataset
return len(self.im_paths)
def __getitem__(self, index):
assert index <= len(self), 'index range error'
# read an image and convert it to grey scale
im_path = self.im_paths[index]
im = cv2.imread(im_path)
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
# image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
im = cv2.resize(im, self.norm_size)
# im = im / 255.
""" The above command does not seems to be valid in my environment """
im = np.divide(im, 255.)
im = (im - 0.5) * 2.0
# get the label of the current image
# upper() is used to convert a letter into uppercase
label = self.labels[index].upper()
# convert an English letter into a number index
label = self.alphabet[label]
# TODO 1: return the image and its label
return im, label
def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
'''
:param im_dir: path to directory with images
:param file_path: file with image paths and labels
:param norm_size: image normalization size, (height, width)
:param batch_size: batch size
:param workers: number of workers for loading data in multiple threads
:return: a data loader
'''
dataset = ListDataset(im_dir, file_path, norm_size)
return DataLoader(dataset,
batch_size=batch_size,
shuffle=True if 'train' in file_path else False, # shuffle images only when training
num_workers=workers)
# ==== Part 2: training, validation and testing
def train_val(model, trainloader, valloader, n_epochs,
lr, optim_type, momentum, weight_decay,
valInterval, device='cpu'):
'''
The main training procedure
----------------------------
:param model: the MLP model
:param trainloader: the dataloader of the train set
:param valloader: the dataloader of the validation set
:param n_epochs: number of training epochs
:param lr: learning rate
:param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
:param momentum: only used if optim_type == 'sgd'
:param weight_decay: the factor of L2 penalty on network weights
:param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# define the cross entropy loss function.
ce_loss = CrossEntropyLoss.apply
# optimizer
if optim_type == 'sgd':
optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
elif optim_type == 'adagrad':
optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'rmsprop':
optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'adam':
optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'adadelta':
optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
else:
print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
raise NotImplementedError
# training
# to save loss of each training epoch in a python "list" data structure
losses = []
for epoch in range(n_epochs):
# set the model in training mode
model.train()
# to save total loss in one epoch
total_loss = 0.
#TODO 2: Calculate losses and train the network using the optimizer
for data, labels in trainloader: # get a batch of data
# step 1: set data type and device
# data = torch.from_numpy(data)
data = data.type(torch.float32)
data = data.to(device)
labels = labels.to(device)
# print(data.device)
# step 2: convert an image to a vector as the input of the MLP
data = torch.flatten(data, start_dim=1)
# print(data.size())
# hit: clear gradients in the optimizer
optimizer.zero_grad()
# step 3: run the model which is the forward process
output = model(data)
# step 4: compute the loss, and call backward propagation function
loss = ce_loss(output, labels)
loss.backward()
# I have no idea why pylance can't get the data type of what ce_loss returns
# step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
# this operation is not differentiable
total_loss += loss.item()
# step 6: call a function, optimizer.step(), to update the parameters of the models
optimizer.step()
# average of the total loss for iterations
avg_loss = total_loss / len(trainloader)
losses.append(avg_loss)
print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
# validation
if (epoch + 1) % valInterval == 0:
val_acc = test(model, valloader, device)
# show prediction accuracy
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
# save model parameters in a file
# model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
model_save_path = opt.model_path
torch.save({'state_dict': model.state_dict(),
}, model_save_path)
print('Model saved in {}\n'.format(model_save_path))
# draw the loss curve
plot_loss(losses)
def test(model, testloader, device):
'''
The testing procedure
----------------------------
:param model: the MLP model
:param testloader: the dataloader to be tested/validated
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# set the model in evaluation mode
model.eval()
n_correct = 0. # number of images that are correctly classified
n_imgs = 0. # number of total images
with torch.no_grad(): # we do not need to compute gradients during validation
#TODO 3: get the prediction of the data and calculate the accuracy
for imgs, labels in testloader:
# step 1: set data type and device
# imgs = torch.from_numpy(imgs)
imgs = imgs.type(torch.float32)
imgs = imgs.to(device)
labels = labels.to(device)
# step 2: convert an image to a vector as the input of the MLP
imgs = torch.flatten(imgs, start_dim=1)
# step 3: run the model which is the forward process
output = model(imgs)
# step 4: get the predicted value by the output using out.argmax(1)
pred = output.argmax(1)
# step 5: sum up the number of images correctly recognized and the total image number
for predict, label in zip(pred, labels):
if predict == label:
n_correct += 1
n_imgs += 1
accuracy = n_correct / n_imgs
return accuracy
# ==== Part 3: predict new images
def predict(model, im_path, norm_size, device):
'''
The predicting procedure
---------------
:param model: the MLP model
:param im_path: path of an image
:param norm_size: image normalization size, (height, width)
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# TODO 4: enter the evaluation mode
model.eval()
# TODO 4: image pre-processing, similar to what we do in ListDataset()
im = cv2.imread(im_path)
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
im = cv2.resize(im, norm_size)
im = np.divide(im, 255.)
im = (im - 0.5) * 2.0
# convert im from numpy.ndarray to torch.tensor
im = torch.from_numpy(im)
# input im into the model
with torch.no_grad():
input = im.view(1, -1).type(torch.float32).to(device)
out = model(input)
prediction = out.argmax(1)[0].item()
# convert index of prediction to the corresponding character
letters = string.ascii_letters[-26:] # ABCD...XYZ
prediction = letters[prediction]
print('Prediction: {}'.format(prediction))
# ==== Part 4: draw the loss curve
def plot_loss(losses):
'''
:param losses: list of losses for each epoch
:return:
'''
f, ax = plt.subplots()
# draw loss
ax.plot(losses)
# set labels
ax.set_xlabel('training epoch')
ax.set_ylabel('loss')
# show the plots
plt.show()
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2023
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
# set configurations
parser = argparse.ArgumentParser()
parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
help='path to directory with images')
parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
help='file list of training image paths and labels')
parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
help='file list of validation image paths and labels')
parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
help='file list of test image paths and labels')
parser.add_argument('--batchsize', type=int, default=8, help='batch size')
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
# configurations for training
parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
parser.add_argument('--act', type=str, default='relu',
help='type of activation function, can be sigmoid, tanh, or relu')
parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
# configurations for test and prediction
parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
help='path of an image to be recognized')
opt = parser.parse_args()
# TODO 5: initialize the MLP model
# what is the input size of the MLP?
# hint 1: we convert an image to a vector as the input of the MLP
# hint 2: each image has shape [norm_size[0], norm_size[1]]
model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
# for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
if opt.mode == 'test' or opt.mode == 'predict':
checkpoint = torch.load(opt.model_path, map_location='cpu')
# """The above code did not consider device problem"""
# checkpoint = torch.load(opt.model_path, map_location=opt.device)
# load model parameters we saved in model_path
model.load_state_dict(checkpoint['state_dict'])
print('[Info] Load model from {}'.format(opt.model_path))
# put the model on CPU or GPU according to the device in args
model = model.to(opt.device)
# -- run the code for training and validation
if opt.mode == 'train':
# training and validation data loader
trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
train_val(model, trainloader, valloader,
n_epochs=opt.epoch,
lr=opt.lr,
optim_type=opt.optim_type,
momentum=opt.momentum,
weight_decay=opt.weight_decay,
valInterval=opt.valInterval,
device=opt.device)
# -- test the saved model
elif opt.mode == 'test':
testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
acc = test(model, testloader, opt.device)
print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
# -- predict a new image
elif opt.mode == 'predict':
predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
else:
print('mode should be train, test, or predict')
raise NotImplementedError
#========================================================
# Media and Cognition
# Homework 1 Neural network basics
# recognition.py - character classification
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
#========================================================
# ==== Part 0: import libs
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import json, cv2, os, string
import matplotlib.pyplot as plt
import numpy as np
# this time we implement our networks and loss functions in other python script, and import them here
from network import MLP
from losses import CrossEntropyLoss
# argparse is used to conveniently set our configurations
import argparse
# ==== Part 1: data loader
# construct a dataset and a data loader, more details can be found in
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
class ListDataset(Dataset):
def __init__(self, im_dir, file_path, norm_size=(32, 32)):
'''
:param im_dir: path to directory with images
:param file_path: json file containing image names and labels
:param norm_size: image normalization size, (height, width)
'''
# this time we will try to recognize 26 English letters (case-insensitive)
letters = string.ascii_letters[-26:] # ABCD...XYZ
self.alphabet = {letters[i]:i for i in range(len(letters))}
self.norm_size = norm_size
with open(file_path, 'r') as f:
imgs = json.load(f)
im_names = list(imgs.keys())
self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
self.labels = list(imgs.values())
def __len__(self):
# the __len__() function should return the total number of samples in the dataset
return len(self.im_paths)
def __getitem__(self, index):
assert index <= len(self), 'index range error'
# read an image and convert it to grey scale
im_path = self.im_paths[index]
im = cv2.imread(im_path)
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
# image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
im = cv2.resize(im, self.norm_size)
# im = im / 255.
""" The above command does not seems to be valid in my environment """
im = np.divide(im, 255.)
im = (im - 0.5) * 2.0
# get the label of the current image
# upper() is used to convert a letter into uppercase
label = self.labels[index].upper()
# convert an English letter into a number index
label = self.alphabet[label]
# TODO 1: return the image and its label
return im, label
def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
'''
:param im_dir: path to directory with images
:param file_path: file with image paths and labels
:param norm_size: image normalization size, (height, width)
:param batch_size: batch size
:param workers: number of workers for loading data in multiple threads
:return: a data loader
'''
dataset = ListDataset(im_dir, file_path, norm_size)
return DataLoader(dataset,
batch_size=batch_size,
shuffle=True if 'train' in file_path else False, # shuffle images only when training
num_workers=workers)
# ==== Part 2: training, validation and testing
def train_val(model, trainloader, valloader, n_epochs,
lr, optim_type, momentum, weight_decay,
valInterval, device='cpu'):
'''
The main training procedure
----------------------------
:param model: the MLP model
:param trainloader: the dataloader of the train set
:param valloader: the dataloader of the validation set
:param n_epochs: number of training epochs
:param lr: learning rate
:param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
:param momentum: only used if optim_type == 'sgd'
:param weight_decay: the factor of L2 penalty on network weights
:param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# define the cross entropy loss function.
ce_loss = CrossEntropyLoss.apply
# optimizer
if optim_type == 'sgd':
optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
elif optim_type == 'adagrad':
optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'rmsprop':
optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'adam':
optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
elif optim_type == 'adadelta':
optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
else:
print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
raise NotImplementedError
# training
# to save loss of each training epoch in a python "list" data structure
losses = []
for epoch in range(n_epochs):
# set the model in training mode
model.train()
# to save total loss in one epoch
total_loss = 0.
#TODO 2: Calculate losses and train the network using the optimizer
for data, labels in trainloader: # get a batch of data
# step 1: set data type and device
# data = torch.from_numpy(data)
data = data.type(torch.float32)
data = data.to(device)
labels = labels.to(device)
# print(data.device)
# step 2: convert an image to a vector as the input of the MLP
data = torch.flatten(data, start_dim=1)
# print(data.size())
# hit: clear gradients in the optimizer
optimizer.zero_grad()
# step 3: run the model which is the forward process
output = model(data)
# step 4: compute the loss, and call backward propagation function
loss = ce_loss(output, labels)
loss.backward()
# I have no idea why pylance can't get the data type of what ce_loss returns
# step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
# this operation is not differentiable
total_loss += loss.item()
# step 6: call a function, optimizer.step(), to update the parameters of the models
optimizer.step()
# average of the total loss for iterations
avg_loss = total_loss / len(trainloader)
losses.append(avg_loss)
print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
# validation
if (epoch + 1) % valInterval == 0:
val_acc = test(model, valloader, device)
# show prediction accuracy
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
# save model parameters in a file
# model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
model_save_path = opt.model_path
torch.save({'state_dict': model.state_dict(),
}, model_save_path)
print('Model saved in {}\n'.format(model_save_path))
# draw the loss curve
plot_loss(losses)
def test(model, testloader, device):
'''
The testing procedure
----------------------------
:param model: the MLP model
:param testloader: the dataloader to be tested/validated
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# set the model in evaluation mode
model.eval()
n_correct = 0. # number of images that are correctly classified
n_imgs = 0. # number of total images
with torch.no_grad(): # we do not need to compute gradients during validation
#TODO 3: get the prediction of the data and calculate the accuracy
for imgs, labels in testloader:
# step 1: set data type and device
# imgs = torch.from_numpy(imgs)
imgs = imgs.type(torch.float32)
imgs = imgs.to(device)
labels = labels.to(device)
# step 2: convert an image to a vector as the input of the MLP
imgs = torch.flatten(imgs, start_dim=1)
# step 3: run the model which is the forward process
output = model(imgs)
# step 4: get the predicted value by the output using out.argmax(1)
pred = output.argmax(1)
# step 5: sum up the number of images correctly recognized and the total image number
for predict, label in zip(pred, labels):
if predict == label:
n_correct += 1
n_imgs += 1
accuracy = n_correct / n_imgs
return accuracy
# ==== Part 3: predict new images
def predict(model, im_path, norm_size, device):
'''
The predicting procedure
---------------
:param model: the MLP model
:param im_path: path of an image
:param norm_size: image normalization size, (height, width)
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
'''
# TODO 4: enter the evaluation mode
model.eval()
# TODO 4: image pre-processing, similar to what we do in ListDataset()
im = cv2.imread(im_path)
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
im = cv2.resize(im, norm_size)
im = np.divide(im, 255.)
im = (im - 0.5) * 2.0
# convert im from numpy.ndarray to torch.tensor
im = torch.from_numpy(im)
# input im into the model
with torch.no_grad():
input = im.view(1, -1).type(torch.float32).to(device)
out = model(input)
prediction = out.argmax(1)[0].item()
# convert index of prediction to the corresponding character
letters = string.ascii_letters[-26:] # ABCD...XYZ
prediction = letters[prediction]
print('Prediction: {}'.format(prediction))
# ==== Part 4: draw the loss curve
def plot_loss(losses):
'''
:param losses: list of losses for each epoch
:return:
'''
f, ax = plt.subplots()
# draw loss
ax.plot(losses)
# set labels
ax.set_xlabel('training epoch')
ax.set_ylabel('loss')
# show the plots
plt.show()
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2023
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
# set configurations
parser = argparse.ArgumentParser()
parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
help='path to directory with images')
parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
help='file list of training image paths and labels')
parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
help='file list of validation image paths and labels')
parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
help='file list of test image paths and labels')
parser.add_argument('--batchsize', type=int, default=8, help='batch size')
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
# configurations for training
parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
parser.add_argument('--act', type=str, default='relu',
help='type of activation function, can be sigmoid, tanh, or relu')
parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
# configurations for test and prediction
parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
help='path of an image to be recognized')
opt = parser.parse_args()
# TODO 5: initialize the MLP model
# what is the input size of the MLP?
# hint 1: we convert an image to a vector as the input of the MLP
# hint 2: each image has shape [norm_size[0], norm_size[1]]
model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
# for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
if opt.mode == 'test' or opt.mode == 'predict':
checkpoint = torch.load(opt.model_path, map_location='cpu')
# """The above code did not consider device problem"""
# checkpoint = torch.load(opt.model_path, map_location=opt.device)
# load model parameters we saved in model_path
model.load_state_dict(checkpoint['state_dict'])
print('[Info] Load model from {}'.format(opt.model_path))
# put the model on CPU or GPU according to the device in args
model = model.to(opt.device)
# -- run the code for training and validation
if opt.mode == 'train':
# training and validation data loader
trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
train_val(model, trainloader, valloader,
n_epochs=opt.epoch,
lr=opt.lr,
optim_type=opt.optim_type,
momentum=opt.momentum,
weight_decay=opt.weight_decay,
valInterval=opt.valInterval,
device=opt.device)
# -- test the saved model
elif opt.mode == 'test':
testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
acc = test(model, testloader, opt.device)
print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
# -- predict a new image
elif opt.mode == 'predict':
predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
else:
print('mode should be train, test, or predict')
raise NotImplementedError

41
hw3/code/check.py Normal file
View File

@@ -0,0 +1,41 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# check.py - Check your implementation of several modules
# Tsinghua University
# (C) Copyright 2024
# ========================================================
from svm_hw import SVM_HINGE, LinearFunction, Hinge
import torch
from torch.autograd import gradcheck
def run():
model = SVM_HINGE(2, C=1.0).double()
x = torch.randn(50, 2, requires_grad=False).double()
W = torch.randn(1, 2, requires_grad=True).double()
b = torch.zeros(1, requires_grad=True).double()
test = gradcheck(LinearFunction.apply, (x, W, b), eps=1e-6, atol=1e-4)
if test:
print('Linear successully tested!')
output = torch.randn(50, 1, requires_grad=True).double()
W = torch.randn(1, 2, requires_grad=True).double()
labels = torch.ones(1, requires_grad=False).double()
C = torch.tensor([[1.0]], requires_grad=False).double()
test = gradcheck(Hinge.apply, (output, W, labels, C), eps=1e-6, atol=1e-5)
if test:
print('Hinge successfully tested')
x = torch.randn(50, 2, requires_grad=False).double()
labels = torch.ones(50, requires_grad=False).double()
try:
output, loss = model(x, labels)
assert model.W.requires_grad is True
assert model.b.requires_grad is True
print('SVM_HINGE successfully tested')
except:
raise Exception('Failed testing SVM_HINGE!')
if __name__ == '__main__':
run()

181
hw3/code/data_preprocess.py Normal file
View File

@@ -0,0 +1,181 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# data_preprocess.py - Using pretrained convolutional layers to extract feature,
# and using PCA for dimensionality reduction
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
import os
import torchvision.transforms as transforms
import torch
from PIL import Image
from networks import Classifier
import matplotlib.pyplot as plt
import argparse
def preprocess(pre_conv, data_root, image_size, classes):
# TODO 1: Using PCA to reduce the dimensionality of 2048 point features extracted by convolution
# =============== process training dataset ======================
print("Start preprocessing the training dataset !!!")
train_data, train_label = loaddata(pre_conv, data_root, 'train', image_size, classes)
# calculate the mean and PCA projection matrix
data_mean, u = PCA(train_data, 2)
u = u * 20
# TODO: using PCA to compress the dimensionality of the train_data after subtracting the mean vector
train_data_pca = (train_data - data_mean) @ u
visualize(train_data_pca, train_label, "train")
savedata(train_data_pca, train_label, data_root+"/train.pt")
print("training dataset saved !!!")
# =============== process validation dataset ======================
print("Start preprocessing the validation dataset!!!")
val_data, val_label = loaddata(pre_conv, data_root, 'val', image_size, classes)
# TODO: using PCA to compress the dimensionality of the val_data after subtracting the mean vector
val_data_pca = (val_data - data_mean) @ u
visualize(val_data_pca, val_label, "val")
savedata(val_data_pca, val_label, data_root+"/val.pt")
print("validation dataset saved !!!")
# =============== process testing dataset ======================
print("Start preprocessing the testing dataset!!!")
test_data, test_label = loaddata(pre_conv, data_root, 'test', image_size, classes)
# TODO: using PCA to compress the dimensionality of the test_data after subtracting the mean vector
test_data_pca = (test_data - data_mean) @ u
visualize(test_data_pca, test_label, "test")
savedata(test_data_pca, test_label, data_root+"/test.pt")
print("testing dataset saved !!!")
def savedata(data, label, save_path):
save_dict = {
'data': data,
'label': label
}
torch.save(save_dict, save_path)
def visualize(datas, labels, mode):
"""
Display feature points after dimensionality reduction
-------------------------------
:param datas: the samples after dimensionality reduction, with the shape of [N, 2]
:param labels: the labels (chosen from {-1, +1}) corresponding to the samples
:param mode: chosen from {'train', 'val', 'test'}
:return:
"""
plt.figure()
for idx in range(datas.shape[1]):
plt.scatter(datas[labels == 2*idx-1, 0], datas[labels == 2*idx-1, 1], label=(2*idx-1))
plt.legend()
plt.title(mode)
plt.show()
def PCA(data, dim=2):
"""
calculate the mean value of the data and the projection matrix for PCA
:param data: the sample features extracted by the pretrained network in homework2, with the shape of [N, 2048]
:param dim: the data dimension after projection
:return:
data_mean: the mean value of the data
u: the projection matrix for PCA, with the shape of [2048, dim]
"""
# TODO 2: complete the algorithm of PCA, calculate the mean value of the data and the projection matrix
# TODO: compute the mean of train_data
data_mean = data.mean(dim=0)
# TODO: compute the covariance matrix of train_data
diff = data - data_mean
# data_cov = diff.T @ diff
data_cov = torch.cov(diff.T)
# TODO: compute the SVD decompositon of data_cov using torch.linalg.svd
# reference: https://pytorch.org/docs/1.11/generated/torch.linalg.svd.html
u, s, v = torch.linalg.svd(data_cov)
# TODO: return the proper 'data_mean' and 'u[]'
return data_mean, u[:, :dim]
def loaddata(pre_conv, data_root, mode, image_size, classes):
"""
load one dataset, and use pretrained network in homework 2 to extract feature
:param pre_conv: pretrained network in homework 2
:param data_root: the path of the dataset
:param mode: chosen from {'train', 'val', 'test'}
:param image_size: the preset size that each image try to zoom to
:param classes: two classes that need to be classified
:return:
datas: the samples of extracted features with the shape of [N, 2048]
labels: the corresponding labels for each sample (chosen from {-1, +1}), with the shape of [N]
"""
assert len(classes) == 2
datas = []
labels = []
for idx in range(len(classes)):
for img in os.listdir(data_root + '/' + mode + '/' + classes[idx]):
data = readimg(pre_conv, data_root + '/' + mode + '/' + classes[idx] + '/' + img, image_size)
label = 2 * idx - 1
datas.append(data)
labels.append(label)
return torch.stack(datas), torch.tensor(labels)
def readimg(pre_conv, filepath, image_size):
"""
Read one image and use pretrained network to extract the feature
--------------------------
:param pre_conv: pretrained network in homework 2
:param filepath: the file path of one image
:param image_size: the preset size that each image try to zoom to
:return:
data: the extracted feature with the length of 2048
"""
img_pil = Image.open(filepath).convert('RGB')
img_pil = img_pil.resize(image_size)
img_transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize(0.5, 0.5),
])
img_tensor = img_transform(img_pil)
data = pre_conv(img_tensor.unsqueeze(0)).reshape(-1)
return data
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--pretrained_net", type=str, default="checkpoints/bn/ckpt_epoch_15.pth",
help="the filepath of the pretrained network in homework 2")
parser.add_argument("--data_root", type=str, default="data", help="the path of all datasets")
parser.add_argument("--image_size", type=tuple, default=(32, 32),
help="the preset size that each image try to zoom to")
parser.add_argument("--classes", default=["B", "C"], help="two classes that need to be classified")
args = parser.parse_args()
pretrained_checkpoint = torch.load(args.pretrained_net, map_location="cpu")
configs = pretrained_checkpoint["configs"]
cls = Classifier(
configs["in_channels"],
configs["num_classes"],
configs["use_batch_norm"],
configs["use_stn"],
configs["dropout_prob"],
)
cls.load_state_dict(pretrained_checkpoint["model_state"], strict=False)
for param in cls.parameters():
param.requires_grad = False
conv = cls.conv_net
preprocess(conv, args.data_root, args.image_size, args.classes)

26
hw3/code/datasets.py Normal file
View File

@@ -0,0 +1,26 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# datasets.py - Define the data loader for the traffic sign classification dataset
# Student ID:
# Name:
# Tsinghua University
# (C) Copyright 2024
# ========================================================
import torch
import torch.utils.data as data
class Traffic_Dataset(data.Dataset):
def __init__(self, data_root):
dataset = torch.load(data_root)
self.datas = dataset["data"]
self.labels = dataset["label"]
def __getitem__(self, index):
return self.datas[index], self.labels[index]
def __len__(self):
return len(self.datas)

271
hw3/code/networks.py Normal file
View File

@@ -0,0 +1,271 @@
# ========================================================
# Media and Cognition
# Homework 2 Convolutional Neural Network
# networks.py - Network definition
# Student ID: 2022010639
# Name: Gao Yixuan
# Tsinghua University
# (C) Copyright 2024
# ========================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
class ConvBlock(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
use_batch_norm=False,
use_residual=False,
):
"""
Convolutional block with batch normalization and ReLU activation
----------------------
:param in_channels: channel number of input image
:param out_channels: channel number of output image
:param kernel_size: size of convolutional kernel
:param stride: stride of convolutional operation
:param padding: padding of convolutional operation
:param use_batch_norm: whether to use batch normalization in convolutional layers
:param use_residual: whether to use residual connection
"""
super().__init__()
if use_batch_norm:
bn2d = nn.BatchNorm2d
else:
# use identity function to replace batch normalization
bn2d = nn.Identity
self.use_residual = use_residual
# >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
# Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
# Network structure:
# conv -> batchnorm -> relu
self.conv = nn.Conv2d(
in_channels, out_channels, kernel_size, stride=stride, padding=padding
)
self.bn = bn2d(out_channels)
self.relu = nn.ReLU()
# <<< TODO 2.1
def forward(self, x):
# >>> TODO 2.2: forward process
# Hint: apply residual connection if `self.use_residual` is True
fx = self.relu(self.bn(self.conv(x)))
# out = self.relu(self.bn(self.conv(x)))
if self.use_residual:
out = fx + x
else:
out = fx
# <<< TODO 2.2
return out
class Classifier(nn.Module):
def __init__(
self,
in_channels,
num_classes,
use_batch_norm=False,
use_stn=False,
dropout_prob=0,
):
"""
Convolutional Neural Networks
----------------------
:param in_channels: channel number of input image
:param num_classes: number of classes for the classification task
:param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
:param use_stn: whether to use spatial transformer network
:param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
"""
super().__init__()
if use_batch_norm:
bn1d = nn.BatchNorm1d
else:
# use identity function to replace batch normalization
bn1d = nn.Identity
if use_stn:
self.stn = STN(in_channels)
else:
# use identity function to replace spatial transformer network
self.stn = nn.Identity(in_channels)
# >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
# input image with size [batch_size, in_channels, img_h, img_w]
# Network structure:
# kernel_size stride padding out_channels use_residual
# ConvBlock 5 1 2 32 False
# ConvBlock 5 2 2 64 False
# maxpool 2 2 0
# ConvBlock 3 1 1 64 True
# ConvBlock 3 1 1 128 False
# maxpool 2 2 0
# ConvBlock 3 1 1 128 True
# dropout(p), where p is input parameter of dropout ratio
self.conv_net = nn.Sequential(
ConvBlock(
in_channels=in_channels,
out_channels=32,
kernel_size=5,
stride=1,
padding=2,
),
ConvBlock(
in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
ConvBlock(
in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1,
use_residual=True,
),
ConvBlock(
in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
ConvBlock(
in_channels=128,
out_channels=128,
kernel_size=3,
stride=1,
padding=1,
use_residual=True,
),
nn.Dropout2d(p=dropout_prob),
)
# <<< TODO 3.1
# >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
# Hint:
# (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
# the output of the convolution layers?
# (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
# Network structure:
# out_channels
# linear 256
# activation
# batchnorm
# dropout(p), where p is input parameter of dropout ratio
# linear num_classes
self.fc_net = nn.Sequential(
nn.Linear(2048, 256),
nn.ReLU(),
bn1d(256),
nn.Dropout1d(dropout_prob),
nn.Linear(256, num_classes),
)
# <<< TODO 3.2
def forward(self, x):
"""
Define the forward function
:param x: input features with size [batch_size, in_channels, img_h, img_w]
:return: output features with size [batch_size, num_classes]
"""
# Step 1: apply spatial transformer network if applicable
x = self.stn(x)
# >>> TODO 3.3: forward process
# Step 2: forward process for the convolutional network
x = self.conv_net(x)
# Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
# fully connected layers.
x = x.view(x.shape[0], -1)
# Step 4: forward process for the fully connected network
out = self.fc_net(x)
# <<< TODO 3.3
return out
class STN(nn.Module):
def __init__(self, in_channels):
"""
The spatial transformer network (STN) learns how to perform spatial transformations on the
input image in order to enhance the geometric invariance of the model. For example, it can
crop a region of interest, scale and correct the orientation of an image. It can be a useful
mechanism because CNNs are not invariant to rotation and scale and more general affine
transformations.
The spatial transformer network boils down to three main components:
- The localization network is a regular CNN which regresses the transformation parameters.
The transformation is never learned explicitly from this dataset, instead the network
learns automatically the spatial transformations that enhances the global accuracy.
- The grid generator generates a grid of coordinates in the input image corresponding
to each pixel from the output image.
- The sampler uses the parameters of the transformation and applies it to the input image.
Here, we are going to implement an STN that performs affine transformations on the input images.
For more information, please refer to the slides and
https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
----------------------
:param in_channels: channel number of input image
"""
super().__init__()
# >>> TODO 4.1: Build your localization net
# Step 1: Build a convolutional network to extract features from input images.
# Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
# this network.
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
self.localization_conv = nn.Sequential(
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
# 8 * 13 * 13
ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
# 32 * 4 * 4
)
# Step 2: Build a fully connected network to predict the parameters of affine transformation from
# the extracted features.
# Hint: Combine linear layers and ReLU activation functions to build this network.
# Suggested structure: 2 linear layers with one BN and ReLU.
self.localization_fc = nn.Sequential(
nn.Linear(32 * 4 * 4, 256),
nn.ReLU(),
nn.BatchNorm1d(256),
nn.Linear(256, 6)
)
# <<< TODO 4.1
# >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
# Hint: The STN should generate the identity transformation by default before training.
# How to initialize the weight/bias of the last linear layer of the fully connected network to
# achieve this goal?
nn.init.zeros_(self.localization_fc[3].weight)
# <<< TODO 4.2
def forward(self, x):
# Extract the features from input images and flatten them
features = self.localization_conv(x)
features = features.view(features.shape[0], -1)
# Predict the parameters of affine transformation from the extracted features
theta = self.localization_fc(features)
theta = theta.view(-1, 2, 3)
# Apply affine transformation to input images
grid = F.affine_grid(theta, x.shape, align_corners=False)
x = F.grid_sample(x, grid, align_corners=False)
return x

148
hw3/code/svm_hw.py Normal file
View File

@@ -0,0 +1,148 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# svm_hw.py - The implementation of SVM using hinge loss
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
# TODO 1: complete the forward and backward propagation processes of the linear layer
class LinearFunction(torch.autograd.Function):
'''
we will implement the linear function:
y = xW^T + b
as well as its gradient computation process
'''
@staticmethod
def forward(ctx, x, W, b):
'''
Input:
:param ctx: a context object that can be used to stash information for backward computation
:param x: input features with size [batch_size, input_size]
:param W: weight matrix with size [output_size, input_size]
:param b: bias with size [output_size]
Return:
y :output features with size [batch_size, output_size]
'''
# TODO
y = torch.matmul(x, W.T) + b
ctx.save_for_backward(x, W)
return y
@staticmethod
def backward(ctx, grad_output):
'''
Input:
:param ctx: a context object with saved variables
:param grad_output: dL/dy, with size [batch_size, output_size]
Return:
grad_input: dL/dx, with size [batch_size, input_size]
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
grad_b: dL/db, with size [output_size], summed for data in the batch
'''
x, W = ctx.saved_variables
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
# calculate dL/dW by using dL/dy (grad_output) and x
# calculate dL/db using dL/dy (grad_output)
# you can use torch.matmul(A, B) to compute matrix product of A and B
# TODO
grad_input = torch.matmul(grad_output, W)
grad_W = torch.matmul(grad_output.T, x)
grad_b = grad_output.sum(0)
return grad_input, grad_W, grad_b
# TODO 2: complete the forward and backward propagation processes of the hinge loss
class Hinge(torch.autograd.Function):
@staticmethod
def forward(ctx, output, W, label, C):
"""
Compute the hinge loss
--------------------------------------
:param ctx: a context object that can be used to stash information for backward computation
:param output: the output of the linear layer with size [batch_size, 1], i.e. output = W^T*x + b
:param W: weight matrix with size [1, input_size]
:param label: the ground truth y in the equation for loss calculation, with size [batch_size]
:param C: the regularization coefficient of hinge loss with size [1, 1]
:return: the hinge loss with size [1, 1]
"""
C = C.type_as(W)
# TODO: compute the hinge loss (together with L2 norm for SVM): loss = 0.5*||w||^2 + C*\sum_i{max(0, 1 - y_i*output_i)}
# you may need F.relu() to implement the max() function.
# print("output size", output.size())
# print("label size", label.size())
# print("product", label * output.reshape_as(label))
# print("minus", 1 - label * output.reshape_as(label))
# print("relu", F.relu(1 - label * output.reshape_as(label)))
# print("sum", (F.relu(1 - label * output.reshape_as(label))).sum())
loss = 1/2 * (W @ W.T) + C * (F.relu(1 - (output.T * label).T)).sum()
ctx.save_for_backward(output, W, label, C)
return loss
@staticmethod
def backward(ctx, grad_loss):
"""
Compute the gradient of hinge loss
:param ctx: a context object with saved variables
:param grad_loss: dL/dloss, with size [1, 1], the gradient of the final target loss with respect to the output (variable 'loss') of the forward function
:return:
grad_output: dL/doutput, with size [batch_size, 1]
grad_W: dL/dW, with size [1, channels]
"""
output, W, label, C = ctx.saved_tensors
# TODO: compute the grad with respect to the output of the linear function and W: dL/doutput, dL/dW
# print("output", output, "label", label, "product", (1 - label.reshape_as(output) * output))
# print("grad_loss size", grad_loss.size())
# print("sizeof l / output", (C * torch.heaviside(1 - label.reshape_as(output) * output, torch.tensor(0).type_as(output)) * (-label.reshape_as(output))).size())
grad_output = grad_loss * C * ((torch.heaviside(1 - (output.T * label).T, torch.tensor(1).type_as(output)).T * (-label))).T
grad_W = grad_loss * W
return grad_output, grad_W, None, None
# TODO 3: complete the structure of SVM model
class SVM_HINGE(nn.Module):
def __init__(self, in_channels, C):
"""
:param in_channels: number of feature channels for SVM input
:param C: regularization coefficient of hinge loss with size [1, 1]
"""
super().__init__()
# TODO: define the parameters W and b
"""
the shape of W should be [1, channels] and the shape of b should be [1, ]
you need to use nn.Parameter() to make W and b be trainable parameters, don't forget to set requires_grad=True for self.W and self.b
please use torch.randn() to initialize W and b
"""
self.W = nn.Parameter(torch.rand(1, in_channels), requires_grad=True)
self.b = nn.Parameter(torch.rand(1, ), requires_grad=True)
self.C = torch.tensor([[C]], requires_grad=False)
def forward(self, x, label=None):
# SVM calculation
output = LinearFunction.apply(x, self.W, self.b)
if label is not None:
loss = Hinge.apply(output, self.W, label, self.C)
else:
loss = None
output = (output > 0.0).type_as(x) * 2.0 - 1.0
return output, loss

110
hw3/code/test_svm.py Normal file
View File

@@ -0,0 +1,110 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# test_svm.py - Test svm model for traffic sign
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
# ==== Part 1: import libs
import argparse
import torch
from datasets import Traffic_Dataset
from svm_hw import SVM_HINGE
from torch.utils.data import DataLoader
import os.path
# ==== Part 2: testing
def test(
data_root,
model_save_path,
device,
):
"""
The main testing procedure of SVM model
----------------------------
:param data_root: path to the root directory of dataset
:param model_save_path: path to pretrained SVM model
:param device: device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
"""
# TODO 1: =================== load the pretrained SVM model ==================================
# TODO: construct testing data loader with 'Traffic_Dataset' and DataLoader, and set 'batch_size=1' and 'shuffle=False'
test_data = Traffic_Dataset(os.path.join(data_root, 'test.pt'))
test_loader = DataLoader(test_data, shuffle=False)
# TODO: load state dictionary of pretrained SVM model
model_svm = torch.load(os.path.join(model_save_path))
# TODO: initialize the SVM model using 'model_svm["configs"]["feature_channel"]' and 'model_svm["configs"]["C"]'
svm = SVM_HINGE(model_svm["configs"]["feature_channel"], model_svm["configs"]["C"])
# TODO: load model parameters (model_svm['state_dict']) we saved in model_path using svm.load_state_dict()
svm.load_state_dict(model_svm["state_dict"])
# TODO: put the model on CPU or GPU
svm.to(device)
# TODO 2 : ================================ testing ==============================================
# TODO: set the model in evaluation mode
svm.eval()
# to calculate and save the testing accuracy
n_correct = 0. # number of images that are correctly classified
n_feas = 0. # number of total images
with torch.no_grad(): # we do not need to compute gradients during validation
# TODO: inference on the testing dataset, similar to the training stage but use 'test_loader'.
for input, label in test_loader:
# TODO: set data type (.float()) and device (.to())
input, label = (
input.type(torch.float).to(device),
label.type(torch.float).to(device)
)
# TODO: run the model; at the validation step, the model only needs one input: feas
# _ refers to a placeholder, which means we do not need the second returned value during validating
out, _ = svm(input)
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
n_correct += (out.reshape_as(label) == label).sum().item()
# TODO:sum up the total image number
n_feas += label.numel()
# show prediction accuracy
acc = 100 * n_correct / n_feas
print('Test accuracy = {:.1f}%'.format(acc))
if __name__ == "__main__":
# set configurations of the testing process
parser = argparse.ArgumentParser()
parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels")
parser.add_argument("--device", type=str, help="cpu or cuda")
parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
args = parser.parse_args()
if args.device is None:
args.device = "cuda" if torch.cuda.is_available() else "cpu"
# run the testing procedure
test(
data_root=args.data_root,
model_save_path=args.model_save_path,
device=args.device,
)

296
hw3/code/train_svm.py Normal file
View File

@@ -0,0 +1,296 @@
# ========================================================
# Media and Cognition
# Homework 3 Support Vector Machine
# train_svm.py - Train svm model for traffic sign
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
# ==== Part 1: import libs
import argparse
import matplotlib.pyplot as plt
import torch
import numpy as np
import random
from datasets import Traffic_Dataset
from svm_hw import SVM_HINGE
from torch.utils.data import DataLoader
import os.path
# ==== Part 2: training and validation
def train(
data_root,
feature_channel,
batch_size,
n_epoch,
lr,
C,
model_save_path,
device,
):
"""
The main training procedure of SVM model
----------------------------
:param data_root: path to the root directory of dataset
:param feature_channel: number of feature channels for SVM input
:param batch_size: batch size of training
:param n_epoch: number of training epochs
:param lr: learning rate
:param C: regularization coefficient in hinge loss
:param model_save_path: path to save SVM model
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
"""
# TODO 1: construct training and validation data loader with 'Traffic_Dataset' and DataLoader, and set proper values for 'batch_size' and 'shuffle'
train_data = Traffic_Dataset(os.path.join(data_root, 'train.pt'))
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = Traffic_Dataset(os.path.join(data_root, 'val.pt'))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
# scale the regularization coefficient
C = C * len(train_loader)
# TODO: initialize the SVM model
svm = SVM_HINGE(feature_channel, C)
# TODO: put the model on CPU or GPU
svm.to(device)
# TODO: define the Adam optimizer
optimizer = torch.optim.Adam(svm.parameters(), lr)
# to save the training loss, training accuracy, validation accuracy, and the epoch index of each training epoch
train_loss = []
train_acc = []
val_acc = []
epochs = []
for epoch in range(n_epoch):
# TODO: save the index of current epoch in the array 'epochs'
epochs.append(epoch + 1)
# TODO 2: ========================= training =======================
# TODO: set the model in training mode
svm.train()
# to calculate and save the training loss and training accuracy
total_loss = 0. # to save total training loss in one epoch
n_correct = 0. # number of images that are correctly classified
n_feas = 0. # number of total images
# TODO: get a batch of data; you may need enumerate() to iteratively get data from 'train_loader'.
# you can refer to previous homework, for example hw2
for step, (input, label) in enumerate(train_loader):
# TODO: set data type (.float()) and device (.to())
input, label = (
input.type(torch.float).to(device),
label.type(torch.float).to(device)
)
# TODO: clear gradients in the optimizer
optimizer.zero_grad()
# TODO: run the model with hinge loss; the model needs two inputs: feas and labels
out, loss = svm(input, label)
# TODO: back-propagation on the computation graph
loss.backward()
# TODO: sum up of total loss, loss.item() return the value of the tensor as a standard python number
total_loss += loss.item()
# TODO: call a function to update the parameters of the models
optimizer.step()
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
n_correct += (out.reshape_as(label) == label).sum().item()
# TODO: sum up the total image number
n_feas += label.numel()
# average of the total loss for iterations
acc = 100 * n_correct / n_feas
avg_loss = total_loss / len(train_loader)
train_acc.append(acc)
train_loss.append(avg_loss)
print('Epoch {:02d}: loss = {:.3f}, training accuracy = {:.1f}%'.format(epoch + 1, avg_loss, acc))
# TODO 3: ========================== Validation ======================================
# TODO: set the model in evaluation mode
svm.eval()
# to calculate and save the validation accuracy
n_correct = 0. # number of images that are correctly classified
n_feas = 0. # number of total images
with torch.no_grad(): # we do not need to compute gradients during validation
# TODO: inference on the validation dataset, similar to the training stage but use 'val_loader'.
for input, label in val_loader:
# TODO: set data type (.float()) and device (.to())
input, label = (
input.type(torch.float).to(device),
label.type(torch.float).to(device)
)
# TODO: run the model; at the validation step, the model only needs one input: feas
# _ refers to a placeholder, which means we do not need the second returned value during validating
out, _ = svm(input)
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
n_correct += (out.reshape_as(label) == label).sum().item()
# TODO: sum up the total image number
n_feas += label.numel()
# show prediction accuracy
acc = 100 * n_correct / n_feas
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, acc))
val_acc.append(acc)
# save model parameters in a file
torch.save({'state_dict': svm.state_dict(),
'configs': {
'feature_channel': feature_channel,
'C': C}
}, model_save_path)
print('Model saved in {}\n'.format(model_save_path))
W = svm.W.data.cpu()
b = svm.b.data.cpu()
# TODO 4: calculate the index of support vectors in training samples using 'train_data.datas' and 'train_data.labels'
# 'sv' should be a list in python structure with the shape of [K], where K is the number of support vectors.
sv = [idx for idx, (data, label) in enumerate(zip(train_data.datas, train_data.labels)) if label * ((W @ data) + b) <= 1]
plot(train_loss, train_acc, val_acc, epochs)
plot_feature(train_features=train_data.datas, val_features=val_data.datas, train_labels=train_data.labels,
val_labels=val_data.labels, sv=sv, W=W, b=b)
def plot_feature(train_features, val_features, train_labels, val_labels, sv, W, b):
"""
Draw the samples,SVM decision boundary, and support vectors
---------------------
:param train_features: training samples with the shape of [B, 2]
:param val_features: validation samples with the shape of [B, 2]
:param train_labels: the labels (chosen from{-1, +1}) corresponding to training samples, with the shape of [B, 1]
:param val_labels: the labels (chosen from{-1, +1}) corresponding to validation samples, with the shape of [B, 1]
:param sv: a list with the index of support vectors in training samples, with the shape of [K] (K is the number of support vectors)
:param W: the weight vector of SVM decision boundary (W^Tx + b), with the shape of [1, feature_channel]
:param b: the bias of SVM decision boundary (W^Tx + b), with the shape of [1,]
"""
train_labels = (train_labels > 0.0).int()
val_labels = (val_labels > 0.0).int()
train_labels[sv] = 2
foreground = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(sv))
foreground_sv = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(foreground))
background = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(sv))
background_sv = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(background))
f, ax = plt.subplots()
plt.title("training dataset")
ax.scatter(train_features[foreground, 0], train_features[foreground, 1], marker='.', c='r', label="-1")
ax.scatter(train_features[foreground_sv, 0], train_features[foreground_sv, 1], marker='.', c='darkorange',
label="-1 (support vector)")
ax.scatter(train_features[background, 0], train_features[background, 1], marker='x', c='b', label="+1")
ax.scatter(train_features[background_sv, 0], train_features[background_sv, 1], marker='x', c='c',
label="+1 (support vector)")
x = np.linspace(-20, 20, 100)
ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
ax.legend(loc="best")
plt.ylim([-30, 30])
plt.show()
f, ax = plt.subplots()
plt.title("validation dataset")
foreground_val = [i for i in range(val_labels.shape[0] // 2)]
background_val = [i + val_labels.shape[0] // 2 for i in range(val_labels.shape[0] // 2)]
ax.scatter(val_features[foreground_val, 0], val_features[foreground_val, 1], marker='.', c='r', label="-1")
ax.scatter(val_features[background_val, 0], val_features[background_val, 1], marker='x', c='b', label="+1")
x = np.linspace(-20, 20, 100)
ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
ax.legend(loc="best")
plt.ylim([-30, 30])
plt.show()
def plot(train_loss, train_acc, val_acc, epochs):
"""
Draw loss and accuracy curve
------------------
:param train_loss: a list with loss of each training epoch
:param train_acc: a list with accuracy on training dataset of each training epoch
:param val_acc: a list with accuracy on validation dataset of each training epoch
:param epochs: a list with the index of all training epochs
"""
# draw the training loss curve
f, ax = plt.subplots()
plt.title("Training Loss")
ax.plot(epochs, train_loss, color="tab:blue")
ax.set_xlabel("Training epoch")
ax.set_ylabel("Loss")
ax.legend(["training loss"], loc="best")
plt.show()
# draw the accuracy curve
f, ax = plt.subplots()
plt.title("Training and Validation Accuracy")
ax.plot(epochs, train_acc, color="tab:orange")
ax.plot(epochs, val_acc, color="tab:green")
ax.legend(["training accuracy","validation accuracy"], loc="best")
ax.set_xlabel("Training epoch")
ax.set_ylabel("Accuracy")
ax.set_ylim(0, 101)
plt.show()
if __name__ == "__main__":
# set random seed for reproducibility
seed = 2024
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
# set configurations of the model and training process
parser = argparse.ArgumentParser()
parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels",)
parser.add_argument("--n_epoch", type=int, default=50, help="number of training epochs")
parser.add_argument("--batch_size", type=int, default=20, help="training batch size")
parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
parser.add_argument("--C", type=float, default=1e-3, help="regularization coefficient in hinge loss")
parser.add_argument("--device", type=str, help="cpu or cuda")
parser.add_argument("--feature_channel", type=int, default=2, help="number of pre-extracted feature channel by pretrained network")
parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
args = parser.parse_args()
if args.device is None:
args.device = "cuda" if torch.cuda.is_available() else "cpu"
# run the training procedure
train(
data_root=args.data_root,
feature_channel=args.feature_channel,
batch_size=args.batch_size,
n_epoch=args.n_epoch,
lr=args.lr,
C=args.C,
model_save_path=args.model_save_path,
device=args.device,
)

132
hw3/report/dtx-style.sty Normal file
View File

@@ -0,0 +1,132 @@
%%
%% This is file `dtx-style.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `dtx-style')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\ProvidesPackage{dtx-style}
\RequirePackage{hypdoc}
\RequirePackage[UTF8,scheme=chinese]{ctex}
\RequirePackage{newpxtext}
\RequirePackage{newpxmath}
\RequirePackage[
top=2.5cm, bottom=2.5cm,
left=4cm, right=2cm,
headsep=3mm]{geometry}
\RequirePackage{array,longtable,booktabs}
\RequirePackage{listings}
\RequirePackage{fancyhdr}
\RequirePackage{xcolor}
\RequirePackage{enumitem}
\RequirePackage{etoolbox}
\RequirePackage{metalogo}
\colorlet{thu@macro}{blue!60!black}
\colorlet{thu@env}{blue!70!black}
\colorlet{thu@option}{purple}
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\def\DescribeOption{%
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
\Describe@Option}
\def\Describe@Option#1{\endgroup
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
\thu@special@index{option}{#1}\@esphack\ignorespaces}
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
\def\thu@special@index#1#2{\@bsphack
\begingroup
\HD@target
\let\HDorg@encapchar\encapchar
\edef\encapchar usage{%
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
}%
\index{#2\actualchar{\string\ttfamily\space#2}
(#1)\encapchar usage}%
\index{#1:\levelchar#2\actualchar
{\string\ttfamily\space#2}\encapchar usage}%
\endgroup
\@esphack}
\lstdefinestyle{lstStyleBase}{%
basicstyle=\small\ttfamily,
aboveskip=\medskipamount,
belowskip=\medskipamount,
lineskip=0pt,
boxpos=c,
showlines=false,
extendedchars=true,
upquote=true,
tabsize=2,
showtabs=false,
showspaces=false,
showstringspaces=false,
numbers=none,
linewidth=\linewidth,
xleftmargin=4pt,
xrightmargin=0pt,
resetmargins=false,
breaklines=true,
breakatwhitespace=false,
breakindent=0pt,
breakautoindent=true,
columns=flexible,
keepspaces=true,
gobble=2,
framesep=3pt,
rulesep=1pt,
framerule=1pt,
backgroundcolor=\color{gray!5},
stringstyle=\color{green!40!black!100},
keywordstyle=\bfseries\color{blue!50!black},
commentstyle=\slshape\color{black!60}}
\lstdefinestyle{lstStyleShell}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{purple},
language=bash}
\lstdefinestyle{lstStyleLaTeX}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{violet},
language=[LaTeX]TeX}
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
\setlist{nosep}
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
\DeclareDocumentCommand{\pkg}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
\DeclareDocumentCommand{\file}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
\newcommand{\myentry}[1]{%
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
\newcommand{\note}[2][Note]{{%
\color{magenta}{\bfseries #1}\emph{#2}}}
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}

153
hw3/report/iidef.sty Normal file
View File

@@ -0,0 +1,153 @@
%%
%% This is file `iidef.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `sty')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
\ProvidesClass{iidef}
[2020/09/09 2.6 Tsinghua University Coursework Template]
%% configuration of nested enumerate env
\RequirePackage{enumitem}
%% set hwcount key-value option
\RequirePackage{kvoptions}
%% required by macro DeclareMathOperator
\RequirePackage{amsmath}
%% Set up page headers using with fancyhdr
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
{\def\@thulhead{thulhead}}
\RequirePackage{amsthm}
%% semester
\def\@term{term}
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
%% institute
\newcommand{\@courseinstitute}[1]{institute}
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
%% coursename
\newcommand{\@coursename}[1]{coursename}
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
%% user can rewrite homework name
\def\@hwname{Homework}
\def\hwname#1{\renewcommand\@hwname{#1}}
%% \iidef@thehwcnt = 1
\DeclareStringOption[1]{thehwcnt}
\ProcessKeyvalOptions*
\def\thehwcnt{\iidef@thehwcnt}
%% page header setup, distinguish between first page(plain style)
%% and second page on (runningpage style)
%%***************************************************************************
\newcommand{\courseheader}{
\thispagestyle{plain}%first page use native plain style to suppress header
\vspace*{-1in}
\begin{center}
\@courseinstitute\\
\@coursename\\
\@term
\vspace*{0.1in}
\hrule
\end{center}
\begin{center}
\underline{\bf \@hwname\;\thehwcnt} \\
\end{center}
}
\@ifundefined{@thulhead}{
\fancypagestyle{runningpage}
{
\fancyhead[L]{\small\@coursename}
\fancyhead[R]{\small\@courseinstitute}
}
%% use runningpage style from second page on
\pagestyle{runningpage}
}{}
%% *********************************************************************************************
%%name command macro
%%*************************
\newcommand{\name}[1]{
\begin{flushleft}
#1\hfill
\today
\end{flushleft}
\hrule
\vspace{2em}
\flushleft
}
%%*************************
%% enumitem related configuration
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
\setlist[enumerate,2]{label=(\alph*)}
\setlist[enumerate,3]{label=\roman*.}
\setlist[enumerate,4]{label=\greek*}
%%******************************
\def\@slname{Solution}
\def\slname#1{\renewcommand\@slname{#1}}
\@ifundefined{solution}{
\newenvironment{solution}
{
\proof[\@slname]
}
{
%% no qed symbol in solution env
\renewcommand{\qedsymbol}{}
\endproof
}
}{}
%%******************************
%%common math symbols go here
%%*************************************************
\def\v#1{\underline{#1}}
\newcommand{\uc}{\underline{c}} % c, vec
\newcommand{\uv}{\underline{v}} % x, vec
\newcommand{\uw}{\underline{w}} % w, vec
\newcommand{\ux}{\underline{x}} % x, vec
\newcommand{\uy}{\underline{y}} % y, vec
\newcommand{\uz}{\underline{z}} % z, vec
\newcommand{\um}{\underline{m}} % m, vec
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
\newcommand{\defas}{\triangleq} %\coloneqq
\newcommand{\reals}{\mathbb{R}}
\newcommand{\TT}{\mathrm{T}} % transpose
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argsup}{arg\,sup}
\DeclareMathOperator*{\arginf}{arg\,inf}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Cov}{Cov}
\DeclareMathOperator{\MSE}{MSE}
\DeclareMathOperator{\1}{\mathds{1}}
\DeclareMathOperator{\In}{\mathbb{I}}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\Prob}{\mathbb{P}}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
%%************************************************************************************

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

379
hw3/report/main.tex Normal file
View File

@@ -0,0 +1,379 @@
% Homework Template
\documentclass[a4paper]{article}
\usepackage{ctex}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{moreenum}
\usepackage{mathtools}
\usepackage{url}
\usepackage{bm}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
\usepackage[mathcal]{eucal}
\usepackage[thehwcnt = 3]{iidef}
\usepackage{listings}
\usepackage{fontspec}
\usepackage{xcolor}
\usepackage{float}
\usepackage{siunitx}
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
\newfontfamily\cascadia{Cascadia Code}
\lstset{
basicstyle = \small\codefont,
% ---
tabsize = 4,
showstringspaces = false,
numbers = left,
numberstyle = \codefont,
% ---
breaklines = true,
captionpos = t,
% ---
frame = l,
flexiblecolumns,
}
\lstdefinestyle{Python}{
language = Python, % 语言选Python
keywordstyle = \color{blue},
keywordstyle = [2] \color{teal},
stringstyle = \color{orange!80!black},
commentstyle = \color{red},
identifierstyle = \color{blue!80!white},
}
\lstdefinestyle{Bash}{
language = bash
}
\thecourseinstitute{清华大学电子工程系}
\thecoursename{\textbf{媒体与认知}}
\theterm{2023-2024学年春季学期}
\hwname{作业}
\begin{document}
\courseheader
% 请在YOUR NAME处填写自己的姓名
\name{高艺轩}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
% 请在?处填写答案
\subsection{\underline{D}}
\subsection{\underline{C}}
\subsection{\underline{D}}
\subsection{\underline{D}}
\subsection{\underline{B}}
\section{计算题15 分)}
\subsection{给定两个类别的样本分别为:
\begin{align*}
&\omega_1:\{(3,1),(2,2),(4,3),(3,2)\} \\
&\omega_2:\{(1,3),(1,2),(-1,1),(-1,2)\}
\end{align*}
试利用LDA将样本特征维数压缩为一维。
}
\begin{proof}[解]
首先计算$\mu_1 = (3, 2), \mu_2 = (0, 2), \mu = (1.5, 2)$。因此
\[S_1 = \frac{1}{4}
\left(
\begin{bmatrix}
0 & 0\\
0 & 1
\end{bmatrix}
+
\begin{bmatrix}
1 & 0\\
0 & 0
\end{bmatrix}
+
\begin{bmatrix}
1 & 1\\
1 & 1
\end{bmatrix}
+
\begin{bmatrix}
0 & 0\\
0 & 0
\end{bmatrix}
\right)
=
\begin{bmatrix}
0.5 & 0.25\\
0.25 & 0.5
\end{bmatrix}\]
\[S_2 = \frac{1}{4}
\left(
\begin{bmatrix}
0 & 0\\
0 & 1
\end{bmatrix}
+
\begin{bmatrix}
1 & 0\\
0 & 0
\end{bmatrix}
+
\begin{bmatrix}
1 & 1\\
1 & 1
\end{bmatrix}
+
\begin{bmatrix}
1 & 0\\
0 & 0
\end{bmatrix}
\right)
=
\begin{bmatrix}
0.75 & 0.25\\
0.25 & 0.5
\end{bmatrix}\]
进一步地,
\[S_w = \frac{1}{2} (S_1 + S_2) =
\begin{bmatrix}
0.625 & 0.25\\
0.25 & 0.5
\end{bmatrix}\]
\[S_b = \frac{1}{2} \left(
\begin{bmatrix}
2.25 & 0\\
0 & 0
\end{bmatrix}
+
\begin{bmatrix}
2.25 & 0\\
0 & 0
\end{bmatrix}
\right)
=
\begin{bmatrix}
2.25 & 0\\
0 & 0
\end{bmatrix}\]
广义特征值分解得到$\lambda = 4.5$$v = (0.8944, -0.4472)$。投影后的样本为
\[\omega_1: \left\{2.2360, 0.8944, 2.2360, 1.7888\right\}\]
\[\omega_2: \left\{-0.4472, 0, -1.3416, -1.7888\right\}\]
\end{proof}
\vspace{3mm}
\subsection{模型训练通常需要大量的数据假设某采集的数据集包含80\%的有效数据和20\%的无效数据。采用一种算法判断数据是否有效其中无效数据被成功判别为无效数据的概率为90\%而有效数据被误判为无效数据的概率为5\%。如果某条数据经过该算法被判别为无效数据,则根据贝叶斯定理,这条数据是无效数据的概率是多少?(提示:全概率公式$P(Y)=\sum^{N}_{i=1}P(Y|X_i)P(X_i)$)\\}
\begin{proof}[解]
\begin{align*}
& P(\text{无效数据} \mid \text{判定无效})\\
= & \frac{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据})}{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据}) + p(\text{判定无效} \mid \text{有效数据})p(\text{有效数据})}\\
= & \frac{0.9 \times 0.2}{0.9 \times 0.2 + 0.05 \times 0.8}\\
= & \frac{0.18}{0.18 + 0.04}\\
= & \frac{9}{11}
\end{align*}
\end{proof}
\vspace{3mm}
\subsection{设有两类正态分布的样本集,第一类均值为$\mu_1=[2,-1]^T$,第二类均值为$\mu_2=[1,1]^T$。两类样本集的协方差矩阵和出现的先验概率都相等:$\Sigma_1=\Sigma_2=\Sigma=\left[ \begin{array}{cc}
4 & 2 \\
2 & \frac{4}{3}
\end{array} \right]$$p(\omega_1)=p(\omega_2)$。试计算分类界面,并对特征向量$x=[6,2]^T$分类。}
\begin{proof}[解]
\[\Sigma^{-1} = \begin{bmatrix}
1 & -1.5\\
-1.5 & 3
\end{bmatrix}\]
决策方程
\[g_{LDF1} = \Sigma^{-1} \mu_1 \boldsymbol{x} + -\frac{1}{2} \mu_1^T \Sigma^{-1} \mu_1 = (3.5, -1) \boldsymbol{x} - 6.5\]
类似地可以得到
\[g_{LDF2} = (-0.5, 1.5) \boldsymbol{x} - 0.5\]
因此分类界面为
\begin{align*}
(3.5, -1) \boldsymbol{x} - 6.5 & = (-0.5, 1.5) \boldsymbol{x} - 0.5\\
(4, -2.5) \boldsymbol{x} & = 6
\end{align*}
对于$(6, 2)$,计算$g_{LDF1}((6, 2)) = 12.5$$g_{LDF2}((6, 2)) = -0.5$,因此属于第一类。
\end{proof}
\vspace{3mm}
\subsection{给定异或的样本集$D=\left\{\left((0,0)^T,-1\right),\left((0,1)^T,1\right),\left((1,0)^T,1\right),\left((1,1)^T,-1\right)\right\}$该样本集是线性不可分的,可采用如下所示的多项式函数$\phi(\mathbf{x})$将样本$D=\left\{(\mathbf{x}_n,y_n)\right\}$映射为$D_\phi=\left\{(\phi(\mathbf{x}_n),y_n)\right\}$,其中$\phi(\mathbf{x})$满足
\begin{equation*}
\begin{aligned}
\phi_1(\mathbf{x})&=2(x_1-0.5) \\
\phi_2(\mathbf{x})&=4(x_1-0.5)(x_2-0.5)
\end{aligned}
\end{equation*}
\\
\qquad(1) 给出映射后的样本集;\\
\qquad(2) 在映射后的样本集中设计一个线性SVM分类器给出支持向量及分类界面。
}
\begin{proof}[解]
映射后的样本集
\[D_{\phi} = \left\lbrace\left((-1, 1)^T, -1\right), \left((-1, -1)^T, 1\right), \left((1, -1)^T, 1\right), \left((1, 1)^T, -1\right)\right\rbrace\]
待优化的问题为
\[L(\boldsymbol{\alpha}) = \sum_{i = 1}^4 \alpha_i - \frac{1}{2} \sum_{i = 1}^4 \sum_{j = 1}^4 \alpha_i \alpha_j y_i y_j \boldsymbol{x}_i^T \boldsymbol{x}_j\]
因此
\begin{align*}
\frac{\partial L}{\partial \alpha_1} & = 1 - \frac{1}{2}\sum_{i \neq 1}^4 \alpha_i y_1 y_i \boldsymbol{x}_1^T \boldsymbol{x}_i - 2 \alpha_1 y_1 y_1 \boldsymbol{x}_1^T \boldsymbol{x}_1\\
& = 1 - 2 \alpha_3 - 4 \alpha_1\\
\frac{\partial L}{\partial \alpha_2} & = 1 - 2\alpha_4 - 4 \alpha_2\\
\frac{\partial L}{\partial \alpha_3} & = 1 - 2 \alpha_1 - 4 \alpha_3\\
\frac{\partial L}{\partial \alpha_4} & = 1 - 2 \alpha_3 - 4 \alpha_4
\end{align*}
令四个偏导数均为0得到$\alpha_1 = \alpha_2 = \alpha_3 = \alpha_4 = \frac{1}{6}$。全部的点均为支持向量。因此
\[\boldsymbol{w} = \sum_{i = 1}^4 \alpha_i y_i \boldsymbol{x}_i = \left(0, -\frac{2}{3}\right)\]
为求偏置量,带入$\boldsymbol{x}_1$
\[(-1) (\boldsymbol{w}^T \boldsymbol{x}_1 + b) = 1\]
得到$b = -\frac{1}{3}$
分类界面$\boldsymbol{w}^T \boldsymbol{x} + b = 0$,即
\[\begin{bmatrix}
0\\-\frac{2}{3}
\end{bmatrix} \boldsymbol{x} - \frac{1}{3} = 0\]
得到$x_2 = \frac{1}{2}$,因此在原空间中,
\[4(x_1 - 0.5)(x_2 - 0.5) = 0.5\]
\end{proof}
\vspace{3mm}
\subsection{使用KMeans算法对2维空间中的6个点$(0,2)$,$(2,0)$,$(2,3)$,$(3,2)$,$(4,0)$,$(5,4)$进行聚类,距离函数选择欧氏距离$d=\sqrt{(x_1-x_2)^2+(y_1-y_2)^2}$\\
\qquad (1)起始聚类中心选择(0,0)和(4,3),计算聚类中心;\\
\qquad (2)起始聚类中心选择(1,4)和(3,1),计算聚类中心。\\
}
\begin{proof}[解]
中心选择$(0, 0), (4, 3)$,第一次分为$(0, 2), (2,0)$$(2, 3), (3, 2), (4, 0), (5, 4)$,更新后的中心为$(1, 1)$$\left(\frac{7}{2}, \frac{9}{4}\right)$。收敛。
中心选择$(1, 4)$$(3, 1)$,第一次分为$(0, 2), (2, 3)$$(2, 0), (4, 0), (3, 2), (5, 4)$,更新后中心为$(1, \frac{5}{2})$$(\frac{7}{2}, \frac{3}{2})$,收敛。
\end{proof}
\vspace{3mm}
\centerline{\textbf{\Large{编程部分}}}
\vspace{3mm}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题进度汇报”中的一项完成
\section{编程作业报告}
\subsection{程序验证}
与助教给出的图片相比我写出的程序PCA得到的结果的xy坐标都在$[-1, 1]$之间不利于之后的分类。我将所有的PCA之后的坐标都扩大了20倍。
运行\lstinline{check.py}进行检查:
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/check/check.png}
\end{figure}
\subsection{数据预处理}
运行
\begin{lstlisting}[style=Bash]
python data_preprocess.py
\end{lstlisting}
得到的输出为
\begin{figure}[H]
\centering
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/preprocess/preprocess_train.png}
\caption{训练集preprocess结果}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/preprocess/preprocess_val.png}
\caption{验证集preprocess结果}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/preprocess/preprocess_test.png}
\caption{测试集preprocess结果}
\end{subfigure}
\end{figure}
\subsection{训练、验证及测试}
\begin{figure}[H]
\centering
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/default/loss.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/default/train_accu.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/default/sv.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/default/val.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.8\linewidth}
\includegraphics[width=\textwidth]{img/train/default/test.png}
\end{subfigure}
\end{figure}
\subsection{调整正则化系数}
\subsubsection{C = \num{1e-6}}
\begin{figure}[H]
\centering
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1e-6/loss.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1e-6/accu.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1e-6/sv.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1e-6/val.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.8\linewidth}
\includegraphics[width=\textwidth]{img/train/1e-6/test.png}
\end{subfigure}
\end{figure}
可以看到出现了严重的欠拟合分类界面超出了绘图的范围。这是因为C过小导致不能正确地分辨合适的分类界面。
\subsubsection{C = 1}
\begin{figure}[H]
\centering
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1/loss.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1/accu.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1/sv.png}
\end{subfigure}
\hspace{0.5cm}
\begin{subfigure}[t]{.45\linewidth}
\includegraphics[width=\textwidth]{img/train/1/val.png}
\end{subfigure}\\[2ex]
\begin{subfigure}[t]{.8\linewidth}
\includegraphics[width=\textwidth]{img/train/1/test.png}
\end{subfigure}
\end{figure}
发生了过拟合,直线被交界面的点限制,斜率不是最优。
\end{document}
%%% Local Variables:
%%% mode: late\rvx
%%% TeX-master: t
%%% End:

163
hw4/code/attnvis.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
+++如梦令
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。

View File

@@ -0,0 +1,3 @@
+++鹧鸪天(秋思)
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。

75
hw4/code/dataset.py Normal file
View File

@@ -0,0 +1,75 @@
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import json
class LMDataset(Dataset):
def __init__(self, data_dir, split):
super().__init__()
# load the data
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
meta = json.load(f)
self.data = meta['data'] # list of samples
self.stoi = meta['stoi'] # a dict that maps character to integer
self.itos = meta['itos'] # a dict that maps string of integer to character
self.vocab_size = meta['vocab_size'] # vocab size
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
class Converter:
'''
This class helps us convert strings to integers and back
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
'''
def __init__(self, stoi, itos):
self.stoi = stoi # a dict that maps character to integer
self.itos = itos # a dict that maps string of integer to character
def single_encode(self, s):
l = [] # initialize an empty list
for i in s:
l.append(self.stoi[i])
# transform the list into a numpy array
l = np.array(l, dtype=np.int64)
return l
def single_decode(self, l):
s = '' # initialize an empty string
for i in l:
# if we meet the end of the sequence (the value of integer is equal to 1), break
if i == 1:
break
# convert string of the integer into a character
s += self.itos[str(i)]
return s
def encode(self, data):
'''
encode a list of strings into integers
'''
lens = [len(s) for s in data]
max_len = max(lens)
out = np.zeros((len(data), max_len+1), dtype=np.int64)
for i,s in enumerate(data):
out[i,:len(s)] = self.single_encode(s)
out[i,len(s)] = 1
x = torch.from_numpy(out[:,:-1])
y = torch.from_numpy(out[:,1:])
return x, y
def decode(self, data):
'''
decode a list of integers into strings
'''
data = data.cpu().numpy().astype(np.int64)
out = []
for i in range(len(data)):
out.append(self.single_decode(data[i]))
return out

356
hw4/code/model.py Normal file
View File

@@ -0,0 +1,356 @@
# ========================================================
# Media and Cognition
# Homework 4 Sequence Modeling
# model.py - Model definition
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
# Import required libraries
############################################################
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
############################################################
# Define the GELU activation function used in OpenAI GPT
############################################################
def gelu(z):
"""
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
"""
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
############################################################
# Define the Multi-Head SelfAttention module
############################################################
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_head, dropout):
super().__init__()
# define there linear layers for q, k, v generation separately
self.q_layer = nn.Linear(embed_dim, embed_dim)
self.k_layer = nn.Linear(embed_dim, embed_dim)
self.v_layer = nn.Linear(embed_dim, embed_dim)
# define the projection layer for output
self.proj_layer = nn.Linear(embed_dim, embed_dim)
# define the dropout layer for attention and output calculation
self.attn_drop = nn.Dropout(dropout)
self.proj_drop = nn.Dropout(dropout)
self.num_head = num_head
self.head_dim = embed_dim // num_head
def forward(self, x):
batch_size, seq_len, dim = x.shape
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
q = self.q_layer(x)
k = self.k_layer(x)
v = self.v_layer(x)
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
# Therefore, a mask is used to prevent positions from attending to subsequent positions
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
# Hint:
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
attn_mask = torch.triu(attn_mask, diagonal=1)
# use Tensor.bool() to convert the matrix to a boolean matrix
attn_mask = attn_mask.bool()
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
attn = attn.masked_fill(attn_mask, -np.inf)
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
attn = torch.softmax(attn, dim=3)
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
attn = self.attn_drop(attn)
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
out = attn @ v
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
result = self.proj_drop(self.proj_layer(out))
# <<< TODO 1
# return the final results `result` and attention weights `attn`
return result, attn
############################################################
# Define the feed forward network (FFN)
############################################################
class FFN(nn.Module):
def __init__(self, embed_dim, feedforward_dim, dropout):
super().__init__()
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
self.drop = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = gelu(x)
x = self.fc2(x)
x = self.drop(x)
return x
############################################################
# Define the TransformerLayer
############################################################
class TransformerLayer(nn.Module):
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
super().__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = SelfAttention(embed_dim, num_head, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
self.no_res = no_res # whether to use residual connection
def forward(self, x):
# >>> TODO 2: complete the forward process of the TransformerLayer module.
# Step 2.1: calculate the output of multi-head self-attention
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
x_norm = self.norm1(x)
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
x_attn, attn = self.attn(x_norm)
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
if not self.no_res:
x_attn = x_attn + x
# Step 2.2: calculate the output of feed forward network
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
x_ffn = self.ffn(self.norm2(x_attn))
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
if not self.no_res:
out = x_attn + x_ffn
else:
out = x_ffn
# <<< TODO 2
return out, attn
############################################################
# Define the GPT module
############################################################
class GPT(nn.Module):
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
'''
vocab_size: the size of vocabulary
max_seq_len: the maximum length of input texts
num_layer: the number of transformer layers
embed_dim: the embedding dimension
num_head: the number of heads in Multi-Head Self Attention
feedforward_dim: the dimension in the feed forward network
dropout: dropout ratio
no_res: whether to use residual connection in transformer layers
no_pos: whether to use position embeddings
'''
super().__init__()
self.num_layer = num_layer
self.max_seq_len = max_seq_len
self.no_pos = no_pos
# Define Embedding Layer to transfer input text tokens and positions to embeddings
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
self.drop = nn.Dropout(dropout)
# Define the transformer layers
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
# Define the head layer to predict output
self.norm = nn.LayerNorm(embed_dim)
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
"""
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
Reference: https://paperswithcode.com/method/weight-tying
"""
self.word_token_embedding.weight = self.language_model_head.weight
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
if m.bias is not None:
torch.nn.init.zeros_(m.bias)
elif isinstance(m, nn.Embedding):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('proj_layer.weight'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
def forward(self, word_idx, targets=None):
batch_size, seq_len = word_idx.shape
# >>> TODO 3: complete the forward process of GPT
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
token_embed = self.word_token_embedding(word_idx)
pos_embed = self.word_pos_embedding(pos)
# Step 3.3: initialize the input embeddings `x` of transformer layers
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
if not self.no_pos:
x = token_embed + pos_embed
else:
x = token_embed
# apply dropout to the input embeddings via `self.drop()`
x = self.drop(x)
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
attention_weights = list()
for i in range(self.num_layer):
# Step 4.1: obtain the output and attention weights of transformer layers
x, attn = self.transformer[i](x)
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
attention_weights.append(attn)
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
# self.language_model_head() is a linear layer defined in __init__() function
# Note: do not add softmax here since it is included in the cross entropy loss function
x = self.norm(x)
logits = self.language_model_head(x)
# <<< TODO 3
# return logits and loss or attention weights
if targets is not None:
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
return logits, loss
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
return logits, attention_weights
def configure_optimizers(self, weight_decay):
"""
This long function is unfortunately doing something very simple and is being very defensive:
We are separating out all parameters of the model into two buckets: those that will experience
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
We are then returning the PyTorch optimizer object.
"""
# separate out all parameters to those that will and won't experience regularizing weight decay
decay = set()
no_decay = set()
whitelist_weight_modules = (nn.Linear, )
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
for mn, m in self.named_modules():
for pn, p in m.named_parameters():
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
# random note: because named_modules and named_parameters are recursive
# we will see the same tensors p many many times. but doing it this way
# allows us to know which parent module any tensor p belongs to...
if pn.endswith('bias'):
# all biases will not be decayed
no_decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
# weights of whitelist modules will be weight decayed
decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
# weights of blacklist modules will NOT be weight decayed
no_decay.add(fpn)
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
# will appear in the no_decay and decay sets respectively after the above.
# In addition, because named_parameters() doesn't return duplicates, it
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
# so let's manually remove 'lm_head.weight' from decay set. This will include
# this tensor into optimization via transformer.wte.weight only, and not decayed.
decay.remove('language_model_head.weight')
# validate that we considered every parameter
param_dict = {pn: p for pn, p in self.named_parameters()}
inter_params = decay & no_decay
union_params = decay | no_decay
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
% (str(param_dict.keys() - union_params), )
# create the pytorch optimizer object
optim_groups = [
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
]
return optim_groups
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
"""
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx.squeeze().cpu().numpy()
############################################################
GPTConfig = {
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
}

61
hw4/code/prepare.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Prepare the dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
"""
import os
import numpy as np
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
args = parser.parse_args()
# set the input file path
input_file_path = os.path.join(args.data_root, 'data.json')
with open(input_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)['data']
print(f"length of dataset: {len(data):,}")
# get all the unique characters that occur in this text
chars = sorted(list(set(''.join(data))))
vocab_size = len(chars) + 2 # for <pad> and <eos>
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")
# create a mapping from characters to integers
stoi = { ch:i+2 for i,ch in enumerate(chars) }
itos = { i+2:ch for i,ch in enumerate(chars) }
stoi['<pad>'] = 0
itos[0] = '<pad>'
stoi['<eos>'] = 1
itos[1] = '<eos>'
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]
print(f"train has {len(train_data):,} samples")
print(f"val has {len(val_data):,} samples")
# save the meta information as well, to help us encode/decode later
train_meta = {
'data': train_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
json.dump(train_meta, f, ensure_ascii=False, indent=4)
val_meta = {
'data': val_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
json.dump(val_meta, f, ensure_ascii=False, indent=4)

76
hw4/code/sample.py Normal file
View File

@@ -0,0 +1,76 @@
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
from model import GPTConfig, GPT
import argparse
from dataset import Converter, LMDataset
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
dataset = LMDataset(data_root, 'train')
converter = Converter(dataset.stoi, dataset.itos)
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
# model
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
# init from a model saved in a specific directory
ckpt_path = os.path.join(ckpt_path, 'best.pth')
print("sample from %s"%ckpt_path)
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig[model_name]
if 'model_args' in checkpoint:
gptconf = checkpoint['model_args']
model = GPT(**gptconf)
state_dict = checkpoint['state_dict']
#unwanted_prefix = '_orig_mod.'
#for k,v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.eval()
model.to(device)
# encode the beginning of the prompt
start_ids = converter.single_encode(start)
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
# run generation
with torch.no_grad():
with ctx:
for k in range(num_samples):
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
print(converter.single_decode(y))
print('---------------')
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and sampling process
parser = argparse.ArgumentParser()
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)

219
hw4/code/train.py Normal file
View File

@@ -0,0 +1,219 @@
import os
import time
import math
import pickle
from contextlib import nullcontext
import argparse
import numpy as np
import torch
from torch.utils.data import DataLoader
from model import GPT, GPTConfig
from dataset import LMDataset, Converter
import matplotlib.pyplot as plt
# learning rate decay scheduler (cosine with warmup)
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
train_dataset = LMDataset(data_root, 'train')
val_dataset = LMDataset(data_root, 'val')
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
converter = Converter(train_dataset.stoi, train_dataset.itos)
# adamw optimizer
learning_rate = 5e-3 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# system
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
best_val_loss = 1e9
iter_num = 0 # number of iterations in the lifetime of this process
# model init
model_args = GPTConfig[model_name]
model_args['vocab_size'] = train_dataset.vocab_size
model_args['max_seq_len'] = 128
model_args['no_res'] = no_res
model_args['no_pos'] = no_pos
# init a new model from scratch
print("Initializing a new model from scratch")
model = GPT(**model_args)
model.to(device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optim_groups = model.configure_optimizers(weight_decay)
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
checkpoint = None # free up memory
print('training...')
# training loop
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
t0 = time.time()
model.train()
train_losses = []
val_losses = []
for epoch in range(epoch_num):
for step, inputs in enumerate(train_loader):
if iter_num >= n_iters:
break
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# forward backward update, with optional gradient accumulation to simulate larger batch size
# and using the GradScaler if data type is float16
with ctx:
logits, loss = model(X, Y)
loss = loss # scale the loss to account for gradient accumulation
# backward pass, with gradient scaling if training in fp16
scaler.scale(loss).backward()
# clip the gradient
if grad_clip != 0.0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# step the optimizer and scaler if training in fp16
scaler.step(optimizer)
scaler.update()
# flush the gradients as soon as we can, no need for this memory anymore
optimizer.zero_grad(set_to_none=True)
iter_num += 1
train_losses.append(loss.item())
# evaluate the loss on train/val sets and write checkpoints
if iter_num % val_interval == 0:
# timing and logging
t1 = time.time()
dt = t1 - t0
t0 = t1
lossf = loss.item()
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
losses = estimate_loss(model, val_loader, converter, ctx, device)
val_losses.append(losses['val'])
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
print(f"saving latest checkpoint to {ckpt_path}")
checkpoint = {
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'model_args': model_args,
'iter_num': iter_num,
'best_val_loss': best_val_loss,
}
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
if losses['val'] < best_val_loss:
best_val_loss = losses['val']
if iter_num > 0:
print(f"saving best checkpoint to {ckpt_path}")
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
# create a plot
f, ax = plt.subplots(1,2,figsize=(18,6))
val_iters = np.arange(1, n_iters+1, val_interval)
# draw loss
ax[0].plot(train_losses)
ax[0].plot(val_iters, val_losses, 'r')
# set labels
ax[0].set_xlabel('training iters')
ax[0].legend(['training loss', 'validation loss'])
train_perplexity = [np.exp(x) for x in train_losses]
val_perplexity = [np.exp(x) for x in val_losses]
# draw perplexity
ax[1].plot(train_perplexity)
ax[1].plot(val_iters, val_perplexity, 'r')
# set labels
ax[1].set_xlabel('training iters')
ax[1].legend(['training perplexity', 'validation perplexity'])
plt.tight_layout()
# show the image
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
plt.show()
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss(model, val_loader, converter, ctx, device):
out = {}
model.eval()
losses = 0
max_iters = 100
iter_num = 0
for inputs in val_loader:
if iter_num >= max_iters:
break
iter_num += 1
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
with ctx:
logits, loss = model(X, Y)
#loss = model.loss(logits, Y)
losses += loss.item()
out['val'] = losses / max_iters
model.train()
return out
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and training process
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
os.makedirs(opt.ckpt_path, exist_ok=True)
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)

132
hw4/report/dtx-style.sty Normal file
View File

@@ -0,0 +1,132 @@
%%
%% This is file `dtx-style.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `dtx-style')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\ProvidesPackage{dtx-style}
\RequirePackage{hypdoc}
\RequirePackage[UTF8,scheme=chinese]{ctex}
\RequirePackage{newpxtext}
\RequirePackage{newpxmath}
\RequirePackage[
top=2.5cm, bottom=2.5cm,
left=4cm, right=2cm,
headsep=3mm]{geometry}
\RequirePackage{array,longtable,booktabs}
\RequirePackage{listings}
\RequirePackage{fancyhdr}
\RequirePackage{xcolor}
\RequirePackage{enumitem}
\RequirePackage{etoolbox}
\RequirePackage{metalogo}
\colorlet{thu@macro}{blue!60!black}
\colorlet{thu@env}{blue!70!black}
\colorlet{thu@option}{purple}
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\def\DescribeOption{%
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
\Describe@Option}
\def\Describe@Option#1{\endgroup
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
\thu@special@index{option}{#1}\@esphack\ignorespaces}
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
\def\thu@special@index#1#2{\@bsphack
\begingroup
\HD@target
\let\HDorg@encapchar\encapchar
\edef\encapchar usage{%
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
}%
\index{#2\actualchar{\string\ttfamily\space#2}
(#1)\encapchar usage}%
\index{#1:\levelchar#2\actualchar
{\string\ttfamily\space#2}\encapchar usage}%
\endgroup
\@esphack}
\lstdefinestyle{lstStyleBase}{%
basicstyle=\small\ttfamily,
aboveskip=\medskipamount,
belowskip=\medskipamount,
lineskip=0pt,
boxpos=c,
showlines=false,
extendedchars=true,
upquote=true,
tabsize=2,
showtabs=false,
showspaces=false,
showstringspaces=false,
numbers=none,
linewidth=\linewidth,
xleftmargin=4pt,
xrightmargin=0pt,
resetmargins=false,
breaklines=true,
breakatwhitespace=false,
breakindent=0pt,
breakautoindent=true,
columns=flexible,
keepspaces=true,
gobble=2,
framesep=3pt,
rulesep=1pt,
framerule=1pt,
backgroundcolor=\color{gray!5},
stringstyle=\color{green!40!black!100},
keywordstyle=\bfseries\color{blue!50!black},
commentstyle=\slshape\color{black!60}}
\lstdefinestyle{lstStyleShell}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{purple},
language=bash}
\lstdefinestyle{lstStyleLaTeX}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{violet},
language=[LaTeX]TeX}
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
\setlist{nosep}
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
\DeclareDocumentCommand{\pkg}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
\DeclareDocumentCommand{\file}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
\newcommand{\myentry}[1]{%
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
\newcommand{\note}[2][Note]{{%
\color{magenta}{\bfseries #1}\emph{#2}}}
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}

153
hw4/report/iidef.sty Normal file
View File

@@ -0,0 +1,153 @@
%%
%% This is file `iidef.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `sty')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
\ProvidesClass{iidef}
[2020/09/09 2.6 Tsinghua University Coursework Template]
%% configuration of nested enumerate env
\RequirePackage{enumitem}
%% set hwcount key-value option
\RequirePackage{kvoptions}
%% required by macro DeclareMathOperator
\RequirePackage{amsmath}
%% Set up page headers using with fancyhdr
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
{\def\@thulhead{thulhead}}
\RequirePackage{amsthm}
%% semester
\def\@term{term}
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
%% institute
\newcommand{\@courseinstitute}[1]{institute}
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
%% coursename
\newcommand{\@coursename}[1]{coursename}
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
%% user can rewrite homework name
\def\@hwname{Homework}
\def\hwname#1{\renewcommand\@hwname{#1}}
%% \iidef@thehwcnt = 1
\DeclareStringOption[1]{thehwcnt}
\ProcessKeyvalOptions*
\def\thehwcnt{\iidef@thehwcnt}
%% page header setup, distinguish between first page(plain style)
%% and second page on (runningpage style)
%%***************************************************************************
\newcommand{\courseheader}{
\thispagestyle{plain}%first page use native plain style to suppress header
\vspace*{-1in}
\begin{center}
\@courseinstitute\\
\@coursename\\
\@term
\vspace*{0.1in}
\hrule
\end{center}
\begin{center}
\underline{\bf \@hwname\;\thehwcnt} \\
\end{center}
}
\@ifundefined{@thulhead}{
\fancypagestyle{runningpage}
{
\fancyhead[L]{\small\@coursename}
\fancyhead[R]{\small\@courseinstitute}
}
%% use runningpage style from second page on
\pagestyle{runningpage}
}{}
%% *********************************************************************************************
%%name command macro
%%*************************
\newcommand{\name}[1]{
\begin{flushleft}
#1\hfill
\today
\end{flushleft}
\hrule
\vspace{2em}
\flushleft
}
%%*************************
%% enumitem related configuration
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
\setlist[enumerate,2]{label=(\alph*)}
\setlist[enumerate,3]{label=\roman*.}
\setlist[enumerate,4]{label=\greek*}
%%******************************
\def\@slname{Solution}
\def\slname#1{\renewcommand\@slname{#1}}
\@ifundefined{solution}{
\newenvironment{solution}
{
\proof[\@slname]
}
{
%% no qed symbol in solution env
\renewcommand{\qedsymbol}{}
\endproof
}
}{}
%%******************************
%%common math symbols go here
%%*************************************************
\def\v#1{\underline{#1}}
\newcommand{\uc}{\underline{c}} % c, vec
\newcommand{\uv}{\underline{v}} % x, vec
\newcommand{\uw}{\underline{w}} % w, vec
\newcommand{\ux}{\underline{x}} % x, vec
\newcommand{\uy}{\underline{y}} % y, vec
\newcommand{\uz}{\underline{z}} % z, vec
\newcommand{\um}{\underline{m}} % m, vec
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
\newcommand{\defas}{\triangleq} %\coloneqq
\newcommand{\reals}{\mathbb{R}}
\newcommand{\TT}{\mathrm{T}} % transpose
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argsup}{arg\,sup}
\DeclareMathOperator*{\arginf}{arg\,inf}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Cov}{Cov}
\DeclareMathOperator{\MSE}{MSE}
\DeclareMathOperator{\1}{\mathds{1}}
\DeclareMathOperator{\In}{\mathbb{I}}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\Prob}{\mathbb{P}}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
%%************************************************************************************

Binary file not shown.

After

Width:  |  Height:  |  Size: 186 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@@ -0,0 +1,49 @@
sample from workdirs/quansongci/best.pth
+++水调歌头
黄花满疏雨,月扫三宫。月明月明人去,绿绵声里,风光残霞。屈指两小天天静,绿满阶外,更相逢。那处得何曾小,泪断肠头。
---------------
+++浣溪沙(五清)
翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
天人未遇向西楼。小阳春水一线清。玉壶重重重。
---------------
+++菩萨蛮(梅)
江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
---------------
+++菩萨蛮
江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
豆蔻风前好因缘。送通住。试问三山同。人间无处难。
---------------
+++秦楼月
练雨梳妆。桃叶半枝,冰肌红子春寒。半枝都奈。吹香飞絮,记清凉。
无限夜云春风护。玉阑无数转。碎帽孤情君,小海东风。
---------------
+++浪淘沙
橘上园阳关路早。绿钗风雨散,犹被东湖见楼。
仿佛风前坡上去日,月如流。想取东南风。犹慵尘尽比重归。
---------------
+++诉衷情(高人)
时候又来深。长是红帘前。醉眼风入春期。
应是时时,何处在、应厮续。
---------------
+++浣溪沙(咏梅)
离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
素娥小山小曲,水朝元有长安。一榻了共取大家。
---------------
+++浣溪沙(和怀)
纵图清露歌黛倚,寒题金銮声珊瑚。十年人来懒舞丝。
---------------
+++满江月
风月不如旧,柔条欲到春风。掩花间心,道处难臾、相逢。
陇头情不物里,阿谁向娇几。且看东词,还明红云与,一笑认教梳灯。
---------------

View File

@@ -0,0 +1,49 @@
sample from workdirs/quansongci_no_pos/best.pth
++++++++菩萨蛮(牡丹月近)
江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
---------------
++++浣溪沙
清歌灯未无限。佳期时更传人不醉里,可奈有芳菲节懒。
双蛾罗带向西楼。小小槛春寒人都怨,燕子未销眉花。
---------------
++++++++++++++++++++临江仙歌香花天
九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
放萧词传天稼时常相逢,还记,酒,占春寒花间风光相住,月劝花往事,占春留思,应春风到上,无人间一线秀船归来,点面皱。□□□□□□□□□□□□。都为谁老还来
---------------
++++鹧鸪天(十二之二)
此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发,忍因缘凝理通。
试语三岛不下,松径何处。问清将春愁易全窟,且识斗重阳。
---------------
++++浣溪沙(赋木犀)
芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
枝开夜忽春风护,玉阑凉痕转新碎香。有君恩多少载酒,且道有春风流。
---------------
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案(西江仙香花宫春令(与梅子
绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱,秋风露满庭芳菲节难过,紫。绿门好,十分飞燕子
红,秋寒庭楼小西西风,春暮
---------------
++++++鹧鸪天(和坡衮侑觞)
薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
春色肃熟燕子,无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
---------------
++++菩萨蛮(用时春)
竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
---------------
++++++++++最仙歌子(和尉生查子题)
绿阴山淡黄未泛湘神神仙,美酒,长唱玉纤纤纤手。元何穷何处重约,清寒食、酒家流光光渐、寄新春花晓,小院映烟微香,正是十年瑶楼酒,水暖花枝枝黄昏昏不语,乍见月寂寞痴愠痕、落醉,看花梢啼红裳篆拂堕风流。
东风吹泪过,
---------------
+++++++++++++点绛唇头春事近
花艳心头道酒前春风雨,欲春惨,春去,深自有极目娇几粉,看春词,还爱红云归,绿杨花,旧谢去年时节节,十分真时及华明月。
醉眼底莺声中秋光幸有豆皇子
杏花开后黄梅梢仙子,且占客里春风吹乱。
细雨过春风轻椒香闺催春,小离
---------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

View File

@@ -0,0 +1,56 @@
sample from workdirs/quansongci_no_res/best.pth
+++藕上空都未。消
---------------
+++。水。香,清干灯翠无月。佳
---------------
+++烟
莫。。一
真。,。,手)+(。当,。,还花。
。。饱)花清生失楼犹。拂念。。。
+东+柳人。碧放萧似天天饮时
---------------
+++,一+
楼。。移。无度此
+路风砧东
---------------
+++,。常明香天。早。+。色。,大,梅子春上妆半枝。奈。吹。飞、,歌。阑故溪枝开夜忽春花。情,重凉痕转。碎沙相,君有园海。奈。
。会
---------------
+++。。晓宫。。园。+二盈
钗。+。,恁尾。
见楼风
寿到+。尽+。日。。
---------------
+++。看。月。
时衮红。自。意
须去前。醉急风入鼎人花
。团时。丹翁怨在身云厮。厌
秋海花拟燕
,无共宿道行气东。,鸾+雨。梦,
。。余采
---------------
++++俊去莺浮
时重。+功太。犹。头(人一溪+者。斋算。旧
---------------
+++,人花长和寞。。纵图清孔歌幽
---------------
+++髻
。+风与不,干
。头余说。花
。心头道。前,枕相
忘,情+物。自水极初。几晶
看。词光。明红主与,。。认,旧。去
户萨尽玉罢
不时家。亭,行翠厚情青
+中思难梦。底南星
。自马
我来
,中+。花
禁,,也
。花、。风儿。堂莺催旧,+离
---------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

View File

@@ -0,0 +1,51 @@
sample from workdirs/quansongci/best.pth
+++清平乐(上赋)
黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
---------------
+++清平乐
京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
客已暮云梦,天人未老。心事有天涯无数。人都不须关,只是秋千千里。
---------------
+++清平乐(春)
红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
---------------
+++清平乐
银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
---------------
+++清平乐
江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归,犹唤梅子春去。
好都奈。吹回飞飞来。清凉不知无限夜,春风护雨晚梁归。
---------------
+++清平乐
春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
---------------
+++清平乐(即回)
六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
好去前时醉,风入泥袖。挼黄团时时问。怨在月明千片春水。
---------------
+++清平乐
晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
---------------
+++清平乐
残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
---------------
+++清平乐(月明月)
醉来人在。春知何时到花时。似来东风识,时时倍度。
风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
---------------

BIN
hw4/report/img/train.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

187
hw4/report/main.tex Normal file
View File

@@ -0,0 +1,187 @@
% Homework template for Inference and Information
% UPDATE: September 26, 2017 by Xiangxiang
\documentclass[a4paper]{article}
\usepackage{ctex}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{moreenum}
\usepackage{mathtools}
\usepackage{url}
\usepackage{bm}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{color}
\usepackage{float}
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
\newfontfamily\cascadia{Cascadia Code}
\lstset{
basicstyle = \small\codefont,
% ---
tabsize = 4,
showstringspaces = false,
numbers = left,
numberstyle = \codefont,
% ---
breaklines = true,
captionpos = t,
% ---
frame = l,
flexiblecolumns,
}
\lstdefinestyle{Python}{
language = Python, % 语言选Python
keywordstyle = \color{blue},
keywordstyle = [2] \color{teal},
stringstyle = \color{orange!80!black},
commentstyle = \color{red},
identifierstyle = \color{blue!80!white},
}
\lstdefinestyle{Bash}{
language = bash
}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
\usepackage[mathcal]{eucal}
\usepackage[thehwcnt = 4]{iidef}
\thecourseinstitute{清华大学电子工程系}
\thecoursename{\textbf{媒体与认知}}
\theterm{2023-2024学年春季学期}
\hwname{作业}
\begin{document}
\courseheader
\name{高艺轩}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
\subsection{\underline{D}}
\subsection{\underline{A}}
\subsection{\underline{A}}
\subsection{\underline{C}}
\subsection{\underline{B}}
\section{计算题15 分)}
% 计算题1
\subsection{隐含马尔可夫模型}
\hspace{2em}暑假中小E每天进行一项体育活动包括跑步R、游泳S和打球B所选择的体育活动受某种潜在因素如心情的影响。小E每天把进行体育活动的照片发至微信朋友圈我们可以根据观测信息推测该潜在因素的状态。
\hspace{2em}假设该潜在因素分为$S_1$$S_2$两种状态。在$S_1$小E选择三种体育活动的概率分别为0.60.20.2;在$S_2$小E选择三种体育活动的概率分别为0.10.60.3。
\hspace{2em}该潜在因素的变化也有一定规律,若某天处于$S_1$的状态,第二天处于$S_1$$S_2$的状态的概率分别为0.50.5;若某天处于$S_2$的状态,第二天处于$S_1$$S_2$的状态的概率分别为0.60.4。
\hspace{2em}暑假第一天处于$S_1$$S_2$的状态的概率均为0.5。
\vspace{3mm}
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}
\begin{proof}[解]
\[\pi = \begin{bmatrix}
0.5\\0.5
\end{bmatrix}\]
\[A = \begin{bmatrix}
0.5 & 0.5\\
0.6 & 0.4\\
\end{bmatrix}\]
\[B = \begin{bmatrix}
0.6 & 0.2 & 0.2\\
0.1 & 0.6 & 0.3
\end{bmatrix}\]
\end{proof}
\vspace{3mm}
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步R、打球B和游泳S{\color{blue}请计算出现该观测序列的概率}
\begin{proof}[解]
\begin{align*}
\alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
\alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
\alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
& = 0.036\\
\alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
& = 0.051\\
\alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
& = 0.00972\\
\alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
& = 0.02304\\
P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
\end{align*}
\end{proof}
\vspace{3mm}
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
\end{figure}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
\section{编程作业报告}
\subsection{模型的训练与测试}
首先进行数据预处理。预处理后进行模型训练,训练的结果见图\ref{fig:default_train}
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/train.png}
\caption{默认测试}
\label{fig:default_train}
\end{figure}
默认配置的生成样本:
\begin{lstlisting}
python sample.py --ckpt_path workdirs/quansongci
\end{lstlisting}
得到的输出为
\lstinputlisting{img/default_sample.txt}
若指定初始文本:
\begin{lstlisting}
python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
\end{lstlisting}
得到的输出为
\lstinputlisting{img/specific_start_sample.txt}
\subsection{探究位置编码和残差链接在模型中的作用}
关闭位置编码的训练:
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/no_pos_train.png}
\end{figure}
得到的生成结果:
\lstinputlisting{img/no_pos_sample.txt}
可以看到,模型没有很好理解句子的长度的关系。
关闭残差连接的训练:
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/no_res_train.png}
\end{figure}
得到的生成结果:
\lstinputlisting{img/no_res_sample.txt}
模型训练遇到了梯度消失的问题,很难有效地训练。
\subsection{可视化}
\begin{figure}[H]
\centering
\includegraphics[width=.8\linewidth]{img/attention_vis.png}
\end{figure}
许多的词语的注意力系数都会集中在题目的几个字上,可以看到模型主要是分析了不同词牌名对内容的相关性。
\end{document}
%%% Local Variables:
%%% mode: late\rvx
%%% TeX-master: t
%%% End:

2
j.ps1
View File

@@ -1 +1 @@
cd ./hw2/code
cd ./hw4/code

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -10,7 +10,9 @@
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"import torchvision.transforms as transforms"
"import torchvision.transforms as transforms\n",
"\n",
"import numpy as np"
]
},
{
@@ -152,6 +154,123 @@
"print(conv_1(a).size())\n",
"print(conv_2(conv_1(a)).size())\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([0., 1.])\n",
"1\n"
]
}
],
"source": [
"a = torch.Tensor([1.0, 2.0])\n",
"b = torch.Tensor([1.0, 1.0])\n",
"print((a > b).type_as(a))\n",
"print((a == b).sum().item())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor(2.5000)\n"
]
}
],
"source": [
"a = torch.Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
"mu = a.mean(dim=0)\n",
"print(mu, a - mu)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[5.],\n",
" [4.]])\n"
]
}
],
"source": [
"a = torch.Tensor([[5], [4]])\n",
"b = torch.Tensor([1])\n",
"print((a.T * b).T)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[False, True, True, True, True],\n",
" [False, False, True, True, True],\n",
" [False, False, False, True, True],\n",
" [False, False, False, False, True],\n",
" [False, False, False, False, False]])\n",
"tensor([[-0.1170, 0.6130, 0.9644, -1.2733, -0.9671],\n",
" [-0.7806, 0.5082, -0.2731, 0.1660, -0.5451],\n",
" [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
" [-1.8357, -0.8010, -0.0424, 0.1491, -1.5009],\n",
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n",
"tensor([[-0.1170, -inf, -inf, -inf, -inf],\n",
" [-0.7806, 0.5082, -inf, -inf, -inf],\n",
" [-2.1527, -0.5059, -0.0079, -inf, -inf],\n",
" [-1.8357, -0.8010, -0.0424, 0.1491, -inf],\n",
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n"
]
}
],
"source": [
"mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
"print(mask)\n",
"attn = torch.randn(5, 5)\n",
"print(attn)\n",
"print(attn.masked_fill(mask, -np.inf))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([0.1402, 0.2312, 0.6285])\n"
]
}
],
"source": [
"Q = torch.Tensor([1, 0, 1, 1])\n",
"K = torch.Tensor([[0, 0, 0, 2],\n",
" [2, 0, 1, 0],\n",
" [2, 1, 2, 1]])\n",
"\n",
"print(torch.softmax((Q @ K.T) / 2, dim=0))"
]
}
],
"metadata": {