Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
76a643ebc4
|
||
|
|
c6b2420b85
|
||
|
|
c850f38778
|
||
|
|
820f679067
|
||
|
|
81de7b1d58 | ||
|
|
b741c9d08e | ||
|
|
8b657be441
|
||
|
|
4bc3f77879
|
27
.gitignore
vendored
@@ -1,12 +1,15 @@
|
||||
*.zip
|
||||
__pycache__/
|
||||
*.pth
|
||||
*.log
|
||||
*.aux
|
||||
*.synctex.gz
|
||||
*.synctex.gz(buzy)
|
||||
*.out
|
||||
*.pdf
|
||||
.DS_Store
|
||||
hw2/code/checkpoints/
|
||||
hw2/code/visualized/
|
||||
*.zip
|
||||
__pycache__/
|
||||
*.pth
|
||||
*.log
|
||||
*.aux
|
||||
*.synctex.gz
|
||||
*.synctex.gz(buzy)
|
||||
*.out
|
||||
*.pdf
|
||||
.DS_Store
|
||||
hw2/code/checkpoints/
|
||||
hw2/code/visualized/
|
||||
hw3/code/data/
|
||||
hw3/code/checkpoints/
|
||||
hw4/code/workdirs/
|
||||
6
hw1/.vscode/settings.json
vendored
@@ -1,4 +1,4 @@
|
||||
{
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.analysis.autoImportCompletions": true
|
||||
{
|
||||
"python.analysis.typeCheckingMode": "basic",
|
||||
"python.analysis.autoImportCompletions": true
|
||||
}
|
||||
@@ -1,56 +1,56 @@
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = 6.678
|
||||
Epoch 04: loss = 4.361
|
||||
Epoch 05: loss = 3.110
|
||||
Epoch 06: loss = 2.099
|
||||
Epoch 07: loss = 1.698
|
||||
Epoch 08: loss = 1.320
|
||||
Epoch 09: loss = 0.970
|
||||
Epoch 10: loss = 0.891
|
||||
Epoch 10: validation accuracy = 66.0%
|
||||
Epoch 11: loss = 0.817
|
||||
Epoch 12: loss = 0.723
|
||||
Epoch 13: loss = 0.512
|
||||
Epoch 14: loss = 0.353
|
||||
Epoch 15: loss = 0.202
|
||||
Epoch 16: loss = 0.182
|
||||
Epoch 17: loss = 0.184
|
||||
Epoch 18: loss = 0.191
|
||||
Epoch 19: loss = 0.175
|
||||
Epoch 20: loss = 0.166
|
||||
Epoch 20: validation accuracy = 68.0%
|
||||
Epoch 21: loss = 0.146
|
||||
Epoch 22: loss = 0.105
|
||||
Epoch 23: loss = 0.109
|
||||
Epoch 24: loss = 0.074
|
||||
Epoch 25: loss = 0.097
|
||||
Epoch 26: loss = 0.047
|
||||
Epoch 27: loss = 0.038
|
||||
Epoch 28: loss = 0.037
|
||||
Epoch 29: loss = 0.024
|
||||
Epoch 30: loss = 0.021
|
||||
Epoch 30: validation accuracy = 68.8%
|
||||
Epoch 31: loss = 0.019
|
||||
Epoch 32: loss = 0.024
|
||||
Epoch 33: loss = 0.023
|
||||
Epoch 34: loss = 0.014
|
||||
Epoch 35: loss = 0.013
|
||||
Epoch 36: loss = 0.012
|
||||
Epoch 37: loss = 0.011
|
||||
Epoch 38: loss = 0.013
|
||||
Epoch 39: loss = 0.013
|
||||
Epoch 40: loss = 0.016
|
||||
Epoch 40: validation accuracy = 70.5%
|
||||
Epoch 41: loss = 0.015
|
||||
Epoch 42: loss = 0.009
|
||||
Epoch 43: loss = 0.011
|
||||
Epoch 44: loss = 0.008
|
||||
Epoch 45: loss = 0.008
|
||||
Epoch 46: loss = 0.010
|
||||
Epoch 47: loss = 0.009
|
||||
Epoch 48: loss = 0.007
|
||||
Epoch 49: loss = 0.007
|
||||
Epoch 50: loss = 0.010
|
||||
Epoch 50: validation accuracy = 70.5%
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = 6.678
|
||||
Epoch 04: loss = 4.361
|
||||
Epoch 05: loss = 3.110
|
||||
Epoch 06: loss = 2.099
|
||||
Epoch 07: loss = 1.698
|
||||
Epoch 08: loss = 1.320
|
||||
Epoch 09: loss = 0.970
|
||||
Epoch 10: loss = 0.891
|
||||
Epoch 10: validation accuracy = 66.0%
|
||||
Epoch 11: loss = 0.817
|
||||
Epoch 12: loss = 0.723
|
||||
Epoch 13: loss = 0.512
|
||||
Epoch 14: loss = 0.353
|
||||
Epoch 15: loss = 0.202
|
||||
Epoch 16: loss = 0.182
|
||||
Epoch 17: loss = 0.184
|
||||
Epoch 18: loss = 0.191
|
||||
Epoch 19: loss = 0.175
|
||||
Epoch 20: loss = 0.166
|
||||
Epoch 20: validation accuracy = 68.0%
|
||||
Epoch 21: loss = 0.146
|
||||
Epoch 22: loss = 0.105
|
||||
Epoch 23: loss = 0.109
|
||||
Epoch 24: loss = 0.074
|
||||
Epoch 25: loss = 0.097
|
||||
Epoch 26: loss = 0.047
|
||||
Epoch 27: loss = 0.038
|
||||
Epoch 28: loss = 0.037
|
||||
Epoch 29: loss = 0.024
|
||||
Epoch 30: loss = 0.021
|
||||
Epoch 30: validation accuracy = 68.8%
|
||||
Epoch 31: loss = 0.019
|
||||
Epoch 32: loss = 0.024
|
||||
Epoch 33: loss = 0.023
|
||||
Epoch 34: loss = 0.014
|
||||
Epoch 35: loss = 0.013
|
||||
Epoch 36: loss = 0.012
|
||||
Epoch 37: loss = 0.011
|
||||
Epoch 38: loss = 0.013
|
||||
Epoch 39: loss = 0.013
|
||||
Epoch 40: loss = 0.016
|
||||
Epoch 40: validation accuracy = 70.5%
|
||||
Epoch 41: loss = 0.015
|
||||
Epoch 42: loss = 0.009
|
||||
Epoch 43: loss = 0.011
|
||||
Epoch 44: loss = 0.008
|
||||
Epoch 45: loss = 0.008
|
||||
Epoch 46: loss = 0.010
|
||||
Epoch 47: loss = 0.009
|
||||
Epoch 48: loss = 0.007
|
||||
Epoch 49: loss = 0.007
|
||||
Epoch 50: loss = 0.010
|
||||
Epoch 50: validation accuracy = 70.5%
|
||||
Model saved in ./saved_models/default.pth
|
||||
@@ -1,2 +1,2 @@
|
||||
[Info] Load model from .\saved_models\default.pth
|
||||
[Info] Load model from .\saved_models\default.pth
|
||||
[Info] Test accuracy = 72.0%
|
||||
@@ -1,2 +1,2 @@
|
||||
[Info] Load model from .\saved_models\adam_optim.pth
|
||||
[Info] Load model from .\saved_models\adam_optim.pth
|
||||
[Info] Test accuracy = 85.0%
|
||||
@@ -1,56 +1,56 @@
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = inf
|
||||
Epoch 04: loss = inf
|
||||
Epoch 05: loss = inf
|
||||
Epoch 06: loss = inf
|
||||
Epoch 07: loss = inf
|
||||
Epoch 08: loss = inf
|
||||
Epoch 09: loss = 3.250
|
||||
Epoch 10: loss = 2.567
|
||||
Epoch 10: validation accuracy = 59.0%
|
||||
Epoch 11: loss = 1.963
|
||||
Epoch 12: loss = 1.558
|
||||
Epoch 13: loss = 1.320
|
||||
Epoch 14: loss = 0.911
|
||||
Epoch 15: loss = 0.808
|
||||
Epoch 16: loss = 0.932
|
||||
Epoch 17: loss = 0.861
|
||||
Epoch 18: loss = 0.748
|
||||
Epoch 19: loss = 0.783
|
||||
Epoch 20: loss = 0.809
|
||||
Epoch 20: validation accuracy = 65.5%
|
||||
Epoch 21: loss = 0.678
|
||||
Epoch 22: loss = 0.757
|
||||
Epoch 23: loss = 0.747
|
||||
Epoch 24: loss = 0.660
|
||||
Epoch 25: loss = 0.536
|
||||
Epoch 26: loss = 0.506
|
||||
Epoch 27: loss = 0.577
|
||||
Epoch 28: loss = 0.600
|
||||
Epoch 29: loss = 0.681
|
||||
Epoch 30: loss = 0.604
|
||||
Epoch 30: validation accuracy = 68.0%
|
||||
Epoch 31: loss = 0.552
|
||||
Epoch 32: loss = 0.671
|
||||
Epoch 33: loss = 0.604
|
||||
Epoch 34: loss = 0.600
|
||||
Epoch 35: loss = 0.818
|
||||
Epoch 36: loss = 0.659
|
||||
Epoch 37: loss = 0.375
|
||||
Epoch 38: loss = 0.380
|
||||
Epoch 39: loss = 0.418
|
||||
Epoch 40: loss = 0.431
|
||||
Epoch 40: validation accuracy = 73.5%
|
||||
Epoch 41: loss = 0.551
|
||||
Epoch 42: loss = 0.488
|
||||
Epoch 43: loss = 0.350
|
||||
Epoch 44: loss = 0.287
|
||||
Epoch 45: loss = 0.294
|
||||
Epoch 46: loss = 0.463
|
||||
Epoch 47: loss = 0.438
|
||||
Epoch 48: loss = 0.392
|
||||
Epoch 49: loss = 0.325
|
||||
Epoch 50: loss = 0.332
|
||||
Epoch 50: validation accuracy = 80.8%
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = inf
|
||||
Epoch 04: loss = inf
|
||||
Epoch 05: loss = inf
|
||||
Epoch 06: loss = inf
|
||||
Epoch 07: loss = inf
|
||||
Epoch 08: loss = inf
|
||||
Epoch 09: loss = 3.250
|
||||
Epoch 10: loss = 2.567
|
||||
Epoch 10: validation accuracy = 59.0%
|
||||
Epoch 11: loss = 1.963
|
||||
Epoch 12: loss = 1.558
|
||||
Epoch 13: loss = 1.320
|
||||
Epoch 14: loss = 0.911
|
||||
Epoch 15: loss = 0.808
|
||||
Epoch 16: loss = 0.932
|
||||
Epoch 17: loss = 0.861
|
||||
Epoch 18: loss = 0.748
|
||||
Epoch 19: loss = 0.783
|
||||
Epoch 20: loss = 0.809
|
||||
Epoch 20: validation accuracy = 65.5%
|
||||
Epoch 21: loss = 0.678
|
||||
Epoch 22: loss = 0.757
|
||||
Epoch 23: loss = 0.747
|
||||
Epoch 24: loss = 0.660
|
||||
Epoch 25: loss = 0.536
|
||||
Epoch 26: loss = 0.506
|
||||
Epoch 27: loss = 0.577
|
||||
Epoch 28: loss = 0.600
|
||||
Epoch 29: loss = 0.681
|
||||
Epoch 30: loss = 0.604
|
||||
Epoch 30: validation accuracy = 68.0%
|
||||
Epoch 31: loss = 0.552
|
||||
Epoch 32: loss = 0.671
|
||||
Epoch 33: loss = 0.604
|
||||
Epoch 34: loss = 0.600
|
||||
Epoch 35: loss = 0.818
|
||||
Epoch 36: loss = 0.659
|
||||
Epoch 37: loss = 0.375
|
||||
Epoch 38: loss = 0.380
|
||||
Epoch 39: loss = 0.418
|
||||
Epoch 40: loss = 0.431
|
||||
Epoch 40: validation accuracy = 73.5%
|
||||
Epoch 41: loss = 0.551
|
||||
Epoch 42: loss = 0.488
|
||||
Epoch 43: loss = 0.350
|
||||
Epoch 44: loss = 0.287
|
||||
Epoch 45: loss = 0.294
|
||||
Epoch 46: loss = 0.463
|
||||
Epoch 47: loss = 0.438
|
||||
Epoch 48: loss = 0.392
|
||||
Epoch 49: loss = 0.325
|
||||
Epoch 50: loss = 0.332
|
||||
Epoch 50: validation accuracy = 80.8%
|
||||
Model saved in .\saved_models\adam_optim_cuda.pth
|
||||
@@ -1,2 +1,2 @@
|
||||
[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
|
||||
[Info] Load model from .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
|
||||
[Info] Test accuracy = 88.8%
|
||||
@@ -1,111 +1,111 @@
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = inf
|
||||
Epoch 04: loss = inf
|
||||
Epoch 05: loss = inf
|
||||
Epoch 06: loss = inf
|
||||
Epoch 07: loss = inf
|
||||
Epoch 08: loss = inf
|
||||
Epoch 09: loss = inf
|
||||
Epoch 10: loss = inf
|
||||
Epoch 10: validation accuracy = 40.2%
|
||||
Epoch 11: loss = inf
|
||||
Epoch 12: loss = inf
|
||||
Epoch 13: loss = inf
|
||||
Epoch 14: loss = inf
|
||||
Epoch 15: loss = inf
|
||||
Epoch 16: loss = inf
|
||||
Epoch 17: loss = 2.360
|
||||
Epoch 18: loss = 2.086
|
||||
Epoch 19: loss = 1.684
|
||||
Epoch 20: loss = 1.453
|
||||
Epoch 20: validation accuracy = 53.0%
|
||||
Epoch 21: loss = 1.174
|
||||
Epoch 22: loss = 1.046
|
||||
Epoch 23: loss = 0.859
|
||||
Epoch 24: loss = 0.740
|
||||
Epoch 25: loss = 0.663
|
||||
Epoch 26: loss = 0.495
|
||||
Epoch 27: loss = 0.566
|
||||
Epoch 28: loss = 0.521
|
||||
Epoch 29: loss = 0.470
|
||||
Epoch 30: loss = 0.363
|
||||
Epoch 30: validation accuracy = 59.0%
|
||||
Epoch 31: loss = 0.365
|
||||
Epoch 32: loss = 0.305
|
||||
Epoch 33: loss = 0.333
|
||||
Epoch 34: loss = 0.293
|
||||
Epoch 35: loss = 0.191
|
||||
Epoch 36: loss = 0.295
|
||||
Epoch 37: loss = 0.275
|
||||
Epoch 38: loss = 0.461
|
||||
Epoch 39: loss = 0.509
|
||||
Epoch 40: loss = 0.298
|
||||
Epoch 40: validation accuracy = 65.2%
|
||||
Epoch 41: loss = 0.186
|
||||
Epoch 42: loss = 0.395
|
||||
Epoch 43: loss = 0.323
|
||||
Epoch 44: loss = 0.309
|
||||
Epoch 45: loss = 0.199
|
||||
Epoch 46: loss = 0.285
|
||||
Epoch 47: loss = 0.290
|
||||
Epoch 48: loss = 0.302
|
||||
Epoch 49: loss = 0.235
|
||||
Epoch 50: loss = 0.190
|
||||
Epoch 50: validation accuracy = 71.2%
|
||||
Epoch 51: loss = 0.294
|
||||
Epoch 52: loss = 0.311
|
||||
Epoch 53: loss = 0.254
|
||||
Epoch 54: loss = 0.289
|
||||
Epoch 55: loss = 0.264
|
||||
Epoch 56: loss = 0.213
|
||||
Epoch 57: loss = 0.166
|
||||
Epoch 58: loss = 0.218
|
||||
Epoch 59: loss = 0.231
|
||||
Epoch 60: loss = 0.283
|
||||
Epoch 60: validation accuracy = 74.8%
|
||||
Epoch 61: loss = 0.324
|
||||
Epoch 62: loss = 0.245
|
||||
Epoch 63: loss = 0.277
|
||||
Epoch 64: loss = 0.286
|
||||
Epoch 65: loss = 0.255
|
||||
Epoch 66: loss = 0.263
|
||||
Epoch 67: loss = 0.272
|
||||
Epoch 68: loss = 0.272
|
||||
Epoch 69: loss = 0.260
|
||||
Epoch 70: loss = 0.271
|
||||
Epoch 70: validation accuracy = 79.0%
|
||||
Epoch 71: loss = 0.310
|
||||
Epoch 72: loss = 0.301
|
||||
Epoch 73: loss = 0.305
|
||||
Epoch 74: loss = 0.311
|
||||
Epoch 75: loss = 0.329
|
||||
Epoch 76: loss = 0.295
|
||||
Epoch 77: loss = 0.300
|
||||
Epoch 78: loss = 0.316
|
||||
Epoch 79: loss = 0.326
|
||||
Epoch 80: loss = 0.352
|
||||
Epoch 80: validation accuracy = 77.5%
|
||||
Epoch 81: loss = 0.344
|
||||
Epoch 82: loss = 0.326
|
||||
Epoch 83: loss = 0.326
|
||||
Epoch 84: loss = 0.335
|
||||
Epoch 85: loss = 0.342
|
||||
Epoch 86: loss = 0.361
|
||||
Epoch 87: loss = 0.337
|
||||
Epoch 88: loss = 0.339
|
||||
Epoch 89: loss = 0.339
|
||||
Epoch 90: loss = 0.341
|
||||
Epoch 90: validation accuracy = 82.8%
|
||||
Epoch 91: loss = 0.350
|
||||
Epoch 92: loss = 0.359
|
||||
Epoch 93: loss = 0.352
|
||||
Epoch 94: loss = 0.363
|
||||
Epoch 95: loss = 0.347
|
||||
Epoch 96: loss = 0.341
|
||||
Epoch 97: loss = 0.336
|
||||
Epoch 98: loss = 0.348
|
||||
Epoch 99: loss = 0.365
|
||||
Epoch 100: loss = 0.350
|
||||
Epoch 100: validation accuracy = 85.2%
|
||||
Epoch 01: loss = inf
|
||||
Epoch 02: loss = inf
|
||||
Epoch 03: loss = inf
|
||||
Epoch 04: loss = inf
|
||||
Epoch 05: loss = inf
|
||||
Epoch 06: loss = inf
|
||||
Epoch 07: loss = inf
|
||||
Epoch 08: loss = inf
|
||||
Epoch 09: loss = inf
|
||||
Epoch 10: loss = inf
|
||||
Epoch 10: validation accuracy = 40.2%
|
||||
Epoch 11: loss = inf
|
||||
Epoch 12: loss = inf
|
||||
Epoch 13: loss = inf
|
||||
Epoch 14: loss = inf
|
||||
Epoch 15: loss = inf
|
||||
Epoch 16: loss = inf
|
||||
Epoch 17: loss = 2.360
|
||||
Epoch 18: loss = 2.086
|
||||
Epoch 19: loss = 1.684
|
||||
Epoch 20: loss = 1.453
|
||||
Epoch 20: validation accuracy = 53.0%
|
||||
Epoch 21: loss = 1.174
|
||||
Epoch 22: loss = 1.046
|
||||
Epoch 23: loss = 0.859
|
||||
Epoch 24: loss = 0.740
|
||||
Epoch 25: loss = 0.663
|
||||
Epoch 26: loss = 0.495
|
||||
Epoch 27: loss = 0.566
|
||||
Epoch 28: loss = 0.521
|
||||
Epoch 29: loss = 0.470
|
||||
Epoch 30: loss = 0.363
|
||||
Epoch 30: validation accuracy = 59.0%
|
||||
Epoch 31: loss = 0.365
|
||||
Epoch 32: loss = 0.305
|
||||
Epoch 33: loss = 0.333
|
||||
Epoch 34: loss = 0.293
|
||||
Epoch 35: loss = 0.191
|
||||
Epoch 36: loss = 0.295
|
||||
Epoch 37: loss = 0.275
|
||||
Epoch 38: loss = 0.461
|
||||
Epoch 39: loss = 0.509
|
||||
Epoch 40: loss = 0.298
|
||||
Epoch 40: validation accuracy = 65.2%
|
||||
Epoch 41: loss = 0.186
|
||||
Epoch 42: loss = 0.395
|
||||
Epoch 43: loss = 0.323
|
||||
Epoch 44: loss = 0.309
|
||||
Epoch 45: loss = 0.199
|
||||
Epoch 46: loss = 0.285
|
||||
Epoch 47: loss = 0.290
|
||||
Epoch 48: loss = 0.302
|
||||
Epoch 49: loss = 0.235
|
||||
Epoch 50: loss = 0.190
|
||||
Epoch 50: validation accuracy = 71.2%
|
||||
Epoch 51: loss = 0.294
|
||||
Epoch 52: loss = 0.311
|
||||
Epoch 53: loss = 0.254
|
||||
Epoch 54: loss = 0.289
|
||||
Epoch 55: loss = 0.264
|
||||
Epoch 56: loss = 0.213
|
||||
Epoch 57: loss = 0.166
|
||||
Epoch 58: loss = 0.218
|
||||
Epoch 59: loss = 0.231
|
||||
Epoch 60: loss = 0.283
|
||||
Epoch 60: validation accuracy = 74.8%
|
||||
Epoch 61: loss = 0.324
|
||||
Epoch 62: loss = 0.245
|
||||
Epoch 63: loss = 0.277
|
||||
Epoch 64: loss = 0.286
|
||||
Epoch 65: loss = 0.255
|
||||
Epoch 66: loss = 0.263
|
||||
Epoch 67: loss = 0.272
|
||||
Epoch 68: loss = 0.272
|
||||
Epoch 69: loss = 0.260
|
||||
Epoch 70: loss = 0.271
|
||||
Epoch 70: validation accuracy = 79.0%
|
||||
Epoch 71: loss = 0.310
|
||||
Epoch 72: loss = 0.301
|
||||
Epoch 73: loss = 0.305
|
||||
Epoch 74: loss = 0.311
|
||||
Epoch 75: loss = 0.329
|
||||
Epoch 76: loss = 0.295
|
||||
Epoch 77: loss = 0.300
|
||||
Epoch 78: loss = 0.316
|
||||
Epoch 79: loss = 0.326
|
||||
Epoch 80: loss = 0.352
|
||||
Epoch 80: validation accuracy = 77.5%
|
||||
Epoch 81: loss = 0.344
|
||||
Epoch 82: loss = 0.326
|
||||
Epoch 83: loss = 0.326
|
||||
Epoch 84: loss = 0.335
|
||||
Epoch 85: loss = 0.342
|
||||
Epoch 86: loss = 0.361
|
||||
Epoch 87: loss = 0.337
|
||||
Epoch 88: loss = 0.339
|
||||
Epoch 89: loss = 0.339
|
||||
Epoch 90: loss = 0.341
|
||||
Epoch 90: validation accuracy = 82.8%
|
||||
Epoch 91: loss = 0.350
|
||||
Epoch 92: loss = 0.359
|
||||
Epoch 93: loss = 0.352
|
||||
Epoch 94: loss = 0.363
|
||||
Epoch 95: loss = 0.347
|
||||
Epoch 96: loss = 0.341
|
||||
Epoch 97: loss = 0.336
|
||||
Epoch 98: loss = 0.348
|
||||
Epoch 99: loss = 0.365
|
||||
Epoch 100: loss = 0.350
|
||||
Epoch 100: validation accuracy = 85.2%
|
||||
Model saved in .\saved_models\adam_optim_lr1e-3_epoch100_momentum10.pth
|
||||
@@ -1,244 +1,244 @@
|
||||
% Homework Template
|
||||
\documentclass[a4paper]{article}
|
||||
\usepackage{ctex}
|
||||
\usepackage{amsmath, amssymb, amsthm}
|
||||
\usepackage{moreenum}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{url}
|
||||
\usepackage{bm}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
\usepackage[mathcal]{eucal}
|
||||
\usepackage[thehwcnt = 1]{iidef}
|
||||
\usepackage{listings}
|
||||
\usepackage[x11names]{xcolor}
|
||||
\usepackage{float}
|
||||
\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
|
||||
|
||||
\DeclareMathOperator{\arctanh}{arctanh}
|
||||
% \DeclareMathOperator{\diag}{diag}
|
||||
|
||||
\setenumerate[1]{label=(\arabic{*})}
|
||||
\setenumerate[2]{label=\arabic{*})}
|
||||
|
||||
\definecolor{codekeyword}{RGB}{171, 0, 216}
|
||||
\definecolor{codetypename}{RGB}{29, 37, 251}
|
||||
\definecolor{codevariable}{RGB}{10, 23, 126}
|
||||
\definecolor{codestring}{RGB}{157, 0, 25}
|
||||
\definecolor{codecomment}{RGB}{31, 129, 19}
|
||||
|
||||
\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
|
||||
% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
|
||||
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||
% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
|
||||
|
||||
\lstset{
|
||||
basicstyle = \small\codefont,
|
||||
% ---
|
||||
tabsize = 4,
|
||||
showstringspaces = false,
|
||||
numbers = left,
|
||||
numberstyle = \cascadia,
|
||||
% ---
|
||||
breaklines = true,
|
||||
captionpos = t,
|
||||
% ---
|
||||
frame = l,
|
||||
flexiblecolumns,
|
||||
columns = fixed,
|
||||
}
|
||||
|
||||
\thecourseinstitute{清华大学电子工程系}
|
||||
\thecoursename{\textbf{媒体与认知} \space 课堂2}
|
||||
\theterm{2023-2024学年春季学期}
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
% 请在YOUR NAME处填写自己的姓名
|
||||
\name{高艺轩}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
% 请在?处填写答案
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$,其中$\mathbf{x}\in R^{(m \times 1)}$,$\mathbf{z}\in R^{(n\times 1)}$,$\mathbf{W}\in R^{(m\times n)}$,$\mathbf{b} \in R^{(n\times 1)}$均为已知,其激活函数如下:
|
||||
$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
|
||||
tanh表示双曲正切函数。若训练过程中的目标函数为L,且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$和$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
|
||||
}
|
||||
\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
|
||||
}
|
||||
|
||||
\begin{proof}[解]
|
||||
首先,对$i \neq j$,$\dfrac{\partial y_i}{\partial z_j} = 0$。
|
||||
|
||||
同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$,因此
|
||||
\[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
|
||||
因此
|
||||
\[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
|
||||
\end{proof}
|
||||
|
||||
\subsubsection{请使用$\mathbf{y}$和$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$,$\frac{\partial L}{\partial \mathbf{W}}$,$\frac{\partial L}{\partial \mathbf{b}}$。
|
||||
}
|
||||
提示:$\frac{\partial L}{\partial \mathbf{x}}$,$\frac{\partial L}{\partial \mathbf{W}}$,$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
|
||||
|
||||
\begin{proof}[解]
|
||||
由链式法则
|
||||
\[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
|
||||
|
||||
对于$\dfrac{\partial L}{\partial W}$,
|
||||
\[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
|
||||
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
|
||||
\end{bmatrix}_{m \times n}\]
|
||||
|
||||
\begin{align*}
|
||||
\frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
|
||||
& = \begin{bmatrix}
|
||||
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
|
||||
\end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
|
||||
\end{align*}
|
||||
|
||||
对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$,由链式法则
|
||||
\[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
|
||||
\end{proof}
|
||||
\vspace{6mm}
|
||||
\centerline{\textbf{\Large{编程部分}}}
|
||||
|
||||
|
||||
\vspace{3mm}
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||
\section{编程作业报告}
|
||||
% 请在此处完成编程作业报告
|
||||
完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
|
||||
\begin{enumerate}
|
||||
\item 使用默认配置进行训练和测试。
|
||||
\begin{enumerate}
|
||||
\item 训练模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/1.1.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/1.1.out.txt}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{img/1default_train.png}
|
||||
\end{figure}
|
||||
|
||||
\item 测试模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/1.2.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/1.2.out.txt}
|
||||
\end{enumerate}
|
||||
\item 调整参数、使用Adam优化器训练并测试。
|
||||
\begin{enumerate}
|
||||
\item 训练模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/2.1.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/2.1.out.txt}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
|
||||
\end{figure}
|
||||
\item 测试性能。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/2.2.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/2.2.out.txt}
|
||||
\end{enumerate}
|
||||
|
||||
\item 使用效果最佳的模型测试。
|
||||
经过简单的尝试,发现使用
|
||||
\lstinputlisting{codes/self_train.in.txt}
|
||||
可以使测试集准确率达到88.8\%,有略微的提升。训练的loss曲线:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=.9\linewidth]{img/3found_best.png}
|
||||
\end{figure}
|
||||
使用它进行预测:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict01.png}
|
||||
\subcaption{预测:A}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict02.png}
|
||||
\subcaption{预测:B}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict03.png}
|
||||
\subcaption{预测:M}
|
||||
\end{subfigure}
|
||||
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict04.png}
|
||||
\subcaption{预测:R}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict05.png}
|
||||
\subcaption{预测:M}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict06.png}
|
||||
\subcaption{预测:O}
|
||||
\end{subfigure}
|
||||
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict07.png}
|
||||
\subcaption{预测:B}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict08.png}
|
||||
\subcaption{预测:W}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\end{figure}
|
||||
\item 遇到的问题及解决方法
|
||||
\begin{enumerate}
|
||||
\item 代码中对灰度图像的矩阵进行标准化时,\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}的\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
|
||||
\item 在利用训练好的模型进行预测时,发现自己找到的大部分模型都预测错误;最后与训练集的图片进行了对比,发现主要问题是裁切字母时留下了过大的边距,导致模型不能正确理解输入。重新裁剪边框后,得到正确的结果。
|
||||
\end{enumerate}
|
||||
\item 建议:希望下次发布作业代码可以利用清华的git。
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
|
||||
|
||||
% \section{自选课题开题报告}
|
||||
% 请在此处介绍自选课题
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: late\rvx
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
||||
% Homework Template
|
||||
\documentclass[a4paper]{article}
|
||||
\usepackage{ctex}
|
||||
\usepackage{amsmath, amssymb, amsthm}
|
||||
\usepackage{moreenum}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{url}
|
||||
\usepackage{bm}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
\usepackage[mathcal]{eucal}
|
||||
\usepackage[thehwcnt = 1]{iidef}
|
||||
\usepackage{listings}
|
||||
\usepackage[x11names]{xcolor}
|
||||
\usepackage{float}
|
||||
\usepackage[colorlinks, linkcolor=black, anchorcolor=green, citecolor=blue]{hyperref}
|
||||
|
||||
\DeclareMathOperator{\arctanh}{arctanh}
|
||||
% \DeclareMathOperator{\diag}{diag}
|
||||
|
||||
\setenumerate[1]{label=(\arabic{*})}
|
||||
\setenumerate[2]{label=\arabic{*})}
|
||||
|
||||
\definecolor{codekeyword}{RGB}{171, 0, 216}
|
||||
\definecolor{codetypename}{RGB}{29, 37, 251}
|
||||
\definecolor{codevariable}{RGB}{10, 23, 126}
|
||||
\definecolor{codestring}{RGB}{157, 0, 25}
|
||||
\definecolor{codecomment}{RGB}{31, 129, 19}
|
||||
|
||||
\newfontfamily\cascadia[Ligatures=ResetAll]{Cascadia Code}
|
||||
% \newfontfamily\codefont[Ligatures=ResetAll]{Cascadia Code}
|
||||
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||
% To enable ligature in listing, go check lstfiracode's github page and copy firacodestyle's settings.
|
||||
|
||||
\lstset{
|
||||
basicstyle = \small\codefont,
|
||||
% ---
|
||||
tabsize = 4,
|
||||
showstringspaces = false,
|
||||
numbers = left,
|
||||
numberstyle = \cascadia,
|
||||
% ---
|
||||
breaklines = true,
|
||||
captionpos = t,
|
||||
% ---
|
||||
frame = l,
|
||||
flexiblecolumns,
|
||||
columns = fixed,
|
||||
}
|
||||
|
||||
\thecourseinstitute{清华大学电子工程系}
|
||||
\thecoursename{\textbf{媒体与认知} \space 课堂2}
|
||||
\theterm{2023-2024学年春季学期}
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
% 请在YOUR NAME处填写自己的姓名
|
||||
\name{高艺轩}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
% 请在?处填写答案
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
\subsection{设隐含层为$\mathbf{z}=\mathbf{W}^T\mathbf{x}+\mathbf{b}$,其中$\mathbf{x}\in R^{(m \times 1)}$,$\mathbf{z}\in R^{(n\times 1)}$,$\mathbf{W}\in R^{(m\times n)}$,$\mathbf{b} \in R^{(n\times 1)}$均为已知,其激活函数如下:
|
||||
$$\mathbf{y}=\delta(\mathbf{z})=tanh(\mathbf{z})$$
|
||||
tanh表示双曲正切函数。若训练过程中的目标函数为L,且已知L对$\mathbf{y}$的导数 $\frac{\partial L}{\partial \mathbf{y}}=[\frac{\partial L}{\partial y_1},\frac{\partial L}{\partial y_2},...,\frac{\partial L}{\partial y_n}]^T$和$\mathbf{y}=[y_1,y_2,...,y_n]^T$的值。
|
||||
}
|
||||
\subsubsection{请使用$\mathbf{y}$表示出$\frac{\partial \mathbf{y}^T}{\partial \mathbf{z}}$, 这里的$\mathbf{y}^T$ 为行向量。
|
||||
}
|
||||
|
||||
\begin{proof}[解]
|
||||
首先,对$i \neq j$,$\dfrac{\partial y_i}{\partial z_j} = 0$。
|
||||
|
||||
同时$y_i = \tanh(z_i) = \tanh(\arctanh(y_i))$,因此
|
||||
\[\frac{\partial y_i}{\partial z_i} = 1 - \tanh^2(z_i) = 1 - y_i^2\]
|
||||
因此
|
||||
\[\dfrac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \qedhere\]
|
||||
\end{proof}
|
||||
|
||||
\subsubsection{请使用$\mathbf{y}$和$\frac{\partial L}{\partial \mathbf{y}}$表示$\frac{\partial L}{\partial \mathbf{x}}$,$\frac{\partial L}{\partial \mathbf{W}}$,$\frac{\partial L}{\partial \mathbf{b}}$。
|
||||
}
|
||||
提示:$\frac{\partial L}{\partial \mathbf{x}}$,$\frac{\partial L}{\partial \mathbf{W}}$,$\frac{\partial L}{\partial \mathbf{b}}$与x,W,b具有相同维度。
|
||||
|
||||
\begin{proof}[解]
|
||||
由链式法则
|
||||
\[\frac{\partial L}{\partial \boldsymbol{x}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{x}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = W \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}\]
|
||||
|
||||
对于$\dfrac{\partial L}{\partial W}$,
|
||||
\[\frac{\partial \boldsymbol{z}^T}{\partial W} = \begin{bmatrix}
|
||||
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
|
||||
\end{bmatrix}_{m \times n}\]
|
||||
|
||||
\begin{align*}
|
||||
\frac{\partial L}{\partial W} & = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial W} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}}\\
|
||||
& = \begin{bmatrix}
|
||||
\boldsymbol{x} & \boldsymbol{x} & \cdots & \boldsymbol{x}
|
||||
\end{bmatrix}_{m \times n} \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}}
|
||||
\end{align*}
|
||||
|
||||
对于$\dfrac{\partial L}{\partial \boldsymbol{b}}$,由链式法则
|
||||
\[\frac{\partial L}{\partial \boldsymbol{b}} = \frac{\partial \boldsymbol{z}^\mathrm{T}}{\partial \boldsymbol{b}} \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = I_n \frac{\partial \boldsymbol{y}^\mathrm{T}}{\partial \boldsymbol{z}} \frac{\partial L}{\partial \boldsymbol{y}} = \diag\{1 - y_1^2, 1 - y_2^2, \dots, 1 - y_n^2\} \frac{\partial L}{\partial \boldsymbol{y}} \qedhere\]
|
||||
\end{proof}
|
||||
\vspace{6mm}
|
||||
\centerline{\textbf{\Large{编程部分}}}
|
||||
|
||||
|
||||
\vspace{3mm}
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||
\section{编程作业报告}
|
||||
% 请在此处完成编程作业报告
|
||||
完成后的代码也可以在 \href{https://git.unlockableworld.com/unlockable/MediaNCognition}{\url{https://git.unlockableworld.com/unlockable/MediaNCognition}}中找到。
|
||||
\begin{enumerate}
|
||||
\item 使用默认配置进行训练和测试。
|
||||
\begin{enumerate}
|
||||
\item 训练模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/1.1.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/1.1.out.txt}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{img/1default_train.png}
|
||||
\end{figure}
|
||||
|
||||
\item 测试模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/1.2.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/1.2.out.txt}
|
||||
\end{enumerate}
|
||||
\item 调整参数、使用Adam优化器训练并测试。
|
||||
\begin{enumerate}
|
||||
\item 训练模型。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/2.1.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/2.1.out.txt}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{img/2adam_optim.png}
|
||||
\end{figure}
|
||||
\item 测试性能。
|
||||
|
||||
输入:
|
||||
\lstinputlisting{codes/2.2.in.txt}
|
||||
|
||||
输出:
|
||||
\lstinputlisting{codes/2.2.out.txt}
|
||||
\end{enumerate}
|
||||
|
||||
\item 使用效果最佳的模型测试。
|
||||
经过简单的尝试,发现使用
|
||||
\lstinputlisting{codes/self_train.in.txt}
|
||||
可以使测试集准确率达到88.8\%,有略微的提升。训练的loss曲线:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=.9\linewidth]{img/3found_best.png}
|
||||
\end{figure}
|
||||
使用它进行预测:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict01.png}
|
||||
\subcaption{预测:A}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict02.png}
|
||||
\subcaption{预测:B}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict03.png}
|
||||
\subcaption{预测:M}
|
||||
\end{subfigure}
|
||||
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict04.png}
|
||||
\subcaption{预测:R}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict05.png}
|
||||
\subcaption{预测:M}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict06.png}
|
||||
\subcaption{预测:O}
|
||||
\end{subfigure}
|
||||
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict07.png}
|
||||
\subcaption{预测:B}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}[b]{.3\linewidth}
|
||||
\includegraphics[width=\linewidth]{img/predict/predict08.png}
|
||||
\subcaption{预测:W}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\end{figure}
|
||||
\item 遇到的问题及解决方法
|
||||
\begin{enumerate}
|
||||
\item 代码中对灰度图像的矩阵进行标准化时,\lstinline{numpy}显示不能对\lstinline{NumpyGenericArray}进行对\lstinline{float}的\lstinline{/}操作。改用\lstinline{np.div()}解决了这个问题。
|
||||
\item 在利用训练好的模型进行预测时,发现自己找到的大部分模型都预测错误;最后与训练集的图片进行了对比,发现主要问题是裁切字母时留下了过大的边距,导致模型不能正确理解输入。重新裁剪边框后,得到正确的结果。
|
||||
\end{enumerate}
|
||||
\item 建议:希望下次发布作业代码可以利用清华的git。
|
||||
\end{enumerate}
|
||||
|
||||
|
||||
|
||||
|
||||
% \section{自选课题开题报告}
|
||||
% 请在此处介绍自选课题
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: late\rvx
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
||||
|
||||
@@ -1,164 +1,164 @@
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# activations.py - activation functions
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
'''
|
||||
In this script we will implement three activation functions, including both forward and backward processes.
|
||||
More details about customizing a backward process in PyTorch can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
'''
|
||||
|
||||
## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
|
||||
class Tanh(torch.autograd.Function):
|
||||
'''
|
||||
Tanh activation function
|
||||
y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
|
||||
'''
|
||||
# static method of a python class means that we can call the function without initializing an instance of the class
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
'''
|
||||
In the forward pass we receive a Tensor containing the input x and return
|
||||
a Tensor containing the output.
|
||||
|
||||
ctx: it is a context object that can be used to save information for backward computation. You can save
|
||||
objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
|
||||
|
||||
x: input with arbitrary shape
|
||||
'''
|
||||
# Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
|
||||
# y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
|
||||
|
||||
# here we directly use torch.tanh(x) to avoid the problem above
|
||||
y = torch.tanh(x)
|
||||
|
||||
# save an variable in ctx
|
||||
ctx.save_for_backward(y)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
In the backward pass we receive a Tensor containing the gradient of the loss
|
||||
with respect to the output, and we need to compute the gradient of the loss
|
||||
with respect to the input.
|
||||
|
||||
grad_output: dL/dy
|
||||
grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
|
||||
"""
|
||||
# get an variable from ctx
|
||||
y, = ctx.saved_tensors
|
||||
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
|
||||
grad_input = grad_output * (1 - y ** 2)
|
||||
|
||||
return grad_input
|
||||
|
||||
#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
|
||||
#Note: You can refer to the activation function Tanh
|
||||
class Sigmoid(torch.autograd.Function):
|
||||
'''
|
||||
Sigmoid activation function
|
||||
y = 1 / (1 + exp(-x))
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
|
||||
# hint: you can use torch.exp(x) to calculate exp(x)
|
||||
y = 1 - (1 + torch.exp(-x))
|
||||
|
||||
# here we save y in ctx, in this way we can use y to calculate gradients in backward process
|
||||
ctx.save_for_backward(y)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# get y from ctx
|
||||
y, = ctx.saved_tensors
|
||||
|
||||
# implement gradient of x (grad_input), grad_input refers to dL/dx
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
|
||||
grad_input = grad_output * y * (1 - y)
|
||||
|
||||
return grad_input
|
||||
|
||||
#TODO 2: complete the forward and backward functions of the ReLU activation function.
|
||||
#Note: You can refer to the activation function Tanh
|
||||
class ReLU(torch.autograd.Function):
|
||||
'''
|
||||
ReLU activation function
|
||||
y = max{x, 0}
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
|
||||
# set elements less than 0 in x to 0
|
||||
# this operation is inplace
|
||||
x = torch.max(x, torch.tensor([0.]).to(x.device))
|
||||
|
||||
# save x in ctx, in this way we can use x to calculate gradients in backward process
|
||||
ctx.save_for_backward(x)
|
||||
|
||||
# return the output
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
In the backward pass we receive a Tensor containing the gradient of the loss
|
||||
with respect to the output, and we need to compute the gradient of the loss
|
||||
with respect to the input.
|
||||
"""
|
||||
|
||||
# get x from ctx
|
||||
x, = ctx.saved_tensors
|
||||
# print("Before heaviside")
|
||||
# print(x, x.size())
|
||||
x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
|
||||
# print("After heaviside")
|
||||
# print(x, x.size())
|
||||
# print(grad_output, grad_output.size())
|
||||
# print(grad_output * x)
|
||||
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
|
||||
grad_input = grad_output * x
|
||||
|
||||
return grad_input
|
||||
|
||||
|
||||
# activate function class according to the type
|
||||
class Activation(nn.Module):
|
||||
def __init__(self, type):
|
||||
'''
|
||||
:param type: 'sigmoid', 'tanh', or 'relu'
|
||||
'''
|
||||
super().__init__()
|
||||
|
||||
if type == 'sigmoid':
|
||||
self.act = Sigmoid.apply
|
||||
elif type == 'tanh':
|
||||
self.act = Tanh.apply
|
||||
elif type == 'relu':
|
||||
self.act = ReLU.apply
|
||||
else:
|
||||
print('activation type should be one of [sigmoid, tanh, relu]')
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(x)
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# activations.py - activation functions
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
'''
|
||||
In this script we will implement three activation functions, including both forward and backward processes.
|
||||
More details about customizing a backward process in PyTorch can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
'''
|
||||
|
||||
## Here, Tanh is given as an example to show how to construct the activation function. Please finish the activation functions of Sigmoid and ReLU later.
|
||||
class Tanh(torch.autograd.Function):
|
||||
'''
|
||||
Tanh activation function
|
||||
y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
|
||||
'''
|
||||
# static method of a python class means that we can call the function without initializing an instance of the class
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
'''
|
||||
In the forward pass we receive a Tensor containing the input x and return
|
||||
a Tensor containing the output.
|
||||
|
||||
ctx: it is a context object that can be used to save information for backward computation. You can save
|
||||
objects by using ctx.save_for_backward, and get objects by using ctx.saved_tensors
|
||||
|
||||
x: input with arbitrary shape
|
||||
'''
|
||||
# Please think if we use "y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))", what might happen when x has a large absolute value
|
||||
# y = (torch.exp(x) - torch.exp(-x)) / (torch.exp(x) + torch.exp(-x))
|
||||
|
||||
# here we directly use torch.tanh(x) to avoid the problem above
|
||||
y = torch.tanh(x)
|
||||
|
||||
# save an variable in ctx
|
||||
ctx.save_for_backward(y)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
In the backward pass we receive a Tensor containing the gradient of the loss
|
||||
with respect to the output, and we need to compute the gradient of the loss
|
||||
with respect to the input.
|
||||
|
||||
grad_output: dL/dy
|
||||
grad_input: dL/dx = dL/dy * dy/dx, where y = forward(x)
|
||||
"""
|
||||
# get an variable from ctx
|
||||
y, = ctx.saved_tensors
|
||||
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and the dy/dx of tanh function is (1-y^2)!
|
||||
grad_input = grad_output * (1 - y ** 2)
|
||||
|
||||
return grad_input
|
||||
|
||||
#TODO 1: complete the forward and backward functions of the Sigmoid activation function.
|
||||
#Note: You can refer to the activation function Tanh
|
||||
class Sigmoid(torch.autograd.Function):
|
||||
'''
|
||||
Sigmoid activation function
|
||||
y = 1 / (1 + exp(-x))
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
|
||||
# hint: you can use torch.exp(x) to calculate exp(x)
|
||||
y = 1 - (1 + torch.exp(-x))
|
||||
|
||||
# here we save y in ctx, in this way we can use y to calculate gradients in backward process
|
||||
ctx.save_for_backward(y)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# get y from ctx
|
||||
y, = ctx.saved_tensors
|
||||
|
||||
# implement gradient of x (grad_input), grad_input refers to dL/dx
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and dy/dx of Sigmoid function is y * (1 - y)
|
||||
grad_input = grad_output * y * (1 - y)
|
||||
|
||||
return grad_input
|
||||
|
||||
#TODO 2: complete the forward and backward functions of the ReLU activation function.
|
||||
#Note: You can refer to the activation function Tanh
|
||||
class ReLU(torch.autograd.Function):
|
||||
'''
|
||||
ReLU activation function
|
||||
y = max{x, 0}
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x):
|
||||
|
||||
# set elements less than 0 in x to 0
|
||||
# this operation is inplace
|
||||
x = torch.max(x, torch.tensor([0.]).to(x.device))
|
||||
|
||||
# save x in ctx, in this way we can use x to calculate gradients in backward process
|
||||
ctx.save_for_backward(x)
|
||||
|
||||
# return the output
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
In the backward pass we receive a Tensor containing the gradient of the loss
|
||||
with respect to the output, and we need to compute the gradient of the loss
|
||||
with respect to the input.
|
||||
"""
|
||||
|
||||
# get x from ctx
|
||||
x, = ctx.saved_tensors
|
||||
# print("Before heaviside")
|
||||
# print(x, x.size())
|
||||
x = torch.heaviside(x, torch.tensor([0.]).to(x.device))
|
||||
# print("After heaviside")
|
||||
# print(x, x.size())
|
||||
# print(grad_output, grad_output.size())
|
||||
# print(grad_output * x)
|
||||
|
||||
# chain rule: dL/dx = dL/dy * dy/dx
|
||||
# where dL/dy = grad_output, and dy/dx of ReLU function is 1 if x > 0, and 0 if x <= 0
|
||||
grad_input = grad_output * x
|
||||
|
||||
return grad_input
|
||||
|
||||
|
||||
# activate function class according to the type
|
||||
class Activation(nn.Module):
|
||||
def __init__(self, type):
|
||||
'''
|
||||
:param type: 'sigmoid', 'tanh', or 'relu'
|
||||
'''
|
||||
super().__init__()
|
||||
|
||||
if type == 'sigmoid':
|
||||
self.act = Sigmoid.apply
|
||||
elif type == 'tanh':
|
||||
self.act = Tanh.apply
|
||||
elif type == 'relu':
|
||||
self.act = ReLU.apply
|
||||
else:
|
||||
print('activation type should be one of [sigmoid, tanh, relu]')
|
||||
raise NotImplementedError
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(x)
|
||||
|
||||
@@ -1,118 +1,118 @@
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# losses.py - loss functions
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
'''
|
||||
In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
|
||||
More details about customizing a backward process can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
'''
|
||||
|
||||
# here is the sample code of MSELoss
|
||||
# you can use this as reference to implement the CrossEntropyLoss
|
||||
class MSELoss(torch.autograd.Function):
|
||||
'''
|
||||
MSE loss function
|
||||
loss = (label - pred) ** 2
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, pred, label):
|
||||
"""
|
||||
:param pred: prediction with shape [batch_size, *], where ∗ means additional dimensions
|
||||
:param label: groundtruth, same shape as the predition
|
||||
:return: MSE loss, averaged by batch_size
|
||||
"""
|
||||
|
||||
# step 1: here we compute the summation of loss for each element and save both pred and label in ctx
|
||||
loss = torch.sum((pred - label) ** 2)
|
||||
ctx.save_for_backward(pred, label)
|
||||
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
:param grad_output: for loss function, grad_output will be 1
|
||||
"""
|
||||
|
||||
# step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
|
||||
pred, label = ctx.saved_tensors
|
||||
grad_input = grad_output * 2 * (pred - label)
|
||||
|
||||
# return None for gradient of label since we do not need to compute dL/dlabel
|
||||
return grad_input, None
|
||||
|
||||
#TODO 1: Complete the CrossEntropyLoss loss function
|
||||
class CrossEntropyLoss(torch.autograd.Function):
|
||||
'''
|
||||
Cross entropy loss function:
|
||||
loss = - log q_i
|
||||
where
|
||||
q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
|
||||
|
||||
However, when z_i has a lager value, exp(z_i) might become infinity.
|
||||
So we use stable softmax:
|
||||
softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
|
||||
where
|
||||
A = exp(-z_max) = exp(-max{z_0, z_1, ...})
|
||||
therefore we have
|
||||
softmax(z_i) = softmax(z_i - z_max)
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, logits, label):
|
||||
"""
|
||||
:param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
|
||||
:param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
|
||||
:return: cross entropy loss, averaged by batch_size
|
||||
"""
|
||||
|
||||
# step 1: calculate softmax(z) using stable softmax method
|
||||
# hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
|
||||
#e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
|
||||
|
||||
# calculate z_max
|
||||
z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
|
||||
|
||||
# calculate exps = exp(z - z_max)
|
||||
exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
|
||||
|
||||
# calculate q = softmax(y - y_max)
|
||||
sums = torch.sum(exps, 1) # of size [batch_size]
|
||||
# print(exps.size(), sums.size())
|
||||
# print(sums.reshape(-1, 1))
|
||||
q = exps / sums.reshape(-1, 1)
|
||||
|
||||
# step 2: convert label into one-hot version
|
||||
# e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
|
||||
# the converted label has shape [batch_size, n_classes]
|
||||
# tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
|
||||
one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
|
||||
|
||||
# step 3: calculate cross entropy loss = - log q_i, and averaged by batch
|
||||
# save result of softmax and one-hot label in ctx for gradient computation
|
||||
cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
|
||||
|
||||
ctx.save_for_backward(q, one_hot_label)
|
||||
|
||||
return cross_entropy
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
|
||||
q, label = ctx.saved_tensors
|
||||
grad_input = grad_output * (q - label)
|
||||
|
||||
# return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# losses.py - loss functions
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
'''
|
||||
In this script we will implement our MSE and Cross Entropy loss functions, including both the forward and backward processes.
|
||||
More details about customizing a backward process can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
'''
|
||||
|
||||
# here is the sample code of MSELoss
|
||||
# you can use this as reference to implement the CrossEntropyLoss
|
||||
class MSELoss(torch.autograd.Function):
|
||||
'''
|
||||
MSE loss function
|
||||
loss = (label - pred) ** 2
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, pred, label):
|
||||
"""
|
||||
:param pred: prediction with shape [batch_size, *], where ∗ means additional dimensions
|
||||
:param label: groundtruth, same shape as the predition
|
||||
:return: MSE loss, averaged by batch_size
|
||||
"""
|
||||
|
||||
# step 1: here we compute the summation of loss for each element and save both pred and label in ctx
|
||||
loss = torch.sum((pred - label) ** 2)
|
||||
ctx.save_for_backward(pred, label)
|
||||
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
"""
|
||||
:param grad_output: for loss function, grad_output will be 1
|
||||
"""
|
||||
|
||||
# step 2: get pred and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dpred)
|
||||
pred, label = ctx.saved_tensors
|
||||
grad_input = grad_output * 2 * (pred - label)
|
||||
|
||||
# return None for gradient of label since we do not need to compute dL/dlabel
|
||||
return grad_input, None
|
||||
|
||||
#TODO 1: Complete the CrossEntropyLoss loss function
|
||||
class CrossEntropyLoss(torch.autograd.Function):
|
||||
'''
|
||||
Cross entropy loss function:
|
||||
loss = - log q_i
|
||||
where
|
||||
q_i = softmax(z_i) = exp(z_i) / (exp(z_0) + exp(z_1) + ...)
|
||||
|
||||
However, when z_i has a lager value, exp(z_i) might become infinity.
|
||||
So we use stable softmax:
|
||||
softmax(z_i) = A exp(z_i) / A (exp(z_0) + exp(z_1) + ...)
|
||||
where
|
||||
A = exp(-z_max) = exp(-max{z_0, z_1, ...})
|
||||
therefore we have
|
||||
softmax(z_i) = softmax(z_i - z_max)
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, logits, label):
|
||||
"""
|
||||
:param logits: logits with shape [batch_size, n_classes], denoted by "z" in the above formula
|
||||
:param label: groundtruth with shape [batch_size], where 0 <= label[i] < n_classes - 1
|
||||
:return: cross entropy loss, averaged by batch_size
|
||||
"""
|
||||
|
||||
# step 1: calculate softmax(z) using stable softmax method
|
||||
# hint: you can use torch.exp(x) to calculate exp(x), and remember to convert label into one-hot version
|
||||
#e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
|
||||
|
||||
# calculate z_max
|
||||
z_max = torch.max(logits, 1, keepdim=True).values # of size [batch_size]
|
||||
|
||||
# calculate exps = exp(z - z_max)
|
||||
exps = torch.exp(logits - z_max) # of size [batch_size, n_classes]
|
||||
|
||||
# calculate q = softmax(y - y_max)
|
||||
sums = torch.sum(exps, 1) # of size [batch_size]
|
||||
# print(exps.size(), sums.size())
|
||||
# print(sums.reshape(-1, 1))
|
||||
q = exps / sums.reshape(-1, 1)
|
||||
|
||||
# step 2: convert label into one-hot version
|
||||
# e.g., if label = [0, 2] and n_classes=4, then the one-hot version is [[1,0,0,0], [0,0,1,0]]
|
||||
# the converted label has shape [batch_size, n_classes]
|
||||
# tips: you can use torch.nn.functional.one_hot() to convert label into one-hot vector with dimension n_classes
|
||||
one_hot_label = torch.nn.functional.one_hot(label, logits.size()[1])
|
||||
|
||||
# step 3: calculate cross entropy loss = - log q_i, and averaged by batch
|
||||
# save result of softmax and one-hot label in ctx for gradient computation
|
||||
cross_entropy = -torch.sum(torch.log(torch.sum(q * one_hot_label, 1))) / label.size()[0]
|
||||
|
||||
ctx.save_for_backward(q, one_hot_label)
|
||||
|
||||
return cross_entropy
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
|
||||
# step 4: get q and label from ctx and calculate the derivative of loss w.r.t. pred (dL/dz)
|
||||
q, label = ctx.saved_tensors
|
||||
grad_input = grad_output * (q - label)
|
||||
|
||||
# return the pred (dL/dz) and None for dL/dlabel since we do not need to compute dL/dlabel
|
||||
return grad_input, None
|
||||
@@ -1,156 +1,156 @@
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# network.py - linear layer and MLP network
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from activations import Activation
|
||||
|
||||
'''
|
||||
In this script we will implement our Linear layer and MLP network.
|
||||
For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
|
||||
More details about customizing a backward process can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
|
||||
'''
|
||||
|
||||
|
||||
class LinearFunction(torch.autograd.Function):
|
||||
'''
|
||||
we will implement the linear function:
|
||||
y = xW^T + b
|
||||
as well as its gradient computation process
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x, W, b):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object that can be used to stash information for backward computation
|
||||
:param x: input features with size [batch_size, input_size]
|
||||
:param W: weight matrix with size [output_size, input_size]
|
||||
:param b: bias with size [output_size]
|
||||
Return:
|
||||
y :output features with size [batch_size, output_size]
|
||||
'''
|
||||
|
||||
# print(x, x.size(), x.dtype)
|
||||
# print(W.T, W.T.size(), W.T.dtype)
|
||||
# print(x.device, W.T.device)
|
||||
y = torch.matmul(x, W.T) + b
|
||||
ctx.save_for_backward(x, W)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object with saved variables
|
||||
:param grad_output: dL/dy, with size [batch_size, output_size]
|
||||
Return:
|
||||
grad_input: dL/dx, with size [batch_size, input_size]
|
||||
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
|
||||
grad_b: dL/db, with size [output_size], summed for data in the batch
|
||||
'''
|
||||
|
||||
x, W = ctx.saved_variables
|
||||
|
||||
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
|
||||
# calculate dL/dW by using dL/dy (grad_output) and x
|
||||
# calculate dL/db using dL/dy (grad_output)
|
||||
# you can use torch.matmul(A, B) to compute matrix product of A and B
|
||||
|
||||
grad_input = torch.matmul(grad_output, W)
|
||||
grad_W = torch.matmul(grad_output.T, x)
|
||||
grad_b = grad_output.sum(0)
|
||||
|
||||
return grad_input, grad_W, grad_b
|
||||
|
||||
|
||||
class Linear(nn.Module):
|
||||
def __init__(self, input_size, output_size):
|
||||
'''
|
||||
A linear layer which uses our own LinearFunction implemented above.
|
||||
-----------------------------------------------
|
||||
:param input_size: dimension of input features
|
||||
:param output_size: dimension of output features
|
||||
'''
|
||||
super(Linear, self).__init__()
|
||||
|
||||
|
||||
W = torch.randn(output_size, input_size).float()
|
||||
b = torch.zeros(output_size).float()
|
||||
self.W = nn.Parameter(W, requires_grad=True)
|
||||
self.b = nn.Parameter(b, requires_grad=True)
|
||||
|
||||
def forward(self, x):
|
||||
# here we call the LinearFunction we implement above
|
||||
return LinearFunction.apply(x, self.W, self.b)
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
|
||||
'''
|
||||
Multilayer Perceptron
|
||||
----------------------
|
||||
:param input_size: dimension of input features
|
||||
:param output_size: dimension of output features
|
||||
:param hidden_size: a list containing hidden size for each hidden layer
|
||||
:param n_layers: number of layers
|
||||
:param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
|
||||
'''
|
||||
# TODO 1: initialize the parent class nn.Module
|
||||
super(MLP, self).__init__()
|
||||
|
||||
# total layer number should be hidden layer number + 1 (output layer)
|
||||
# print(hidden_size, n_layers)
|
||||
assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
|
||||
|
||||
# TODO 2;complete the network structures
|
||||
# instantiate the activation function by using the defined classes in activations.py
|
||||
self.act = Activation(act_type)
|
||||
|
||||
# initialize a list to save layers
|
||||
layers = nn.ModuleList()
|
||||
|
||||
if n_layers == 1:
|
||||
# append a linear layer into the module list
|
||||
# if n_layers == 1, MLP degenerates to a single linear layer
|
||||
layers.append(Linear(input_size, output_size))
|
||||
|
||||
# MLP with at least 2 layers
|
||||
else:
|
||||
# construct the hidden layers and add them to the module list
|
||||
# a hidden layer of MLP consists of a linear layer and an activation function
|
||||
in_size = input_size
|
||||
for i in range(n_layers - 1):
|
||||
layer = Linear(in_size, hidden_size[i])
|
||||
layers.append(layer) # append the linear layer into the module list
|
||||
layers.append(self.act)
|
||||
in_size = hidden_size[i] # update in_size for the next layer
|
||||
|
||||
# initialize the output layer and append the layer into the module list
|
||||
# hint: what is the output size of the output layer?
|
||||
layers.append(Linear(hidden_size[-1], output_size))
|
||||
|
||||
# Use nn.Sequential to get the neural network
|
||||
self.network = torch.nn.Sequential()
|
||||
for layer in layers:
|
||||
self.network.append(layer)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
Define the forward function
|
||||
:param x: input features with size [batch_size, input_size]
|
||||
:return: output features with size [batch_size, output_size]
|
||||
'''
|
||||
# TODO 3: implement the forward propagation of the MLP
|
||||
out = self.network(x)
|
||||
|
||||
return out
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# network.py - linear layer and MLP network
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from activations import Activation
|
||||
|
||||
'''
|
||||
In this script we will implement our Linear layer and MLP network.
|
||||
For the linear layer, we will provide a sample of codes which calculate both the forward and backward processes by our own.
|
||||
More details about customizing a backward process can be found in:
|
||||
https://pytorch.org/tutorials/beginner/examples_autograd/two_layer_net_custom_function.html
|
||||
For the MLP network, you should cascade the linear layers and activation functions in a proper way in the __init__ function and implement the forward function.
|
||||
'''
|
||||
|
||||
|
||||
class LinearFunction(torch.autograd.Function):
|
||||
'''
|
||||
we will implement the linear function:
|
||||
y = xW^T + b
|
||||
as well as its gradient computation process
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x, W, b):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object that can be used to stash information for backward computation
|
||||
:param x: input features with size [batch_size, input_size]
|
||||
:param W: weight matrix with size [output_size, input_size]
|
||||
:param b: bias with size [output_size]
|
||||
Return:
|
||||
y :output features with size [batch_size, output_size]
|
||||
'''
|
||||
|
||||
# print(x, x.size(), x.dtype)
|
||||
# print(W.T, W.T.size(), W.T.dtype)
|
||||
# print(x.device, W.T.device)
|
||||
y = torch.matmul(x, W.T) + b
|
||||
ctx.save_for_backward(x, W)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object with saved variables
|
||||
:param grad_output: dL/dy, with size [batch_size, output_size]
|
||||
Return:
|
||||
grad_input: dL/dx, with size [batch_size, input_size]
|
||||
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
|
||||
grad_b: dL/db, with size [output_size], summed for data in the batch
|
||||
'''
|
||||
|
||||
x, W = ctx.saved_variables
|
||||
|
||||
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
|
||||
# calculate dL/dW by using dL/dy (grad_output) and x
|
||||
# calculate dL/db using dL/dy (grad_output)
|
||||
# you can use torch.matmul(A, B) to compute matrix product of A and B
|
||||
|
||||
grad_input = torch.matmul(grad_output, W)
|
||||
grad_W = torch.matmul(grad_output.T, x)
|
||||
grad_b = grad_output.sum(0)
|
||||
|
||||
return grad_input, grad_W, grad_b
|
||||
|
||||
|
||||
class Linear(nn.Module):
|
||||
def __init__(self, input_size, output_size):
|
||||
'''
|
||||
A linear layer which uses our own LinearFunction implemented above.
|
||||
-----------------------------------------------
|
||||
:param input_size: dimension of input features
|
||||
:param output_size: dimension of output features
|
||||
'''
|
||||
super(Linear, self).__init__()
|
||||
|
||||
|
||||
W = torch.randn(output_size, input_size).float()
|
||||
b = torch.zeros(output_size).float()
|
||||
self.W = nn.Parameter(W, requires_grad=True)
|
||||
self.b = nn.Parameter(b, requires_grad=True)
|
||||
|
||||
def forward(self, x):
|
||||
# here we call the LinearFunction we implement above
|
||||
return LinearFunction.apply(x, self.W, self.b)
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, input_size, output_size, hidden_size, n_layers, act_type):
|
||||
'''
|
||||
Multilayer Perceptron
|
||||
----------------------
|
||||
:param input_size: dimension of input features
|
||||
:param output_size: dimension of output features
|
||||
:param hidden_size: a list containing hidden size for each hidden layer
|
||||
:param n_layers: number of layers
|
||||
:param act_type: type of activation function for each hidden layer, can be none, sigmoid, tanh, or relu
|
||||
'''
|
||||
# TODO 1: initialize the parent class nn.Module
|
||||
super(MLP, self).__init__()
|
||||
|
||||
# total layer number should be hidden layer number + 1 (output layer)
|
||||
# print(hidden_size, n_layers)
|
||||
assert len(hidden_size) + 1 == n_layers, 'total layer number should be hidden layer number + 1'
|
||||
|
||||
# TODO 2;complete the network structures
|
||||
# instantiate the activation function by using the defined classes in activations.py
|
||||
self.act = Activation(act_type)
|
||||
|
||||
# initialize a list to save layers
|
||||
layers = nn.ModuleList()
|
||||
|
||||
if n_layers == 1:
|
||||
# append a linear layer into the module list
|
||||
# if n_layers == 1, MLP degenerates to a single linear layer
|
||||
layers.append(Linear(input_size, output_size))
|
||||
|
||||
# MLP with at least 2 layers
|
||||
else:
|
||||
# construct the hidden layers and add them to the module list
|
||||
# a hidden layer of MLP consists of a linear layer and an activation function
|
||||
in_size = input_size
|
||||
for i in range(n_layers - 1):
|
||||
layer = Linear(in_size, hidden_size[i])
|
||||
layers.append(layer) # append the linear layer into the module list
|
||||
layers.append(self.act)
|
||||
in_size = hidden_size[i] # update in_size for the next layer
|
||||
|
||||
# initialize the output layer and append the layer into the module list
|
||||
# hint: what is the output size of the output layer?
|
||||
layers.append(Linear(hidden_size[-1], output_size))
|
||||
|
||||
# Use nn.Sequential to get the neural network
|
||||
self.network = torch.nn.Sequential()
|
||||
for layer in layers:
|
||||
self.network.append(layer)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
'''
|
||||
Define the forward function
|
||||
:param x: input features with size [batch_size, input_size]
|
||||
:return: output features with size [batch_size, output_size]
|
||||
'''
|
||||
# TODO 3: implement the forward propagation of the MLP
|
||||
out = self.network(x)
|
||||
|
||||
return out
|
||||
|
||||
@@ -1,397 +1,397 @@
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# recognition.py - character classification
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
|
||||
# ==== Part 0: import libs
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
import json, cv2, os, string
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import numpy as np
|
||||
|
||||
# this time we implement our networks and loss functions in other python script, and import them here
|
||||
from network import MLP
|
||||
from losses import CrossEntropyLoss
|
||||
|
||||
# argparse is used to conveniently set our configurations
|
||||
import argparse
|
||||
|
||||
# ==== Part 1: data loader
|
||||
|
||||
# construct a dataset and a data loader, more details can be found in
|
||||
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
|
||||
|
||||
class ListDataset(Dataset):
|
||||
def __init__(self, im_dir, file_path, norm_size=(32, 32)):
|
||||
'''
|
||||
:param im_dir: path to directory with images
|
||||
:param file_path: json file containing image names and labels
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
'''
|
||||
|
||||
# this time we will try to recognize 26 English letters (case-insensitive)
|
||||
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
||||
self.alphabet = {letters[i]:i for i in range(len(letters))}
|
||||
self.norm_size = norm_size
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
imgs = json.load(f)
|
||||
im_names = list(imgs.keys())
|
||||
|
||||
self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
|
||||
self.labels = list(imgs.values())
|
||||
|
||||
def __len__(self):
|
||||
# the __len__() function should return the total number of samples in the dataset
|
||||
return len(self.im_paths)
|
||||
|
||||
def __getitem__(self, index):
|
||||
assert index <= len(self), 'index range error'
|
||||
|
||||
# read an image and convert it to grey scale
|
||||
im_path = self.im_paths[index]
|
||||
im = cv2.imread(im_path)
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
|
||||
im = cv2.resize(im, self.norm_size)
|
||||
# im = im / 255.
|
||||
""" The above command does not seems to be valid in my environment """
|
||||
im = np.divide(im, 255.)
|
||||
im = (im - 0.5) * 2.0
|
||||
|
||||
# get the label of the current image
|
||||
# upper() is used to convert a letter into uppercase
|
||||
label = self.labels[index].upper()
|
||||
|
||||
# convert an English letter into a number index
|
||||
label = self.alphabet[label]
|
||||
|
||||
# TODO 1: return the image and its label
|
||||
return im, label
|
||||
|
||||
|
||||
|
||||
def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
|
||||
'''
|
||||
:param im_dir: path to directory with images
|
||||
:param file_path: file with image paths and labels
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
:param batch_size: batch size
|
||||
:param workers: number of workers for loading data in multiple threads
|
||||
:return: a data loader
|
||||
'''
|
||||
|
||||
dataset = ListDataset(im_dir, file_path, norm_size)
|
||||
return DataLoader(dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True if 'train' in file_path else False, # shuffle images only when training
|
||||
num_workers=workers)
|
||||
|
||||
|
||||
# ==== Part 2: training, validation and testing
|
||||
|
||||
def train_val(model, trainloader, valloader, n_epochs,
|
||||
lr, optim_type, momentum, weight_decay,
|
||||
valInterval, device='cpu'):
|
||||
'''
|
||||
The main training procedure
|
||||
----------------------------
|
||||
:param model: the MLP model
|
||||
:param trainloader: the dataloader of the train set
|
||||
:param valloader: the dataloader of the validation set
|
||||
:param n_epochs: number of training epochs
|
||||
:param lr: learning rate
|
||||
:param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
|
||||
:param momentum: only used if optim_type == 'sgd'
|
||||
:param weight_decay: the factor of L2 penalty on network weights
|
||||
:param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
|
||||
# define the cross entropy loss function.
|
||||
ce_loss = CrossEntropyLoss.apply
|
||||
|
||||
# optimizer
|
||||
if optim_type == 'sgd':
|
||||
optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
|
||||
elif optim_type == 'adagrad':
|
||||
optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'rmsprop':
|
||||
optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'adam':
|
||||
optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'adadelta':
|
||||
optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
|
||||
else:
|
||||
print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
|
||||
raise NotImplementedError
|
||||
|
||||
# training
|
||||
|
||||
# to save loss of each training epoch in a python "list" data structure
|
||||
losses = []
|
||||
|
||||
for epoch in range(n_epochs):
|
||||
# set the model in training mode
|
||||
model.train()
|
||||
|
||||
# to save total loss in one epoch
|
||||
total_loss = 0.
|
||||
|
||||
#TODO 2: Calculate losses and train the network using the optimizer
|
||||
for data, labels in trainloader: # get a batch of data
|
||||
|
||||
# step 1: set data type and device
|
||||
# data = torch.from_numpy(data)
|
||||
data = data.type(torch.float32)
|
||||
data = data.to(device)
|
||||
labels = labels.to(device)
|
||||
|
||||
# print(data.device)
|
||||
|
||||
# step 2: convert an image to a vector as the input of the MLP
|
||||
data = torch.flatten(data, start_dim=1)
|
||||
# print(data.size())
|
||||
|
||||
# hit: clear gradients in the optimizer
|
||||
optimizer.zero_grad()
|
||||
|
||||
# step 3: run the model which is the forward process
|
||||
output = model(data)
|
||||
|
||||
# step 4: compute the loss, and call backward propagation function
|
||||
loss = ce_loss(output, labels)
|
||||
loss.backward()
|
||||
# I have no idea why pylance can't get the data type of what ce_loss returns
|
||||
|
||||
# step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
|
||||
# this operation is not differentiable
|
||||
total_loss += loss.item()
|
||||
|
||||
# step 6: call a function, optimizer.step(), to update the parameters of the models
|
||||
optimizer.step()
|
||||
|
||||
|
||||
# average of the total loss for iterations
|
||||
avg_loss = total_loss / len(trainloader)
|
||||
losses.append(avg_loss)
|
||||
print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
|
||||
|
||||
# validation
|
||||
if (epoch + 1) % valInterval == 0:
|
||||
val_acc = test(model, valloader, device)
|
||||
# show prediction accuracy
|
||||
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
|
||||
|
||||
|
||||
# save model parameters in a file
|
||||
# model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
|
||||
model_save_path = opt.model_path
|
||||
|
||||
torch.save({'state_dict': model.state_dict(),
|
||||
}, model_save_path)
|
||||
print('Model saved in {}\n'.format(model_save_path))
|
||||
|
||||
# draw the loss curve
|
||||
plot_loss(losses)
|
||||
|
||||
|
||||
def test(model, testloader, device):
|
||||
'''
|
||||
The testing procedure
|
||||
----------------------------
|
||||
:param model: the MLP model
|
||||
:param testloader: the dataloader to be tested/validated
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
# set the model in evaluation mode
|
||||
model.eval()
|
||||
|
||||
n_correct = 0. # number of images that are correctly classified
|
||||
n_imgs = 0. # number of total images
|
||||
|
||||
with torch.no_grad(): # we do not need to compute gradients during validation
|
||||
|
||||
#TODO 3: get the prediction of the data and calculate the accuracy
|
||||
for imgs, labels in testloader:
|
||||
# step 1: set data type and device
|
||||
# imgs = torch.from_numpy(imgs)
|
||||
imgs = imgs.type(torch.float32)
|
||||
imgs = imgs.to(device)
|
||||
labels = labels.to(device)
|
||||
|
||||
# step 2: convert an image to a vector as the input of the MLP
|
||||
imgs = torch.flatten(imgs, start_dim=1)
|
||||
|
||||
# step 3: run the model which is the forward process
|
||||
output = model(imgs)
|
||||
|
||||
# step 4: get the predicted value by the output using out.argmax(1)
|
||||
pred = output.argmax(1)
|
||||
|
||||
# step 5: sum up the number of images correctly recognized and the total image number
|
||||
for predict, label in zip(pred, labels):
|
||||
if predict == label:
|
||||
n_correct += 1
|
||||
n_imgs += 1
|
||||
|
||||
accuracy = n_correct / n_imgs
|
||||
return accuracy
|
||||
|
||||
|
||||
# ==== Part 3: predict new images
|
||||
def predict(model, im_path, norm_size, device):
|
||||
'''
|
||||
The predicting procedure
|
||||
---------------
|
||||
:param model: the MLP model
|
||||
:param im_path: path of an image
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
|
||||
# TODO 4: enter the evaluation mode
|
||||
model.eval()
|
||||
|
||||
# TODO 4: image pre-processing, similar to what we do in ListDataset()
|
||||
im = cv2.imread(im_path)
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
im = cv2.resize(im, norm_size)
|
||||
im = np.divide(im, 255.)
|
||||
im = (im - 0.5) * 2.0
|
||||
|
||||
# convert im from numpy.ndarray to torch.tensor
|
||||
im = torch.from_numpy(im)
|
||||
|
||||
# input im into the model
|
||||
with torch.no_grad():
|
||||
input = im.view(1, -1).type(torch.float32).to(device)
|
||||
out = model(input)
|
||||
prediction = out.argmax(1)[0].item()
|
||||
|
||||
# convert index of prediction to the corresponding character
|
||||
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
||||
prediction = letters[prediction]
|
||||
|
||||
print('Prediction: {}'.format(prediction))
|
||||
|
||||
|
||||
# ==== Part 4: draw the loss curve
|
||||
def plot_loss(losses):
|
||||
'''
|
||||
:param losses: list of losses for each epoch
|
||||
:return:
|
||||
'''
|
||||
|
||||
f, ax = plt.subplots()
|
||||
|
||||
# draw loss
|
||||
ax.plot(losses)
|
||||
|
||||
# set labels
|
||||
ax.set_xlabel('training epoch')
|
||||
ax.set_ylabel('loss')
|
||||
|
||||
# show the plots
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# set random seed for reproducibility
|
||||
seed = 2023
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
# set configurations
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
|
||||
parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
|
||||
help='path to directory with images')
|
||||
parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
|
||||
help='file list of training image paths and labels')
|
||||
parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
|
||||
help='file list of validation image paths and labels')
|
||||
parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
|
||||
help='file list of test image paths and labels')
|
||||
parser.add_argument('--batchsize', type=int, default=8, help='batch size')
|
||||
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
|
||||
|
||||
# configurations for training
|
||||
parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
|
||||
parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
|
||||
parser.add_argument('--act', type=str, default='relu',
|
||||
help='type of activation function, can be sigmoid, tanh, or relu')
|
||||
parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
|
||||
parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
|
||||
parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
|
||||
parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
|
||||
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
|
||||
parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
|
||||
parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
|
||||
parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
|
||||
|
||||
# configurations for test and prediction
|
||||
parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
|
||||
parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
|
||||
help='path of an image to be recognized')
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
# TODO 5: initialize the MLP model
|
||||
# what is the input size of the MLP?
|
||||
# hint 1: we convert an image to a vector as the input of the MLP
|
||||
# hint 2: each image has shape [norm_size[0], norm_size[1]]
|
||||
model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
|
||||
|
||||
# for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
|
||||
if opt.mode == 'test' or opt.mode == 'predict':
|
||||
checkpoint = torch.load(opt.model_path, map_location='cpu')
|
||||
# """The above code did not consider device problem"""
|
||||
# checkpoint = torch.load(opt.model_path, map_location=opt.device)
|
||||
# load model parameters we saved in model_path
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
print('[Info] Load model from {}'.format(opt.model_path))
|
||||
|
||||
# put the model on CPU or GPU according to the device in args
|
||||
model = model.to(opt.device)
|
||||
|
||||
# -- run the code for training and validation
|
||||
if opt.mode == 'train':
|
||||
# training and validation data loader
|
||||
trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
|
||||
valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
|
||||
train_val(model, trainloader, valloader,
|
||||
n_epochs=opt.epoch,
|
||||
lr=opt.lr,
|
||||
optim_type=opt.optim_type,
|
||||
momentum=opt.momentum,
|
||||
weight_decay=opt.weight_decay,
|
||||
valInterval=opt.valInterval,
|
||||
device=opt.device)
|
||||
|
||||
# -- test the saved model
|
||||
elif opt.mode == 'test':
|
||||
testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
|
||||
acc = test(model, testloader, opt.device)
|
||||
print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
|
||||
|
||||
# -- predict a new image
|
||||
elif opt.mode == 'predict':
|
||||
predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
|
||||
|
||||
else:
|
||||
print('mode should be train, test, or predict')
|
||||
raise NotImplementedError
|
||||
#========================================================
|
||||
# Media and Cognition
|
||||
# Homework 1 Neural network basics
|
||||
# recognition.py - character classification
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
#========================================================
|
||||
|
||||
# ==== Part 0: import libs
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
import json, cv2, os, string
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import numpy as np
|
||||
|
||||
# this time we implement our networks and loss functions in other python script, and import them here
|
||||
from network import MLP
|
||||
from losses import CrossEntropyLoss
|
||||
|
||||
# argparse is used to conveniently set our configurations
|
||||
import argparse
|
||||
|
||||
# ==== Part 1: data loader
|
||||
|
||||
# construct a dataset and a data loader, more details can be found in
|
||||
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
|
||||
|
||||
class ListDataset(Dataset):
|
||||
def __init__(self, im_dir, file_path, norm_size=(32, 32)):
|
||||
'''
|
||||
:param im_dir: path to directory with images
|
||||
:param file_path: json file containing image names and labels
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
'''
|
||||
|
||||
# this time we will try to recognize 26 English letters (case-insensitive)
|
||||
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
||||
self.alphabet = {letters[i]:i for i in range(len(letters))}
|
||||
self.norm_size = norm_size
|
||||
|
||||
with open(file_path, 'r') as f:
|
||||
imgs = json.load(f)
|
||||
im_names = list(imgs.keys())
|
||||
|
||||
self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
|
||||
self.labels = list(imgs.values())
|
||||
|
||||
def __len__(self):
|
||||
# the __len__() function should return the total number of samples in the dataset
|
||||
return len(self.im_paths)
|
||||
|
||||
def __getitem__(self, index):
|
||||
assert index <= len(self), 'index range error'
|
||||
|
||||
# read an image and convert it to grey scale
|
||||
im_path = self.im_paths[index]
|
||||
im = cv2.imread(im_path)
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
|
||||
im = cv2.resize(im, self.norm_size)
|
||||
# im = im / 255.
|
||||
""" The above command does not seems to be valid in my environment """
|
||||
im = np.divide(im, 255.)
|
||||
im = (im - 0.5) * 2.0
|
||||
|
||||
# get the label of the current image
|
||||
# upper() is used to convert a letter into uppercase
|
||||
label = self.labels[index].upper()
|
||||
|
||||
# convert an English letter into a number index
|
||||
label = self.alphabet[label]
|
||||
|
||||
# TODO 1: return the image and its label
|
||||
return im, label
|
||||
|
||||
|
||||
|
||||
def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
|
||||
'''
|
||||
:param im_dir: path to directory with images
|
||||
:param file_path: file with image paths and labels
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
:param batch_size: batch size
|
||||
:param workers: number of workers for loading data in multiple threads
|
||||
:return: a data loader
|
||||
'''
|
||||
|
||||
dataset = ListDataset(im_dir, file_path, norm_size)
|
||||
return DataLoader(dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True if 'train' in file_path else False, # shuffle images only when training
|
||||
num_workers=workers)
|
||||
|
||||
|
||||
# ==== Part 2: training, validation and testing
|
||||
|
||||
def train_val(model, trainloader, valloader, n_epochs,
|
||||
lr, optim_type, momentum, weight_decay,
|
||||
valInterval, device='cpu'):
|
||||
'''
|
||||
The main training procedure
|
||||
----------------------------
|
||||
:param model: the MLP model
|
||||
:param trainloader: the dataloader of the train set
|
||||
:param valloader: the dataloader of the validation set
|
||||
:param n_epochs: number of training epochs
|
||||
:param lr: learning rate
|
||||
:param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
|
||||
:param momentum: only used if optim_type == 'sgd'
|
||||
:param weight_decay: the factor of L2 penalty on network weights
|
||||
:param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
|
||||
# define the cross entropy loss function.
|
||||
ce_loss = CrossEntropyLoss.apply
|
||||
|
||||
# optimizer
|
||||
if optim_type == 'sgd':
|
||||
optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
|
||||
elif optim_type == 'adagrad':
|
||||
optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'rmsprop':
|
||||
optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'adam':
|
||||
optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
|
||||
elif optim_type == 'adadelta':
|
||||
optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
|
||||
else:
|
||||
print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
|
||||
raise NotImplementedError
|
||||
|
||||
# training
|
||||
|
||||
# to save loss of each training epoch in a python "list" data structure
|
||||
losses = []
|
||||
|
||||
for epoch in range(n_epochs):
|
||||
# set the model in training mode
|
||||
model.train()
|
||||
|
||||
# to save total loss in one epoch
|
||||
total_loss = 0.
|
||||
|
||||
#TODO 2: Calculate losses and train the network using the optimizer
|
||||
for data, labels in trainloader: # get a batch of data
|
||||
|
||||
# step 1: set data type and device
|
||||
# data = torch.from_numpy(data)
|
||||
data = data.type(torch.float32)
|
||||
data = data.to(device)
|
||||
labels = labels.to(device)
|
||||
|
||||
# print(data.device)
|
||||
|
||||
# step 2: convert an image to a vector as the input of the MLP
|
||||
data = torch.flatten(data, start_dim=1)
|
||||
# print(data.size())
|
||||
|
||||
# hit: clear gradients in the optimizer
|
||||
optimizer.zero_grad()
|
||||
|
||||
# step 3: run the model which is the forward process
|
||||
output = model(data)
|
||||
|
||||
# step 4: compute the loss, and call backward propagation function
|
||||
loss = ce_loss(output, labels)
|
||||
loss.backward()
|
||||
# I have no idea why pylance can't get the data type of what ce_loss returns
|
||||
|
||||
# step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
|
||||
# this operation is not differentiable
|
||||
total_loss += loss.item()
|
||||
|
||||
# step 6: call a function, optimizer.step(), to update the parameters of the models
|
||||
optimizer.step()
|
||||
|
||||
|
||||
# average of the total loss for iterations
|
||||
avg_loss = total_loss / len(trainloader)
|
||||
losses.append(avg_loss)
|
||||
print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
|
||||
|
||||
# validation
|
||||
if (epoch + 1) % valInterval == 0:
|
||||
val_acc = test(model, valloader, device)
|
||||
# show prediction accuracy
|
||||
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
|
||||
|
||||
|
||||
# save model parameters in a file
|
||||
# model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
|
||||
model_save_path = opt.model_path
|
||||
|
||||
torch.save({'state_dict': model.state_dict(),
|
||||
}, model_save_path)
|
||||
print('Model saved in {}\n'.format(model_save_path))
|
||||
|
||||
# draw the loss curve
|
||||
plot_loss(losses)
|
||||
|
||||
|
||||
def test(model, testloader, device):
|
||||
'''
|
||||
The testing procedure
|
||||
----------------------------
|
||||
:param model: the MLP model
|
||||
:param testloader: the dataloader to be tested/validated
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
# set the model in evaluation mode
|
||||
model.eval()
|
||||
|
||||
n_correct = 0. # number of images that are correctly classified
|
||||
n_imgs = 0. # number of total images
|
||||
|
||||
with torch.no_grad(): # we do not need to compute gradients during validation
|
||||
|
||||
#TODO 3: get the prediction of the data and calculate the accuracy
|
||||
for imgs, labels in testloader:
|
||||
# step 1: set data type and device
|
||||
# imgs = torch.from_numpy(imgs)
|
||||
imgs = imgs.type(torch.float32)
|
||||
imgs = imgs.to(device)
|
||||
labels = labels.to(device)
|
||||
|
||||
# step 2: convert an image to a vector as the input of the MLP
|
||||
imgs = torch.flatten(imgs, start_dim=1)
|
||||
|
||||
# step 3: run the model which is the forward process
|
||||
output = model(imgs)
|
||||
|
||||
# step 4: get the predicted value by the output using out.argmax(1)
|
||||
pred = output.argmax(1)
|
||||
|
||||
# step 5: sum up the number of images correctly recognized and the total image number
|
||||
for predict, label in zip(pred, labels):
|
||||
if predict == label:
|
||||
n_correct += 1
|
||||
n_imgs += 1
|
||||
|
||||
accuracy = n_correct / n_imgs
|
||||
return accuracy
|
||||
|
||||
|
||||
# ==== Part 3: predict new images
|
||||
def predict(model, im_path, norm_size, device):
|
||||
'''
|
||||
The predicting procedure
|
||||
---------------
|
||||
:param model: the MLP model
|
||||
:param im_path: path of an image
|
||||
:param norm_size: image normalization size, (height, width)
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
'''
|
||||
|
||||
# TODO 4: enter the evaluation mode
|
||||
model.eval()
|
||||
|
||||
# TODO 4: image pre-processing, similar to what we do in ListDataset()
|
||||
im = cv2.imread(im_path)
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
im = cv2.resize(im, norm_size)
|
||||
im = np.divide(im, 255.)
|
||||
im = (im - 0.5) * 2.0
|
||||
|
||||
# convert im from numpy.ndarray to torch.tensor
|
||||
im = torch.from_numpy(im)
|
||||
|
||||
# input im into the model
|
||||
with torch.no_grad():
|
||||
input = im.view(1, -1).type(torch.float32).to(device)
|
||||
out = model(input)
|
||||
prediction = out.argmax(1)[0].item()
|
||||
|
||||
# convert index of prediction to the corresponding character
|
||||
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
||||
prediction = letters[prediction]
|
||||
|
||||
print('Prediction: {}'.format(prediction))
|
||||
|
||||
|
||||
# ==== Part 4: draw the loss curve
|
||||
def plot_loss(losses):
|
||||
'''
|
||||
:param losses: list of losses for each epoch
|
||||
:return:
|
||||
'''
|
||||
|
||||
f, ax = plt.subplots()
|
||||
|
||||
# draw loss
|
||||
ax.plot(losses)
|
||||
|
||||
# set labels
|
||||
ax.set_xlabel('training epoch')
|
||||
ax.set_ylabel('loss')
|
||||
|
||||
# show the plots
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# set random seed for reproducibility
|
||||
seed = 2023
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
# set configurations
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
|
||||
parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
|
||||
help='path to directory with images')
|
||||
parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
|
||||
help='file list of training image paths and labels')
|
||||
parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
|
||||
help='file list of validation image paths and labels')
|
||||
parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
|
||||
help='file list of test image paths and labels')
|
||||
parser.add_argument('--batchsize', type=int, default=8, help='batch size')
|
||||
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
|
||||
|
||||
# configurations for training
|
||||
parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
|
||||
parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
|
||||
parser.add_argument('--act', type=str, default='relu',
|
||||
help='type of activation function, can be sigmoid, tanh, or relu')
|
||||
parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
|
||||
parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
|
||||
parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
|
||||
parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
|
||||
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
|
||||
parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
|
||||
parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
|
||||
parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
|
||||
|
||||
# configurations for test and prediction
|
||||
parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
|
||||
parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
|
||||
help='path of an image to be recognized')
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
# TODO 5: initialize the MLP model
|
||||
# what is the input size of the MLP?
|
||||
# hint 1: we convert an image to a vector as the input of the MLP
|
||||
# hint 2: each image has shape [norm_size[0], norm_size[1]]
|
||||
model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
|
||||
|
||||
# for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
|
||||
if opt.mode == 'test' or opt.mode == 'predict':
|
||||
checkpoint = torch.load(opt.model_path, map_location='cpu')
|
||||
# """The above code did not consider device problem"""
|
||||
# checkpoint = torch.load(opt.model_path, map_location=opt.device)
|
||||
# load model parameters we saved in model_path
|
||||
model.load_state_dict(checkpoint['state_dict'])
|
||||
print('[Info] Load model from {}'.format(opt.model_path))
|
||||
|
||||
# put the model on CPU or GPU according to the device in args
|
||||
model = model.to(opt.device)
|
||||
|
||||
# -- run the code for training and validation
|
||||
if opt.mode == 'train':
|
||||
# training and validation data loader
|
||||
trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
|
||||
valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
|
||||
train_val(model, trainloader, valloader,
|
||||
n_epochs=opt.epoch,
|
||||
lr=opt.lr,
|
||||
optim_type=opt.optim_type,
|
||||
momentum=opt.momentum,
|
||||
weight_decay=opt.weight_decay,
|
||||
valInterval=opt.valInterval,
|
||||
device=opt.device)
|
||||
|
||||
# -- test the saved model
|
||||
elif opt.mode == 'test':
|
||||
testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
|
||||
acc = test(model, testloader, opt.device)
|
||||
print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
|
||||
|
||||
# -- predict a new image
|
||||
elif opt.mode == 'predict':
|
||||
predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
|
||||
|
||||
else:
|
||||
print('mode should be train, test, or predict')
|
||||
raise NotImplementedError
|
||||
|
||||
41
hw3/code/check.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# check.py - Check your implementation of several modules
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
from svm_hw import SVM_HINGE, LinearFunction, Hinge
|
||||
import torch
|
||||
from torch.autograd import gradcheck
|
||||
|
||||
|
||||
def run():
|
||||
model = SVM_HINGE(2, C=1.0).double()
|
||||
x = torch.randn(50, 2, requires_grad=False).double()
|
||||
W = torch.randn(1, 2, requires_grad=True).double()
|
||||
b = torch.zeros(1, requires_grad=True).double()
|
||||
test = gradcheck(LinearFunction.apply, (x, W, b), eps=1e-6, atol=1e-4)
|
||||
if test:
|
||||
print('Linear successully tested!')
|
||||
output = torch.randn(50, 1, requires_grad=True).double()
|
||||
W = torch.randn(1, 2, requires_grad=True).double()
|
||||
labels = torch.ones(1, requires_grad=False).double()
|
||||
C = torch.tensor([[1.0]], requires_grad=False).double()
|
||||
test = gradcheck(Hinge.apply, (output, W, labels, C), eps=1e-6, atol=1e-5)
|
||||
if test:
|
||||
print('Hinge successfully tested!')
|
||||
x = torch.randn(50, 2, requires_grad=False).double()
|
||||
labels = torch.ones(50, requires_grad=False).double()
|
||||
try:
|
||||
output, loss = model(x, labels)
|
||||
assert model.W.requires_grad is True
|
||||
assert model.b.requires_grad is True
|
||||
print('SVM_HINGE successfully tested!')
|
||||
except:
|
||||
raise Exception('Failed testing SVM_HINGE!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
181
hw3/code/data_preprocess.py
Normal file
@@ -0,0 +1,181 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# data_preprocess.py - Using pretrained convolutional layers to extract feature,
|
||||
# and using PCA for dimensionality reduction
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
import os
|
||||
import torchvision.transforms as transforms
|
||||
import torch
|
||||
from PIL import Image
|
||||
from networks import Classifier
|
||||
import matplotlib.pyplot as plt
|
||||
import argparse
|
||||
|
||||
|
||||
def preprocess(pre_conv, data_root, image_size, classes):
|
||||
# TODO 1: Using PCA to reduce the dimensionality of 2048 point features extracted by convolution
|
||||
|
||||
# =============== process training dataset ======================
|
||||
print("Start preprocessing the training dataset !!!")
|
||||
train_data, train_label = loaddata(pre_conv, data_root, 'train', image_size, classes)
|
||||
|
||||
# calculate the mean and PCA projection matrix
|
||||
data_mean, u = PCA(train_data, 2)
|
||||
|
||||
u = u * 20
|
||||
|
||||
# TODO: using PCA to compress the dimensionality of the train_data after subtracting the mean vector
|
||||
train_data_pca = (train_data - data_mean) @ u
|
||||
|
||||
visualize(train_data_pca, train_label, "train")
|
||||
savedata(train_data_pca, train_label, data_root+"/train.pt")
|
||||
print("training dataset saved !!!")
|
||||
|
||||
# =============== process validation dataset ======================
|
||||
print("Start preprocessing the validation dataset!!!")
|
||||
val_data, val_label = loaddata(pre_conv, data_root, 'val', image_size, classes)
|
||||
|
||||
# TODO: using PCA to compress the dimensionality of the val_data after subtracting the mean vector
|
||||
val_data_pca = (val_data - data_mean) @ u
|
||||
|
||||
visualize(val_data_pca, val_label, "val")
|
||||
savedata(val_data_pca, val_label, data_root+"/val.pt")
|
||||
print("validation dataset saved !!!")
|
||||
|
||||
# =============== process testing dataset ======================
|
||||
print("Start preprocessing the testing dataset!!!")
|
||||
test_data, test_label = loaddata(pre_conv, data_root, 'test', image_size, classes)
|
||||
|
||||
# TODO: using PCA to compress the dimensionality of the test_data after subtracting the mean vector
|
||||
test_data_pca = (test_data - data_mean) @ u
|
||||
|
||||
visualize(test_data_pca, test_label, "test")
|
||||
savedata(test_data_pca, test_label, data_root+"/test.pt")
|
||||
print("testing dataset saved !!!")
|
||||
|
||||
|
||||
def savedata(data, label, save_path):
|
||||
save_dict = {
|
||||
'data': data,
|
||||
'label': label
|
||||
}
|
||||
torch.save(save_dict, save_path)
|
||||
|
||||
|
||||
def visualize(datas, labels, mode):
|
||||
"""
|
||||
Display feature points after dimensionality reduction
|
||||
-------------------------------
|
||||
:param datas: the samples after dimensionality reduction, with the shape of [N, 2]
|
||||
:param labels: the labels (chosen from {-1, +1}) corresponding to the samples
|
||||
:param mode: chosen from {'train', 'val', 'test'}
|
||||
:return:
|
||||
"""
|
||||
plt.figure()
|
||||
for idx in range(datas.shape[1]):
|
||||
plt.scatter(datas[labels == 2*idx-1, 0], datas[labels == 2*idx-1, 1], label=(2*idx-1))
|
||||
plt.legend()
|
||||
plt.title(mode)
|
||||
plt.show()
|
||||
|
||||
|
||||
def PCA(data, dim=2):
|
||||
"""
|
||||
calculate the mean value of the data and the projection matrix for PCA
|
||||
:param data: the sample features extracted by the pretrained network in homework2, with the shape of [N, 2048]
|
||||
:param dim: the data dimension after projection
|
||||
:return:
|
||||
data_mean: the mean value of the data
|
||||
u: the projection matrix for PCA, with the shape of [2048, dim]
|
||||
"""
|
||||
# TODO 2: complete the algorithm of PCA, calculate the mean value of the data and the projection matrix
|
||||
|
||||
# TODO: compute the mean of train_data
|
||||
data_mean = data.mean(dim=0)
|
||||
# TODO: compute the covariance matrix of train_data
|
||||
diff = data - data_mean
|
||||
# data_cov = diff.T @ diff
|
||||
data_cov = torch.cov(diff.T)
|
||||
# TODO: compute the SVD decompositon of data_cov using torch.linalg.svd
|
||||
# reference: https://pytorch.org/docs/1.11/generated/torch.linalg.svd.html
|
||||
u, s, v = torch.linalg.svd(data_cov)
|
||||
# TODO: return the proper 'data_mean' and 'u[]'
|
||||
return data_mean, u[:, :dim]
|
||||
|
||||
def loaddata(pre_conv, data_root, mode, image_size, classes):
|
||||
"""
|
||||
load one dataset, and use pretrained network in homework 2 to extract feature
|
||||
:param pre_conv: pretrained network in homework 2
|
||||
:param data_root: the path of the dataset
|
||||
:param mode: chosen from {'train', 'val', 'test'}
|
||||
:param image_size: the preset size that each image try to zoom to
|
||||
:param classes: two classes that need to be classified
|
||||
:return:
|
||||
datas: the samples of extracted features with the shape of [N, 2048]
|
||||
labels: the corresponding labels for each sample (chosen from {-1, +1}), with the shape of [N]
|
||||
"""
|
||||
assert len(classes) == 2
|
||||
datas = []
|
||||
labels = []
|
||||
for idx in range(len(classes)):
|
||||
for img in os.listdir(data_root + '/' + mode + '/' + classes[idx]):
|
||||
data = readimg(pre_conv, data_root + '/' + mode + '/' + classes[idx] + '/' + img, image_size)
|
||||
label = 2 * idx - 1
|
||||
datas.append(data)
|
||||
labels.append(label)
|
||||
return torch.stack(datas), torch.tensor(labels)
|
||||
|
||||
|
||||
def readimg(pre_conv, filepath, image_size):
|
||||
"""
|
||||
Read one image and use pretrained network to extract the feature
|
||||
--------------------------
|
||||
:param pre_conv: pretrained network in homework 2
|
||||
:param filepath: the file path of one image
|
||||
:param image_size: the preset size that each image try to zoom to
|
||||
:return:
|
||||
data: the extracted feature with the length of 2048
|
||||
"""
|
||||
img_pil = Image.open(filepath).convert('RGB')
|
||||
img_pil = img_pil.resize(image_size)
|
||||
img_transform = transforms.Compose([transforms.ToTensor(),
|
||||
transforms.Normalize(0.5, 0.5),
|
||||
])
|
||||
img_tensor = img_transform(img_pil)
|
||||
data = pre_conv(img_tensor.unsqueeze(0)).reshape(-1)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--pretrained_net", type=str, default="checkpoints/bn/ckpt_epoch_15.pth",
|
||||
help="the filepath of the pretrained network in homework 2")
|
||||
parser.add_argument("--data_root", type=str, default="data", help="the path of all datasets")
|
||||
parser.add_argument("--image_size", type=tuple, default=(32, 32),
|
||||
help="the preset size that each image try to zoom to")
|
||||
parser.add_argument("--classes", default=["B", "C"], help="two classes that need to be classified")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
pretrained_checkpoint = torch.load(args.pretrained_net, map_location="cpu")
|
||||
configs = pretrained_checkpoint["configs"]
|
||||
cls = Classifier(
|
||||
configs["in_channels"],
|
||||
configs["num_classes"],
|
||||
configs["use_batch_norm"],
|
||||
configs["use_stn"],
|
||||
configs["dropout_prob"],
|
||||
)
|
||||
cls.load_state_dict(pretrained_checkpoint["model_state"], strict=False)
|
||||
for param in cls.parameters():
|
||||
param.requires_grad = False
|
||||
conv = cls.conv_net
|
||||
|
||||
preprocess(conv, args.data_root, args.image_size, args.classes)
|
||||
26
hw3/code/datasets.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# datasets.py - Define the data loader for the traffic sign classification dataset
|
||||
# Student ID:
|
||||
# Name:
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
|
||||
import torch
|
||||
import torch.utils.data as data
|
||||
|
||||
|
||||
class Traffic_Dataset(data.Dataset):
|
||||
def __init__(self, data_root):
|
||||
dataset = torch.load(data_root)
|
||||
self.datas = dataset["data"]
|
||||
self.labels = dataset["label"]
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.datas[index], self.labels[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.datas)
|
||||
271
hw3/code/networks.py
Normal file
@@ -0,0 +1,271 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 2 Convolutional Neural Network
|
||||
# networks.py - Network definition
|
||||
# Student ID: 2022010639
|
||||
# Name: Gao Yixuan
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class ConvBlock(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
use_batch_norm=False,
|
||||
use_residual=False,
|
||||
):
|
||||
"""
|
||||
Convolutional block with batch normalization and ReLU activation
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
:param out_channels: channel number of output image
|
||||
:param kernel_size: size of convolutional kernel
|
||||
:param stride: stride of convolutional operation
|
||||
:param padding: padding of convolutional operation
|
||||
:param use_batch_norm: whether to use batch normalization in convolutional layers
|
||||
:param use_residual: whether to use residual connection
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if use_batch_norm:
|
||||
bn2d = nn.BatchNorm2d
|
||||
else:
|
||||
# use identity function to replace batch normalization
|
||||
bn2d = nn.Identity
|
||||
|
||||
self.use_residual = use_residual
|
||||
|
||||
# >>> TODO 2.1: complete a convolutional block with batch normalization and ReLU activation
|
||||
# Hint: use the `bn2d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
||||
# Network structure:
|
||||
# conv -> batchnorm -> relu
|
||||
self.conv = nn.Conv2d(
|
||||
in_channels, out_channels, kernel_size, stride=stride, padding=padding
|
||||
)
|
||||
self.bn = bn2d(out_channels)
|
||||
self.relu = nn.ReLU()
|
||||
# <<< TODO 2.1
|
||||
|
||||
def forward(self, x):
|
||||
# >>> TODO 2.2: forward process
|
||||
# Hint: apply residual connection if `self.use_residual` is True
|
||||
fx = self.relu(self.bn(self.conv(x)))
|
||||
# out = self.relu(self.bn(self.conv(x)))
|
||||
if self.use_residual:
|
||||
out = fx + x
|
||||
else:
|
||||
out = fx
|
||||
|
||||
# <<< TODO 2.2
|
||||
return out
|
||||
|
||||
|
||||
class Classifier(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
num_classes,
|
||||
use_batch_norm=False,
|
||||
use_stn=False,
|
||||
dropout_prob=0,
|
||||
):
|
||||
"""
|
||||
Convolutional Neural Networks
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
:param num_classes: number of classes for the classification task
|
||||
:param use_batch_norm: whether to use batch normalization in convolutional layers and linear layers
|
||||
:param use_stn: whether to use spatial transformer network
|
||||
:param dropout_prob: dropout ratio of dropout layer which ranges from 0 to 1
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if use_batch_norm:
|
||||
bn1d = nn.BatchNorm1d
|
||||
else:
|
||||
# use identity function to replace batch normalization
|
||||
bn1d = nn.Identity
|
||||
|
||||
if use_stn:
|
||||
self.stn = STN(in_channels)
|
||||
else:
|
||||
# use identity function to replace spatial transformer network
|
||||
self.stn = nn.Identity(in_channels)
|
||||
|
||||
# >>> TODO 3.1: complete a multilayer convolutional neural network with nn.Sequential function.
|
||||
# input image with size [batch_size, in_channels, img_h, img_w]
|
||||
# Network structure:
|
||||
# kernel_size stride padding out_channels use_residual
|
||||
# ConvBlock 5 1 2 32 False
|
||||
# ConvBlock 5 2 2 64 False
|
||||
# maxpool 2 2 0
|
||||
# ConvBlock 3 1 1 64 True
|
||||
# ConvBlock 3 1 1 128 False
|
||||
# maxpool 2 2 0
|
||||
# ConvBlock 3 1 1 128 True
|
||||
# dropout(p), where p is input parameter of dropout ratio
|
||||
|
||||
self.conv_net = nn.Sequential(
|
||||
ConvBlock(
|
||||
in_channels=in_channels,
|
||||
out_channels=32,
|
||||
kernel_size=5,
|
||||
stride=1,
|
||||
padding=2,
|
||||
),
|
||||
ConvBlock(
|
||||
in_channels=32, out_channels=64, kernel_size=5, stride=2, padding=2
|
||||
),
|
||||
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
||||
ConvBlock(
|
||||
in_channels=64,
|
||||
out_channels=64,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
use_residual=True,
|
||||
),
|
||||
ConvBlock(
|
||||
in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1
|
||||
),
|
||||
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
|
||||
ConvBlock(
|
||||
in_channels=128,
|
||||
out_channels=128,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
use_residual=True,
|
||||
),
|
||||
nn.Dropout2d(p=dropout_prob),
|
||||
)
|
||||
# <<< TODO 3.1
|
||||
|
||||
# >>> TODO 3.2: complete a sub-network with two linear layers by using nn.Sequential function
|
||||
# Hint:
|
||||
# (1) Note that the size of input images is (3, 32, 32) by default, what is the size of
|
||||
# the output of the convolution layers?
|
||||
# (2) Use the `bn1d` defined above for batch normalization to adapt to the input parameter `use_batch_norm`
|
||||
# Network structure:
|
||||
# out_channels
|
||||
# linear 256
|
||||
# activation
|
||||
# batchnorm
|
||||
# dropout(p), where p is input parameter of dropout ratio
|
||||
# linear num_classes
|
||||
self.fc_net = nn.Sequential(
|
||||
nn.Linear(2048, 256),
|
||||
nn.ReLU(),
|
||||
bn1d(256),
|
||||
nn.Dropout1d(dropout_prob),
|
||||
nn.Linear(256, num_classes),
|
||||
)
|
||||
# <<< TODO 3.2
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Define the forward function
|
||||
:param x: input features with size [batch_size, in_channels, img_h, img_w]
|
||||
:return: output features with size [batch_size, num_classes]
|
||||
"""
|
||||
# Step 1: apply spatial transformer network if applicable
|
||||
x = self.stn(x)
|
||||
|
||||
# >>> TODO 3.3: forward process
|
||||
# Step 2: forward process for the convolutional network
|
||||
x = self.conv_net(x)
|
||||
|
||||
# Step 3: use `Tensor.view()` to flatten the tensor to match the size of the input of the
|
||||
# fully connected layers.
|
||||
x = x.view(x.shape[0], -1)
|
||||
|
||||
# Step 4: forward process for the fully connected network
|
||||
out = self.fc_net(x)
|
||||
# <<< TODO 3.3
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class STN(nn.Module):
|
||||
def __init__(self, in_channels):
|
||||
"""
|
||||
The spatial transformer network (STN) learns how to perform spatial transformations on the
|
||||
input image in order to enhance the geometric invariance of the model. For example, it can
|
||||
crop a region of interest, scale and correct the orientation of an image. It can be a useful
|
||||
mechanism because CNNs are not invariant to rotation and scale and more general affine
|
||||
transformations.
|
||||
|
||||
The spatial transformer network boils down to three main components:
|
||||
|
||||
- The localization network is a regular CNN which regresses the transformation parameters.
|
||||
The transformation is never learned explicitly from this dataset, instead the network
|
||||
learns automatically the spatial transformations that enhances the global accuracy.
|
||||
- The grid generator generates a grid of coordinates in the input image corresponding
|
||||
to each pixel from the output image.
|
||||
- The sampler uses the parameters of the transformation and applies it to the input image.
|
||||
|
||||
Here, we are going to implement an STN that performs affine transformations on the input images.
|
||||
For more information, please refer to the slides and
|
||||
https://pytorch.org/tutorials/intermediate/spatial_transformer_tutorial.html .
|
||||
|
||||
----------------------
|
||||
:param in_channels: channel number of input image
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# >>> TODO 4.1: Build your localization net
|
||||
# Step 1: Build a convolutional network to extract features from input images.
|
||||
# Hint: Combine convolutional layers, batch normalization layers and ReLU activation functions to build
|
||||
# this network.
|
||||
# Suggested structure: 3 down-sampling convolutional layers with doubling output channels, using BN and ReLU.
|
||||
self.localization_conv = nn.Sequential(
|
||||
ConvBlock(in_channels=in_channels, out_channels=8, kernel_size=9, stride=2, padding=4, use_batch_norm=True),
|
||||
# 8 * 13 * 13
|
||||
ConvBlock(in_channels=8, out_channels=16, kernel_size=5, stride=2, padding=2, use_batch_norm=True),
|
||||
ConvBlock(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1, use_batch_norm=True),
|
||||
# 32 * 4 * 4
|
||||
)
|
||||
|
||||
# Step 2: Build a fully connected network to predict the parameters of affine transformation from
|
||||
# the extracted features.
|
||||
# Hint: Combine linear layers and ReLU activation functions to build this network.
|
||||
# Suggested structure: 2 linear layers with one BN and ReLU.
|
||||
self.localization_fc = nn.Sequential(
|
||||
nn.Linear(32 * 4 * 4, 256),
|
||||
nn.ReLU(),
|
||||
nn.BatchNorm1d(256),
|
||||
nn.Linear(256, 6)
|
||||
)
|
||||
# <<< TODO 4.1
|
||||
|
||||
# >>> TODO 4.2: Initialize the weight/bias of the last linear layer of the fully connected network
|
||||
# Hint: The STN should generate the identity transformation by default before training.
|
||||
# How to initialize the weight/bias of the last linear layer of the fully connected network to
|
||||
# achieve this goal?
|
||||
nn.init.zeros_(self.localization_fc[3].weight)
|
||||
# <<< TODO 4.2
|
||||
|
||||
def forward(self, x):
|
||||
# Extract the features from input images and flatten them
|
||||
features = self.localization_conv(x)
|
||||
features = features.view(features.shape[0], -1)
|
||||
|
||||
# Predict the parameters of affine transformation from the extracted features
|
||||
theta = self.localization_fc(features)
|
||||
theta = theta.view(-1, 2, 3)
|
||||
|
||||
# Apply affine transformation to input images
|
||||
grid = F.affine_grid(theta, x.shape, align_corners=False)
|
||||
x = F.grid_sample(x, grid, align_corners=False)
|
||||
|
||||
return x
|
||||
148
hw3/code/svm_hw.py
Normal file
@@ -0,0 +1,148 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# svm_hw.py - The implementation of SVM using hinge loss
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# TODO 1: complete the forward and backward propagation processes of the linear layer
|
||||
class LinearFunction(torch.autograd.Function):
|
||||
'''
|
||||
we will implement the linear function:
|
||||
y = xW^T + b
|
||||
as well as its gradient computation process
|
||||
'''
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, x, W, b):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object that can be used to stash information for backward computation
|
||||
:param x: input features with size [batch_size, input_size]
|
||||
:param W: weight matrix with size [output_size, input_size]
|
||||
:param b: bias with size [output_size]
|
||||
Return:
|
||||
y :output features with size [batch_size, output_size]
|
||||
'''
|
||||
|
||||
# TODO
|
||||
y = torch.matmul(x, W.T) + b
|
||||
ctx.save_for_backward(x, W)
|
||||
|
||||
return y
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_output):
|
||||
'''
|
||||
Input:
|
||||
:param ctx: a context object with saved variables
|
||||
:param grad_output: dL/dy, with size [batch_size, output_size]
|
||||
Return:
|
||||
grad_input: dL/dx, with size [batch_size, input_size]
|
||||
grad_W: dL/dW, with size [output_size, input_size], summed for data in the batch
|
||||
grad_b: dL/db, with size [output_size], summed for data in the batch
|
||||
'''
|
||||
|
||||
x, W = ctx.saved_variables
|
||||
|
||||
# calculate dL/dx by using dL/dy (grad_output) and W, e.g., dL/dx = dL/dy*W
|
||||
# calculate dL/dW by using dL/dy (grad_output) and x
|
||||
# calculate dL/db using dL/dy (grad_output)
|
||||
# you can use torch.matmul(A, B) to compute matrix product of A and B
|
||||
|
||||
# TODO
|
||||
grad_input = torch.matmul(grad_output, W)
|
||||
grad_W = torch.matmul(grad_output.T, x)
|
||||
grad_b = grad_output.sum(0)
|
||||
|
||||
return grad_input, grad_W, grad_b
|
||||
|
||||
|
||||
# TODO 2: complete the forward and backward propagation processes of the hinge loss
|
||||
class Hinge(torch.autograd.Function):
|
||||
|
||||
@staticmethod
|
||||
def forward(ctx, output, W, label, C):
|
||||
"""
|
||||
Compute the hinge loss
|
||||
--------------------------------------
|
||||
:param ctx: a context object that can be used to stash information for backward computation
|
||||
:param output: the output of the linear layer with size [batch_size, 1], i.e. output = W^T*x + b
|
||||
:param W: weight matrix with size [1, input_size]
|
||||
:param label: the ground truth y in the equation for loss calculation, with size [batch_size]
|
||||
:param C: the regularization coefficient of hinge loss with size [1, 1]
|
||||
:return: the hinge loss with size [1, 1]
|
||||
"""
|
||||
C = C.type_as(W)
|
||||
|
||||
# TODO: compute the hinge loss (together with L2 norm for SVM): loss = 0.5*||w||^2 + C*\sum_i{max(0, 1 - y_i*output_i)}
|
||||
# you may need F.relu() to implement the max() function.
|
||||
# print("output size", output.size())
|
||||
# print("label size", label.size())
|
||||
# print("product", label * output.reshape_as(label))
|
||||
# print("minus", 1 - label * output.reshape_as(label))
|
||||
# print("relu", F.relu(1 - label * output.reshape_as(label)))
|
||||
# print("sum", (F.relu(1 - label * output.reshape_as(label))).sum())
|
||||
loss = 1/2 * (W @ W.T) + C * (F.relu(1 - (output.T * label).T)).sum()
|
||||
ctx.save_for_backward(output, W, label, C)
|
||||
|
||||
return loss
|
||||
|
||||
@staticmethod
|
||||
def backward(ctx, grad_loss):
|
||||
"""
|
||||
Compute the gradient of hinge loss
|
||||
:param ctx: a context object with saved variables
|
||||
:param grad_loss: dL/dloss, with size [1, 1], the gradient of the final target loss with respect to the output (variable 'loss') of the forward function
|
||||
:return:
|
||||
grad_output: dL/doutput, with size [batch_size, 1]
|
||||
grad_W: dL/dW, with size [1, channels]
|
||||
"""
|
||||
output, W, label, C = ctx.saved_tensors
|
||||
# TODO: compute the grad with respect to the output of the linear function and W: dL/doutput, dL/dW
|
||||
# print("output", output, "label", label, "product", (1 - label.reshape_as(output) * output))
|
||||
# print("grad_loss size", grad_loss.size())
|
||||
# print("sizeof l / output", (C * torch.heaviside(1 - label.reshape_as(output) * output, torch.tensor(0).type_as(output)) * (-label.reshape_as(output))).size())
|
||||
grad_output = grad_loss * C * ((torch.heaviside(1 - (output.T * label).T, torch.tensor(1).type_as(output)).T * (-label))).T
|
||||
grad_W = grad_loss * W
|
||||
return grad_output, grad_W, None, None
|
||||
|
||||
|
||||
# TODO 3: complete the structure of SVM model
|
||||
class SVM_HINGE(nn.Module):
|
||||
|
||||
def __init__(self, in_channels, C):
|
||||
"""
|
||||
:param in_channels: number of feature channels for SVM input
|
||||
:param C: regularization coefficient of hinge loss with size [1, 1]
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# TODO: define the parameters W and b
|
||||
"""
|
||||
the shape of W should be [1, channels] and the shape of b should be [1, ]
|
||||
you need to use nn.Parameter() to make W and b be trainable parameters, don't forget to set requires_grad=True for self.W and self.b
|
||||
please use torch.randn() to initialize W and b
|
||||
"""
|
||||
|
||||
self.W = nn.Parameter(torch.rand(1, in_channels), requires_grad=True)
|
||||
self.b = nn.Parameter(torch.rand(1, ), requires_grad=True)
|
||||
self.C = torch.tensor([[C]], requires_grad=False)
|
||||
|
||||
def forward(self, x, label=None):
|
||||
# SVM calculation
|
||||
output = LinearFunction.apply(x, self.W, self.b)
|
||||
if label is not None:
|
||||
loss = Hinge.apply(output, self.W, label, self.C)
|
||||
else:
|
||||
loss = None
|
||||
output = (output > 0.0).type_as(x) * 2.0 - 1.0
|
||||
return output, loss
|
||||
110
hw3/code/test_svm.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# test_svm.py - Test svm model for traffic sign
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
# ==== Part 1: import libs
|
||||
import argparse
|
||||
import torch
|
||||
from datasets import Traffic_Dataset
|
||||
from svm_hw import SVM_HINGE
|
||||
from torch.utils.data import DataLoader
|
||||
import os.path
|
||||
|
||||
|
||||
# ==== Part 2: testing
|
||||
def test(
|
||||
data_root,
|
||||
model_save_path,
|
||||
device,
|
||||
):
|
||||
"""
|
||||
The main testing procedure of SVM model
|
||||
----------------------------
|
||||
:param data_root: path to the root directory of dataset
|
||||
:param model_save_path: path to pretrained SVM model
|
||||
:param device: device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
"""
|
||||
|
||||
# TODO 1: =================== load the pretrained SVM model ==================================
|
||||
|
||||
# TODO: construct testing data loader with 'Traffic_Dataset' and DataLoader, and set 'batch_size=1' and 'shuffle=False'
|
||||
test_data = Traffic_Dataset(os.path.join(data_root, 'test.pt'))
|
||||
test_loader = DataLoader(test_data, shuffle=False)
|
||||
|
||||
# TODO: load state dictionary of pretrained SVM model
|
||||
model_svm = torch.load(os.path.join(model_save_path))
|
||||
|
||||
# TODO: initialize the SVM model using 'model_svm["configs"]["feature_channel"]' and 'model_svm["configs"]["C"]'
|
||||
svm = SVM_HINGE(model_svm["configs"]["feature_channel"], model_svm["configs"]["C"])
|
||||
|
||||
# TODO: load model parameters (model_svm['state_dict']) we saved in model_path using svm.load_state_dict()
|
||||
svm.load_state_dict(model_svm["state_dict"])
|
||||
|
||||
# TODO: put the model on CPU or GPU
|
||||
svm.to(device)
|
||||
|
||||
# TODO 2 : ================================ testing ==============================================
|
||||
|
||||
# TODO: set the model in evaluation mode
|
||||
svm.eval()
|
||||
|
||||
# to calculate and save the testing accuracy
|
||||
n_correct = 0. # number of images that are correctly classified
|
||||
n_feas = 0. # number of total images
|
||||
|
||||
with torch.no_grad(): # we do not need to compute gradients during validation
|
||||
# TODO: inference on the testing dataset, similar to the training stage but use 'test_loader'.
|
||||
for input, label in test_loader:
|
||||
# TODO: set data type (.float()) and device (.to())
|
||||
input, label = (
|
||||
input.type(torch.float).to(device),
|
||||
label.type(torch.float).to(device)
|
||||
)
|
||||
|
||||
# TODO: run the model; at the validation step, the model only needs one input: feas
|
||||
# _ refers to a placeholder, which means we do not need the second returned value during validating
|
||||
out, _ = svm(input)
|
||||
|
||||
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
|
||||
n_correct += (out.reshape_as(label) == label).sum().item()
|
||||
|
||||
# TODO:sum up the total image number
|
||||
n_feas += label.numel()
|
||||
|
||||
# show prediction accuracy
|
||||
acc = 100 * n_correct / n_feas
|
||||
print('Test accuracy = {:.1f}%'.format(acc))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# set configurations of the testing process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels")
|
||||
parser.add_argument("--device", type=str, help="cpu or cuda")
|
||||
parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device is None:
|
||||
args.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# run the testing procedure
|
||||
test(
|
||||
data_root=args.data_root,
|
||||
model_save_path=args.model_save_path,
|
||||
device=args.device,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
296
hw3/code/train_svm.py
Normal file
@@ -0,0 +1,296 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 3 Support Vector Machine
|
||||
# train_svm.py - Train svm model for traffic sign
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
# ==== Part 1: import libs
|
||||
import argparse
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
import numpy as np
|
||||
import random
|
||||
from datasets import Traffic_Dataset
|
||||
from svm_hw import SVM_HINGE
|
||||
from torch.utils.data import DataLoader
|
||||
import os.path
|
||||
|
||||
|
||||
# ==== Part 2: training and validation
|
||||
def train(
|
||||
data_root,
|
||||
feature_channel,
|
||||
batch_size,
|
||||
n_epoch,
|
||||
lr,
|
||||
C,
|
||||
model_save_path,
|
||||
device,
|
||||
):
|
||||
"""
|
||||
The main training procedure of SVM model
|
||||
----------------------------
|
||||
:param data_root: path to the root directory of dataset
|
||||
:param feature_channel: number of feature channels for SVM input
|
||||
:param batch_size: batch size of training
|
||||
:param n_epoch: number of training epochs
|
||||
:param lr: learning rate
|
||||
:param C: regularization coefficient in hinge loss
|
||||
:param model_save_path: path to save SVM model
|
||||
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
||||
"""
|
||||
|
||||
# TODO 1: construct training and validation data loader with 'Traffic_Dataset' and DataLoader, and set proper values for 'batch_size' and 'shuffle'
|
||||
train_data = Traffic_Dataset(os.path.join(data_root, 'train.pt'))
|
||||
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
|
||||
val_data = Traffic_Dataset(os.path.join(data_root, 'val.pt'))
|
||||
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)
|
||||
|
||||
# scale the regularization coefficient
|
||||
C = C * len(train_loader)
|
||||
|
||||
# TODO: initialize the SVM model
|
||||
svm = SVM_HINGE(feature_channel, C)
|
||||
|
||||
# TODO: put the model on CPU or GPU
|
||||
svm.to(device)
|
||||
|
||||
# TODO: define the Adam optimizer
|
||||
optimizer = torch.optim.Adam(svm.parameters(), lr)
|
||||
|
||||
# to save the training loss, training accuracy, validation accuracy, and the epoch index of each training epoch
|
||||
train_loss = []
|
||||
train_acc = []
|
||||
val_acc = []
|
||||
epochs = []
|
||||
|
||||
for epoch in range(n_epoch):
|
||||
# TODO: save the index of current epoch in the array 'epochs'
|
||||
epochs.append(epoch + 1)
|
||||
|
||||
# TODO 2: ========================= training =======================
|
||||
# TODO: set the model in training mode›
|
||||
svm.train()
|
||||
|
||||
# to calculate and save the training loss and training accuracy
|
||||
total_loss = 0. # to save total training loss in one epoch
|
||||
n_correct = 0. # number of images that are correctly classified
|
||||
n_feas = 0. # number of total images
|
||||
|
||||
# TODO: get a batch of data; you may need enumerate() to iteratively get data from 'train_loader'.
|
||||
# you can refer to previous homework, for example hw2
|
||||
for step, (input, label) in enumerate(train_loader):
|
||||
# TODO: set data type (.float()) and device (.to())
|
||||
input, label = (
|
||||
input.type(torch.float).to(device),
|
||||
label.type(torch.float).to(device)
|
||||
)
|
||||
|
||||
# TODO: clear gradients in the optimizer
|
||||
optimizer.zero_grad()
|
||||
|
||||
# TODO: run the model with hinge loss; the model needs two inputs: feas and labels
|
||||
out, loss = svm(input, label)
|
||||
|
||||
# TODO: back-propagation on the computation graph
|
||||
loss.backward()
|
||||
|
||||
# TODO: sum up of total loss, loss.item() return the value of the tensor as a standard python number
|
||||
total_loss += loss.item()
|
||||
|
||||
# TODO: call a function to update the parameters of the models
|
||||
optimizer.step()
|
||||
|
||||
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
|
||||
n_correct += (out.reshape_as(label) == label).sum().item()
|
||||
|
||||
# TODO: sum up the total image number
|
||||
n_feas += label.numel()
|
||||
|
||||
# average of the total loss for iterations
|
||||
acc = 100 * n_correct / n_feas
|
||||
avg_loss = total_loss / len(train_loader)
|
||||
train_acc.append(acc)
|
||||
train_loss.append(avg_loss)
|
||||
print('Epoch {:02d}: loss = {:.3f}, training accuracy = {:.1f}%'.format(epoch + 1, avg_loss, acc))
|
||||
|
||||
# TODO 3: ========================== Validation ======================================
|
||||
|
||||
# TODO: set the model in evaluation mode
|
||||
svm.eval()
|
||||
|
||||
# to calculate and save the validation accuracy
|
||||
n_correct = 0. # number of images that are correctly classified
|
||||
n_feas = 0. # number of total images
|
||||
|
||||
with torch.no_grad(): # we do not need to compute gradients during validation
|
||||
# TODO: inference on the validation dataset, similar to the training stage but use 'val_loader'.
|
||||
for input, label in val_loader:
|
||||
# TODO: set data type (.float()) and device (.to())
|
||||
input, label = (
|
||||
input.type(torch.float).to(device),
|
||||
label.type(torch.float).to(device)
|
||||
)
|
||||
|
||||
# TODO: run the model; at the validation step, the model only needs one input: feas
|
||||
# _ refers to a placeholder, which means we do not need the second returned value during validating
|
||||
out, _ = svm(input)
|
||||
|
||||
# TODO: sum up the number of images correctly recognized. note the shapes of 'out' and 'labels' are different
|
||||
n_correct += (out.reshape_as(label) == label).sum().item()
|
||||
|
||||
# TODO: sum up the total image number
|
||||
n_feas += label.numel()
|
||||
|
||||
# show prediction accuracy
|
||||
acc = 100 * n_correct / n_feas
|
||||
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, acc))
|
||||
val_acc.append(acc)
|
||||
|
||||
# save model parameters in a file
|
||||
torch.save({'state_dict': svm.state_dict(),
|
||||
'configs': {
|
||||
'feature_channel': feature_channel,
|
||||
'C': C}
|
||||
}, model_save_path)
|
||||
print('Model saved in {}\n'.format(model_save_path))
|
||||
|
||||
W = svm.W.data.cpu()
|
||||
b = svm.b.data.cpu()
|
||||
|
||||
# TODO 4: calculate the index of support vectors in training samples using 'train_data.datas' and 'train_data.labels'
|
||||
# 'sv' should be a list in python structure with the shape of [K], where K is the number of support vectors.
|
||||
sv = [idx for idx, (data, label) in enumerate(zip(train_data.datas, train_data.labels)) if label * ((W @ data) + b) <= 1]
|
||||
|
||||
plot(train_loss, train_acc, val_acc, epochs)
|
||||
plot_feature(train_features=train_data.datas, val_features=val_data.datas, train_labels=train_data.labels,
|
||||
val_labels=val_data.labels, sv=sv, W=W, b=b)
|
||||
|
||||
|
||||
def plot_feature(train_features, val_features, train_labels, val_labels, sv, W, b):
|
||||
"""
|
||||
Draw the samples,SVM decision boundary, and support vectors
|
||||
---------------------
|
||||
:param train_features: training samples with the shape of [B, 2]
|
||||
:param val_features: validation samples with the shape of [B, 2]
|
||||
:param train_labels: the labels (chosen from{-1, +1}) corresponding to training samples, with the shape of [B, 1]
|
||||
:param val_labels: the labels (chosen from{-1, +1}) corresponding to validation samples, with the shape of [B, 1]
|
||||
:param sv: a list with the index of support vectors in training samples, with the shape of [K] (K is the number of support vectors)
|
||||
:param W: the weight vector of SVM decision boundary (W^Tx + b), with the shape of [1, feature_channel]
|
||||
:param b: the bias of SVM decision boundary (W^Tx + b), with the shape of [1,]
|
||||
"""
|
||||
train_labels = (train_labels > 0.0).int()
|
||||
val_labels = (val_labels > 0.0).int()
|
||||
train_labels[sv] = 2
|
||||
foreground = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(sv))
|
||||
foreground_sv = list(set([i for i in range(train_labels.shape[0] // 2)]) - set(foreground))
|
||||
background = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(sv))
|
||||
background_sv = list(set([i + train_labels.shape[0] // 2 for i in range(train_labels.shape[0] // 2)]) - set(background))
|
||||
f, ax = plt.subplots()
|
||||
plt.title("training dataset")
|
||||
ax.scatter(train_features[foreground, 0], train_features[foreground, 1], marker='.', c='r', label="-1")
|
||||
ax.scatter(train_features[foreground_sv, 0], train_features[foreground_sv, 1], marker='.', c='darkorange',
|
||||
label="-1 (support vector)")
|
||||
ax.scatter(train_features[background, 0], train_features[background, 1], marker='x', c='b', label="+1")
|
||||
ax.scatter(train_features[background_sv, 0], train_features[background_sv, 1], marker='x', c='c',
|
||||
label="+1 (support vector)")
|
||||
x = np.linspace(-20, 20, 100)
|
||||
ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
|
||||
ax.legend(loc="best")
|
||||
plt.ylim([-30, 30])
|
||||
plt.show()
|
||||
f, ax = plt.subplots()
|
||||
plt.title("validation dataset")
|
||||
foreground_val = [i for i in range(val_labels.shape[0] // 2)]
|
||||
background_val = [i + val_labels.shape[0] // 2 for i in range(val_labels.shape[0] // 2)]
|
||||
ax.scatter(val_features[foreground_val, 0], val_features[foreground_val, 1], marker='.', c='r', label="-1")
|
||||
ax.scatter(val_features[background_val, 0], val_features[background_val, 1], marker='x', c='b', label="+1")
|
||||
x = np.linspace(-20, 20, 100)
|
||||
ax.plot(x, -W[0, 0] / W[0, 1] * x - b / W[0, 1], c='y')
|
||||
ax.legend(loc="best")
|
||||
plt.ylim([-30, 30])
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot(train_loss, train_acc, val_acc, epochs):
|
||||
"""
|
||||
Draw loss and accuracy curve
|
||||
------------------
|
||||
:param train_loss: a list with loss of each training epoch
|
||||
:param train_acc: a list with accuracy on training dataset of each training epoch
|
||||
:param val_acc: a list with accuracy on validation dataset of each training epoch
|
||||
:param epochs: a list with the index of all training epochs
|
||||
"""
|
||||
|
||||
# draw the training loss curve
|
||||
f, ax = plt.subplots()
|
||||
plt.title("Training Loss")
|
||||
ax.plot(epochs, train_loss, color="tab:blue")
|
||||
ax.set_xlabel("Training epoch")
|
||||
ax.set_ylabel("Loss")
|
||||
ax.legend(["training loss"], loc="best")
|
||||
plt.show()
|
||||
|
||||
# draw the accuracy curve
|
||||
f, ax = plt.subplots()
|
||||
plt.title("Training and Validation Accuracy")
|
||||
ax.plot(epochs, train_acc, color="tab:orange")
|
||||
ax.plot(epochs, val_acc, color="tab:green")
|
||||
ax.legend(["training accuracy","validation accuracy"], loc="best")
|
||||
ax.set_xlabel("Training epoch")
|
||||
ax.set_ylabel("Accuracy")
|
||||
ax.set_ylim(0, 101)
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
|
||||
# set configurations of the model and training process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data_root", type=str, default="data", help="file list of training image paths and labels",)
|
||||
parser.add_argument("--n_epoch", type=int, default=50, help="number of training epochs")
|
||||
parser.add_argument("--batch_size", type=int, default=20, help="training batch size")
|
||||
parser.add_argument("--lr", type=float, default=1e-2, help="learning rate")
|
||||
parser.add_argument("--C", type=float, default=1e-3, help="regularization coefficient in hinge loss")
|
||||
parser.add_argument("--device", type=str, help="cpu or cuda")
|
||||
parser.add_argument("--feature_channel", type=int, default=2, help="number of pre-extracted feature channel by pretrained network")
|
||||
parser.add_argument("--model_save_path", type=str, default="checkpoints/svm.pth", help="path to save SVM model")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.device is None:
|
||||
args.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# run the training procedure
|
||||
train(
|
||||
data_root=args.data_root,
|
||||
feature_channel=args.feature_channel,
|
||||
batch_size=args.batch_size,
|
||||
n_epoch=args.n_epoch,
|
||||
lr=args.lr,
|
||||
C=args.C,
|
||||
model_save_path=args.model_save_path,
|
||||
device=args.device,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
132
hw3/report/dtx-style.sty
Normal file
@@ -0,0 +1,132 @@
|
||||
%%
|
||||
%% This is file `dtx-style.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `dtx-style')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\ProvidesPackage{dtx-style}
|
||||
\RequirePackage{hypdoc}
|
||||
\RequirePackage[UTF8,scheme=chinese]{ctex}
|
||||
\RequirePackage{newpxtext}
|
||||
\RequirePackage{newpxmath}
|
||||
\RequirePackage[
|
||||
top=2.5cm, bottom=2.5cm,
|
||||
left=4cm, right=2cm,
|
||||
headsep=3mm]{geometry}
|
||||
\RequirePackage{array,longtable,booktabs}
|
||||
\RequirePackage{listings}
|
||||
\RequirePackage{fancyhdr}
|
||||
\RequirePackage{xcolor}
|
||||
\RequirePackage{enumitem}
|
||||
\RequirePackage{etoolbox}
|
||||
\RequirePackage{metalogo}
|
||||
|
||||
\colorlet{thu@macro}{blue!60!black}
|
||||
\colorlet{thu@env}{blue!70!black}
|
||||
\colorlet{thu@option}{purple}
|
||||
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
|
||||
\def\DescribeOption{%
|
||||
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
|
||||
\Describe@Option}
|
||||
\def\Describe@Option#1{\endgroup
|
||||
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
|
||||
\thu@special@index{option}{#1}\@esphack\ignorespaces}
|
||||
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
|
||||
\def\thu@special@index#1#2{\@bsphack
|
||||
\begingroup
|
||||
\HD@target
|
||||
\let\HDorg@encapchar\encapchar
|
||||
\edef\encapchar usage{%
|
||||
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
|
||||
}%
|
||||
\index{#2\actualchar{\string\ttfamily\space#2}
|
||||
(#1)\encapchar usage}%
|
||||
\index{#1:\levelchar#2\actualchar
|
||||
{\string\ttfamily\space#2}\encapchar usage}%
|
||||
\endgroup
|
||||
\@esphack}
|
||||
|
||||
\lstdefinestyle{lstStyleBase}{%
|
||||
basicstyle=\small\ttfamily,
|
||||
aboveskip=\medskipamount,
|
||||
belowskip=\medskipamount,
|
||||
lineskip=0pt,
|
||||
boxpos=c,
|
||||
showlines=false,
|
||||
extendedchars=true,
|
||||
upquote=true,
|
||||
tabsize=2,
|
||||
showtabs=false,
|
||||
showspaces=false,
|
||||
showstringspaces=false,
|
||||
numbers=none,
|
||||
linewidth=\linewidth,
|
||||
xleftmargin=4pt,
|
||||
xrightmargin=0pt,
|
||||
resetmargins=false,
|
||||
breaklines=true,
|
||||
breakatwhitespace=false,
|
||||
breakindent=0pt,
|
||||
breakautoindent=true,
|
||||
columns=flexible,
|
||||
keepspaces=true,
|
||||
gobble=2,
|
||||
framesep=3pt,
|
||||
rulesep=1pt,
|
||||
framerule=1pt,
|
||||
backgroundcolor=\color{gray!5},
|
||||
stringstyle=\color{green!40!black!100},
|
||||
keywordstyle=\bfseries\color{blue!50!black},
|
||||
commentstyle=\slshape\color{black!60}}
|
||||
|
||||
\lstdefinestyle{lstStyleShell}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{purple},
|
||||
language=bash}
|
||||
|
||||
\lstdefinestyle{lstStyleLaTeX}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{violet},
|
||||
language=[LaTeX]TeX}
|
||||
|
||||
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
|
||||
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
|
||||
|
||||
\setlist{nosep}
|
||||
|
||||
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
|
||||
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
|
||||
\DeclareDocumentCommand{\pkg}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
|
||||
\DeclareDocumentCommand{\file}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
|
||||
\newcommand{\myentry}[1]{%
|
||||
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
|
||||
\newcommand{\note}[2][Note]{{%
|
||||
\color{magenta}{\bfseries #1}\emph{#2}}}
|
||||
|
||||
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
|
||||
153
hw3/report/iidef.sty
Normal file
@@ -0,0 +1,153 @@
|
||||
%%
|
||||
%% This is file `iidef.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `sty')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
|
||||
\ProvidesClass{iidef}
|
||||
[2020/09/09 2.6 Tsinghua University Coursework Template]
|
||||
%% configuration of nested enumerate env
|
||||
\RequirePackage{enumitem}
|
||||
%% set hwcount key-value option
|
||||
\RequirePackage{kvoptions}
|
||||
%% required by macro DeclareMathOperator
|
||||
\RequirePackage{amsmath}
|
||||
%% Set up page headers using with fancyhdr
|
||||
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
|
||||
{\def\@thulhead{thulhead}}
|
||||
\RequirePackage{amsthm}
|
||||
%% semester
|
||||
\def\@term{term}
|
||||
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
|
||||
%% institute
|
||||
\newcommand{\@courseinstitute}[1]{institute}
|
||||
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
|
||||
%% coursename
|
||||
\newcommand{\@coursename}[1]{coursename}
|
||||
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
|
||||
%% user can rewrite homework name
|
||||
\def\@hwname{Homework}
|
||||
\def\hwname#1{\renewcommand\@hwname{#1}}
|
||||
%% \iidef@thehwcnt = 1
|
||||
\DeclareStringOption[1]{thehwcnt}
|
||||
\ProcessKeyvalOptions*
|
||||
\def\thehwcnt{\iidef@thehwcnt}
|
||||
%% page header setup, distinguish between first page(plain style)
|
||||
%% and second page on (runningpage style)
|
||||
%%***************************************************************************
|
||||
\newcommand{\courseheader}{
|
||||
\thispagestyle{plain}%first page use native plain style to suppress header
|
||||
\vspace*{-1in}
|
||||
\begin{center}
|
||||
\@courseinstitute\\
|
||||
\@coursename\\
|
||||
\@term
|
||||
\vspace*{0.1in}
|
||||
\hrule
|
||||
\end{center}
|
||||
\begin{center}
|
||||
\underline{\bf \@hwname\;\thehwcnt} \\
|
||||
\end{center}
|
||||
}
|
||||
\@ifundefined{@thulhead}{
|
||||
\fancypagestyle{runningpage}
|
||||
{
|
||||
\fancyhead[L]{\small\@coursename}
|
||||
\fancyhead[R]{\small\@courseinstitute}
|
||||
}
|
||||
%% use runningpage style from second page on
|
||||
\pagestyle{runningpage}
|
||||
}{}
|
||||
%% *********************************************************************************************
|
||||
%%name command macro
|
||||
%%*************************
|
||||
\newcommand{\name}[1]{
|
||||
\begin{flushleft}
|
||||
#1\hfill
|
||||
\today
|
||||
\end{flushleft}
|
||||
\hrule
|
||||
|
||||
\vspace{2em}
|
||||
|
||||
\flushleft
|
||||
}
|
||||
%%*************************
|
||||
%% enumitem related configuration
|
||||
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
|
||||
\setlist[enumerate,2]{label=(\alph*)}
|
||||
\setlist[enumerate,3]{label=\roman*.}
|
||||
\setlist[enumerate,4]{label=\greek*}
|
||||
%%******************************
|
||||
\def\@slname{Solution}
|
||||
\def\slname#1{\renewcommand\@slname{#1}}
|
||||
|
||||
\@ifundefined{solution}{
|
||||
\newenvironment{solution}
|
||||
{
|
||||
\proof[\@slname]
|
||||
}
|
||||
{
|
||||
%% no qed symbol in solution env
|
||||
\renewcommand{\qedsymbol}{}
|
||||
\endproof
|
||||
}
|
||||
}{}
|
||||
%%******************************
|
||||
%%common math symbols go here
|
||||
%%*************************************************
|
||||
\def\v#1{\underline{#1}}
|
||||
\newcommand{\uc}{\underline{c}} % c, vec
|
||||
\newcommand{\uv}{\underline{v}} % x, vec
|
||||
\newcommand{\uw}{\underline{w}} % w, vec
|
||||
\newcommand{\ux}{\underline{x}} % x, vec
|
||||
\newcommand{\uy}{\underline{y}} % y, vec
|
||||
\newcommand{\uz}{\underline{z}} % z, vec
|
||||
\newcommand{\um}{\underline{m}} % m, vec
|
||||
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
|
||||
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
|
||||
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
|
||||
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
|
||||
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
|
||||
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
|
||||
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
|
||||
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
|
||||
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
|
||||
|
||||
\newcommand{\defas}{\triangleq} %\coloneqq
|
||||
\newcommand{\reals}{\mathbb{R}}
|
||||
\newcommand{\TT}{\mathrm{T}} % transpose
|
||||
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\DeclareMathOperator*{\argsup}{arg\,sup}
|
||||
\DeclareMathOperator*{\arginf}{arg\,inf}
|
||||
\DeclareMathOperator{\diag}{diag}
|
||||
\DeclareMathOperator{\Var}{Var}
|
||||
\DeclareMathOperator{\Cov}{Cov}
|
||||
\DeclareMathOperator{\MSE}{MSE}
|
||||
\DeclareMathOperator{\1}{\mathds{1}}
|
||||
\DeclareMathOperator{\In}{\mathbb{I}}
|
||||
\DeclareMathOperator{\E}{\mathbb{E}}
|
||||
\DeclareMathOperator{\Prob}{\mathbb{P}}
|
||||
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
|
||||
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
|
||||
%%************************************************************************************
|
||||
BIN
hw3/report/img/check/check.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
hw3/report/img/preprocess/preprocess_test.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
hw3/report/img/preprocess/preprocess_train.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
hw3/report/img/preprocess/preprocess_val.png
Normal file
|
After Width: | Height: | Size: 15 KiB |
BIN
hw3/report/img/train/1/accu.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
hw3/report/img/train/1/loss.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
hw3/report/img/train/1/sv.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
hw3/report/img/train/1/test.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
hw3/report/img/train/1/val.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
hw3/report/img/train/1e-6/accu.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
hw3/report/img/train/1e-6/loss.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
hw3/report/img/train/1e-6/sv.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
hw3/report/img/train/1e-6/test.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
hw3/report/img/train/1e-6/val.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
BIN
hw3/report/img/train/default/loss.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
hw3/report/img/train/default/sv.png
Normal file
|
After Width: | Height: | Size: 31 KiB |
BIN
hw3/report/img/train/default/test.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
hw3/report/img/train/default/train_accu.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
hw3/report/img/train/default/val.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
379
hw3/report/main.tex
Normal file
@@ -0,0 +1,379 @@
|
||||
% Homework Template
|
||||
\documentclass[a4paper]{article}
|
||||
\usepackage{ctex}
|
||||
\usepackage{amsmath, amssymb, amsthm}
|
||||
\usepackage{moreenum}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{url}
|
||||
\usepackage{bm}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
\usepackage[mathcal]{eucal}
|
||||
\usepackage[thehwcnt = 3]{iidef}
|
||||
\usepackage{listings}
|
||||
\usepackage{fontspec}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{float}
|
||||
\usepackage{siunitx}
|
||||
|
||||
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||
\newfontfamily\cascadia{Cascadia Code}
|
||||
|
||||
\lstset{
|
||||
basicstyle = \small\codefont,
|
||||
% ---
|
||||
tabsize = 4,
|
||||
showstringspaces = false,
|
||||
numbers = left,
|
||||
numberstyle = \codefont,
|
||||
% ---
|
||||
breaklines = true,
|
||||
captionpos = t,
|
||||
% ---
|
||||
frame = l,
|
||||
flexiblecolumns,
|
||||
}
|
||||
|
||||
\lstdefinestyle{Python}{
|
||||
language = Python, % 语言选Python
|
||||
keywordstyle = \color{blue},
|
||||
keywordstyle = [2] \color{teal},
|
||||
stringstyle = \color{orange!80!black},
|
||||
commentstyle = \color{red},
|
||||
identifierstyle = \color{blue!80!white},
|
||||
}
|
||||
|
||||
\lstdefinestyle{Bash}{
|
||||
language = bash
|
||||
}
|
||||
|
||||
\thecourseinstitute{清华大学电子工程系}
|
||||
\thecoursename{\textbf{媒体与认知}}
|
||||
\theterm{2023-2024学年春季学期}
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
% 请在YOUR NAME处填写自己的姓名
|
||||
\name{高艺轩}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
% 请在?处填写答案
|
||||
\subsection{\underline{D}}
|
||||
|
||||
\subsection{\underline{C}}
|
||||
|
||||
\subsection{\underline{D}}
|
||||
|
||||
\subsection{\underline{D}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
|
||||
|
||||
\subsection{给定两个类别的样本分别为:
|
||||
\begin{align*}
|
||||
&\omega_1:\{(3,1),(2,2),(4,3),(3,2)\} \\
|
||||
&\omega_2:\{(1,3),(1,2),(-1,1),(-1,2)\}
|
||||
\end{align*}
|
||||
试利用LDA,将样本特征维数压缩为一维。
|
||||
}
|
||||
|
||||
\begin{proof}[解]
|
||||
首先计算$\mu_1 = (3, 2), \mu_2 = (0, 2), \mu = (1.5, 2)$。因此
|
||||
\[S_1 = \frac{1}{4}
|
||||
\left(
|
||||
\begin{bmatrix}
|
||||
0 & 0\\
|
||||
0 & 1
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
1 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
1 & 1\\
|
||||
1 & 1
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
0 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
\right)
|
||||
=
|
||||
\begin{bmatrix}
|
||||
0.5 & 0.25\\
|
||||
0.25 & 0.5
|
||||
\end{bmatrix}\]
|
||||
\[S_2 = \frac{1}{4}
|
||||
\left(
|
||||
\begin{bmatrix}
|
||||
0 & 0\\
|
||||
0 & 1
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
1 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
1 & 1\\
|
||||
1 & 1
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
1 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
\right)
|
||||
=
|
||||
\begin{bmatrix}
|
||||
0.75 & 0.25\\
|
||||
0.25 & 0.5
|
||||
\end{bmatrix}\]
|
||||
进一步地,
|
||||
\[S_w = \frac{1}{2} (S_1 + S_2) =
|
||||
\begin{bmatrix}
|
||||
0.625 & 0.25\\
|
||||
0.25 & 0.5
|
||||
\end{bmatrix}\]
|
||||
\[S_b = \frac{1}{2} \left(
|
||||
\begin{bmatrix}
|
||||
2.25 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
+
|
||||
\begin{bmatrix}
|
||||
2.25 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}
|
||||
\right)
|
||||
=
|
||||
\begin{bmatrix}
|
||||
2.25 & 0\\
|
||||
0 & 0
|
||||
\end{bmatrix}\]
|
||||
广义特征值分解得到$\lambda = 4.5$,$v = (0.8944, -0.4472)$。投影后的样本为
|
||||
\[\omega_1: \left\{2.2360, 0.8944, 2.2360, 1.7888\right\}\]
|
||||
\[\omega_2: \left\{-0.4472, 0, -1.3416, -1.7888\right\}\]
|
||||
\end{proof}
|
||||
|
||||
|
||||
|
||||
\vspace{3mm}
|
||||
\subsection{模型训练通常需要大量的数据,假设某采集的数据集包含80\%的有效数据和20\%的无效数据。采用一种算法判断数据是否有效,其中无效数据被成功判别为无效数据的概率为90\%,而有效数据被误判为无效数据的概率为5\%。如果某条数据经过该算法被判别为无效数据,则根据贝叶斯定理,这条数据是无效数据的概率是多少?(提示:全概率公式$P(Y)=\sum^{N}_{i=1}P(Y|X_i)P(X_i)$)\\}
|
||||
|
||||
\begin{proof}[解]
|
||||
\begin{align*}
|
||||
& P(\text{无效数据} \mid \text{判定无效})\\
|
||||
= & \frac{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据})}{p(\text{判定无效} \mid \text{无效数据})p(\text{无效数据}) + p(\text{判定无效} \mid \text{有效数据})p(\text{有效数据})}\\
|
||||
= & \frac{0.9 \times 0.2}{0.9 \times 0.2 + 0.05 \times 0.8}\\
|
||||
= & \frac{0.18}{0.18 + 0.04}\\
|
||||
= & \frac{9}{11}
|
||||
\end{align*}
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
\subsection{设有两类正态分布的样本集,第一类均值为$\mu_1=[2,-1]^T$,第二类均值为$\mu_2=[1,1]^T$。两类样本集的协方差矩阵和出现的先验概率都相等:$\Sigma_1=\Sigma_2=\Sigma=\left[ \begin{array}{cc}
|
||||
4 & 2 \\
|
||||
2 & \frac{4}{3}
|
||||
\end{array} \right]$,$p(\omega_1)=p(\omega_2)$。试计算分类界面,并对特征向量$x=[6,2]^T$分类。}
|
||||
|
||||
\begin{proof}[解]
|
||||
\[\Sigma^{-1} = \begin{bmatrix}
|
||||
1 & -1.5\\
|
||||
-1.5 & 3
|
||||
\end{bmatrix}\]
|
||||
决策方程
|
||||
\[g_{LDF1} = \Sigma^{-1} \mu_1 \boldsymbol{x} + -\frac{1}{2} \mu_1^T \Sigma^{-1} \mu_1 = (3.5, -1) \boldsymbol{x} - 6.5\]
|
||||
类似地可以得到
|
||||
\[g_{LDF2} = (-0.5, 1.5) \boldsymbol{x} - 0.5\]
|
||||
因此分类界面为
|
||||
\begin{align*}
|
||||
(3.5, -1) \boldsymbol{x} - 6.5 & = (-0.5, 1.5) \boldsymbol{x} - 0.5\\
|
||||
(4, -2.5) \boldsymbol{x} & = 6
|
||||
\end{align*}
|
||||
对于$(6, 2)$,计算$g_{LDF1}((6, 2)) = 12.5$,$g_{LDF2}((6, 2)) = -0.5$,因此属于第一类。
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
\subsection{给定异或的样本集$D=\left\{\left((0,0)^T,-1\right),\left((0,1)^T,1\right),\left((1,0)^T,1\right),\left((1,1)^T,-1\right)\right\}$该样本集是线性不可分的,可采用如下所示的多项式函数$\phi(\mathbf{x})$将样本$D=\left\{(\mathbf{x}_n,y_n)\right\}$映射为$D_\phi=\left\{(\phi(\mathbf{x}_n),y_n)\right\}$,其中$\phi(\mathbf{x})$满足
|
||||
\begin{equation*}
|
||||
\begin{aligned}
|
||||
\phi_1(\mathbf{x})&=2(x_1-0.5) \\
|
||||
\phi_2(\mathbf{x})&=4(x_1-0.5)(x_2-0.5)
|
||||
\end{aligned}
|
||||
\end{equation*}
|
||||
\\
|
||||
\qquad(1) 给出映射后的样本集;\\
|
||||
\qquad(2) 在映射后的样本集中,设计一个线性SVM分类器,给出支持向量及分类界面。
|
||||
}
|
||||
|
||||
\begin{proof}[解]
|
||||
映射后的样本集
|
||||
\[D_{\phi} = \left\lbrace\left((-1, 1)^T, -1\right), \left((-1, -1)^T, 1\right), \left((1, -1)^T, 1\right), \left((1, 1)^T, -1\right)\right\rbrace\]
|
||||
|
||||
待优化的问题为
|
||||
\[L(\boldsymbol{\alpha}) = \sum_{i = 1}^4 \alpha_i - \frac{1}{2} \sum_{i = 1}^4 \sum_{j = 1}^4 \alpha_i \alpha_j y_i y_j \boldsymbol{x}_i^T \boldsymbol{x}_j\]
|
||||
因此
|
||||
\begin{align*}
|
||||
\frac{\partial L}{\partial \alpha_1} & = 1 - \frac{1}{2}\sum_{i \neq 1}^4 \alpha_i y_1 y_i \boldsymbol{x}_1^T \boldsymbol{x}_i - 2 \alpha_1 y_1 y_1 \boldsymbol{x}_1^T \boldsymbol{x}_1\\
|
||||
& = 1 - 2 \alpha_3 - 4 \alpha_1\\
|
||||
\frac{\partial L}{\partial \alpha_2} & = 1 - 2\alpha_4 - 4 \alpha_2\\
|
||||
\frac{\partial L}{\partial \alpha_3} & = 1 - 2 \alpha_1 - 4 \alpha_3\\
|
||||
\frac{\partial L}{\partial \alpha_4} & = 1 - 2 \alpha_3 - 4 \alpha_4
|
||||
\end{align*}
|
||||
令四个偏导数均为0,得到$\alpha_1 = \alpha_2 = \alpha_3 = \alpha_4 = \frac{1}{6}$。全部的点均为支持向量。因此
|
||||
\[\boldsymbol{w} = \sum_{i = 1}^4 \alpha_i y_i \boldsymbol{x}_i = \left(0, -\frac{2}{3}\right)\]
|
||||
|
||||
为求偏置量,带入$\boldsymbol{x}_1$:
|
||||
\[(-1) (\boldsymbol{w}^T \boldsymbol{x}_1 + b) = 1\]
|
||||
得到$b = -\frac{1}{3}$。
|
||||
|
||||
分类界面$\boldsymbol{w}^T \boldsymbol{x} + b = 0$,即
|
||||
\[\begin{bmatrix}
|
||||
0\\-\frac{2}{3}
|
||||
\end{bmatrix} \boldsymbol{x} - \frac{1}{3} = 0\]
|
||||
得到$x_2 = \frac{1}{2}$,因此在原空间中,
|
||||
\[4(x_1 - 0.5)(x_2 - 0.5) = 0.5\]
|
||||
|
||||
\end{proof}
|
||||
|
||||
|
||||
|
||||
\vspace{3mm}
|
||||
\subsection{使用KMeans算法对2维空间中的6个点$(0,2)$,$(2,0)$,$(2,3)$,$(3,2)$,$(4,0)$,$(5,4)$进行聚类,距离函数选择欧氏距离$d=\sqrt{(x_1-x_2)^2+(y_1-y_2)^2}$。\\
|
||||
\qquad (1)起始聚类中心选择(0,0)和(4,3),计算聚类中心;\\
|
||||
\qquad (2)起始聚类中心选择(1,4)和(3,1),计算聚类中心。\\
|
||||
}
|
||||
|
||||
\begin{proof}[解]
|
||||
中心选择$(0, 0), (4, 3)$,第一次分为$(0, 2), (2,0)$与$(2, 3), (3, 2), (4, 0), (5, 4)$,更新后的中心为$(1, 1)$与$\left(\frac{7}{2}, \frac{9}{4}\right)$。收敛。
|
||||
|
||||
中心选择$(1, 4)$与$(3, 1)$,第一次分为$(0, 2), (2, 3)$与$(2, 0), (4, 0), (3, 2), (5, 4)$,更新后中心为$(1, \frac{5}{2})$与$(\frac{7}{2}, \frac{3}{2})$,收敛。
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{编程部分}}}
|
||||
|
||||
|
||||
\vspace{3mm}
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题进度汇报”中的一项完成
|
||||
\section{编程作业报告}
|
||||
\subsection{程序验证}
|
||||
与助教给出的图片相比,我写出的程序PCA得到的结果的xy坐标都在$[-1, 1]$之间,不利于之后的分类。我将所有的PCA之后的坐标都扩大了20倍。
|
||||
|
||||
运行\lstinline{check.py}进行检查:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/check/check.png}
|
||||
\end{figure}
|
||||
|
||||
\subsection{数据预处理}
|
||||
运行
|
||||
\begin{lstlisting}[style=Bash]
|
||||
python data_preprocess.py
|
||||
\end{lstlisting}
|
||||
得到的输出为
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/preprocess/preprocess_train.png}
|
||||
\caption{训练集preprocess结果}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/preprocess/preprocess_val.png}
|
||||
\caption{验证集preprocess结果}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/preprocess/preprocess_test.png}
|
||||
\caption{测试集preprocess结果}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\subsection{训练、验证及测试}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/default/loss.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/default/train_accu.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/default/sv.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/default/val.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.8\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/default/test.png}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\subsection{调整正则化系数}
|
||||
\subsubsection{C = \num{1e-6}}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1e-6/loss.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1e-6/accu.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1e-6/sv.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1e-6/val.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.8\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1e-6/test.png}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
可以看到出现了严重的欠拟合,分类界面超出了绘图的范围。这是因为C过小,导致不能正确地分辨合适的分类界面。
|
||||
|
||||
\subsubsection{C = 1}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1/loss.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1/accu.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1/sv.png}
|
||||
\end{subfigure}
|
||||
\hspace{0.5cm}
|
||||
\begin{subfigure}[t]{.45\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1/val.png}
|
||||
\end{subfigure}\\[2ex]
|
||||
\begin{subfigure}[t]{.8\linewidth}
|
||||
\includegraphics[width=\textwidth]{img/train/1/test.png}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
发生了过拟合,直线被交界面的点限制,斜率不是最优。
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: late\rvx
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
||||
163
hw4/code/attnvis.ipynb
Normal file
13426
hw4/code/data/quansongci/data.json
Normal file
22640
hw4/code/data/quansongci/train.json
Normal file
11904
hw4/code/data/quansongci/val.json
Normal file
2
hw4/code/data/vis/vis_1.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
+++如梦令
|
||||
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。
|
||||
3
hw4/code/data/vis/vis_2.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
+++鹧鸪天(秋思)
|
||||
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
|
||||
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。
|
||||
75
hw4/code/dataset.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
class LMDataset(Dataset):
|
||||
def __init__(self, data_dir, split):
|
||||
super().__init__()
|
||||
# load the data
|
||||
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
|
||||
meta = json.load(f)
|
||||
|
||||
self.data = meta['data'] # list of samples
|
||||
self.stoi = meta['stoi'] # a dict that maps character to integer
|
||||
self.itos = meta['itos'] # a dict that maps string of integer to character
|
||||
self.vocab_size = meta['vocab_size'] # vocab size
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.data[index]
|
||||
|
||||
class Converter:
|
||||
'''
|
||||
This class helps us convert strings to integers and back
|
||||
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
|
||||
'''
|
||||
def __init__(self, stoi, itos):
|
||||
self.stoi = stoi # a dict that maps character to integer
|
||||
self.itos = itos # a dict that maps string of integer to character
|
||||
|
||||
def single_encode(self, s):
|
||||
l = [] # initialize an empty list
|
||||
for i in s:
|
||||
l.append(self.stoi[i])
|
||||
# transform the list into a numpy array
|
||||
l = np.array(l, dtype=np.int64)
|
||||
return l
|
||||
|
||||
def single_decode(self, l):
|
||||
s = '' # initialize an empty string
|
||||
for i in l:
|
||||
# if we meet the end of the sequence (the value of integer is equal to 1), break
|
||||
if i == 1:
|
||||
break
|
||||
# convert string of the integer into a character
|
||||
s += self.itos[str(i)]
|
||||
return s
|
||||
|
||||
|
||||
def encode(self, data):
|
||||
'''
|
||||
encode a list of strings into integers
|
||||
'''
|
||||
lens = [len(s) for s in data]
|
||||
max_len = max(lens)
|
||||
out = np.zeros((len(data), max_len+1), dtype=np.int64)
|
||||
for i,s in enumerate(data):
|
||||
out[i,:len(s)] = self.single_encode(s)
|
||||
out[i,len(s)] = 1
|
||||
x = torch.from_numpy(out[:,:-1])
|
||||
y = torch.from_numpy(out[:,1:])
|
||||
return x, y
|
||||
|
||||
def decode(self, data):
|
||||
'''
|
||||
decode a list of integers into strings
|
||||
'''
|
||||
data = data.cpu().numpy().astype(np.int64)
|
||||
out = []
|
||||
for i in range(len(data)):
|
||||
out.append(self.single_decode(data[i]))
|
||||
return out
|
||||
356
hw4/code/model.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 4 Sequence Modeling
|
||||
# model.py - Model definition
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
|
||||
# Import required libraries
|
||||
############################################################
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
import numpy as np
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the GELU activation function used in OpenAI GPT
|
||||
############################################################
|
||||
def gelu(z):
|
||||
"""
|
||||
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
|
||||
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
|
||||
"""
|
||||
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the Multi-Head SelfAttention module
|
||||
############################################################
|
||||
class SelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, embed_dim, num_head, dropout):
|
||||
super().__init__()
|
||||
|
||||
# define there linear layers for q, k, v generation separately
|
||||
self.q_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.k_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.v_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the projection layer for output
|
||||
self.proj_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the dropout layer for attention and output calculation
|
||||
self.attn_drop = nn.Dropout(dropout)
|
||||
self.proj_drop = nn.Dropout(dropout)
|
||||
|
||||
self.num_head = num_head
|
||||
self.head_dim = embed_dim // num_head
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
batch_size, seq_len, dim = x.shape
|
||||
|
||||
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
|
||||
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||
q = self.q_layer(x)
|
||||
k = self.k_layer(x)
|
||||
v = self.v_layer(x)
|
||||
|
||||
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
|
||||
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||
q = q.transpose(1, 2)
|
||||
k = k.transpose(1, 2)
|
||||
v = v.transpose(1, 2)
|
||||
|
||||
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
|
||||
|
||||
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||
# Hint:
|
||||
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
|
||||
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||
attn_mask = torch.triu(attn_mask, diagonal=1)
|
||||
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||
attn_mask = attn_mask.bool()
|
||||
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||
attn = attn.masked_fill(attn_mask, -np.inf)
|
||||
|
||||
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||
attn = torch.softmax(attn, dim=3)
|
||||
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||
attn = self.attn_drop(attn)
|
||||
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||
out = attn @ v
|
||||
|
||||
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
|
||||
|
||||
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||
result = self.proj_drop(self.proj_layer(out))
|
||||
# <<< TODO 1
|
||||
|
||||
# return the final results `result` and attention weights `attn`
|
||||
return result, attn
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the feed forward network (FFN)
|
||||
############################################################
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, embed_dim, feedforward_dim, dropout):
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
|
||||
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
|
||||
self.drop = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = gelu(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
############################################################
|
||||
|
||||
# Define the TransformerLayer
|
||||
############################################################
|
||||
class TransformerLayer(nn.Module):
|
||||
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(embed_dim)
|
||||
self.attn = SelfAttention(embed_dim, num_head, dropout)
|
||||
self.norm2 = nn.LayerNorm(embed_dim)
|
||||
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
|
||||
self.no_res = no_res # whether to use residual connection
|
||||
|
||||
def forward(self, x):
|
||||
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||
# Step 2.1: calculate the output of multi-head self-attention
|
||||
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||
x_norm = self.norm1(x)
|
||||
|
||||
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||
x_attn, attn = self.attn(x_norm)
|
||||
|
||||
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||
if not self.no_res:
|
||||
x_attn = x_attn + x
|
||||
|
||||
# Step 2.2: calculate the output of feed forward network
|
||||
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||
x_ffn = self.ffn(self.norm2(x_attn))
|
||||
|
||||
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||
if not self.no_res:
|
||||
out = x_attn + x_ffn
|
||||
else:
|
||||
out = x_ffn
|
||||
# <<< TODO 2
|
||||
|
||||
return out, attn
|
||||
############################################################
|
||||
|
||||
# Define the GPT module
|
||||
############################################################
|
||||
class GPT(nn.Module):
|
||||
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
|
||||
'''
|
||||
vocab_size: the size of vocabulary
|
||||
max_seq_len: the maximum length of input texts
|
||||
num_layer: the number of transformer layers
|
||||
embed_dim: the embedding dimension
|
||||
num_head: the number of heads in Multi-Head Self Attention
|
||||
feedforward_dim: the dimension in the feed forward network
|
||||
dropout: dropout ratio
|
||||
no_res: whether to use residual connection in transformer layers
|
||||
no_pos: whether to use position embeddings
|
||||
'''
|
||||
super().__init__()
|
||||
self.num_layer = num_layer
|
||||
self.max_seq_len = max_seq_len
|
||||
self.no_pos = no_pos
|
||||
|
||||
# Define Embedding Layer to transfer input text tokens and positions to embeddings
|
||||
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
|
||||
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
|
||||
|
||||
self.drop = nn.Dropout(dropout)
|
||||
# Define the transformer layers
|
||||
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
|
||||
|
||||
# Define the head layer to predict output
|
||||
self.norm = nn.LayerNorm(embed_dim)
|
||||
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
|
||||
|
||||
"""
|
||||
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
|
||||
Reference: https://paperswithcode.com/method/weight-tying
|
||||
"""
|
||||
self.word_token_embedding.weight = self.language_model_head.weight
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Linear):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
if m.bias is not None:
|
||||
torch.nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Embedding):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
|
||||
# apply special scaled init to the residual projections, per GPT-2 paper
|
||||
for pn, p in self.named_parameters():
|
||||
if pn.endswith('proj_layer.weight'):
|
||||
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
|
||||
|
||||
|
||||
def forward(self, word_idx, targets=None):
|
||||
batch_size, seq_len = word_idx.shape
|
||||
|
||||
# >>> TODO 3: complete the forward process of GPT
|
||||
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
|
||||
|
||||
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||
token_embed = self.word_token_embedding(word_idx)
|
||||
pos_embed = self.word_pos_embedding(pos)
|
||||
|
||||
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||
if not self.no_pos:
|
||||
x = token_embed + pos_embed
|
||||
else:
|
||||
x = token_embed
|
||||
|
||||
# apply dropout to the input embeddings via `self.drop()`
|
||||
x = self.drop(x)
|
||||
|
||||
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||
attention_weights = list()
|
||||
for i in range(self.num_layer):
|
||||
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||
x, attn = self.transformer[i](x)
|
||||
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||
attention_weights.append(attn)
|
||||
|
||||
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||
# self.language_model_head() is a linear layer defined in __init__() function
|
||||
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||
x = self.norm(x)
|
||||
logits = self.language_model_head(x)
|
||||
# <<< TODO 3
|
||||
|
||||
# return logits and loss or attention weights
|
||||
if targets is not None:
|
||||
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
|
||||
return logits, loss
|
||||
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
|
||||
return logits, attention_weights
|
||||
|
||||
def configure_optimizers(self, weight_decay):
|
||||
"""
|
||||
This long function is unfortunately doing something very simple and is being very defensive:
|
||||
We are separating out all parameters of the model into two buckets: those that will experience
|
||||
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
||||
We are then returning the PyTorch optimizer object.
|
||||
"""
|
||||
|
||||
# separate out all parameters to those that will and won't experience regularizing weight decay
|
||||
decay = set()
|
||||
no_decay = set()
|
||||
whitelist_weight_modules = (nn.Linear, )
|
||||
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
|
||||
for mn, m in self.named_modules():
|
||||
for pn, p in m.named_parameters():
|
||||
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
|
||||
# random note: because named_modules and named_parameters are recursive
|
||||
# we will see the same tensors p many many times. but doing it this way
|
||||
# allows us to know which parent module any tensor p belongs to...
|
||||
if pn.endswith('bias'):
|
||||
# all biases will not be decayed
|
||||
no_decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
|
||||
# weights of whitelist modules will be weight decayed
|
||||
decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
|
||||
# weights of blacklist modules will NOT be weight decayed
|
||||
no_decay.add(fpn)
|
||||
|
||||
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
|
||||
# will appear in the no_decay and decay sets respectively after the above.
|
||||
# In addition, because named_parameters() doesn't return duplicates, it
|
||||
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
|
||||
# so let's manually remove 'lm_head.weight' from decay set. This will include
|
||||
# this tensor into optimization via transformer.wte.weight only, and not decayed.
|
||||
decay.remove('language_model_head.weight')
|
||||
|
||||
# validate that we considered every parameter
|
||||
param_dict = {pn: p for pn, p in self.named_parameters()}
|
||||
inter_params = decay & no_decay
|
||||
union_params = decay | no_decay
|
||||
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
|
||||
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
|
||||
% (str(param_dict.keys() - union_params), )
|
||||
|
||||
# create the pytorch optimizer object
|
||||
optim_groups = [
|
||||
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
|
||||
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||
]
|
||||
return optim_groups
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||
"""
|
||||
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
||||
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
||||
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
||||
"""
|
||||
for _ in range(max_new_tokens):
|
||||
# if the sequence context is growing too long we must crop it at block_size
|
||||
idx_cond = idx
|
||||
# forward the model to get the logits for the index in the sequence
|
||||
logits, _ = self(idx_cond)
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits[:, -1, :] / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
# sample from the distribution
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
# append sampled index to the running sequence and continue
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
|
||||
return idx.squeeze().cpu().numpy()
|
||||
############################################################
|
||||
|
||||
GPTConfig = {
|
||||
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
|
||||
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
|
||||
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
|
||||
}
|
||||
61
hw4/code/prepare.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Prepare the dataset for character-level language modeling.
|
||||
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
||||
"""
|
||||
import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
# set the input file path
|
||||
input_file_path = os.path.join(args.data_root, 'data.json')
|
||||
|
||||
with open(input_file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)['data']
|
||||
print(f"length of dataset: {len(data):,}")
|
||||
|
||||
# get all the unique characters that occur in this text
|
||||
chars = sorted(list(set(''.join(data))))
|
||||
vocab_size = len(chars) + 2 # for <pad> and <eos>
|
||||
print("all the unique characters:", ''.join(chars))
|
||||
print(f"vocab size: {vocab_size:,}")
|
||||
|
||||
# create a mapping from characters to integers
|
||||
stoi = { ch:i+2 for i,ch in enumerate(chars) }
|
||||
itos = { i+2:ch for i,ch in enumerate(chars) }
|
||||
stoi['<pad>'] = 0
|
||||
itos[0] = '<pad>'
|
||||
stoi['<eos>'] = 1
|
||||
itos[1] = '<eos>'
|
||||
|
||||
|
||||
# create the train and test splits
|
||||
n = len(data)
|
||||
train_data = data[:int(n*0.9)]
|
||||
val_data = data[int(n*0.9):]
|
||||
print(f"train has {len(train_data):,} samples")
|
||||
print(f"val has {len(val_data):,} samples")
|
||||
|
||||
# save the meta information as well, to help us encode/decode later
|
||||
train_meta = {
|
||||
'data': train_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(train_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
val_meta = {
|
||||
'data': val_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(val_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
76
hw4/code/sample.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Sample from a trained model
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import torch
|
||||
from model import GPTConfig, GPT
|
||||
import argparse
|
||||
from dataset import Converter, LMDataset
|
||||
|
||||
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
|
||||
dataset = LMDataset(data_root, 'train')
|
||||
converter = Converter(dataset.stoi, dataset.itos)
|
||||
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
|
||||
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
|
||||
# model
|
||||
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
# init from a model saved in a specific directory
|
||||
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||
print("sample from %s"%ckpt_path)
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
gptconf = GPTConfig[model_name]
|
||||
if 'model_args' in checkpoint:
|
||||
gptconf = checkpoint['model_args']
|
||||
model = GPT(**gptconf)
|
||||
state_dict = checkpoint['state_dict']
|
||||
#unwanted_prefix = '_orig_mod.'
|
||||
#for k,v in list(state_dict.items()):
|
||||
# if k.startswith(unwanted_prefix):
|
||||
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
model.eval()
|
||||
model.to(device)
|
||||
|
||||
# encode the beginning of the prompt
|
||||
start_ids = converter.single_encode(start)
|
||||
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
|
||||
|
||||
# run generation
|
||||
with torch.no_grad():
|
||||
with ctx:
|
||||
for k in range(num_samples):
|
||||
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
|
||||
print(converter.single_decode(y))
|
||||
print('---------------')
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and sampling process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
|
||||
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
|
||||
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
|
||||
219
hw4/code/train.py
Normal file
@@ -0,0 +1,219 @@
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from model import GPT, GPTConfig
|
||||
from dataset import LMDataset, Converter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# learning rate decay scheduler (cosine with warmup)
|
||||
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
|
||||
# 1) linear warmup for warmup_iters steps
|
||||
if it < warmup_iters:
|
||||
return learning_rate * it / warmup_iters
|
||||
# 2) if it > lr_decay_iters, return min learning rate
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
# 3) in between, use cosine decay down to min learning rate
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
||||
return min_lr + coeff * (learning_rate - min_lr)
|
||||
|
||||
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
|
||||
train_dataset = LMDataset(data_root, 'train')
|
||||
val_dataset = LMDataset(data_root, 'val')
|
||||
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
|
||||
converter = Converter(train_dataset.stoi, train_dataset.itos)
|
||||
|
||||
# adamw optimizer
|
||||
learning_rate = 5e-3 # max learning rate
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.99
|
||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||
|
||||
# system
|
||||
|
||||
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||
best_val_loss = 1e9
|
||||
iter_num = 0 # number of iterations in the lifetime of this process
|
||||
|
||||
# model init
|
||||
model_args = GPTConfig[model_name]
|
||||
model_args['vocab_size'] = train_dataset.vocab_size
|
||||
model_args['max_seq_len'] = 128
|
||||
model_args['no_res'] = no_res
|
||||
model_args['no_pos'] = no_pos
|
||||
|
||||
# init a new model from scratch
|
||||
print("Initializing a new model from scratch")
|
||||
model = GPT(**model_args)
|
||||
|
||||
model.to(device)
|
||||
|
||||
# initialize a GradScaler. If enabled=False scaler is a no-op
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
||||
|
||||
# optimizer
|
||||
optim_groups = model.configure_optimizers(weight_decay)
|
||||
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
|
||||
checkpoint = None # free up memory
|
||||
|
||||
print('training...')
|
||||
# training loop
|
||||
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
|
||||
t0 = time.time()
|
||||
model.train()
|
||||
train_losses = []
|
||||
val_losses = []
|
||||
for epoch in range(epoch_num):
|
||||
for step, inputs in enumerate(train_loader):
|
||||
if iter_num >= n_iters:
|
||||
break
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
||||
# and using the GradScaler if data type is float16
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
loss = loss # scale the loss to account for gradient accumulation
|
||||
|
||||
# backward pass, with gradient scaling if training in fp16
|
||||
scaler.scale(loss).backward()
|
||||
# clip the gradient
|
||||
if grad_clip != 0.0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
# step the optimizer and scaler if training in fp16
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
# flush the gradients as soon as we can, no need for this memory anymore
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
iter_num += 1
|
||||
train_losses.append(loss.item())
|
||||
# evaluate the loss on train/val sets and write checkpoints
|
||||
if iter_num % val_interval == 0:
|
||||
# timing and logging
|
||||
t1 = time.time()
|
||||
dt = t1 - t0
|
||||
t0 = t1
|
||||
lossf = loss.item()
|
||||
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
||||
losses = estimate_loss(model, val_loader, converter, ctx, device)
|
||||
val_losses.append(losses['val'])
|
||||
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
|
||||
print(f"saving latest checkpoint to {ckpt_path}")
|
||||
checkpoint = {
|
||||
'state_dict': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'model_args': model_args,
|
||||
'iter_num': iter_num,
|
||||
'best_val_loss': best_val_loss,
|
||||
}
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
|
||||
|
||||
if losses['val'] < best_val_loss:
|
||||
best_val_loss = losses['val']
|
||||
if iter_num > 0:
|
||||
print(f"saving best checkpoint to {ckpt_path}")
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
|
||||
|
||||
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
|
||||
|
||||
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
|
||||
# create a plot
|
||||
f, ax = plt.subplots(1,2,figsize=(18,6))
|
||||
val_iters = np.arange(1, n_iters+1, val_interval)
|
||||
|
||||
# draw loss
|
||||
ax[0].plot(train_losses)
|
||||
ax[0].plot(val_iters, val_losses, 'r')
|
||||
|
||||
# set labels
|
||||
ax[0].set_xlabel('training iters')
|
||||
ax[0].legend(['training loss', 'validation loss'])
|
||||
|
||||
train_perplexity = [np.exp(x) for x in train_losses]
|
||||
val_perplexity = [np.exp(x) for x in val_losses]
|
||||
# draw perplexity
|
||||
ax[1].plot(train_perplexity)
|
||||
ax[1].plot(val_iters, val_perplexity, 'r')
|
||||
|
||||
# set labels
|
||||
ax[1].set_xlabel('training iters')
|
||||
ax[1].legend(['training perplexity', 'validation perplexity'])
|
||||
plt.tight_layout()
|
||||
|
||||
# show the image
|
||||
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
|
||||
plt.show()
|
||||
|
||||
# helps estimate an arbitrarily accurate loss over either split using many batches
|
||||
@torch.no_grad()
|
||||
def estimate_loss(model, val_loader, converter, ctx, device):
|
||||
out = {}
|
||||
model.eval()
|
||||
losses = 0
|
||||
max_iters = 100
|
||||
iter_num = 0
|
||||
for inputs in val_loader:
|
||||
if iter_num >= max_iters:
|
||||
break
|
||||
iter_num += 1
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
#loss = model.loss(logits, Y)
|
||||
losses += loss.item()
|
||||
out['val'] = losses / max_iters
|
||||
model.train()
|
||||
return out
|
||||
|
||||
if __name__ == '__main__':
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and training process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
|
||||
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
|
||||
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
|
||||
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
|
||||
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
|
||||
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
os.makedirs(opt.ckpt_path, exist_ok=True)
|
||||
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
|
||||
|
||||
|
||||
132
hw4/report/dtx-style.sty
Normal file
@@ -0,0 +1,132 @@
|
||||
%%
|
||||
%% This is file `dtx-style.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `dtx-style')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\ProvidesPackage{dtx-style}
|
||||
\RequirePackage{hypdoc}
|
||||
\RequirePackage[UTF8,scheme=chinese]{ctex}
|
||||
\RequirePackage{newpxtext}
|
||||
\RequirePackage{newpxmath}
|
||||
\RequirePackage[
|
||||
top=2.5cm, bottom=2.5cm,
|
||||
left=4cm, right=2cm,
|
||||
headsep=3mm]{geometry}
|
||||
\RequirePackage{array,longtable,booktabs}
|
||||
\RequirePackage{listings}
|
||||
\RequirePackage{fancyhdr}
|
||||
\RequirePackage{xcolor}
|
||||
\RequirePackage{enumitem}
|
||||
\RequirePackage{etoolbox}
|
||||
\RequirePackage{metalogo}
|
||||
|
||||
\colorlet{thu@macro}{blue!60!black}
|
||||
\colorlet{thu@env}{blue!70!black}
|
||||
\colorlet{thu@option}{purple}
|
||||
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
|
||||
\def\DescribeOption{%
|
||||
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
|
||||
\Describe@Option}
|
||||
\def\Describe@Option#1{\endgroup
|
||||
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
|
||||
\thu@special@index{option}{#1}\@esphack\ignorespaces}
|
||||
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
|
||||
\def\thu@special@index#1#2{\@bsphack
|
||||
\begingroup
|
||||
\HD@target
|
||||
\let\HDorg@encapchar\encapchar
|
||||
\edef\encapchar usage{%
|
||||
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
|
||||
}%
|
||||
\index{#2\actualchar{\string\ttfamily\space#2}
|
||||
(#1)\encapchar usage}%
|
||||
\index{#1:\levelchar#2\actualchar
|
||||
{\string\ttfamily\space#2}\encapchar usage}%
|
||||
\endgroup
|
||||
\@esphack}
|
||||
|
||||
\lstdefinestyle{lstStyleBase}{%
|
||||
basicstyle=\small\ttfamily,
|
||||
aboveskip=\medskipamount,
|
||||
belowskip=\medskipamount,
|
||||
lineskip=0pt,
|
||||
boxpos=c,
|
||||
showlines=false,
|
||||
extendedchars=true,
|
||||
upquote=true,
|
||||
tabsize=2,
|
||||
showtabs=false,
|
||||
showspaces=false,
|
||||
showstringspaces=false,
|
||||
numbers=none,
|
||||
linewidth=\linewidth,
|
||||
xleftmargin=4pt,
|
||||
xrightmargin=0pt,
|
||||
resetmargins=false,
|
||||
breaklines=true,
|
||||
breakatwhitespace=false,
|
||||
breakindent=0pt,
|
||||
breakautoindent=true,
|
||||
columns=flexible,
|
||||
keepspaces=true,
|
||||
gobble=2,
|
||||
framesep=3pt,
|
||||
rulesep=1pt,
|
||||
framerule=1pt,
|
||||
backgroundcolor=\color{gray!5},
|
||||
stringstyle=\color{green!40!black!100},
|
||||
keywordstyle=\bfseries\color{blue!50!black},
|
||||
commentstyle=\slshape\color{black!60}}
|
||||
|
||||
\lstdefinestyle{lstStyleShell}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{purple},
|
||||
language=bash}
|
||||
|
||||
\lstdefinestyle{lstStyleLaTeX}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{violet},
|
||||
language=[LaTeX]TeX}
|
||||
|
||||
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
|
||||
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
|
||||
|
||||
\setlist{nosep}
|
||||
|
||||
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
|
||||
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
|
||||
\DeclareDocumentCommand{\pkg}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
|
||||
\DeclareDocumentCommand{\file}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
|
||||
\newcommand{\myentry}[1]{%
|
||||
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
|
||||
\newcommand{\note}[2][Note]{{%
|
||||
\color{magenta}{\bfseries #1}\emph{#2}}}
|
||||
|
||||
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
|
||||
153
hw4/report/iidef.sty
Normal file
@@ -0,0 +1,153 @@
|
||||
%%
|
||||
%% This is file `iidef.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `sty')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
|
||||
\ProvidesClass{iidef}
|
||||
[2020/09/09 2.6 Tsinghua University Coursework Template]
|
||||
%% configuration of nested enumerate env
|
||||
\RequirePackage{enumitem}
|
||||
%% set hwcount key-value option
|
||||
\RequirePackage{kvoptions}
|
||||
%% required by macro DeclareMathOperator
|
||||
\RequirePackage{amsmath}
|
||||
%% Set up page headers using with fancyhdr
|
||||
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
|
||||
{\def\@thulhead{thulhead}}
|
||||
\RequirePackage{amsthm}
|
||||
%% semester
|
||||
\def\@term{term}
|
||||
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
|
||||
%% institute
|
||||
\newcommand{\@courseinstitute}[1]{institute}
|
||||
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
|
||||
%% coursename
|
||||
\newcommand{\@coursename}[1]{coursename}
|
||||
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
|
||||
%% user can rewrite homework name
|
||||
\def\@hwname{Homework}
|
||||
\def\hwname#1{\renewcommand\@hwname{#1}}
|
||||
%% \iidef@thehwcnt = 1
|
||||
\DeclareStringOption[1]{thehwcnt}
|
||||
\ProcessKeyvalOptions*
|
||||
\def\thehwcnt{\iidef@thehwcnt}
|
||||
%% page header setup, distinguish between first page(plain style)
|
||||
%% and second page on (runningpage style)
|
||||
%%***************************************************************************
|
||||
\newcommand{\courseheader}{
|
||||
\thispagestyle{plain}%first page use native plain style to suppress header
|
||||
\vspace*{-1in}
|
||||
\begin{center}
|
||||
\@courseinstitute\\
|
||||
\@coursename\\
|
||||
\@term
|
||||
\vspace*{0.1in}
|
||||
\hrule
|
||||
\end{center}
|
||||
\begin{center}
|
||||
\underline{\bf \@hwname\;\thehwcnt} \\
|
||||
\end{center}
|
||||
}
|
||||
\@ifundefined{@thulhead}{
|
||||
\fancypagestyle{runningpage}
|
||||
{
|
||||
\fancyhead[L]{\small\@coursename}
|
||||
\fancyhead[R]{\small\@courseinstitute}
|
||||
}
|
||||
%% use runningpage style from second page on
|
||||
\pagestyle{runningpage}
|
||||
}{}
|
||||
%% *********************************************************************************************
|
||||
%%name command macro
|
||||
%%*************************
|
||||
\newcommand{\name}[1]{
|
||||
\begin{flushleft}
|
||||
#1\hfill
|
||||
\today
|
||||
\end{flushleft}
|
||||
\hrule
|
||||
|
||||
\vspace{2em}
|
||||
|
||||
\flushleft
|
||||
}
|
||||
%%*************************
|
||||
%% enumitem related configuration
|
||||
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
|
||||
\setlist[enumerate,2]{label=(\alph*)}
|
||||
\setlist[enumerate,3]{label=\roman*.}
|
||||
\setlist[enumerate,4]{label=\greek*}
|
||||
%%******************************
|
||||
\def\@slname{Solution}
|
||||
\def\slname#1{\renewcommand\@slname{#1}}
|
||||
|
||||
\@ifundefined{solution}{
|
||||
\newenvironment{solution}
|
||||
{
|
||||
\proof[\@slname]
|
||||
}
|
||||
{
|
||||
%% no qed symbol in solution env
|
||||
\renewcommand{\qedsymbol}{}
|
||||
\endproof
|
||||
}
|
||||
}{}
|
||||
%%******************************
|
||||
%%common math symbols go here
|
||||
%%*************************************************
|
||||
\def\v#1{\underline{#1}}
|
||||
\newcommand{\uc}{\underline{c}} % c, vec
|
||||
\newcommand{\uv}{\underline{v}} % x, vec
|
||||
\newcommand{\uw}{\underline{w}} % w, vec
|
||||
\newcommand{\ux}{\underline{x}} % x, vec
|
||||
\newcommand{\uy}{\underline{y}} % y, vec
|
||||
\newcommand{\uz}{\underline{z}} % z, vec
|
||||
\newcommand{\um}{\underline{m}} % m, vec
|
||||
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
|
||||
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
|
||||
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
|
||||
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
|
||||
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
|
||||
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
|
||||
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
|
||||
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
|
||||
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
|
||||
|
||||
\newcommand{\defas}{\triangleq} %\coloneqq
|
||||
\newcommand{\reals}{\mathbb{R}}
|
||||
\newcommand{\TT}{\mathrm{T}} % transpose
|
||||
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\DeclareMathOperator*{\argsup}{arg\,sup}
|
||||
\DeclareMathOperator*{\arginf}{arg\,inf}
|
||||
\DeclareMathOperator{\diag}{diag}
|
||||
\DeclareMathOperator{\Var}{Var}
|
||||
\DeclareMathOperator{\Cov}{Cov}
|
||||
\DeclareMathOperator{\MSE}{MSE}
|
||||
\DeclareMathOperator{\1}{\mathds{1}}
|
||||
\DeclareMathOperator{\In}{\mathbb{I}}
|
||||
\DeclareMathOperator{\E}{\mathbb{E}}
|
||||
\DeclareMathOperator{\Prob}{\mathbb{P}}
|
||||
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
|
||||
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
|
||||
%%************************************************************************************
|
||||
BIN
hw4/report/img/20240526_155701910_iOS.png
Normal file
|
After Width: | Height: | Size: 186 KiB |
BIN
hw4/report/img/attention_vis.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
49
hw4/report/img/default_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
sample from workdirs/quansongci/best.pth
|
||||
+++水调歌头
|
||||
黄花满疏雨,月扫三宫。月明月明人去,绿绵声里,风光残霞。屈指两小天天静,绿满阶外,更相逢。那处得何曾小,泪断肠头。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(五清)
|
||||
翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
|
||||
天人未遇向西楼。小阳春水一线清。玉壶重重重。
|
||||
|
||||
---------------
|
||||
+++菩萨蛮(梅)
|
||||
江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
|
||||
楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
|
||||
|
||||
---------------
|
||||
+++菩萨蛮
|
||||
江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
|
||||
豆蔻风前好因缘。送通住。试问三山同。人间无处难。
|
||||
|
||||
---------------
|
||||
+++秦楼月
|
||||
练雨梳妆。桃叶半枝,冰肌红子春寒。半枝都奈。吹香飞絮,记清凉。
|
||||
无限夜云春风护。玉阑无数转。碎帽孤情君,小海东风。
|
||||
|
||||
---------------
|
||||
+++浪淘沙
|
||||
橘上园阳关路早。绿钗风雨散,犹被东湖见楼。
|
||||
仿佛风前坡上去日,月如流。想取东南风。犹慵尘尽比重归。
|
||||
|
||||
---------------
|
||||
+++诉衷情(高人)
|
||||
时候又来深。长是红帘前。醉眼风入春期。
|
||||
应是时时,何处在、应厮续。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(咏梅)
|
||||
离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
|
||||
素娥小山小曲,水朝元有长安。一榻了共取大家。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(和怀)
|
||||
纵图清露歌黛倚,寒题金銮声珊瑚。十年人来懒舞丝。
|
||||
|
||||
---------------
|
||||
+++满江月
|
||||
风月不如旧,柔条欲到春风。掩花间心,道处难臾、相逢。
|
||||
陇头情不物里,阿谁向娇几。且看东词,还明红云与,一笑认教梳灯。
|
||||
|
||||
---------------
|
||||
49
hw4/report/img/no_pos_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
sample from workdirs/quansongci_no_pos/best.pth
|
||||
++++++++菩萨蛮(牡丹月近)
|
||||
江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
|
||||
春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
|
||||
|
||||
---------------
|
||||
++++浣溪沙
|
||||
清歌灯未无限。佳期时更传人不醉里,可奈有芳菲节懒。
|
||||
双蛾罗带向西楼。小小槛春寒人都怨,燕子未销眉花。
|
||||
|
||||
---------------
|
||||
++++++++++++++++++++临江仙歌香花天
|
||||
九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
|
||||
放萧词传天稼时常相逢,还记,酒,占春寒花间风光相住,月劝花往事,占春留思,应春风到上,无人间一线秀船归来,点面皱。□□□□□□□□□□□□。都为谁老还来
|
||||
---------------
|
||||
++++鹧鸪天(十二之二)
|
||||
此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发,忍因缘凝理通。
|
||||
试语三岛不下,松径何处。问清将春愁易全窟,且识斗重阳。
|
||||
|
||||
---------------
|
||||
++++浣溪沙(赋木犀)
|
||||
芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
|
||||
枝开夜忽春风护,玉阑凉痕转新碎香。有君恩多少载酒,且道有春风流。
|
||||
|
||||
---------------
|
||||
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案(西江仙香花宫春令(与梅子
|
||||
绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱,秋风露满庭芳菲节难过,紫。绿门好,十分飞燕子
|
||||
红,秋寒庭楼小西西风,春暮
|
||||
---------------
|
||||
++++++鹧鸪天(和坡衮侑觞)
|
||||
薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
|
||||
春色肃熟燕子,无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
|
||||
|
||||
---------------
|
||||
++++菩萨蛮(用时春)
|
||||
竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
|
||||
暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
|
||||
|
||||
---------------
|
||||
++++++++++最仙歌子(和尉生查子题)
|
||||
绿阴山淡黄未泛湘神神仙,美酒,长唱玉纤纤纤手。元何穷何处重约,清寒食、酒家流光光渐、寄新春花晓,小院映烟微香,正是十年瑶楼酒,水暖花枝枝黄昏昏不语,乍见月寂寞痴愠痕、落醉,看花梢啼红裳篆拂堕风流。
|
||||
东风吹泪过,
|
||||
---------------
|
||||
+++++++++++++点绛唇头春事近
|
||||
花艳心头道酒前春风雨,欲春惨,春去,深自有极目娇几粉,看春词,还爱红云归,绿杨花,旧谢去年时节节,十分真时及华明月。
|
||||
醉眼底莺声中秋光幸有豆皇子
|
||||
杏花开后黄梅梢仙子,且占客里春风吹乱。
|
||||
细雨过春风轻椒香闺催春,小离
|
||||
---------------
|
||||
BIN
hw4/report/img/no_pos_train.png
Normal file
|
After Width: | Height: | Size: 71 KiB |
56
hw4/report/img/no_res_sample.txt
Normal file
@@ -0,0 +1,56 @@
|
||||
sample from workdirs/quansongci_no_res/best.pth
|
||||
+++藕上空都未。消
|
||||
---------------
|
||||
+++。水。香,清干灯翠无月。佳
|
||||
---------------
|
||||
+++烟
|
||||
莫。。一
|
||||
真。,。,手)+(。当,。,还花。
|
||||
。。饱)花清生失楼犹。拂念。。。
|
||||
+东+柳人。碧放萧似天天饮时
|
||||
---------------
|
||||
+++,一+
|
||||
楼。。移。无度此
|
||||
,+路风砧东
|
||||
---------------
|
||||
+++,。常明香天。早。+。色。,大,梅子春上妆半枝。奈。吹。飞、,歌。阑故溪枝开夜忽春花。情,重凉痕转。碎沙相,君有园海。奈。
|
||||
。会
|
||||
---------------
|
||||
+++。。晓宫。。园。+二盈
|
||||
|
||||
钗。+。,恁尾。
|
||||
见楼风
|
||||
寿到+。尽+。日。。
|
||||
---------------
|
||||
+++。看。月。
|
||||
(
|
||||
时衮红。自。意
|
||||
须去前。醉急风入鼎人花
|
||||
。团时。丹翁怨在身云厮。厌
|
||||
秋海花拟燕
|
||||
,无共宿道行气东。,鸾+雨。梦,
|
||||
。。余采
|
||||
---------------
|
||||
++++俊去莺浮
|
||||
时重。+功太。犹。头(人一溪+者。斋算。旧
|
||||
---------------
|
||||
+++,人花长和寞。。纵图清孔歌幽
|
||||
---------------
|
||||
+++髻
|
||||
。+风与不,干
|
||||
柔
|
||||
。头余说。花
|
||||
。心头道。前,枕相
|
||||
。
|
||||
忘,情+物。自水极初。几晶
|
||||
看。词光。明红主与,。。认,旧。去
|
||||
户萨尽玉罢
|
||||
不时家。亭,行翠厚情青
|
||||
+中思难梦。底南星
|
||||
。自马
|
||||
黄
|
||||
我来
|
||||
,中+。花
|
||||
禁,,也
|
||||
。花、。风儿。堂莺催旧,+离
|
||||
---------------
|
||||
BIN
hw4/report/img/no_res_train.png
Normal file
|
After Width: | Height: | Size: 75 KiB |
51
hw4/report/img/specific_start_sample.txt
Normal file
@@ -0,0 +1,51 @@
|
||||
sample from workdirs/quansongci/best.pth
|
||||
+++清平乐(上赋)
|
||||
黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
|
||||
屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
|
||||
客已暮云梦,天人未老。心事有天涯无数。人都不须关,只是秋千千里。
|
||||
|
||||
---------------
|
||||
+++清平乐(春)
|
||||
红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
|
||||
一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
|
||||
小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归,犹唤梅子春去。
|
||||
好都奈。吹回飞飞来。清凉不知无限夜,春风护雨晚梁归。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
|
||||
钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
|
||||
|
||||
---------------
|
||||
+++清平乐(即回)
|
||||
六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
|
||||
好去前时醉,风入泥袖。挼黄团时时问。怨在月明千片春水。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
|
||||
春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
|
||||
谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
|
||||
|
||||
---------------
|
||||
+++清平乐(月明月)
|
||||
醉来人在。春知何时到花时。似来东风识,时时倍度。
|
||||
风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
|
||||
|
||||
---------------
|
||||
BIN
hw4/report/img/train.png
Normal file
|
After Width: | Height: | Size: 72 KiB |
187
hw4/report/main.tex
Normal file
@@ -0,0 +1,187 @@
|
||||
% Homework template for Inference and Information
|
||||
% UPDATE: September 26, 2017 by Xiangxiang
|
||||
\documentclass[a4paper]{article}
|
||||
\usepackage{ctex}
|
||||
\usepackage{amsmath, amssymb, amsthm}
|
||||
\usepackage{moreenum}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{url}
|
||||
\usepackage{bm}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{listings}
|
||||
\usepackage{color}
|
||||
\usepackage{float}
|
||||
|
||||
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||
\newfontfamily\cascadia{Cascadia Code}
|
||||
|
||||
\lstset{
|
||||
basicstyle = \small\codefont,
|
||||
% ---
|
||||
tabsize = 4,
|
||||
showstringspaces = false,
|
||||
numbers = left,
|
||||
numberstyle = \codefont,
|
||||
% ---
|
||||
breaklines = true,
|
||||
captionpos = t,
|
||||
% ---
|
||||
frame = l,
|
||||
flexiblecolumns,
|
||||
}
|
||||
|
||||
\lstdefinestyle{Python}{
|
||||
language = Python, % 语言选Python
|
||||
keywordstyle = \color{blue},
|
||||
keywordstyle = [2] \color{teal},
|
||||
stringstyle = \color{orange!80!black},
|
||||
commentstyle = \color{red},
|
||||
identifierstyle = \color{blue!80!white},
|
||||
}
|
||||
|
||||
\lstdefinestyle{Bash}{
|
||||
language = bash
|
||||
}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
\usepackage[mathcal]{eucal}
|
||||
\usepackage[thehwcnt = 4]{iidef}
|
||||
|
||||
\thecourseinstitute{清华大学电子工程系}
|
||||
\thecoursename{\textbf{媒体与认知}}
|
||||
\theterm{2023-2024学年春季学期}
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
\name{高艺轩}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
\subsection{\underline{D}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{C}}
|
||||
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
% 计算题1
|
||||
\subsection{隐含马尔可夫模型}
|
||||
|
||||
\hspace{2em}暑假中,小E每天进行一项体育活动,包括跑步(R)、游泳(S)和打球(B),所选择的体育活动受某种潜在因素(如心情)的影响。小E每天把进行体育活动的照片发至微信朋友圈,我们可以根据观测信息推测该潜在因素的状态。
|
||||
|
||||
\hspace{2em}假设该潜在因素分为$S_1$和$S_2$两种状态。在$S_1$时,小E选择三种体育活动的概率分别为0.6,0.2,0.2;在$S_2$时,小E选择三种体育活动的概率分别为0.1,0.6,0.3。
|
||||
|
||||
\hspace{2em}该潜在因素的变化也有一定规律,若某天处于$S_1$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.5,0.5;若某天处于$S_2$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.6,0.4。
|
||||
|
||||
\hspace{2em}暑假第一天处于$S_1$和$S_2$的状态的概率均为0.5。
|
||||
|
||||
\vspace{3mm}
|
||||
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模,{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
|
||||
|
||||
\begin{proof}[解]
|
||||
\[\pi = \begin{bmatrix}
|
||||
0.5\\0.5
|
||||
\end{bmatrix}\]
|
||||
\[A = \begin{bmatrix}
|
||||
0.5 & 0.5\\
|
||||
0.6 & 0.4\\
|
||||
\end{bmatrix}\]
|
||||
\[B = \begin{bmatrix}
|
||||
0.6 & 0.2 & 0.2\\
|
||||
0.1 & 0.6 & 0.3
|
||||
\end{bmatrix}\]
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步(R)、打球(B)和游泳(S),{\color{blue}请计算出现该观测序列的概率}。
|
||||
|
||||
\begin{proof}[解]
|
||||
\begin{align*}
|
||||
\alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
|
||||
\alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
|
||||
\alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
|
||||
& = 0.036\\
|
||||
\alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
|
||||
& = 0.051\\
|
||||
\alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
|
||||
& = 0.00972\\
|
||||
\alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
|
||||
& = 0.02304\\
|
||||
P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
|
||||
\end{align*}
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
|
||||
\end{figure}
|
||||
|
||||
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||
\section{编程作业报告}
|
||||
\subsection{模型的训练与测试}
|
||||
首先进行数据预处理。预处理后进行模型训练,训练的结果见图\ref{fig:default_train}。
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/train.png}
|
||||
\caption{默认测试}
|
||||
\label{fig:default_train}
|
||||
\end{figure}
|
||||
|
||||
默认配置的生成样本:
|
||||
\begin{lstlisting}
|
||||
python sample.py --ckpt_path workdirs/quansongci
|
||||
\end{lstlisting}
|
||||
得到的输出为
|
||||
\lstinputlisting{img/default_sample.txt}
|
||||
若指定初始文本:
|
||||
\begin{lstlisting}
|
||||
python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
|
||||
\end{lstlisting}
|
||||
得到的输出为
|
||||
\lstinputlisting{img/specific_start_sample.txt}
|
||||
|
||||
\subsection{探究位置编码和残差链接在模型中的作用}
|
||||
关闭位置编码的训练:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/no_pos_train.png}
|
||||
\end{figure}
|
||||
得到的生成结果:
|
||||
\lstinputlisting{img/no_pos_sample.txt}
|
||||
可以看到,模型没有很好理解句子的长度的关系。
|
||||
|
||||
关闭残差连接的训练:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/no_res_train.png}
|
||||
\end{figure}
|
||||
得到的生成结果:
|
||||
\lstinputlisting{img/no_res_sample.txt}
|
||||
模型训练遇到了梯度消失的问题,很难有效地训练。
|
||||
|
||||
\subsection{可视化}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=.8\linewidth]{img/attention_vis.png}
|
||||
\end{figure}
|
||||
|
||||
许多的词语的注意力系数都会集中在题目的几个字上,可以看到模型主要是分析了不同词牌名对内容的相关性。
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: late\rvx
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
||||
123
testtorch.ipynb
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -10,7 +10,9 @@
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"import torchvision.transforms as transforms"
|
||||
"import torchvision.transforms as transforms\n",
|
||||
"\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -152,6 +154,123 @@
|
||||
"print(conv_1(a).size())\n",
|
||||
"print(conv_2(conv_1(a)).size())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([0., 1.])\n",
|
||||
"1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = torch.Tensor([1.0, 2.0])\n",
|
||||
"b = torch.Tensor([1.0, 1.0])\n",
|
||||
"print((a > b).type_as(a))\n",
|
||||
"print((a == b).sum().item())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor(2.5000)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = torch.Tensor([[1.0, 2.0], [3.0, 4.0]])\n",
|
||||
"mu = a.mean(dim=0)\n",
|
||||
"print(mu, a - mu)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[5.],\n",
|
||||
" [4.]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = torch.Tensor([[5], [4]])\n",
|
||||
"b = torch.Tensor([1])\n",
|
||||
"print((a.T * b).T)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[False, True, True, True, True],\n",
|
||||
" [False, False, True, True, True],\n",
|
||||
" [False, False, False, True, True],\n",
|
||||
" [False, False, False, False, True],\n",
|
||||
" [False, False, False, False, False]])\n",
|
||||
"tensor([[-0.1170, 0.6130, 0.9644, -1.2733, -0.9671],\n",
|
||||
" [-0.7806, 0.5082, -0.2731, 0.1660, -0.5451],\n",
|
||||
" [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
|
||||
" [-1.8357, -0.8010, -0.0424, 0.1491, -1.5009],\n",
|
||||
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n",
|
||||
"tensor([[-0.1170, -inf, -inf, -inf, -inf],\n",
|
||||
" [-0.7806, 0.5082, -inf, -inf, -inf],\n",
|
||||
" [-2.1527, -0.5059, -0.0079, -inf, -inf],\n",
|
||||
" [-1.8357, -0.8010, -0.0424, 0.1491, -inf],\n",
|
||||
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
|
||||
"print(mask)\n",
|
||||
"attn = torch.randn(5, 5)\n",
|
||||
"print(attn)\n",
|
||||
"print(attn.masked_fill(mask, -np.inf))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([0.1402, 0.2312, 0.6285])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Q = torch.Tensor([1, 0, 1, 1])\n",
|
||||
"K = torch.Tensor([[0, 0, 0, 2],\n",
|
||||
" [2, 0, 1, 0],\n",
|
||||
" [2, 1, 2, 1]])\n",
|
||||
"\n",
|
||||
"print(torch.softmax((Q @ K.T) / 2, dim=0))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||