Homework4 Submit.

2024-05-27 00:01:48 +08:00
parent c6b2420b85
commit 76a643ebc4
19 changed files with 35031 additions and 76 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ __pycache__/
 hw2/code/checkpoints/
 hw2/code/visualized/
 hw3/code/data/
-hw3/code/checkpoints/
+hw3/code/checkpoints/
+hw4/code/workdirs/
--- a/hw4/code/attnvis.ipynb
+++ b/hw4/code/attnvis.ipynb
--- a/hw4/code/data/quansongci/train.json
+++ b/hw4/code/data/quansongci/train.json
--- a/hw4/code/data/quansongci/val.json
+++ b/hw4/code/data/quansongci/val.json
--- a/hw4/code/model.py
+++ b/hw4/code/model.py
@@ -2,8 +2,8 @@
 #             Media and Cognition
 #             Homework 4  Sequence Modeling
 #             model.py - Model definition
-#             Student ID:
-#             Name:
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
        # Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
        # the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
        # num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
-        q = ???
-        k = ???
-        v = ???
+        q = self.q_layer(x)
+        k = self.k_layer(x)
+        v = self.v_layer(x)

        # Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
        # first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
-        q = ???
-        k = ???
-        v = ???
+        q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
+        k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
+        v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)

        # then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
        # the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
-        q = ???
-        k = ???
-        v = ???
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)

        # Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
        # Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
        # the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
-        attn = ???
+        attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
        
        # Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
        # Therefore, a mask is used to prevent positions from attending to subsequent positions
        # attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
        # Hint:
        # use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
-        attn_mask = ???
+        attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
        # use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
-        attn_mask = ???
+        attn_mask = torch.triu(attn_mask, diagonal=1)
        # use Tensor.bool() to convert the matrix to a boolean matrix
-        attn_mask = ???
+        attn_mask = attn_mask.bool()
        # fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
-        attn = ???
+        attn = attn.masked_fill(attn_mask, -np.inf)

        # Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
-        attn = ???
+        attn = torch.softmax(attn, dim=3)
        # Step 1.3.4: apply dropout to `attn` via self.attn_drop()
-        attn = ???
+        attn = self.attn_drop(attn)
        # Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
        # the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
-        out = ???
+        out = attn @ v

        # Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
        # the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
-        out = ???
+        out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)

        # Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
-        result = ??? 
+        result = self.proj_drop(self.proj_layer(out))
        # <<< TODO 1

        # return the final results `result` and attention weights `attn`
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
        # >>> TODO 2: complete the forward process of the TransformerLayer module.
        # Step 2.1: calculate the output of multi-head self-attention
        # normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
-        x_norm = ???
+        x_norm = self.norm1(x)

        # calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
-        x_attn, attn = ???
+        x_attn, attn = self.attn(x_norm)

        # add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
-        if ???:
-            x_attn = ???
+        if not self.no_res:
+            x_attn = x_attn + x

        # Step 2.2: calculate the output of feed forward network
        # calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
-        x_ffn = ???
+        x_ffn = self.ffn(self.norm2(x_attn))

        # add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
-        if ???:
-            out = ???
+        if not self.no_res:
+            out = x_attn + x_ffn
        else:
-            out = ???
+            out = x_ffn
        # <<< TODO 2
        
        return out, attn
@@ -230,36 +230,36 @@ class GPT(nn.Module):

        # >>> TODO 3: complete the forward process of GPT
        # Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1] 
-        pos = ???
+        pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)

        # Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
-        token_embed = ???
-        pos_embed = ???
+        token_embed = self.word_token_embedding(word_idx)
+        pos_embed = self.word_pos_embedding(pos)

        # Step 3.3: initialize the input embeddings `x` of transformer layers
        # add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
-        if ???:
-            x = ???
+        if not self.no_pos:
+            x = token_embed + pos_embed
        else:
-            x = ???
+            x = token_embed

        # apply dropout to the input embeddings via `self.drop()`
-        x = ???
+        x = self.drop(x)

        # Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
        # define a list `attention_weights` and append the attention weights of each transformer layer into the list
-        attention_weights = ??? 
-        for ???:
+        attention_weights = list()
+        for i in range(self.num_layer):
            # Step 4.1: obtain the output and attention weights of transformer layers
-            x, attn = ???
+            x, attn = self.transformer[i](x)
            # Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
-            ???
+            attention_weights.append(attn)
     
        # Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
        # self.language_model_head() is a linear layer defined in __init__() function
        # Note: do not add softmax here since it is included in the cross entropy loss function
-        x = ???
-        logits = ???
+        x = self.norm(x)
+        logits = self.language_model_head(x)
        # <<< TODO 3

        # return logits and loss or attention weights
--- a/hw4/code/sample.py
+++ b/hw4/code/sample.py
@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
    # model
    dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(ckpt_path, 'best.pth')
    print("sample from %s"%ckpt_path)
--- a/hw4/code/train.py
+++ b/hw4/code/train.py
@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
    
    dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    #ctx = torch.autocast(device_type=device, dtype=ptdtype)
    best_val_loss = 1e9
    iter_num = 0 # number of iterations in the lifetime of this process
--- a/hw4/report/img/20240526_155701910_iOS.png
+++ b/hw4/report/img/20240526_155701910_iOS.png
--- a/hw4/report/img/attention_vis.png
+++ b/hw4/report/img/attention_vis.png
--- a/hw4/report/img/default_sample.txt
+++ b/hw4/report/img/default_sample.txt
@@ -0,0 +1,49 @@
+sample from workdirs/quansongci/best.pth
+++水调歌头
+黄花满疏雨，月扫三宫。月明月明人去，绿绵声里，风光残霞。屈指两小天天静，绿满阶外，更相逢。那处得何曾小，泪断肠头。
+
+---------------
+++浣溪沙（五清）
+翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
+天人未遇向西楼。小阳春水一线清。玉壶重重重。
+
+---------------
+++菩萨蛮（梅）
+江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
+楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
+
+---------------
+++菩萨蛮
+江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
+豆蔻风前好因缘。送通住。试问三山同。人间无处难。
+
+---------------
+++秦楼月
+练雨梳妆。桃叶半枝，冰肌红子春寒。半枝都奈。吹香飞絮，记清凉。
+无限夜云春风护。玉阑无数转。碎帽孤情君，小海东风。
+
+---------------
+++浪淘沙
+橘上园阳关路早。绿钗风雨散，犹被东湖见楼。
+仿佛风前坡上去日，月如流。想取东南风。犹慵尘尽比重归。
+
+---------------
+++诉衷情（高人）
+时候又来深。长是红帘前。醉眼风入春期。
+应是时时，何处在、应厮续。
+
+---------------
+++浣溪沙（咏梅）
+离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
+素娥小山小曲，水朝元有长安。一榻了共取大家。
+
+---------------
+++浣溪沙（和怀）
+纵图清露歌黛倚，寒题金銮声珊瑚。十年人来懒舞丝。
+
+---------------
+++满江月
+风月不如旧，柔条欲到春风。掩花间心，道处难臾、相逢。
+陇头情不物里，阿谁向娇几。且看东词，还明红云与，一笑认教梳灯。
+
+---------------
--- a/hw4/report/img/no_pos_sample.txt
+++ b/hw4/report/img/no_pos_sample.txt
@@ -0,0 +1,49 @@
+sample from workdirs/quansongci_no_pos/best.pth
++++++++菩萨蛮（牡丹月近）
+江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
+春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
+
+---------------
++++浣溪沙
+清歌灯未无限。佳期时更传人不醉里，可奈有芳菲节懒。
+双蛾罗带向西楼。小小槛春寒人都怨，燕子未销眉花。
+
+---------------
++++++++++++++++++++临江仙歌香花天
+九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
+放萧词传天稼时常相逢，还记，酒，占春寒花间风光相住，月劝花往事，占春留思，应春风到上，无人间一线秀船归来，点面皱。□□□□□□□□□□□□。都为谁老还来
+---------------
++++鹧鸪天（十二之二）
+此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发，忍因缘凝理通。
+试语三岛不下，松径何处。问清将春愁易全窟，且识斗重阳。
+
+---------------
++++浣溪沙（赋木犀）
+芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
+枝开夜忽春风护，玉阑凉痕转新碎香。有君恩多少载酒，且道有春风流。
+
+---------------
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案（西江仙香花宫春令（与梅子
+绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱，秋风露满庭芳菲节难过，紫。绿门好，十分飞燕子
+红，秋寒庭楼小西西风，春暮
+---------------
++++++鹧鸪天（和坡衮侑觞）
+薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
+春色肃熟燕子，无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
+
+---------------
++++菩萨蛮（用时春）
+竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
+暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
+
+---------------
++++++++++最仙歌子（和尉生查子题）
+绿阴山淡黄未泛湘神神仙，美酒，长唱玉纤纤纤手。元何穷何处重约，清寒食、酒家流光光渐、寄新春花晓，小院映烟微香，正是十年瑶楼酒，水暖花枝枝黄昏昏不语，乍见月寂寞痴愠痕、落醉，看花梢啼红裳篆拂堕风流。
+东风吹泪过，
+---------------
+++++++++++++点绛唇头春事近
+花艳心头道酒前春风雨，欲春惨，春去，深自有极目娇几粉，看春词，还爱红云归，绿杨花，旧谢去年时节节，十分真时及华明月。
+醉眼底莺声中秋光幸有豆皇子
+杏花开后黄梅梢仙子，且占客里春风吹乱。
+细雨过春风轻椒香闺催春，小离
+---------------
--- a/hw4/report/img/no_pos_train.png
+++ b/hw4/report/img/no_pos_train.png
--- a/hw4/report/img/no_res_sample.txt
+++ b/hw4/report/img/no_res_sample.txt
@@ -0,0 +1,56 @@
+sample from workdirs/quansongci_no_res/best.pth
+++藕上空都未。消
+---------------
+++。水。香，清干灯翠无月。佳
+---------------
+++烟
+莫。。一
+真。，。，手）+（。当，。，还花。
+。。饱）花清生失楼犹。拂念。。。
+东+柳人。碧放萧似天天饮时
+---------------
+++，一+
+楼。。移。无度此
+，+路风砧东
+---------------
+++，。常明香天。早。+。色。，大，梅子春上妆半枝。奈。吹。飞、，歌。阑故溪枝开夜忽春花。情，重凉痕转。碎沙相，君有园海。奈。
+。会
+---------------
+++。。晓宫。。园。+二盈
+
+钗。+。，恁尾。
+见楼风
+寿到+。尽+。日。。
+---------------
+++。看。月。
+（
+时衮红。自。意
+须去前。醉急风入鼎人花
+。团时。丹翁怨在身云厮。厌
+秋海花拟燕
+，无共宿道行气东。，鸾+雨。梦，
+。。余采
+---------------
++++俊去莺浮
+时重。+功太。犹。头（人一溪+者。斋算。旧
+---------------
+++，人花长和寞。。纵图清孔歌幽
+---------------
+++髻
+。+风与不，干
+柔
+。头余说。花
+。心头道。前，枕相
+。
+忘，情+物。自水极初。几晶
+看。词光。明红主与，。。认，旧。去
+户萨尽玉罢
+不时家。亭，行翠厚情青
+中思难梦。底南星
+。自马
+黄
+我来
+，中+。花
+禁，，也
+。花、。风儿。堂莺催旧，+离
+---------------
--- a/hw4/report/img/no_res_train.png
+++ b/hw4/report/img/no_res_train.png
--- a/hw4/report/img/specific_start_sample.txt
+++ b/hw4/report/img/specific_start_sample.txt
@@ -0,0 +1,51 @@
+sample from workdirs/quansongci/best.pth
+++清平乐（上赋）
+黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
+屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
+
+---------------
+++清平乐
+京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
+客已暮云梦，天人未老。心事有天涯无数。人都不须关，只是秋千千里。
+
+---------------
+++清平乐（春）
+红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
+一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
+
+---------------
+++清平乐
+银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
+小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
+
+---------------
+++清平乐
+江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归，犹唤梅子春去。
+好都奈。吹回飞飞来。清凉不知无限夜，春风护雨晚梁归。
+
+---------------
+++清平乐
+春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
+钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
+
+---------------
+++清平乐（即回）
+六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
+好去前时醉，风入泥袖。挼黄团时时问。怨在月明千片春水。
+
+---------------
+++清平乐
+晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
+春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
+
+---------------
+++清平乐
+残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
+谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
+
+---------------
+++清平乐（月明月）
+醉来人在。春知何时到花时。似来东风识，时时倍度。
+风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
+
+---------------
--- a/hw4/report/img/train.png
+++ b/hw4/report/img/train.png
--- a/hw4/report/main.tex
+++ b/hw4/report/main.tex
@@ -11,32 +11,37 @@
 \usepackage{graphicx}
 \usepackage{listings}
 \usepackage{color}
+\usepackage{float}
+
+\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
+\newfontfamily\cascadia{Cascadia Code}

 \lstset{
-    basicstyle          =   \sffamily,          % 基本代码风格
-    keywordstyle        =   \bfseries,          % 关键字风格
-    commentstyle        =   \rmfamily\itshape,  % 注释的风格，斜体
-    stringstyle         =   \ttfamily,  % 字符串风格
-    flexiblecolumns,                % 别问为什么，加上这个
-    numbers             =   left,   % 行号的位置在左边
-    showspaces          =   false,  % 是否显示空格，显示了有点乱，所以不现实了
-    numberstyle         =   \zihao{-5}\ttfamily,    % 行号的样式，小五号，tt等宽字体
+    basicstyle          =   \small\codefont,
+    % ---
+    tabsize             =   4,
    showstringspaces    =   false,
-    captionpos          =   t,      % 这段代码的名字所呈现的位置，t指的是top上面
-    frame               =   lrtb,   % 显示边框
+    numbers             =   left,
+    numberstyle         =   \codefont,
+    % ---
+    breaklines          =   true,
+    captionpos          =   t,      
+    % ---
+    frame               =   l,
+    flexiblecolumns,
 }

 \lstdefinestyle{Python}{
    language        =   Python, % 语言选Python
-    basicstyle      =   \zihao{-5}\ttfamily,
-    numberstyle     =   \zihao{-5}\ttfamily,
    keywordstyle    =   \color{blue},
    keywordstyle    =   [2] \color{teal},
-    stringstyle     =   \color{magenta},
-    commentstyle    =   \color{red}\ttfamily,
-    breaklines      =   true,   % 自动换行，建议不要写太长的行
-    columns         =   fixed,  % 如果不加这一句，字间距就不固定，很丑，必须加
-    basewidth       =   0.5em,
+    stringstyle     =   \color{orange!80!black},
+    commentstyle    =   \color{red},
+    identifierstyle =   \color{blue!80!white},
+}
+
+\lstdefinestyle{Bash}{
+    language        =   bash
 }
 \usepackage{subcaption}
 \usepackage{booktabs} % toprule
@@ -49,20 +54,20 @@
 \hwname{作业}
 \begin{document}
 \courseheader
-\name{YOUR NAME}
+\name{高艺轩}
 \vspace{3mm}
 \centerline{\textbf{\Large{理论部分}}}

 \section{单选题（15分）}
-\subsection{\underline{?}}
+\subsection{\underline{D}}

-\subsection{\underline{?}}
+\subsection{\underline{A}}

-\subsection{\underline{?}}
+\subsection{\underline{A}}

-\subsection{\underline{?}}
+\subsection{\underline{C}}

-\subsection{\underline{?}}
+\subsection{\underline{B}}

 \section{计算题（15 分）}
 % 计算题1
@@ -79,16 +84,98 @@
 \vspace{3mm}
 (1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模，{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。

+\begin{proof}[解]
+    \[\pi = \begin{bmatrix}
+        0.5\\0.5
+    \end{bmatrix}\]
+    \[A = \begin{bmatrix}
+        0.5 & 0.5\\
+        0.6 & 0.4\\
+    \end{bmatrix}\]
+    \[B = \begin{bmatrix}
+        0.6 & 0.2 & 0.2\\
+        0.1 & 0.6 & 0.3
+    \end{bmatrix}\]
+\end{proof}
+
 \vspace{3mm}
 (2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步（R）、打球（B）和游泳（S），{\color{blue}请计算出现该观测序列的概率}。

+\begin{proof}[解]
+    \begin{align*}
+        \alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
+        \alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
+        \alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
+        & = 0.036\\
+        \alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
+        & = 0.051\\
+        \alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
+        & = 0.00972\\
+        \alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
+        & = 0.02304\\
+        P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
+    \end{align*}
+\end{proof}
+
 \vspace{3mm}
 (3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。

+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
+\end{figure}
+

 % 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
 \section{编程作业报告}
-\section{自选课题工作进度汇报}
+\subsection{模型的训练与测试}
+首先进行数据预处理。预处理后进行模型训练，训练的结果见图\ref{fig:default_train}。
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/train.png}
+    \caption{默认测试}
+    \label{fig:default_train}
+\end{figure}
+
+默认配置的生成样本：
+\begin{lstlisting}
+python sample.py --ckpt_path workdirs/quansongci
+\end{lstlisting}
+得到的输出为
+\lstinputlisting{img/default_sample.txt}
+若指定初始文本：
+\begin{lstlisting}
+python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
+\end{lstlisting}
+得到的输出为
+\lstinputlisting{img/specific_start_sample.txt}
+
+\subsection{探究位置编码和残差链接在模型中的作用}
+关闭位置编码的训练：
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/no_pos_train.png}
+\end{figure}
+得到的生成结果：
+\lstinputlisting{img/no_pos_sample.txt}
+可以看到，模型没有很好理解句子的长度的关系。
+
+关闭残差连接的训练：
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=\linewidth]{img/no_res_train.png}
+\end{figure}
+得到的生成结果：
+\lstinputlisting{img/no_res_sample.txt}
+模型训练遇到了梯度消失的问题，很难有效地训练。
+
+\subsection{可视化}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=.8\linewidth]{img/attention_vis.png}
+\end{figure}
+
+许多的词语的注意力系数都会集中在题目的几个字上，可以看到模型主要是分析了不同词牌名对内容的相关性。

 \end{document}

--- a/j.ps1
+++ b/j.ps1
@@ -1 +1 @@
-cd ./hw3/code
+cd ./hw4/code
--- a/testtorch.ipynb
+++ b/testtorch.ipynb
@@ -10,7 +10,9 @@
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
-    "import torchvision.transforms as transforms"
+    "import torchvision.transforms as transforms\n",
+    "\n",
+    "import numpy as np"
   ]
  },
  {
@@ -212,6 +214,63 @@
    "b = torch.Tensor([1])\n",
    "print((a.T * b).T)"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[False,  True,  True,  True,  True],\n",
+      "        [False, False,  True,  True,  True],\n",
+      "        [False, False, False,  True,  True],\n",
+      "        [False, False, False, False,  True],\n",
+      "        [False, False, False, False, False]])\n",
+      "tensor([[-0.1170,  0.6130,  0.9644, -1.2733, -0.9671],\n",
+      "        [-0.7806,  0.5082, -0.2731,  0.1660, -0.5451],\n",
+      "        [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
+      "        [-1.8357, -0.8010, -0.0424,  0.1491, -1.5009],\n",
+      "        [-1.3666, -0.8209,  0.0483, -1.3165, -0.9222]])\n",
+      "tensor([[-0.1170,    -inf,    -inf,    -inf,    -inf],\n",
+      "        [-0.7806,  0.5082,    -inf,    -inf,    -inf],\n",
+      "        [-2.1527, -0.5059, -0.0079,    -inf,    -inf],\n",
+      "        [-1.8357, -0.8010, -0.0424,  0.1491,    -inf],\n",
+      "        [-1.3666, -0.8209,  0.0483, -1.3165, -0.9222]])\n"
+     ]
+    }
+   ],
+   "source": [
+    "mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
+    "print(mask)\n",
+    "attn = torch.randn(5, 5)\n",
+    "print(attn)\n",
+    "print(attn.masked_fill(mask, -np.inf))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([0.1402, 0.2312, 0.6285])\n"
+     ]
+    }
+   ],
+   "source": [
+    "Q = torch.Tensor([1, 0, 1, 1])\n",
+    "K = torch.Tensor([[0, 0, 0, 2],\n",
+    "                  [2, 0, 1, 0],\n",
+    "                  [2, 1, 2, 1]])\n",
+    "\n",
+    "print(torch.softmax((Q @ K.T) / 2, dim=0))"
+   ]
  }
 ],
 "metadata": {