Homework4 Submit.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,4 +11,5 @@ __pycache__/
|
||||
hw2/code/checkpoints/
|
||||
hw2/code/visualized/
|
||||
hw3/code/data/
|
||||
hw3/code/checkpoints/
|
||||
hw3/code/checkpoints/
|
||||
hw4/code/workdirs/
|
||||
File diff suppressed because one or more lines are too long
22640
hw4/code/data/quansongci/train.json
Normal file
22640
hw4/code/data/quansongci/train.json
Normal file
File diff suppressed because it is too large
Load Diff
11904
hw4/code/data/quansongci/val.json
Normal file
11904
hw4/code/data/quansongci/val.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -2,8 +2,8 @@
|
||||
# Media and Cognition
|
||||
# Homework 4 Sequence Modeling
|
||||
# model.py - Model definition
|
||||
# Student ID:
|
||||
# Name:
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
|
||||
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = self.q_layer(x)
|
||||
k = self.k_layer(x)
|
||||
v = self.v_layer(x)
|
||||
|
||||
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
|
||||
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = q.transpose(1, 2)
|
||||
k = k.transpose(1, 2)
|
||||
v = v.transpose(1, 2)
|
||||
|
||||
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||
attn = ???
|
||||
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
|
||||
|
||||
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||
# Hint:
|
||||
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||
attn_mask = ???
|
||||
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
|
||||
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||
attn_mask = ???
|
||||
attn_mask = torch.triu(attn_mask, diagonal=1)
|
||||
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||
attn_mask = ???
|
||||
attn_mask = attn_mask.bool()
|
||||
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||
attn = ???
|
||||
attn = attn.masked_fill(attn_mask, -np.inf)
|
||||
|
||||
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||
attn = ???
|
||||
attn = torch.softmax(attn, dim=3)
|
||||
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||
attn = ???
|
||||
attn = self.attn_drop(attn)
|
||||
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||
out = ???
|
||||
out = attn @ v
|
||||
|
||||
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||
out = ???
|
||||
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
|
||||
|
||||
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||
result = ???
|
||||
result = self.proj_drop(self.proj_layer(out))
|
||||
# <<< TODO 1
|
||||
|
||||
# return the final results `result` and attention weights `attn`
|
||||
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
|
||||
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||
# Step 2.1: calculate the output of multi-head self-attention
|
||||
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||
x_norm = ???
|
||||
x_norm = self.norm1(x)
|
||||
|
||||
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||
x_attn, attn = ???
|
||||
x_attn, attn = self.attn(x_norm)
|
||||
|
||||
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||
if ???:
|
||||
x_attn = ???
|
||||
if not self.no_res:
|
||||
x_attn = x_attn + x
|
||||
|
||||
# Step 2.2: calculate the output of feed forward network
|
||||
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||
x_ffn = ???
|
||||
x_ffn = self.ffn(self.norm2(x_attn))
|
||||
|
||||
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||
if ???:
|
||||
out = ???
|
||||
if not self.no_res:
|
||||
out = x_attn + x_ffn
|
||||
else:
|
||||
out = ???
|
||||
out = x_ffn
|
||||
# <<< TODO 2
|
||||
|
||||
return out, attn
|
||||
@@ -230,36 +230,36 @@ class GPT(nn.Module):
|
||||
|
||||
# >>> TODO 3: complete the forward process of GPT
|
||||
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||
pos = ???
|
||||
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
|
||||
|
||||
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||
token_embed = ???
|
||||
pos_embed = ???
|
||||
token_embed = self.word_token_embedding(word_idx)
|
||||
pos_embed = self.word_pos_embedding(pos)
|
||||
|
||||
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||
if ???:
|
||||
x = ???
|
||||
if not self.no_pos:
|
||||
x = token_embed + pos_embed
|
||||
else:
|
||||
x = ???
|
||||
x = token_embed
|
||||
|
||||
# apply dropout to the input embeddings via `self.drop()`
|
||||
x = ???
|
||||
x = self.drop(x)
|
||||
|
||||
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||
attention_weights = ???
|
||||
for ???:
|
||||
attention_weights = list()
|
||||
for i in range(self.num_layer):
|
||||
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||
x, attn = ???
|
||||
x, attn = self.transformer[i](x)
|
||||
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||
???
|
||||
attention_weights.append(attn)
|
||||
|
||||
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||
# self.language_model_head() is a linear layer defined in __init__() function
|
||||
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||
x = ???
|
||||
logits = ???
|
||||
x = self.norm(x)
|
||||
logits = self.language_model_head(x)
|
||||
# <<< TODO 3
|
||||
|
||||
# return logits and loss or attention weights
|
||||
|
||||
@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
|
||||
# model
|
||||
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
# init from a model saved in a specific directory
|
||||
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||
print("sample from %s"%ckpt_path)
|
||||
|
||||
@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
|
||||
|
||||
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||
best_val_loss = 1e9
|
||||
iter_num = 0 # number of iterations in the lifetime of this process
|
||||
|
||||
BIN
hw4/report/img/20240526_155701910_iOS.png
Normal file
BIN
hw4/report/img/20240526_155701910_iOS.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 186 KiB |
BIN
hw4/report/img/attention_vis.png
Normal file
BIN
hw4/report/img/attention_vis.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
49
hw4/report/img/default_sample.txt
Normal file
49
hw4/report/img/default_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
sample from workdirs/quansongci/best.pth
|
||||
+++水调歌头
|
||||
黄花满疏雨,月扫三宫。月明月明人去,绿绵声里,风光残霞。屈指两小天天静,绿满阶外,更相逢。那处得何曾小,泪断肠头。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(五清)
|
||||
翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
|
||||
天人未遇向西楼。小阳春水一线清。玉壶重重重。
|
||||
|
||||
---------------
|
||||
+++菩萨蛮(梅)
|
||||
江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
|
||||
楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
|
||||
|
||||
---------------
|
||||
+++菩萨蛮
|
||||
江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
|
||||
豆蔻风前好因缘。送通住。试问三山同。人间无处难。
|
||||
|
||||
---------------
|
||||
+++秦楼月
|
||||
练雨梳妆。桃叶半枝,冰肌红子春寒。半枝都奈。吹香飞絮,记清凉。
|
||||
无限夜云春风护。玉阑无数转。碎帽孤情君,小海东风。
|
||||
|
||||
---------------
|
||||
+++浪淘沙
|
||||
橘上园阳关路早。绿钗风雨散,犹被东湖见楼。
|
||||
仿佛风前坡上去日,月如流。想取东南风。犹慵尘尽比重归。
|
||||
|
||||
---------------
|
||||
+++诉衷情(高人)
|
||||
时候又来深。长是红帘前。醉眼风入春期。
|
||||
应是时时,何处在、应厮续。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(咏梅)
|
||||
离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
|
||||
素娥小山小曲,水朝元有长安。一榻了共取大家。
|
||||
|
||||
---------------
|
||||
+++浣溪沙(和怀)
|
||||
纵图清露歌黛倚,寒题金銮声珊瑚。十年人来懒舞丝。
|
||||
|
||||
---------------
|
||||
+++满江月
|
||||
风月不如旧,柔条欲到春风。掩花间心,道处难臾、相逢。
|
||||
陇头情不物里,阿谁向娇几。且看东词,还明红云与,一笑认教梳灯。
|
||||
|
||||
---------------
|
||||
49
hw4/report/img/no_pos_sample.txt
Normal file
49
hw4/report/img/no_pos_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
sample from workdirs/quansongci_no_pos/best.pth
|
||||
++++++++菩萨蛮(牡丹月近)
|
||||
江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
|
||||
春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
|
||||
|
||||
---------------
|
||||
++++浣溪沙
|
||||
清歌灯未无限。佳期时更传人不醉里,可奈有芳菲节懒。
|
||||
双蛾罗带向西楼。小小槛春寒人都怨,燕子未销眉花。
|
||||
|
||||
---------------
|
||||
++++++++++++++++++++临江仙歌香花天
|
||||
九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
|
||||
放萧词传天稼时常相逢,还记,酒,占春寒花间风光相住,月劝花往事,占春留思,应春风到上,无人间一线秀船归来,点面皱。□□□□□□□□□□□□。都为谁老还来
|
||||
---------------
|
||||
++++鹧鸪天(十二之二)
|
||||
此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发,忍因缘凝理通。
|
||||
试语三岛不下,松径何处。问清将春愁易全窟,且识斗重阳。
|
||||
|
||||
---------------
|
||||
++++浣溪沙(赋木犀)
|
||||
芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
|
||||
枝开夜忽春风护,玉阑凉痕转新碎香。有君恩多少载酒,且道有春风流。
|
||||
|
||||
---------------
|
||||
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案(西江仙香花宫春令(与梅子
|
||||
绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱,秋风露满庭芳菲节难过,紫。绿门好,十分飞燕子
|
||||
红,秋寒庭楼小西西风,春暮
|
||||
---------------
|
||||
++++++鹧鸪天(和坡衮侑觞)
|
||||
薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
|
||||
春色肃熟燕子,无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
|
||||
|
||||
---------------
|
||||
++++菩萨蛮(用时春)
|
||||
竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
|
||||
暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
|
||||
|
||||
---------------
|
||||
++++++++++最仙歌子(和尉生查子题)
|
||||
绿阴山淡黄未泛湘神神仙,美酒,长唱玉纤纤纤手。元何穷何处重约,清寒食、酒家流光光渐、寄新春花晓,小院映烟微香,正是十年瑶楼酒,水暖花枝枝黄昏昏不语,乍见月寂寞痴愠痕、落醉,看花梢啼红裳篆拂堕风流。
|
||||
东风吹泪过,
|
||||
---------------
|
||||
+++++++++++++点绛唇头春事近
|
||||
花艳心头道酒前春风雨,欲春惨,春去,深自有极目娇几粉,看春词,还爱红云归,绿杨花,旧谢去年时节节,十分真时及华明月。
|
||||
醉眼底莺声中秋光幸有豆皇子
|
||||
杏花开后黄梅梢仙子,且占客里春风吹乱。
|
||||
细雨过春风轻椒香闺催春,小离
|
||||
---------------
|
||||
BIN
hw4/report/img/no_pos_train.png
Normal file
BIN
hw4/report/img/no_pos_train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
56
hw4/report/img/no_res_sample.txt
Normal file
56
hw4/report/img/no_res_sample.txt
Normal file
@@ -0,0 +1,56 @@
|
||||
sample from workdirs/quansongci_no_res/best.pth
|
||||
+++藕上空都未。消
|
||||
---------------
|
||||
+++。水。香,清干灯翠无月。佳
|
||||
---------------
|
||||
+++烟
|
||||
莫。。一
|
||||
真。,。,手)+(。当,。,还花。
|
||||
。。饱)花清生失楼犹。拂念。。。
|
||||
+东+柳人。碧放萧似天天饮时
|
||||
---------------
|
||||
+++,一+
|
||||
楼。。移。无度此
|
||||
,+路风砧东
|
||||
---------------
|
||||
+++,。常明香天。早。+。色。,大,梅子春上妆半枝。奈。吹。飞、,歌。阑故溪枝开夜忽春花。情,重凉痕转。碎沙相,君有园海。奈。
|
||||
。会
|
||||
---------------
|
||||
+++。。晓宫。。园。+二盈
|
||||
|
||||
钗。+。,恁尾。
|
||||
见楼风
|
||||
寿到+。尽+。日。。
|
||||
---------------
|
||||
+++。看。月。
|
||||
(
|
||||
时衮红。自。意
|
||||
须去前。醉急风入鼎人花
|
||||
。团时。丹翁怨在身云厮。厌
|
||||
秋海花拟燕
|
||||
,无共宿道行气东。,鸾+雨。梦,
|
||||
。。余采
|
||||
---------------
|
||||
++++俊去莺浮
|
||||
时重。+功太。犹。头(人一溪+者。斋算。旧
|
||||
---------------
|
||||
+++,人花长和寞。。纵图清孔歌幽
|
||||
---------------
|
||||
+++髻
|
||||
。+风与不,干
|
||||
柔
|
||||
。头余说。花
|
||||
。心头道。前,枕相
|
||||
。
|
||||
忘,情+物。自水极初。几晶
|
||||
看。词光。明红主与,。。认,旧。去
|
||||
户萨尽玉罢
|
||||
不时家。亭,行翠厚情青
|
||||
+中思难梦。底南星
|
||||
。自马
|
||||
黄
|
||||
我来
|
||||
,中+。花
|
||||
禁,,也
|
||||
。花、。风儿。堂莺催旧,+离
|
||||
---------------
|
||||
BIN
hw4/report/img/no_res_train.png
Normal file
BIN
hw4/report/img/no_res_train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 75 KiB |
51
hw4/report/img/specific_start_sample.txt
Normal file
51
hw4/report/img/specific_start_sample.txt
Normal file
@@ -0,0 +1,51 @@
|
||||
sample from workdirs/quansongci/best.pth
|
||||
+++清平乐(上赋)
|
||||
黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
|
||||
屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
|
||||
客已暮云梦,天人未老。心事有天涯无数。人都不须关,只是秋千千里。
|
||||
|
||||
---------------
|
||||
+++清平乐(春)
|
||||
红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
|
||||
一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
|
||||
小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归,犹唤梅子春去。
|
||||
好都奈。吹回飞飞来。清凉不知无限夜,春风护雨晚梁归。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
|
||||
钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
|
||||
|
||||
---------------
|
||||
+++清平乐(即回)
|
||||
六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
|
||||
好去前时醉,风入泥袖。挼黄团时时问。怨在月明千片春水。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
|
||||
春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
|
||||
|
||||
---------------
|
||||
+++清平乐
|
||||
残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
|
||||
谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
|
||||
|
||||
---------------
|
||||
+++清平乐(月明月)
|
||||
醉来人在。春知何时到花时。似来东风识,时时倍度。
|
||||
风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
|
||||
|
||||
---------------
|
||||
BIN
hw4/report/img/train.png
Normal file
BIN
hw4/report/img/train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
@@ -11,32 +11,37 @@
|
||||
\usepackage{graphicx}
|
||||
\usepackage{listings}
|
||||
\usepackage{color}
|
||||
\usepackage{float}
|
||||
|
||||
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||
\newfontfamily\cascadia{Cascadia Code}
|
||||
|
||||
\lstset{
|
||||
basicstyle = \sffamily, % 基本代码风格
|
||||
keywordstyle = \bfseries, % 关键字风格
|
||||
commentstyle = \rmfamily\itshape, % 注释的风格,斜体
|
||||
stringstyle = \ttfamily, % 字符串风格
|
||||
flexiblecolumns, % 别问为什么,加上这个
|
||||
numbers = left, % 行号的位置在左边
|
||||
showspaces = false, % 是否显示空格,显示了有点乱,所以不现实了
|
||||
numberstyle = \zihao{-5}\ttfamily, % 行号的样式,小五号,tt等宽字体
|
||||
basicstyle = \small\codefont,
|
||||
% ---
|
||||
tabsize = 4,
|
||||
showstringspaces = false,
|
||||
captionpos = t, % 这段代码的名字所呈现的位置,t指的是top上面
|
||||
frame = lrtb, % 显示边框
|
||||
numbers = left,
|
||||
numberstyle = \codefont,
|
||||
% ---
|
||||
breaklines = true,
|
||||
captionpos = t,
|
||||
% ---
|
||||
frame = l,
|
||||
flexiblecolumns,
|
||||
}
|
||||
|
||||
\lstdefinestyle{Python}{
|
||||
language = Python, % 语言选Python
|
||||
basicstyle = \zihao{-5}\ttfamily,
|
||||
numberstyle = \zihao{-5}\ttfamily,
|
||||
keywordstyle = \color{blue},
|
||||
keywordstyle = [2] \color{teal},
|
||||
stringstyle = \color{magenta},
|
||||
commentstyle = \color{red}\ttfamily,
|
||||
breaklines = true, % 自动换行,建议不要写太长的行
|
||||
columns = fixed, % 如果不加这一句,字间距就不固定,很丑,必须加
|
||||
basewidth = 0.5em,
|
||||
stringstyle = \color{orange!80!black},
|
||||
commentstyle = \color{red},
|
||||
identifierstyle = \color{blue!80!white},
|
||||
}
|
||||
|
||||
\lstdefinestyle{Bash}{
|
||||
language = bash
|
||||
}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
@@ -49,20 +54,20 @@
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
\name{YOUR NAME}
|
||||
\name{高艺轩}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
\subsection{\underline{?}}
|
||||
\subsection{\underline{D}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
\subsection{\underline{A}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
\subsection{\underline{C}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
\subsection{\underline{B}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
% 计算题1
|
||||
@@ -79,16 +84,98 @@
|
||||
\vspace{3mm}
|
||||
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模,{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
|
||||
|
||||
\begin{proof}[解]
|
||||
\[\pi = \begin{bmatrix}
|
||||
0.5\\0.5
|
||||
\end{bmatrix}\]
|
||||
\[A = \begin{bmatrix}
|
||||
0.5 & 0.5\\
|
||||
0.6 & 0.4\\
|
||||
\end{bmatrix}\]
|
||||
\[B = \begin{bmatrix}
|
||||
0.6 & 0.2 & 0.2\\
|
||||
0.1 & 0.6 & 0.3
|
||||
\end{bmatrix}\]
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步(R)、打球(B)和游泳(S),{\color{blue}请计算出现该观测序列的概率}。
|
||||
|
||||
\begin{proof}[解]
|
||||
\begin{align*}
|
||||
\alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
|
||||
\alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
|
||||
\alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
|
||||
& = 0.036\\
|
||||
\alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
|
||||
& = 0.051\\
|
||||
\alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
|
||||
& = 0.00972\\
|
||||
\alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
|
||||
& = 0.02304\\
|
||||
P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
|
||||
\end{align*}
|
||||
\end{proof}
|
||||
|
||||
\vspace{3mm}
|
||||
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
|
||||
\end{figure}
|
||||
|
||||
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||
\section{编程作业报告}
|
||||
\section{自选课题工作进度汇报}
|
||||
\subsection{模型的训练与测试}
|
||||
首先进行数据预处理。预处理后进行模型训练,训练的结果见图\ref{fig:default_train}。
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/train.png}
|
||||
\caption{默认测试}
|
||||
\label{fig:default_train}
|
||||
\end{figure}
|
||||
|
||||
默认配置的生成样本:
|
||||
\begin{lstlisting}
|
||||
python sample.py --ckpt_path workdirs/quansongci
|
||||
\end{lstlisting}
|
||||
得到的输出为
|
||||
\lstinputlisting{img/default_sample.txt}
|
||||
若指定初始文本:
|
||||
\begin{lstlisting}
|
||||
python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
|
||||
\end{lstlisting}
|
||||
得到的输出为
|
||||
\lstinputlisting{img/specific_start_sample.txt}
|
||||
|
||||
\subsection{探究位置编码和残差链接在模型中的作用}
|
||||
关闭位置编码的训练:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/no_pos_train.png}
|
||||
\end{figure}
|
||||
得到的生成结果:
|
||||
\lstinputlisting{img/no_pos_sample.txt}
|
||||
可以看到,模型没有很好理解句子的长度的关系。
|
||||
|
||||
关闭残差连接的训练:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/no_res_train.png}
|
||||
\end{figure}
|
||||
得到的生成结果:
|
||||
\lstinputlisting{img/no_res_sample.txt}
|
||||
模型训练遇到了梯度消失的问题,很难有效地训练。
|
||||
|
||||
\subsection{可视化}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=.8\linewidth]{img/attention_vis.png}
|
||||
\end{figure}
|
||||
|
||||
许多的词语的注意力系数都会集中在题目的几个字上,可以看到模型主要是分析了不同词牌名对内容的相关性。
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
@@ -10,7 +10,9 @@
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"import torchvision.transforms as transforms"
|
||||
"import torchvision.transforms as transforms\n",
|
||||
"\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -212,6 +214,63 @@
|
||||
"b = torch.Tensor([1])\n",
|
||||
"print((a.T * b).T)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[False, True, True, True, True],\n",
|
||||
" [False, False, True, True, True],\n",
|
||||
" [False, False, False, True, True],\n",
|
||||
" [False, False, False, False, True],\n",
|
||||
" [False, False, False, False, False]])\n",
|
||||
"tensor([[-0.1170, 0.6130, 0.9644, -1.2733, -0.9671],\n",
|
||||
" [-0.7806, 0.5082, -0.2731, 0.1660, -0.5451],\n",
|
||||
" [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
|
||||
" [-1.8357, -0.8010, -0.0424, 0.1491, -1.5009],\n",
|
||||
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n",
|
||||
"tensor([[-0.1170, -inf, -inf, -inf, -inf],\n",
|
||||
" [-0.7806, 0.5082, -inf, -inf, -inf],\n",
|
||||
" [-2.1527, -0.5059, -0.0079, -inf, -inf],\n",
|
||||
" [-1.8357, -0.8010, -0.0424, 0.1491, -inf],\n",
|
||||
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
|
||||
"print(mask)\n",
|
||||
"attn = torch.randn(5, 5)\n",
|
||||
"print(attn)\n",
|
||||
"print(attn.masked_fill(mask, -np.inf))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([0.1402, 0.2312, 0.6285])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Q = torch.Tensor([1, 0, 1, 1])\n",
|
||||
"K = torch.Tensor([[0, 0, 0, 2],\n",
|
||||
" [2, 0, 1, 0],\n",
|
||||
" [2, 1, 2, 1]])\n",
|
||||
"\n",
|
||||
"print(torch.softmax((Q @ K.T) / 2, dim=0))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
Reference in New Issue
Block a user