Homework4 Submit.

This commit is contained in:
unlockable
2024-05-27 00:01:48 +08:00
parent c6b2420b85
commit 76a643ebc4
19 changed files with 35031 additions and 76 deletions

3
.gitignore vendored
View File

@@ -11,4 +11,5 @@ __pycache__/
hw2/code/checkpoints/
hw2/code/visualized/
hw3/code/data/
hw3/code/checkpoints/
hw3/code/checkpoints/
hw4/code/workdirs/

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -2,8 +2,8 @@
# Media and Cognition
# Homework 4 Sequence Modeling
# model.py - Model definition
# Student ID:
# Name:
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
q = ???
k = ???
v = ???
q = self.q_layer(x)
k = self.k_layer(x)
v = self.v_layer(x)
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
q = ???
k = ???
v = ???
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
q = ???
k = ???
v = ???
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
attn = ???
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
# Therefore, a mask is used to prevent positions from attending to subsequent positions
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
# Hint:
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
attn_mask = ???
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
attn_mask = ???
attn_mask = torch.triu(attn_mask, diagonal=1)
# use Tensor.bool() to convert the matrix to a boolean matrix
attn_mask = ???
attn_mask = attn_mask.bool()
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
attn = ???
attn = attn.masked_fill(attn_mask, -np.inf)
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
attn = ???
attn = torch.softmax(attn, dim=3)
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
attn = ???
attn = self.attn_drop(attn)
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
out = ???
out = attn @ v
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
out = ???
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
result = ???
result = self.proj_drop(self.proj_layer(out))
# <<< TODO 1
# return the final results `result` and attention weights `attn`
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
# >>> TODO 2: complete the forward process of the TransformerLayer module.
# Step 2.1: calculate the output of multi-head self-attention
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
x_norm = ???
x_norm = self.norm1(x)
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
x_attn, attn = ???
x_attn, attn = self.attn(x_norm)
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
if ???:
x_attn = ???
if not self.no_res:
x_attn = x_attn + x
# Step 2.2: calculate the output of feed forward network
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
x_ffn = ???
x_ffn = self.ffn(self.norm2(x_attn))
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
if ???:
out = ???
if not self.no_res:
out = x_attn + x_ffn
else:
out = ???
out = x_ffn
# <<< TODO 2
return out, attn
@@ -230,36 +230,36 @@ class GPT(nn.Module):
# >>> TODO 3: complete the forward process of GPT
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
pos = ???
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
token_embed = ???
pos_embed = ???
token_embed = self.word_token_embedding(word_idx)
pos_embed = self.word_pos_embedding(pos)
# Step 3.3: initialize the input embeddings `x` of transformer layers
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
if ???:
x = ???
if not self.no_pos:
x = token_embed + pos_embed
else:
x = ???
x = token_embed
# apply dropout to the input embeddings via `self.drop()`
x = ???
x = self.drop(x)
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
attention_weights = ???
for ???:
attention_weights = list()
for i in range(self.num_layer):
# Step 4.1: obtain the output and attention weights of transformer layers
x, attn = ???
x, attn = self.transformer[i](x)
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
???
attention_weights.append(attn)
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
# self.language_model_head() is a linear layer defined in __init__() function
# Note: do not add softmax here since it is included in the cross entropy loss function
x = ???
logits = ???
x = self.norm(x)
logits = self.language_model_head(x)
# <<< TODO 3
# return logits and loss or attention weights

View File

@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
# model
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
# init from a model saved in a specific directory
ckpt_path = os.path.join(ckpt_path, 'best.pth')
print("sample from %s"%ckpt_path)

View File

@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
best_val_loss = 1e9
iter_num = 0 # number of iterations in the lifetime of this process

Binary file not shown.

After

Width:  |  Height:  |  Size: 186 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

View File

@@ -0,0 +1,49 @@
sample from workdirs/quansongci/best.pth
+++水调歌头
黄花满疏雨,月扫三宫。月明月明人去,绿绵声里,风光残霞。屈指两小天天静,绿满阶外,更相逢。那处得何曾小,泪断肠头。
---------------
+++浣溪沙(五清)
翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
天人未遇向西楼。小阳春水一线清。玉壶重重重。
---------------
+++菩萨蛮(梅)
江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
---------------
+++菩萨蛮
江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
豆蔻风前好因缘。送通住。试问三山同。人间无处难。
---------------
+++秦楼月
练雨梳妆。桃叶半枝,冰肌红子春寒。半枝都奈。吹香飞絮,记清凉。
无限夜云春风护。玉阑无数转。碎帽孤情君,小海东风。
---------------
+++浪淘沙
橘上园阳关路早。绿钗风雨散,犹被东湖见楼。
仿佛风前坡上去日,月如流。想取东南风。犹慵尘尽比重归。
---------------
+++诉衷情(高人)
时候又来深。长是红帘前。醉眼风入春期。
应是时时,何处在、应厮续。
---------------
+++浣溪沙(咏梅)
离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
素娥小山小曲,水朝元有长安。一榻了共取大家。
---------------
+++浣溪沙(和怀)
纵图清露歌黛倚,寒题金銮声珊瑚。十年人来懒舞丝。
---------------
+++满江月
风月不如旧,柔条欲到春风。掩花间心,道处难臾、相逢。
陇头情不物里,阿谁向娇几。且看东词,还明红云与,一笑认教梳灯。
---------------

View File

@@ -0,0 +1,49 @@
sample from workdirs/quansongci_no_pos/best.pth
++++++++菩萨蛮(牡丹月近)
江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
---------------
++++浣溪沙
清歌灯未无限。佳期时更传人不醉里,可奈有芳菲节懒。
双蛾罗带向西楼。小小槛春寒人都怨,燕子未销眉花。
---------------
++++++++++++++++++++临江仙歌香花天
九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
放萧词传天稼时常相逢,还记,酒,占春寒花间风光相住,月劝花往事,占春留思,应春风到上,无人间一线秀船归来,点面皱。□□□□□□□□□□□□。都为谁老还来
---------------
++++鹧鸪天(十二之二)
此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发,忍因缘凝理通。
试语三岛不下,松径何处。问清将春愁易全窟,且识斗重阳。
---------------
++++浣溪沙(赋木犀)
芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
枝开夜忽春风护,玉阑凉痕转新碎香。有君恩多少载酒,且道有春风流。
---------------
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案(西江仙香花宫春令(与梅子
绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱,秋风露满庭芳菲节难过,紫。绿门好,十分飞燕子
红,秋寒庭楼小西西风,春暮
---------------
++++++鹧鸪天(和坡衮侑觞)
薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
春色肃熟燕子,无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
---------------
++++菩萨蛮(用时春)
竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
---------------
++++++++++最仙歌子(和尉生查子题)
绿阴山淡黄未泛湘神神仙,美酒,长唱玉纤纤纤手。元何穷何处重约,清寒食、酒家流光光渐、寄新春花晓,小院映烟微香,正是十年瑶楼酒,水暖花枝枝黄昏昏不语,乍见月寂寞痴愠痕、落醉,看花梢啼红裳篆拂堕风流。
东风吹泪过,
---------------
+++++++++++++点绛唇头春事近
花艳心头道酒前春风雨,欲春惨,春去,深自有极目娇几粉,看春词,还爱红云归,绿杨花,旧谢去年时节节,十分真时及华明月。
醉眼底莺声中秋光幸有豆皇子
杏花开后黄梅梢仙子,且占客里春风吹乱。
细雨过春风轻椒香闺催春,小离
---------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

View File

@@ -0,0 +1,56 @@
sample from workdirs/quansongci_no_res/best.pth
+++藕上空都未。消
---------------
+++。水。香,清干灯翠无月。佳
---------------
+++烟
莫。。一
真。,。,手)+(。当,。,还花。
。。饱)花清生失楼犹。拂念。。。
+东+柳人。碧放萧似天天饮时
---------------
+++,一+
楼。。移。无度此
+路风砧东
---------------
+++,。常明香天。早。+。色。,大,梅子春上妆半枝。奈。吹。飞、,歌。阑故溪枝开夜忽春花。情,重凉痕转。碎沙相,君有园海。奈。
。会
---------------
+++。。晓宫。。园。+二盈
钗。+。,恁尾。
见楼风
寿到+。尽+。日。。
---------------
+++。看。月。
时衮红。自。意
须去前。醉急风入鼎人花
。团时。丹翁怨在身云厮。厌
秋海花拟燕
,无共宿道行气东。,鸾+雨。梦,
。。余采
---------------
++++俊去莺浮
时重。+功太。犹。头(人一溪+者。斋算。旧
---------------
+++,人花长和寞。。纵图清孔歌幽
---------------
+++髻
。+风与不,干
。头余说。花
。心头道。前,枕相
忘,情+物。自水极初。几晶
看。词光。明红主与,。。认,旧。去
户萨尽玉罢
不时家。亭,行翠厚情青
+中思难梦。底南星
。自马
我来
,中+。花
禁,,也
。花、。风儿。堂莺催旧,+离
---------------

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

View File

@@ -0,0 +1,51 @@
sample from workdirs/quansongci/best.pth
+++清平乐(上赋)
黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
---------------
+++清平乐
京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
客已暮云梦,天人未老。心事有天涯无数。人都不须关,只是秋千千里。
---------------
+++清平乐(春)
红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
---------------
+++清平乐
银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
---------------
+++清平乐
江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归,犹唤梅子春去。
好都奈。吹回飞飞来。清凉不知无限夜,春风护雨晚梁归。
---------------
+++清平乐
春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
---------------
+++清平乐(即回)
六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
好去前时醉,风入泥袖。挼黄团时时问。怨在月明千片春水。
---------------
+++清平乐
晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
---------------
+++清平乐
残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
---------------
+++清平乐(月明月)
醉来人在。春知何时到花时。似来东风识,时时倍度。
风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
---------------

BIN
hw4/report/img/train.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

View File

@@ -11,32 +11,37 @@
\usepackage{graphicx}
\usepackage{listings}
\usepackage{color}
\usepackage{float}
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
\newfontfamily\cascadia{Cascadia Code}
\lstset{
basicstyle = \sffamily, % 基本代码风格
keywordstyle = \bfseries, % 关键字风格
commentstyle = \rmfamily\itshape, % 注释的风格,斜体
stringstyle = \ttfamily, % 字符串风格
flexiblecolumns, % 别问为什么,加上这个
numbers = left, % 行号的位置在左边
showspaces = false, % 是否显示空格,显示了有点乱,所以不现实了
numberstyle = \zihao{-5}\ttfamily, % 行号的样式小五号tt等宽字体
basicstyle = \small\codefont,
% ---
tabsize = 4,
showstringspaces = false,
captionpos = t, % 这段代码的名字所呈现的位置t指的是top上面
frame = lrtb, % 显示边框
numbers = left,
numberstyle = \codefont,
% ---
breaklines = true,
captionpos = t,
% ---
frame = l,
flexiblecolumns,
}
\lstdefinestyle{Python}{
language = Python, % 语言选Python
basicstyle = \zihao{-5}\ttfamily,
numberstyle = \zihao{-5}\ttfamily,
keywordstyle = \color{blue},
keywordstyle = [2] \color{teal},
stringstyle = \color{magenta},
commentstyle = \color{red}\ttfamily,
breaklines = true, % 自动换行,建议不要写太长的行
columns = fixed, % 如果不加这一句,字间距就不固定,很丑,必须加
basewidth = 0.5em,
stringstyle = \color{orange!80!black},
commentstyle = \color{red},
identifierstyle = \color{blue!80!white},
}
\lstdefinestyle{Bash}{
language = bash
}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
@@ -49,20 +54,20 @@
\hwname{作业}
\begin{document}
\courseheader
\name{YOUR NAME}
\name{高艺轩}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
\subsection{\underline{?}}
\subsection{\underline{D}}
\subsection{\underline{?}}
\subsection{\underline{A}}
\subsection{\underline{?}}
\subsection{\underline{A}}
\subsection{\underline{?}}
\subsection{\underline{C}}
\subsection{\underline{?}}
\subsection{\underline{B}}
\section{计算题15 分)}
% 计算题1
@@ -79,16 +84,98 @@
\vspace{3mm}
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}
\begin{proof}[解]
\[\pi = \begin{bmatrix}
0.5\\0.5
\end{bmatrix}\]
\[A = \begin{bmatrix}
0.5 & 0.5\\
0.6 & 0.4\\
\end{bmatrix}\]
\[B = \begin{bmatrix}
0.6 & 0.2 & 0.2\\
0.1 & 0.6 & 0.3
\end{bmatrix}\]
\end{proof}
\vspace{3mm}
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步R、打球B和游泳S{\color{blue}请计算出现该观测序列的概率}
\begin{proof}[解]
\begin{align*}
\alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
\alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
\alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
& = 0.036\\
\alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
& = 0.051\\
\alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
& = 0.00972\\
\alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
& = 0.02304\\
P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
\end{align*}
\end{proof}
\vspace{3mm}
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
\end{figure}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
\section{编程作业报告}
\section{自选课题工作进度汇报}
\subsection{模型的训练与测试}
首先进行数据预处理。预处理后进行模型训练,训练的结果见图\ref{fig:default_train}
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/train.png}
\caption{默认测试}
\label{fig:default_train}
\end{figure}
默认配置的生成样本:
\begin{lstlisting}
python sample.py --ckpt_path workdirs/quansongci
\end{lstlisting}
得到的输出为
\lstinputlisting{img/default_sample.txt}
若指定初始文本:
\begin{lstlisting}
python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
\end{lstlisting}
得到的输出为
\lstinputlisting{img/specific_start_sample.txt}
\subsection{探究位置编码和残差链接在模型中的作用}
关闭位置编码的训练:
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/no_pos_train.png}
\end{figure}
得到的生成结果:
\lstinputlisting{img/no_pos_sample.txt}
可以看到,模型没有很好理解句子的长度的关系。
关闭残差连接的训练:
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{img/no_res_train.png}
\end{figure}
得到的生成结果:
\lstinputlisting{img/no_res_sample.txt}
模型训练遇到了梯度消失的问题,很难有效地训练。
\subsection{可视化}
\begin{figure}[H]
\centering
\includegraphics[width=.8\linewidth]{img/attention_vis.png}
\end{figure}
许多的词语的注意力系数都会集中在题目的几个字上,可以看到模型主要是分析了不同词牌名对内容的相关性。
\end{document}

2
j.ps1
View File

@@ -1 +1 @@
cd ./hw3/code
cd ./hw4/code

View File

@@ -10,7 +10,9 @@
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"import torchvision.transforms as transforms"
"import torchvision.transforms as transforms\n",
"\n",
"import numpy as np"
]
},
{
@@ -212,6 +214,63 @@
"b = torch.Tensor([1])\n",
"print((a.T * b).T)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[False, True, True, True, True],\n",
" [False, False, True, True, True],\n",
" [False, False, False, True, True],\n",
" [False, False, False, False, True],\n",
" [False, False, False, False, False]])\n",
"tensor([[-0.1170, 0.6130, 0.9644, -1.2733, -0.9671],\n",
" [-0.7806, 0.5082, -0.2731, 0.1660, -0.5451],\n",
" [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
" [-1.8357, -0.8010, -0.0424, 0.1491, -1.5009],\n",
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n",
"tensor([[-0.1170, -inf, -inf, -inf, -inf],\n",
" [-0.7806, 0.5082, -inf, -inf, -inf],\n",
" [-2.1527, -0.5059, -0.0079, -inf, -inf],\n",
" [-1.8357, -0.8010, -0.0424, 0.1491, -inf],\n",
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n"
]
}
],
"source": [
"mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
"print(mask)\n",
"attn = torch.randn(5, 5)\n",
"print(attn)\n",
"print(attn.masked_fill(mask, -np.inf))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([0.1402, 0.2312, 0.6285])\n"
]
}
],
"source": [
"Q = torch.Tensor([1, 0, 1, 1])\n",
"K = torch.Tensor([[0, 0, 0, 2],\n",
" [2, 0, 1, 0],\n",
" [2, 1, 2, 1]])\n",
"\n",
"print(torch.softmax((Q @ K.T) / 2, dim=0))"
]
}
],
"metadata": {