Homework4 Submit.
This commit is contained in:
File diff suppressed because one or more lines are too long
22640
hw4/code/data/quansongci/train.json
Normal file
22640
hw4/code/data/quansongci/train.json
Normal file
File diff suppressed because it is too large
Load Diff
11904
hw4/code/data/quansongci/val.json
Normal file
11904
hw4/code/data/quansongci/val.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -2,8 +2,8 @@
|
||||
# Media and Cognition
|
||||
# Homework 4 Sequence Modeling
|
||||
# model.py - Model definition
|
||||
# Student ID:
|
||||
# Name:
|
||||
# Student ID: 2022010639
|
||||
# Name: Yixuan Gao
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
|
||||
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = self.q_layer(x)
|
||||
k = self.k_layer(x)
|
||||
v = self.v_layer(x)
|
||||
|
||||
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||
|
||||
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
q = q.transpose(1, 2)
|
||||
k = k.transpose(1, 2)
|
||||
v = v.transpose(1, 2)
|
||||
|
||||
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||
attn = ???
|
||||
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
|
||||
|
||||
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||
# Hint:
|
||||
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||
attn_mask = ???
|
||||
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
|
||||
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||
attn_mask = ???
|
||||
attn_mask = torch.triu(attn_mask, diagonal=1)
|
||||
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||
attn_mask = ???
|
||||
attn_mask = attn_mask.bool()
|
||||
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||
attn = ???
|
||||
attn = attn.masked_fill(attn_mask, -np.inf)
|
||||
|
||||
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||
attn = ???
|
||||
attn = torch.softmax(attn, dim=3)
|
||||
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||
attn = ???
|
||||
attn = self.attn_drop(attn)
|
||||
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||
out = ???
|
||||
out = attn @ v
|
||||
|
||||
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||
out = ???
|
||||
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
|
||||
|
||||
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||
result = ???
|
||||
result = self.proj_drop(self.proj_layer(out))
|
||||
# <<< TODO 1
|
||||
|
||||
# return the final results `result` and attention weights `attn`
|
||||
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
|
||||
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||
# Step 2.1: calculate the output of multi-head self-attention
|
||||
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||
x_norm = ???
|
||||
x_norm = self.norm1(x)
|
||||
|
||||
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||
x_attn, attn = ???
|
||||
x_attn, attn = self.attn(x_norm)
|
||||
|
||||
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||
if ???:
|
||||
x_attn = ???
|
||||
if not self.no_res:
|
||||
x_attn = x_attn + x
|
||||
|
||||
# Step 2.2: calculate the output of feed forward network
|
||||
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||
x_ffn = ???
|
||||
x_ffn = self.ffn(self.norm2(x_attn))
|
||||
|
||||
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||
if ???:
|
||||
out = ???
|
||||
if not self.no_res:
|
||||
out = x_attn + x_ffn
|
||||
else:
|
||||
out = ???
|
||||
out = x_ffn
|
||||
# <<< TODO 2
|
||||
|
||||
return out, attn
|
||||
@@ -230,36 +230,36 @@ class GPT(nn.Module):
|
||||
|
||||
# >>> TODO 3: complete the forward process of GPT
|
||||
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||
pos = ???
|
||||
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
|
||||
|
||||
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||
token_embed = ???
|
||||
pos_embed = ???
|
||||
token_embed = self.word_token_embedding(word_idx)
|
||||
pos_embed = self.word_pos_embedding(pos)
|
||||
|
||||
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||
if ???:
|
||||
x = ???
|
||||
if not self.no_pos:
|
||||
x = token_embed + pos_embed
|
||||
else:
|
||||
x = ???
|
||||
x = token_embed
|
||||
|
||||
# apply dropout to the input embeddings via `self.drop()`
|
||||
x = ???
|
||||
x = self.drop(x)
|
||||
|
||||
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||
attention_weights = ???
|
||||
for ???:
|
||||
attention_weights = list()
|
||||
for i in range(self.num_layer):
|
||||
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||
x, attn = ???
|
||||
x, attn = self.transformer[i](x)
|
||||
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||
???
|
||||
attention_weights.append(attn)
|
||||
|
||||
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||
# self.language_model_head() is a linear layer defined in __init__() function
|
||||
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||
x = ???
|
||||
logits = ???
|
||||
x = self.norm(x)
|
||||
logits = self.language_model_head(x)
|
||||
# <<< TODO 3
|
||||
|
||||
# return logits and loss or attention weights
|
||||
|
||||
@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
|
||||
# model
|
||||
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
# init from a model saved in a specific directory
|
||||
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||
print("sample from %s"%ckpt_path)
|
||||
|
||||
@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
|
||||
|
||||
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||
best_val_loss = 1e9
|
||||
iter_num = 0 # number of iterations in the lifetime of this process
|
||||
|
||||
Reference in New Issue
Block a user