Homework4 Submit.

This commit is contained in:
unlockable
2024-05-27 00:01:48 +08:00
parent c6b2420b85
commit 76a643ebc4
19 changed files with 35031 additions and 76 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -2,8 +2,8 @@
# Media and Cognition
# Homework 4 Sequence Modeling
# model.py - Model definition
# Student ID:
# Name:
# Student ID: 2022010639
# Name: Yixuan Gao
# Tsinghua University
# (C) Copyright 2024
# ========================================================
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
q = ???
k = ???
v = ???
q = self.q_layer(x)
k = self.k_layer(x)
v = self.v_layer(x)
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
q = ???
k = ???
v = ???
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
q = ???
k = ???
v = ???
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
attn = ???
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
# Therefore, a mask is used to prevent positions from attending to subsequent positions
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
# Hint:
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
attn_mask = ???
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
attn_mask = ???
attn_mask = torch.triu(attn_mask, diagonal=1)
# use Tensor.bool() to convert the matrix to a boolean matrix
attn_mask = ???
attn_mask = attn_mask.bool()
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
attn = ???
attn = attn.masked_fill(attn_mask, -np.inf)
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
attn = ???
attn = torch.softmax(attn, dim=3)
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
attn = ???
attn = self.attn_drop(attn)
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
out = ???
out = attn @ v
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
out = ???
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
result = ???
result = self.proj_drop(self.proj_layer(out))
# <<< TODO 1
# return the final results `result` and attention weights `attn`
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
# >>> TODO 2: complete the forward process of the TransformerLayer module.
# Step 2.1: calculate the output of multi-head self-attention
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
x_norm = ???
x_norm = self.norm1(x)
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
x_attn, attn = ???
x_attn, attn = self.attn(x_norm)
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
if ???:
x_attn = ???
if not self.no_res:
x_attn = x_attn + x
# Step 2.2: calculate the output of feed forward network
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
x_ffn = ???
x_ffn = self.ffn(self.norm2(x_attn))
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
if ???:
out = ???
if not self.no_res:
out = x_attn + x_ffn
else:
out = ???
out = x_ffn
# <<< TODO 2
return out, attn
@@ -230,36 +230,36 @@ class GPT(nn.Module):
# >>> TODO 3: complete the forward process of GPT
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
pos = ???
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
token_embed = ???
pos_embed = ???
token_embed = self.word_token_embedding(word_idx)
pos_embed = self.word_pos_embedding(pos)
# Step 3.3: initialize the input embeddings `x` of transformer layers
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
if ???:
x = ???
if not self.no_pos:
x = token_embed + pos_embed
else:
x = ???
x = token_embed
# apply dropout to the input embeddings via `self.drop()`
x = ???
x = self.drop(x)
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
attention_weights = ???
for ???:
attention_weights = list()
for i in range(self.num_layer):
# Step 4.1: obtain the output and attention weights of transformer layers
x, attn = ???
x, attn = self.transformer[i](x)
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
???
attention_weights.append(attn)
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
# self.language_model_head() is a linear layer defined in __init__() function
# Note: do not add softmax here since it is included in the cross entropy loss function
x = ???
logits = ???
x = self.norm(x)
logits = self.language_model_head(x)
# <<< TODO 3
# return logits and loss or attention weights

View File

@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
# model
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
# init from a model saved in a specific directory
ckpt_path = os.path.join(ckpt_path, 'best.pth')
print("sample from %s"%ckpt_path)

View File

@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
best_val_loss = 1e9
iter_num = 0 # number of iterations in the lifetime of this process