Homework4 Submit.

2024-05-27 00:01:48 +08:00
parent c6b2420b85
commit 76a643ebc4
19 changed files with 35031 additions and 76 deletions
--- a/hw4/code/attnvis.ipynb
+++ b/hw4/code/attnvis.ipynb
--- a/hw4/code/data/quansongci/train.json
+++ b/hw4/code/data/quansongci/train.json
--- a/hw4/code/data/quansongci/val.json
+++ b/hw4/code/data/quansongci/val.json
--- a/hw4/code/model.py
+++ b/hw4/code/model.py
@@ -2,8 +2,8 @@
 #             Media and Cognition
 #             Homework 4  Sequence Modeling
 #             model.py - Model definition
-#             Student ID:
-#             Name:
+#             Student ID: 2022010639
+#             Name: Yixuan Gao
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
@@ -60,54 +60,54 @@ class SelfAttention(nn.Module):
        # Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
        # the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
        # num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
-        q = ???
-        k = ???
-        v = ???
+        q = self.q_layer(x)
+        k = self.k_layer(x)
+        v = self.v_layer(x)

        # Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
        # first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
-        q = ???
-        k = ???
-        v = ???
+        q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
+        k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
+        v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)

        # then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
        # the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
-        q = ???
-        k = ???
-        v = ???
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)

        # Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
        # Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
        # the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
-        attn = ???
+        attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
        
        # Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
        # Therefore, a mask is used to prevent positions from attending to subsequent positions
        # attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
        # Hint:
        # use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
-        attn_mask = ???
+        attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
        # use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
-        attn_mask = ???
+        attn_mask = torch.triu(attn_mask, diagonal=1)
        # use Tensor.bool() to convert the matrix to a boolean matrix
-        attn_mask = ???
+        attn_mask = attn_mask.bool()
        # fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
-        attn = ???
+        attn = attn.masked_fill(attn_mask, -np.inf)

        # Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
-        attn = ???
+        attn = torch.softmax(attn, dim=3)
        # Step 1.3.4: apply dropout to `attn` via self.attn_drop()
-        attn = ???
+        attn = self.attn_drop(attn)
        # Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
        # the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
-        out = ???
+        out = attn @ v

        # Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
        # the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
-        out = ???
+        out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)

        # Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
-        result = ??? 
+        result = self.proj_drop(self.proj_layer(out))
        # <<< TODO 1

        # return the final results `result` and attention weights `attn`
@@ -147,24 +147,24 @@ class TransformerLayer(nn.Module):
        # >>> TODO 2: complete the forward process of the TransformerLayer module.
        # Step 2.1: calculate the output of multi-head self-attention
        # normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
-        x_norm = ???
+        x_norm = self.norm1(x)

        # calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
-        x_attn, attn = ???
+        x_attn, attn = self.attn(x_norm)

        # add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
-        if ???:
-            x_attn = ???
+        if not self.no_res:
+            x_attn = x_attn + x

        # Step 2.2: calculate the output of feed forward network
        # calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
-        x_ffn = ???
+        x_ffn = self.ffn(self.norm2(x_attn))

        # add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
-        if ???:
-            out = ???
+        if not self.no_res:
+            out = x_attn + x_ffn
        else:
-            out = ???
+            out = x_ffn
        # <<< TODO 2
        
        return out, attn
@@ -230,36 +230,36 @@ class GPT(nn.Module):

        # >>> TODO 3: complete the forward process of GPT
        # Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1] 
-        pos = ???
+        pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)

        # Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
-        token_embed = ???
-        pos_embed = ???
+        token_embed = self.word_token_embedding(word_idx)
+        pos_embed = self.word_pos_embedding(pos)

        # Step 3.3: initialize the input embeddings `x` of transformer layers
        # add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
-        if ???:
-            x = ???
+        if not self.no_pos:
+            x = token_embed + pos_embed
        else:
-            x = ???
+            x = token_embed

        # apply dropout to the input embeddings via `self.drop()`
-        x = ???
+        x = self.drop(x)

        # Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
        # define a list `attention_weights` and append the attention weights of each transformer layer into the list
-        attention_weights = ??? 
-        for ???:
+        attention_weights = list()
+        for i in range(self.num_layer):
            # Step 4.1: obtain the output and attention weights of transformer layers
-            x, attn = ???
+            x, attn = self.transformer[i](x)
            # Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
-            ???
+            attention_weights.append(attn)
     
        # Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
        # self.language_model_head() is a linear layer defined in __init__() function
        # Note: do not add softmax here since it is included in the cross entropy loss function
-        x = ???
-        logits = ???
+        x = self.norm(x)
+        logits = self.language_model_head(x)
        # <<< TODO 3

        # return logits and loss or attention weights
--- a/hw4/code/sample.py
+++ b/hw4/code/sample.py
@@ -17,7 +17,7 @@ def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root,
    # model
    dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(ckpt_path, 'best.pth')
    print("sample from %s"%ckpt_path)
--- a/hw4/code/train.py
+++ b/hw4/code/train.py
@@ -45,7 +45,7 @@ def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, d
    
    dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    #ctx = torch.autocast(device_type=device, dtype=ptdtype)
    best_val_loss = 1e9
    iter_num = 0 # number of iterations in the lifetime of this process