TA Release homework4.

2024-05-22 20:22:47 +08:00
parent c850f38778
commit c6b2420b85
12 changed files with 14707 additions and 0 deletions
--- a/hw4/code/attnvis.ipynb
+++ b/hw4/code/attnvis.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from contextlib import nullcontext\n",
+    "import torch\n",
+    "from model import GPTConfig, GPT\n",
+    "from bertviz import head_view\n",
+    "from dataset import Converter, LMDataset\n",
+    "\n",
+    "# set random seed for reproducibility\n",
+    "seed = 2024\n",
+    "torch.manual_seed(seed)\n",
+    "torch.cuda.manual_seed(seed)\n",
+    "torch.cuda.manual_seed_all(seed)\n",
+    "torch.backends.cudnn.deterministic = True\n",
+    "torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
+    "torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
+    "\n",
+    "#################################################\n",
+    "# \n",
+    "model_name = 'mygpt'\n",
+    "ckpt_path = 'workdirs/quansongci'\n",
+    "data_root = 'data/quansongci'\n",
+    "vis_text_path = 'data/vis/vis_1.txt'\n",
+    "#################################################\n",
+    "\n",
+    "device = 'cpu'\n",
+    "\n",
+    "dataset = LMDataset(data_root, 'train')\n",
+    "converter = Converter(dataset.stoi, dataset.itos)\n",
+    "\n",
+    "\n",
+    "with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
+    "    start = f.read()\n",
+    "start_ids = converter.single_encode(start)\n",
+    "start_texts = [c for c in start]\n",
+    "x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
+    "print(f\"Input texts: {start}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0792738",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model\n",
+    "dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
+    "ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
+    "ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
+    "# init from a model saved in a specific directory\n",
+    "ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
+    "print(\"loading model params from %s\"%ckpt_path)\n",
+    "checkpoint = torch.load(ckpt_path, map_location=device)\n",
+    "gptconf = GPTConfig[model_name]\n",
+    "if 'model_args' in checkpoint:\n",
+    "    gptconf = checkpoint['model_args']\n",
+    "model = GPT(**gptconf)\n",
+    "state_dict = checkpoint['state_dict']\n",
+    "model.load_state_dict(state_dict)\n",
+    "\n",
+    "model.eval()\n",
+    "model.to(device)\n",
+    "\n",
+    "# run generation\n",
+    "with torch.no_grad():\n",
+    "    with ctx:\n",
+    "        _, attn_weights = model(x)\n",
+    "\n",
+    "head_view(attn_weights, start_texts)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/hw4/code/data/quansongci/data.json
+++ b/hw4/code/data/quansongci/data.json
--- a/hw4/code/data/vis/vis_1.txt
+++ b/hw4/code/data/vis/vis_1.txt
@@ -0,0 +1,2 @@
+++如梦令
+昨夜雨疏风骤。浓睡不消残酒。试问卷帘人，却道海棠依旧。知否。知否。应是绿肥红瘦。
--- a/hw4/code/data/vis/vis_2.txt
+++ b/hw4/code/data/vis/vis_2.txt
@@ -0,0 +1,3 @@
+++鹧鸪天（秋思）
+红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散，黄叶荻花秋水流。
+楼上角，笛声悠。兴王莫上叹人头。明朝归去无消息，只有当时一望流。
--- a/hw4/code/dataset.py
+++ b/hw4/code/dataset.py
@@ -0,0 +1,75 @@
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+import os
+import json
+
+class LMDataset(Dataset):
+    def __init__(self, data_dir, split):
+        super().__init__()
+        # load the data
+        with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
+            meta = json.load(f)
+        
+        self.data = meta['data'] # list of samples
+        self.stoi = meta['stoi'] # a dict that maps character to integer
+        self.itos = meta['itos'] # a dict that maps string of integer to character
+        self.vocab_size = meta['vocab_size'] # vocab size
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, index):
+        return self.data[index]
+
+class Converter:
+    '''
+    This class helps us convert strings to integers and back
+    We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
+    '''
+    def __init__(self, stoi, itos):
+        self.stoi = stoi # a dict that maps character to integer
+        self.itos = itos # a dict that maps string of integer to character
+    
+    def single_encode(self, s):
+        l = [] # initialize an empty list
+        for i in s:
+            l.append(self.stoi[i])
+        # transform the list into a numpy array
+        l = np.array(l, dtype=np.int64)
+        return l 
+        
+    def single_decode(self, l):
+        s = '' # initialize an empty string
+        for i in l:
+            # if we meet the end of the sequence (the value of integer is equal to 1), break
+            if i == 1:
+                break
+            # convert string of the integer into a character
+            s += self.itos[str(i)]
+        return s 
+
+
+    def encode(self, data):
+        '''
+        encode a list of strings into integers
+        '''
+        lens = [len(s) for s in data]
+        max_len = max(lens)
+        out = np.zeros((len(data), max_len+1), dtype=np.int64)
+        for i,s in enumerate(data):
+            out[i,:len(s)] = self.single_encode(s)
+            out[i,len(s)] = 1
+        x = torch.from_numpy(out[:,:-1])
+        y = torch.from_numpy(out[:,1:])
+        return x, y 
+
+    def decode(self, data):
+        '''
+        decode a list of integers into strings
+        '''
+        data = data.cpu().numpy().astype(np.int64)
+        out = []
+        for i in range(len(data)):
+            out.append(self.single_decode(data[i]))
+        return out
--- a/hw4/code/model.py
+++ b/hw4/code/model.py
@@ -0,0 +1,356 @@
+# ========================================================
+#             Media and Cognition
+#             Homework 4  Sequence Modeling
+#             model.py - Model definition
+#             Student ID:
+#             Name:
+#             Tsinghua University
+#             (C) Copyright 2024
+# ========================================================
+
+
+# Import required libraries
+############################################################
+import math
+import torch
+import torch.nn as nn 
+from torch.nn import functional as F
+import numpy as np
+
+############################################################
+
+# Define the GELU activation function used in OpenAI GPT
+############################################################
+def gelu(z):
+    """
+    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
+    0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
+    """
+    return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
+
+############################################################
+
+# Define the Multi-Head SelfAttention module
+############################################################
+class SelfAttention(nn.Module):
+
+    def __init__(self, embed_dim, num_head, dropout):
+        super().__init__()
+
+        # define there linear layers for q, k, v generation separately
+        self.q_layer = nn.Linear(embed_dim, embed_dim)
+        self.k_layer = nn.Linear(embed_dim, embed_dim)
+        self.v_layer = nn.Linear(embed_dim, embed_dim)
+
+        # define the projection layer for output
+        self.proj_layer = nn.Linear(embed_dim, embed_dim)
+
+        # define the dropout layer for attention and output calculation
+        self.attn_drop = nn.Dropout(dropout)
+        self.proj_drop = nn.Dropout(dropout)
+
+        self.num_head = num_head
+        self.head_dim = embed_dim // num_head
+    
+    def forward(self, x):
+
+        batch_size, seq_len, dim = x.shape
+
+        # >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
+        # Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
+        # the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
+        # num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
+        q = ???
+        k = ???
+        v = ???
+
+        # Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
+        # first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
+        q = ???
+        k = ???
+        v = ???
+
+        # then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
+        # the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
+        q = ???
+        k = ???
+        v = ???
+
+        # Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
+        # Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
+        # the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
+        attn = ???
+        
+        # Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
+        # Therefore, a mask is used to prevent positions from attending to subsequent positions
+        # attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
+        # Hint:
+        # use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
+        attn_mask = ???
+        # use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
+        attn_mask = ???
+        # use Tensor.bool() to convert the matrix to a boolean matrix
+        attn_mask = ???
+        # fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
+        attn = ???
+
+        # Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
+        attn = ???
+        # Step 1.3.4: apply dropout to `attn` via self.attn_drop()
+        attn = ???
+        # Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
+        # the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
+        out = ???
+
+        # Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
+        # the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
+        out = ???
+
+        # Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
+        result = ??? 
+        # <<< TODO 1
+
+        # return the final results `result` and attention weights `attn`
+        return result, attn
+    
+############################################################
+    
+# Define the feed forward network (FFN)
+############################################################
+class FFN(nn.Module):
+    def __init__(self, embed_dim, feedforward_dim, dropout):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim, feedforward_dim)
+        self.fc2 = nn.Linear(feedforward_dim, embed_dim)
+        self.drop = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = gelu(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+############################################################
+
+# Define the TransformerLayer
+############################################################
+class TransformerLayer(nn.Module):
+    def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.attn = SelfAttention(embed_dim, num_head, dropout)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.ffn = FFN(embed_dim, feedforward_dim, dropout)
+        self.no_res = no_res # whether to use residual connection
+
+    def forward(self, x):
+        # >>> TODO 2: complete the forward process of the TransformerLayer module.
+        # Step 2.1: calculate the output of multi-head self-attention
+        # normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
+        x_norm = ???
+
+        # calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
+        x_attn, attn = ???
+
+        # add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
+        if ???:
+            x_attn = ???
+
+        # Step 2.2: calculate the output of feed forward network
+        # calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
+        x_ffn = ???
+
+        # add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
+        if ???:
+            out = ???
+        else:
+            out = ???
+        # <<< TODO 2
+        
+        return out, attn
+############################################################
+
+# Define the GPT module
+############################################################
+class GPT(nn.Module):
+    def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
+        '''
+            vocab_size: the size of vocabulary
+            max_seq_len: the maximum length of input texts
+            num_layer: the number of transformer layers
+            embed_dim: the embedding dimension
+            num_head: the number of heads in Multi-Head Self Attention
+            feedforward_dim: the dimension in the feed forward network
+            dropout: dropout ratio
+            no_res: whether to use residual connection in transformer layers
+            no_pos: whether to use position embeddings
+        '''
+        super().__init__()
+        self.num_layer = num_layer
+        self.max_seq_len = max_seq_len
+        self.no_pos = no_pos
+
+        # Define Embedding Layer to transfer input text tokens and positions to embeddings
+        self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
+        self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
+
+        self.drop = nn.Dropout(dropout)
+        # Define the transformer layers
+        self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
+        
+        # Define the head layer to predict output
+        self.norm = nn.LayerNorm(embed_dim)
+        self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
+
+        """
+        Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
+        Reference: https://paperswithcode.com/method/weight-tying
+        """
+        self.word_token_embedding.weight = self.language_model_head.weight
+
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+                if m.bias is not None:
+                    torch.nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Embedding):
+                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
+        
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('proj_layer.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
+
+    
+    def forward(self, word_idx, targets=None):
+        batch_size, seq_len = word_idx.shape
+
+        # >>> TODO 3: complete the forward process of GPT
+        # Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1] 
+        pos = ???
+
+        # Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
+        token_embed = ???
+        pos_embed = ???
+
+        # Step 3.3: initialize the input embeddings `x` of transformer layers
+        # add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
+        if ???:
+            x = ???
+        else:
+            x = ???
+
+        # apply dropout to the input embeddings via `self.drop()`
+        x = ???
+
+        # Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
+        # define a list `attention_weights` and append the attention weights of each transformer layer into the list
+        attention_weights = ??? 
+        for ???:
+            # Step 4.1: obtain the output and attention weights of transformer layers
+            x, attn = ???
+            # Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
+            ???
+     
+        # Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
+        # self.language_model_head() is a linear layer defined in __init__() function
+        # Note: do not add softmax here since it is included in the cross entropy loss function
+        x = ???
+        logits = ???
+        # <<< TODO 3
+
+        # return logits and loss or attention weights
+        if targets is not None:
+            loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
+            return logits, loss
+        assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
+        return logits, attention_weights
+
+    def configure_optimizers(self, weight_decay):
+        """
+        This long function is unfortunately doing something very simple and is being very defensive:
+        We are separating out all parameters of the model into two buckets: those that will experience
+        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
+        We are then returning the PyTorch optimizer object.
+        """
+
+        # separate out all parameters to those that will and won't experience regularizing weight decay
+        decay = set()
+        no_decay = set()
+        whitelist_weight_modules = (nn.Linear, )
+        blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
+        for mn, m in self.named_modules():
+            for pn, p in m.named_parameters():
+                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
+                # random note: because named_modules and named_parameters are recursive
+                # we will see the same tensors p many many times. but doing it this way
+                # allows us to know which parent module any tensor p belongs to...
+                if pn.endswith('bias'):
+                    # all biases will not be decayed
+                    no_decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
+                    # weights of whitelist modules will be weight decayed
+                    decay.add(fpn)
+                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
+                    # weights of blacklist modules will NOT be weight decayed
+                    no_decay.add(fpn)
+
+        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
+        # will appear in the no_decay and decay sets respectively after the above.
+        # In addition, because named_parameters() doesn't return duplicates, it
+        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
+        # so let's manually remove 'lm_head.weight' from decay set. This will include
+        # this tensor into optimization via transformer.wte.weight only, and not decayed.
+        decay.remove('language_model_head.weight')
+
+        # validate that we considered every parameter
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        inter_params = decay & no_decay
+        union_params = decay | no_decay
+        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
+        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
+                                                    % (str(param_dict.keys() - union_params), )
+
+        # create the pytorch optimizer object
+        optim_groups = [
+            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
+            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
+        ]
+        return optim_groups
+    
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx.squeeze().cpu().numpy()
+############################################################
+
+GPTConfig = {
+    'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
+    'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
+    'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
+}
--- a/hw4/code/prepare.py
+++ b/hw4/code/prepare.py
@@ -0,0 +1,61 @@
+"""
+Prepare the dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+"""
+import os
+import numpy as np
+import argparse
+import json
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
+args = parser.parse_args()
+
+# set the input file path
+input_file_path = os.path.join(args.data_root, 'data.json')
+
+with open(input_file_path, 'r', encoding='utf-8') as f:
+    data = json.load(f)['data']
+print(f"length of dataset: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(''.join(data))))
+vocab_size = len(chars) + 2 # for <pad> and <eos>
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i+2 for i,ch in enumerate(chars) }
+itos = { i+2:ch for i,ch in enumerate(chars) }
+stoi['<pad>'] = 0
+itos[0] = '<pad>'
+stoi['<eos>'] = 1
+itos[1] = '<eos>'
+
+
+# create the train and test splits
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+print(f"train has {len(train_data):,} samples")
+print(f"val has {len(val_data):,} samples")
+
+# save the meta information as well, to help us encode/decode later
+train_meta = {
+    'data': train_data,
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
+    json.dump(train_meta, f, ensure_ascii=False, indent=4)
+
+val_meta = {
+    'data': val_data,
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
+    json.dump(val_meta, f, ensure_ascii=False, indent=4)
+
--- a/hw4/code/sample.py
+++ b/hw4/code/sample.py
@@ -0,0 +1,76 @@
+"""
+Sample from a trained model
+"""
+import os
+import pickle
+from contextlib import nullcontext
+import torch
+from model import GPTConfig, GPT
+import argparse
+from dataset import Converter, LMDataset
+
+def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
+    dataset = LMDataset(data_root, 'train')
+    converter = Converter(dataset.stoi, dataset.itos)
+    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
+    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
+    # model
+    dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    # init from a model saved in a specific directory
+    ckpt_path = os.path.join(ckpt_path, 'best.pth')
+    print("sample from %s"%ckpt_path)
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    gptconf = GPTConfig[model_name]
+    if 'model_args' in checkpoint:
+        gptconf = checkpoint['model_args']
+    model = GPT(**gptconf)
+    state_dict = checkpoint['state_dict']
+    #unwanted_prefix = '_orig_mod.'
+    #for k,v in list(state_dict.items()):
+    #    if k.startswith(unwanted_prefix):
+    #        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+    model.load_state_dict(state_dict)
+
+    model.eval()
+    model.to(device)
+
+    # encode the beginning of the prompt
+    start_ids = converter.single_encode(start)
+    x = torch.from_numpy(start_ids)[None, ...].to(device).long()
+
+    # run generation
+    with torch.no_grad():
+        with ctx:
+            for k in range(num_samples):
+                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+                print(converter.single_decode(y))
+                print('---------------')
+
+if __name__ == '__main__':
+
+    # set random seed for reproducibility
+    seed = 2024
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+
+    # set configurations of the model and sampling process
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
+    parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
+    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
+    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
+    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
+    parser.add_argument('--device', type=str, help='cpu or cuda')
+
+    opt = parser.parse_args()
+    if opt.device is None:
+        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    
+
+    sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
--- a/hw4/code/train.py
+++ b/hw4/code/train.py
@@ -0,0 +1,219 @@
+import os
+import time
+import math
+import pickle
+from contextlib import nullcontext
+import argparse
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from model import GPT, GPTConfig
+from dataset import LMDataset, Converter
+import matplotlib.pyplot as plt
+
+# learning rate decay scheduler (cosine with warmup)
+def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
+    # 1) linear warmup for warmup_iters steps
+    if it < warmup_iters:
+        return learning_rate * it / warmup_iters
+    # 2) if it > lr_decay_iters, return min learning rate
+    if it > lr_decay_iters:
+        return min_lr
+    # 3) in between, use cosine decay down to min learning rate
+    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+    assert 0 <= decay_ratio <= 1
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
+    return min_lr + coeff * (learning_rate - min_lr)
+
+def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
+    train_dataset = LMDataset(data_root, 'train')
+    val_dataset = LMDataset(data_root, 'val')
+    train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
+    converter = Converter(train_dataset.stoi, train_dataset.itos)
+
+    # adamw optimizer
+    learning_rate = 5e-3 # max learning rate
+    weight_decay = 1e-1
+    beta1 = 0.9
+    beta2 = 0.99
+    grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
+    
+    # system
+    
+    dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
+    #ctx = torch.autocast(device_type=device, dtype=ptdtype)
+    best_val_loss = 1e9
+    iter_num = 0 # number of iterations in the lifetime of this process
+
+    # model init
+    model_args = GPTConfig[model_name]
+    model_args['vocab_size'] = train_dataset.vocab_size
+    model_args['max_seq_len'] = 128
+    model_args['no_res'] = no_res
+    model_args['no_pos'] = no_pos
+
+    # init a new model from scratch
+    print("Initializing a new model from scratch")
+    model = GPT(**model_args)
+
+    model.to(device)
+
+    # initialize a GradScaler. If enabled=False scaler is a no-op
+    scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
+
+    # optimizer
+    optim_groups = model.configure_optimizers(weight_decay)
+    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
+    checkpoint = None # free up memory
+
+    print('training...')
+    # training loop    
+    epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
+    t0 = time.time()
+    model.train()
+    train_losses = []
+    val_losses = []
+    for epoch in range(epoch_num):
+        for step, inputs in enumerate(train_loader):
+            if iter_num >= n_iters:
+                break
+            X, Y = converter.encode(inputs)
+            X, Y = X.to(device), Y.to(device)
+            lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+
+            # forward backward update, with optional gradient accumulation to simulate larger batch size
+            # and using the GradScaler if data type is float16
+            with ctx:
+                logits, loss = model(X, Y)
+                loss = loss  # scale the loss to account for gradient accumulation
+            
+            # backward pass, with gradient scaling if training in fp16
+            scaler.scale(loss).backward()
+            # clip the gradient
+            if grad_clip != 0.0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+            # step the optimizer and scaler if training in fp16
+            scaler.step(optimizer)
+            scaler.update()
+            # flush the gradients as soon as we can, no need for this memory anymore
+            optimizer.zero_grad(set_to_none=True)
+
+            iter_num += 1
+            train_losses.append(loss.item())
+            # evaluate the loss on train/val sets and write checkpoints
+            if iter_num % val_interval == 0:
+                # timing and logging
+                t1 = time.time()
+                dt = t1 - t0
+                t0 = t1
+                lossf = loss.item()
+                print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
+                losses = estimate_loss(model, val_loader, converter, ctx, device)
+                val_losses.append(losses['val'])
+                print(f"iter {iter_num}: val loss {losses['val']:.4f}")
+                print(f"saving latest checkpoint to {ckpt_path}")
+                checkpoint = {
+                        'state_dict': model.state_dict(),
+                        'optimizer': optimizer.state_dict(),
+                        'model_args': model_args,
+                        'iter_num': iter_num,
+                        'best_val_loss': best_val_loss,
+                    }
+                torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
+
+                if losses['val'] < best_val_loss:
+                    best_val_loss = losses['val']
+                    if iter_num > 0:
+                        print(f"saving best checkpoint to {ckpt_path}")
+                        torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
+
+    plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
+
+def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
+    # create a plot
+    f, ax = plt.subplots(1,2,figsize=(18,6))
+    val_iters = np.arange(1, n_iters+1, val_interval)
+
+    # draw loss
+    ax[0].plot(train_losses)
+    ax[0].plot(val_iters, val_losses, 'r')
+
+    # set labels
+    ax[0].set_xlabel('training iters')
+    ax[0].legend(['training loss', 'validation loss'])
+
+    train_perplexity = [np.exp(x) for x in train_losses]
+    val_perplexity = [np.exp(x) for x in val_losses]
+    # draw perplexity
+    ax[1].plot(train_perplexity)
+    ax[1].plot(val_iters, val_perplexity, 'r')
+
+    # set labels
+    ax[1].set_xlabel('training iters')
+    ax[1].legend(['training perplexity', 'validation perplexity'])
+    plt.tight_layout()
+
+    # show the image
+    plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
+    plt.show()
+
+# helps estimate an arbitrarily accurate loss over either split using many batches
+@torch.no_grad()
+def estimate_loss(model, val_loader, converter, ctx, device):
+    out = {}
+    model.eval()
+    losses = 0
+    max_iters = 100
+    iter_num = 0
+    for inputs in val_loader:
+        if iter_num >= max_iters:
+            break
+        iter_num += 1
+        X, Y = converter.encode(inputs)
+        X, Y = X.to(device), Y.to(device)
+        with ctx:
+            logits, loss = model(X, Y)
+            #loss = model.loss(logits, Y)
+        losses += loss.item()
+    out['val'] = losses / max_iters
+    model.train()
+    return out
+
+if __name__ == '__main__':
+    # set random seed for reproducibility
+    seed = 2024
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
+    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+
+    # set configurations of the model and training process
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
+    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
+    parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
+    parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
+    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
+    parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
+    parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
+    parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
+    parser.add_argument('--device', type=str, help='cpu or cuda')
+
+    opt = parser.parse_args()
+    if opt.device is None:
+        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    os.makedirs(opt.ckpt_path, exist_ok=True)
+    train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
+
+