Merge pull request 'Homework4 Submit' (#5) from homework4 into main

Reviewed-on: #5
2024-05-27 00:04:04 +08:00
parent f1459069da 76a643ebc4
commit 69e52e0e50
26 changed files with 49665 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ hw2/code/checkpoints/
 hw2/code/visualized/
 hw3/code/data/
 hw3/code/checkpoints/
 hw4/code/workdirs/
--- a/hw4/code/attnvis.ipynb
+++ b/hw4/code/attnvis.ipynb
--- a/hw4/code/data/quansongci/data.json
+++ b/hw4/code/data/quansongci/data.json
--- a/hw4/code/data/quansongci/train.json
+++ b/hw4/code/data/quansongci/train.json
--- a/hw4/code/data/quansongci/val.json
+++ b/hw4/code/data/quansongci/val.json
--- a/hw4/code/data/vis/vis_1.txt
+++ b/hw4/code/data/vis/vis_1.txt
@@ -0,0 +1,2 @@
 +++如梦令
 昨夜雨疏风骤。浓睡不消残酒。试问卷帘人，却道海棠依旧。知否。知否。应是绿肥红瘦。
--- a/hw4/code/data/vis/vis_2.txt
+++ b/hw4/code/data/vis/vis_2.txt
@@ -0,0 +1,3 @@
 +++鹧鸪天（秋思）
 红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散，黄叶荻花秋水流。
 楼上角，笛声悠。兴王莫上叹人头。明朝归去无消息，只有当时一望流。
--- a/hw4/code/dataset.py
+++ b/hw4/code/dataset.py
@@ -0,0 +1,75 @@
 import torch
 from torch.utils.data import Dataset
 import numpy as np
 import os
 import json
 class LMDataset(Dataset):
    def __init__(self, data_dir, split):
        super().__init__()
        # load the data
        with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
            meta = json.load(f)
        self.data = meta['data'] # list of samples
        self.stoi = meta['stoi'] # a dict that maps character to integer
        self.itos = meta['itos'] # a dict that maps string of integer to character
        self.vocab_size = meta['vocab_size'] # vocab size
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]
 class Converter:
    '''
    This class helps us convert strings to integers and back
    We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
    '''
    def __init__(self, stoi, itos):
        self.stoi = stoi # a dict that maps character to integer
        self.itos = itos # a dict that maps string of integer to character
    def single_encode(self, s):
        l = [] # initialize an empty list
        for i in s:
            l.append(self.stoi[i])
        # transform the list into a numpy array
        l = np.array(l, dtype=np.int64)
        return l 
    def single_decode(self, l):
        s = '' # initialize an empty string
        for i in l:
            # if we meet the end of the sequence (the value of integer is equal to 1), break
            if i == 1:
                break
            # convert string of the integer into a character
            s += self.itos[str(i)]
        return s 
    def encode(self, data):
        '''
        encode a list of strings into integers
        '''
        lens = [len(s) for s in data]
        max_len = max(lens)
        out = np.zeros((len(data), max_len+1), dtype=np.int64)
        for i,s in enumerate(data):
            out[i,:len(s)] = self.single_encode(s)
            out[i,len(s)] = 1
        x = torch.from_numpy(out[:,:-1])
        y = torch.from_numpy(out[:,1:])
        return x, y 
    def decode(self, data):
        '''
        decode a list of integers into strings
        '''
        data = data.cpu().numpy().astype(np.int64)
        out = []
        for i in range(len(data)):
            out.append(self.single_decode(data[i]))
        return out
--- a/hw4/code/model.py
+++ b/hw4/code/model.py
@@ -0,0 +1,356 @@
 # ========================================================
 #             Media and Cognition
 #             Homework 4  Sequence Modeling
 #             model.py - Model definition
 #             Student ID: 2022010639
 #             Name: Yixuan Gao
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
 # Import required libraries
 ############################################################
 import math
 import torch
 import torch.nn as nn 
 from torch.nn import functional as F
 import numpy as np
 ############################################################
 # Define the GELU activation function used in OpenAI GPT
 ############################################################
 def gelu(z):
    """
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
    """
    return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
 ############################################################
 # Define the Multi-Head SelfAttention module
 ############################################################
 class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_head, dropout):
        super().__init__()
        # define there linear layers for q, k, v generation separately
        self.q_layer = nn.Linear(embed_dim, embed_dim)
        self.k_layer = nn.Linear(embed_dim, embed_dim)
        self.v_layer = nn.Linear(embed_dim, embed_dim)
        # define the projection layer for output
        self.proj_layer = nn.Linear(embed_dim, embed_dim)
        # define the dropout layer for attention and output calculation
        self.attn_drop = nn.Dropout(dropout)
        self.proj_drop = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = embed_dim // num_head
    def forward(self, x):
        batch_size, seq_len, dim = x.shape
        # >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
        # Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
        # the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
        # num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
        q = self.q_layer(x)
        k = self.k_layer(x)
        v = self.v_layer(x)
        # Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
        # first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
        q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
        k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
        v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
        # then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
        # the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)
        # Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
        # Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
        # the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
        attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
        # Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
        # Therefore, a mask is used to prevent positions from attending to subsequent positions
        # attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
        # Hint:
        # use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
        attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
        # use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
        attn_mask = torch.triu(attn_mask, diagonal=1)
        # use Tensor.bool() to convert the matrix to a boolean matrix
        attn_mask = attn_mask.bool()
        # fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
        attn = attn.masked_fill(attn_mask, -np.inf)
        # Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
        attn = torch.softmax(attn, dim=3)
        # Step 1.3.4: apply dropout to `attn` via self.attn_drop()
        attn = self.attn_drop(attn)
        # Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
        # the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
        out = attn @ v
        # Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
        # the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
        out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
        # Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
        result = self.proj_drop(self.proj_layer(out))
        # <<< TODO 1
        # return the final results `result` and attention weights `attn`
        return result, attn
 ############################################################
 # Define the feed forward network (FFN)
 ############################################################
 class FFN(nn.Module):
    def __init__(self, embed_dim, feedforward_dim, dropout):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, feedforward_dim)
        self.fc2 = nn.Linear(feedforward_dim, embed_dim)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        x = self.fc1(x)
        x = gelu(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
 ############################################################
 # Define the TransformerLayer
 ############################################################
 class TransformerLayer(nn.Module):
    def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = SelfAttention(embed_dim, num_head, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, feedforward_dim, dropout)
        self.no_res = no_res # whether to use residual connection
    def forward(self, x):
        # >>> TODO 2: complete the forward process of the TransformerLayer module.
        # Step 2.1: calculate the output of multi-head self-attention
        # normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
        x_norm = self.norm1(x)
        # calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
        x_attn, attn = self.attn(x_norm)
        # add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
        if not self.no_res:
            x_attn = x_attn + x
        # Step 2.2: calculate the output of feed forward network
        # calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
        x_ffn = self.ffn(self.norm2(x_attn))
        # add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
        if not self.no_res:
            out = x_attn + x_ffn
        else:
            out = x_ffn
        # <<< TODO 2
        return out, attn
 ############################################################
 # Define the GPT module
 ############################################################
 class GPT(nn.Module):
    def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
        '''
            vocab_size: the size of vocabulary
            max_seq_len: the maximum length of input texts
            num_layer: the number of transformer layers
            embed_dim: the embedding dimension
            num_head: the number of heads in Multi-Head Self Attention
            feedforward_dim: the dimension in the feed forward network
            dropout: dropout ratio
            no_res: whether to use residual connection in transformer layers
            no_pos: whether to use position embeddings
        '''
        super().__init__()
        self.num_layer = num_layer
        self.max_seq_len = max_seq_len
        self.no_pos = no_pos
        # Define Embedding Layer to transfer input text tokens and positions to embeddings
        self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
        self.drop = nn.Dropout(dropout)
        # Define the transformer layers
        self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
        # Define the head layer to predict output
        self.norm = nn.LayerNorm(embed_dim)
        self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
        """
        Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
        Reference: https://paperswithcode.com/method/weight-tying
        """
        self.word_token_embedding.weight = self.language_model_head.weight
        self.init_weights()
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('proj_layer.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
    def forward(self, word_idx, targets=None):
        batch_size, seq_len = word_idx.shape
        # >>> TODO 3: complete the forward process of GPT
        # Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1] 
        pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
        # Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
        token_embed = self.word_token_embedding(word_idx)
        pos_embed = self.word_pos_embedding(pos)
        # Step 3.3: initialize the input embeddings `x` of transformer layers
        # add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
        if not self.no_pos:
            x = token_embed + pos_embed
        else:
            x = token_embed
        # apply dropout to the input embeddings via `self.drop()`
        x = self.drop(x)
        # Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
        # define a list `attention_weights` and append the attention weights of each transformer layer into the list
        attention_weights = list()
        for i in range(self.num_layer):
            # Step 4.1: obtain the output and attention weights of transformer layers
            x, attn = self.transformer[i](x)
            # Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
            attention_weights.append(attn)
        # Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
        # self.language_model_head() is a linear layer defined in __init__() function
        # Note: do not add softmax here since it is included in the cross entropy loss function
        x = self.norm(x)
        logits = self.language_model_head(x)
        # <<< TODO 3
        # return logits and loss or attention weights
        if targets is not None:
            loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
            return logits, loss
        assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
        return logits, attention_weights
    def configure_optimizers(self, weight_decay):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (nn.Linear, )
        blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
        # will appear in the no_decay and decay sets respectively after the above.
        # In addition, because named_parameters() doesn't return duplicates, it
        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
        # so let's manually remove 'lm_head.weight' from decay set. This will include
        # this tensor into optimization via transformer.wte.weight only, and not decayed.
        decay.remove('language_model_head.weight')
        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )
        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        return optim_groups
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
        return idx.squeeze().cpu().numpy()
 ############################################################
 GPTConfig = {
    'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
    'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
    'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
 }
--- a/hw4/code/prepare.py
+++ b/hw4/code/prepare.py
@@ -0,0 +1,61 @@
 """
 Prepare the dataset for character-level language modeling.
 So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
 """
 import os
 import numpy as np
 import argparse
 import json
 parser = argparse.ArgumentParser()
 parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
 args = parser.parse_args()
 # set the input file path
 input_file_path = os.path.join(args.data_root, 'data.json')
 with open(input_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)['data']
 print(f"length of dataset: {len(data):,}")
 # get all the unique characters that occur in this text
 chars = sorted(list(set(''.join(data))))
 vocab_size = len(chars) + 2 # for <pad> and <eos>
 print("all the unique characters:", ''.join(chars))
 print(f"vocab size: {vocab_size:,}")
 # create a mapping from characters to integers
 stoi = { ch:i+2 for i,ch in enumerate(chars) }
 itos = { i+2:ch for i,ch in enumerate(chars) }
 stoi['<pad>'] = 0
 itos[0] = '<pad>'
 stoi['<eos>'] = 1
 itos[1] = '<eos>'
 # create the train and test splits
 n = len(data)
 train_data = data[:int(n*0.9)]
 val_data = data[int(n*0.9):]
 print(f"train has {len(train_data):,} samples")
 print(f"val has {len(val_data):,} samples")
 # save the meta information as well, to help us encode/decode later
 train_meta = {
    'data': train_data,
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
 }
 with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
    json.dump(train_meta, f, ensure_ascii=False, indent=4)
 val_meta = {
    'data': val_data,
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
 }
 with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
    json.dump(val_meta, f, ensure_ascii=False, indent=4)
--- a/hw4/code/sample.py
+++ b/hw4/code/sample.py
@@ -0,0 +1,76 @@
 """
 Sample from a trained model
 """
 import os
 import pickle
 from contextlib import nullcontext
 import torch
 from model import GPTConfig, GPT
 import argparse
 from dataset import Converter, LMDataset
 def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
    dataset = LMDataset(data_root, 'train')
    converter = Converter(dataset.stoi, dataset.itos)
    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
    # model
    dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(ckpt_path, 'best.pth')
    print("sample from %s"%ckpt_path)
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig[model_name]
    if 'model_args' in checkpoint:
        gptconf = checkpoint['model_args']
    model = GPT(**gptconf)
    state_dict = checkpoint['state_dict']
    #unwanted_prefix = '_orig_mod.'
    #for k,v in list(state_dict.items()):
    #    if k.startswith(unwanted_prefix):
    #        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    model.eval()
    model.to(device)
    # encode the beginning of the prompt
    start_ids = converter.single_encode(start)
    x = torch.from_numpy(start_ids)[None, ...].to(device).long()
    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
                print(converter.single_decode(y))
                print('---------------')
 if __name__ == '__main__':
    # set random seed for reproducibility
    seed = 2024
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
    # set configurations of the model and sampling process
    parser = argparse.ArgumentParser()
    parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
    parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
    parser.add_argument('--device', type=str, help='cpu or cuda')
    opt = parser.parse_args()
    if opt.device is None:
        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
--- a/hw4/code/train.py
+++ b/hw4/code/train.py
@@ -0,0 +1,219 @@
 import os
 import time
 import math
 import pickle
 from contextlib import nullcontext
 import argparse
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
 from model import GPT, GPTConfig
 from dataset import LMDataset, Converter
 import matplotlib.pyplot as plt
 # learning rate decay scheduler (cosine with warmup)
 def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)
 def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
    train_dataset = LMDataset(data_root, 'train')
    val_dataset = LMDataset(data_root, 'val')
    train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    converter = Converter(train_dataset.stoi, train_dataset.itos)
    # adamw optimizer
    learning_rate = 5e-3 # max learning rate
    weight_decay = 1e-1
    beta1 = 0.9
    beta2 = 0.99
    grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
    # system
    dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
    #ctx = torch.autocast(device_type=device, dtype=ptdtype)
    best_val_loss = 1e9
    iter_num = 0 # number of iterations in the lifetime of this process
    # model init
    model_args = GPTConfig[model_name]
    model_args['vocab_size'] = train_dataset.vocab_size
    model_args['max_seq_len'] = 128
    model_args['no_res'] = no_res
    model_args['no_pos'] = no_pos
    # init a new model from scratch
    print("Initializing a new model from scratch")
    model = GPT(**model_args)
    model.to(device)
    # initialize a GradScaler. If enabled=False scaler is a no-op
    scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
    # optimizer
    optim_groups = model.configure_optimizers(weight_decay)
    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
    checkpoint = None # free up memory
    print('training...')
    # training loop    
    epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
    t0 = time.time()
    model.train()
    train_losses = []
    val_losses = []
    for epoch in range(epoch_num):
        for step, inputs in enumerate(train_loader):
            if iter_num >= n_iters:
                break
            X, Y = converter.encode(inputs)
            X, Y = X.to(device), Y.to(device)
            lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            # forward backward update, with optional gradient accumulation to simulate larger batch size
            # and using the GradScaler if data type is float16
            with ctx:
                logits, loss = model(X, Y)
                loss = loss  # scale the loss to account for gradient accumulation
            # backward pass, with gradient scaling if training in fp16
            scaler.scale(loss).backward()
            # clip the gradient
            if grad_clip != 0.0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            # step the optimizer and scaler if training in fp16
            scaler.step(optimizer)
            scaler.update()
            # flush the gradients as soon as we can, no need for this memory anymore
            optimizer.zero_grad(set_to_none=True)
            iter_num += 1
            train_losses.append(loss.item())
            # evaluate the loss on train/val sets and write checkpoints
            if iter_num % val_interval == 0:
                # timing and logging
                t1 = time.time()
                dt = t1 - t0
                t0 = t1
                lossf = loss.item()
                print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
                losses = estimate_loss(model, val_loader, converter, ctx, device)
                val_losses.append(losses['val'])
                print(f"iter {iter_num}: val loss {losses['val']:.4f}")
                print(f"saving latest checkpoint to {ckpt_path}")
                checkpoint = {
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'model_args': model_args,
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss,
                    }
                torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
                if losses['val'] < best_val_loss:
                    best_val_loss = losses['val']
                    if iter_num > 0:
                        print(f"saving best checkpoint to {ckpt_path}")
                        torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
    plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
 def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
    # create a plot
    f, ax = plt.subplots(1,2,figsize=(18,6))
    val_iters = np.arange(1, n_iters+1, val_interval)
    # draw loss
    ax[0].plot(train_losses)
    ax[0].plot(val_iters, val_losses, 'r')
    # set labels
    ax[0].set_xlabel('training iters')
    ax[0].legend(['training loss', 'validation loss'])
    train_perplexity = [np.exp(x) for x in train_losses]
    val_perplexity = [np.exp(x) for x in val_losses]
    # draw perplexity
    ax[1].plot(train_perplexity)
    ax[1].plot(val_iters, val_perplexity, 'r')
    # set labels
    ax[1].set_xlabel('training iters')
    ax[1].legend(['training perplexity', 'validation perplexity'])
    plt.tight_layout()
    # show the image
    plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
    plt.show()
 # helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
 def estimate_loss(model, val_loader, converter, ctx, device):
    out = {}
    model.eval()
    losses = 0
    max_iters = 100
    iter_num = 0
    for inputs in val_loader:
        if iter_num >= max_iters:
            break
        iter_num += 1
        X, Y = converter.encode(inputs)
        X, Y = X.to(device), Y.to(device)
        with ctx:
            logits, loss = model(X, Y)
            #loss = model.loss(logits, Y)
        losses += loss.item()
    out['val'] = losses / max_iters
    model.train()
    return out
 if __name__ == '__main__':
    # set random seed for reproducibility
    seed = 2024
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
    # set configurations of the model and training process
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
    parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
    parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
    parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
    parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
    parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
    parser.add_argument('--device', type=str, help='cpu or cuda')
    opt = parser.parse_args()
    if opt.device is None:
        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    os.makedirs(opt.ckpt_path, exist_ok=True)
    train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
--- a/hw4/report/dtx-style.sty
+++ b/hw4/report/dtx-style.sty
@@ -0,0 +1,132 @@
 %%
 %% This is file `dtx-style.sty',
 %% generated with the docstrip utility.
 %%
 %% The original source files were:
 %%
 %% thucoursework.dtx  (with options: `dtx-style')
 %% 
 %% This is a generated file.
 %% 
 %% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
 %% 
 %% This work may be distributed and/or modified under the
 %% conditions of the LaTeX Project Public License, either version 1.3
 %% of this license or (at your option) any later version.
 %% The latest version of this license is in
 %%   http://www.latex-project.org/lppl.txt
 %% and version 1.3 or later is part of all distributions of LaTeX
 %% version 2005/12/01 or later.
 %% 
 %% To produce the documentation run the original source files ending with `.dtx'
 %% through LaTeX.
 %% 
 \ProvidesPackage{dtx-style}
 \RequirePackage{hypdoc}
 \RequirePackage[UTF8,scheme=chinese]{ctex}
 \RequirePackage{newpxtext}
 \RequirePackage{newpxmath}
 \RequirePackage[
  top=2.5cm, bottom=2.5cm,
  left=4cm, right=2cm,
  headsep=3mm]{geometry}
 \RequirePackage{array,longtable,booktabs}
 \RequirePackage{listings}
 \RequirePackage{fancyhdr}
 \RequirePackage{xcolor}
 \RequirePackage{enumitem}
 \RequirePackage{etoolbox}
 \RequirePackage{metalogo}
 \colorlet{thu@macro}{blue!60!black}
 \colorlet{thu@env}{blue!70!black}
 \colorlet{thu@option}{purple}
 \patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
 \patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
 \patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
 \patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
 \def\DescribeOption{%
  \leavevmode\@bsphack\begingroup\MakePrivateLetters%
  \Describe@Option}
 \def\Describe@Option#1{\endgroup
  \marginpar{\raggedleft\PrintDescribeOption{#1}}%
  \thu@special@index{option}{#1}\@esphack\ignorespaces}
 \def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
 \def\thu@special@index#1#2{\@bsphack
  \begingroup
    \HD@target
    \let\HDorg@encapchar\encapchar
    \edef\encapchar usage{%
      \HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
    }%
    \index{#2\actualchar{\string\ttfamily\space#2}
           (#1)\encapchar usage}%
    \index{#1:\levelchar#2\actualchar
           {\string\ttfamily\space#2}\encapchar usage}%
  \endgroup
  \@esphack}
 \lstdefinestyle{lstStyleBase}{%
   basicstyle=\small\ttfamily,
   aboveskip=\medskipamount,
   belowskip=\medskipamount,
   lineskip=0pt,
   boxpos=c,
   showlines=false,
   extendedchars=true,
   upquote=true,
   tabsize=2,
   showtabs=false,
   showspaces=false,
   showstringspaces=false,
   numbers=none,
   linewidth=\linewidth,
   xleftmargin=4pt,
   xrightmargin=0pt,
   resetmargins=false,
   breaklines=true,
   breakatwhitespace=false,
   breakindent=0pt,
   breakautoindent=true,
   columns=flexible,
   keepspaces=true,
   gobble=2,
   framesep=3pt,
   rulesep=1pt,
   framerule=1pt,
   backgroundcolor=\color{gray!5},
   stringstyle=\color{green!40!black!100},
   keywordstyle=\bfseries\color{blue!50!black},
   commentstyle=\slshape\color{black!60}}
 \lstdefinestyle{lstStyleShell}{%
   style=lstStyleBase,
   frame=l,
   rulecolor=\color{purple},
   language=bash}
 \lstdefinestyle{lstStyleLaTeX}{%
   style=lstStyleBase,
   frame=l,
   rulecolor=\color{violet},
   language=[LaTeX]TeX}
 \lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
 \lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
 \setlist{nosep}
 \DeclareDocumentCommand{\option}{m}{\textsf{#1}}
 \DeclareDocumentCommand{\env}{m}{\texttt{#1}}
 \DeclareDocumentCommand{\pkg}{s m}{%
  \texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
 \DeclareDocumentCommand{\file}{s m}{%
  \texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
 \newcommand{\myentry}[1]{%
  \marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
 \newcommand{\note}[2][Note]{{%
  \color{magenta}{\bfseries #1}\emph{#2}}}
 \def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
--- a/hw4/report/iidef.sty
+++ b/hw4/report/iidef.sty
@@ -0,0 +1,153 @@
 %%
 %% This is file `iidef.sty',
 %% generated with the docstrip utility.
 %%
 %% The original source files were:
 %%
 %% thucoursework.dtx  (with options: `sty')
 %% 
 %% This is a generated file.
 %% 
 %% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
 %% 
 %% This work may be distributed and/or modified under the
 %% conditions of the LaTeX Project Public License, either version 1.3
 %% of this license or (at your option) any later version.
 %% The latest version of this license is in
 %%   http://www.latex-project.org/lppl.txt
 %% and version 1.3 or later is part of all distributions of LaTeX
 %% version 2005/12/01 or later.
 %% 
 %% To produce the documentation run the original source files ending with `.dtx'
 %% through LaTeX.
 %% 
 \NeedsTeXFormat{LaTeX2e}[1999/12/01]
 \ProvidesClass{iidef}
 [2020/09/09 2.6 Tsinghua University Coursework Template]
 %% configuration of nested enumerate env
 \RequirePackage{enumitem}
 %% set hwcount key-value option
 \RequirePackage{kvoptions}
 %% required by macro DeclareMathOperator
 \RequirePackage{amsmath}
 %% Set up page headers using with fancyhdr
 \@ifundefined{lhead}{\RequirePackage{fancyhdr}}
 {\def\@thulhead{thulhead}}
 \RequirePackage{amsthm}
 %% semester
 \def\@term{term}
 \newcommand{\theterm}[1]{\renewcommand\@term{#1}}
 %% institute
 \newcommand{\@courseinstitute}[1]{institute}
 \newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
 %% coursename
 \newcommand{\@coursename}[1]{coursename}
 \newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
 %% user can rewrite homework name
 \def\@hwname{Homework}
 \def\hwname#1{\renewcommand\@hwname{#1}}
 %% \iidef@thehwcnt = 1
 \DeclareStringOption[1]{thehwcnt}
 \ProcessKeyvalOptions*
 \def\thehwcnt{\iidef@thehwcnt}
 %% page header setup, distinguish between first page(plain style)
 %% and second page on (runningpage style)
 %%***************************************************************************
 \newcommand{\courseheader}{
 \thispagestyle{plain}%first page use native plain style to suppress header
 \vspace*{-1in}
 \begin{center}
 \@courseinstitute\\
 \@coursename\\
 \@term
 \vspace*{0.1in}
 \hrule
 \end{center}
 \begin{center}
  \underline{\bf \@hwname\;\thehwcnt} \\
 \end{center}
 }
 \@ifundefined{@thulhead}{
 \fancypagestyle{runningpage}
 {
  \fancyhead[L]{\small\@coursename}
  \fancyhead[R]{\small\@courseinstitute}
 }
 %% use runningpage style from second page on
 \pagestyle{runningpage}
 }{}
 %% *********************************************************************************************
 %%name command macro
 %%*************************
 \newcommand{\name}[1]{
 \begin{flushleft}
  #1\hfill
  \today
 \end{flushleft}
 \hrule
 \vspace{2em}
 \flushleft
 }
 %%*************************
 %% enumitem related configuration
 \setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
 \setlist[enumerate,2]{label=(\alph*)}
 \setlist[enumerate,3]{label=\roman*.}
 \setlist[enumerate,4]{label=\greek*}
 %%******************************
 \def\@slname{Solution}
 \def\slname#1{\renewcommand\@slname{#1}}
 \@ifundefined{solution}{
 \newenvironment{solution}
 {
 \proof[\@slname]
 }
 {
 %% no qed symbol in solution env
 \renewcommand{\qedsymbol}{}
 \endproof
 }
 }{}
 %%******************************
 %%common math symbols go here
 %%*************************************************
 \def\v#1{\underline{#1}}
 \newcommand{\uc}{\underline{c}}    % c, vec
 \newcommand{\uv}{\underline{v}}    % x, vec
 \newcommand{\uw}{\underline{w}}    % w, vec
 \newcommand{\ux}{\underline{x}}    % x, vec
 \newcommand{\uy}{\underline{y}}    % y, vec
 \newcommand{\uz}{\underline{z}}    % z, vec
 \newcommand{\um}{\underline{m}}    % m, vec
 \newcommand{\rvx}{\mathsf{x}}    % x, r.v.
 \newcommand{\rvy}{\mathsf{y}}    % y, r.v.
 \newcommand{\rvz}{\mathsf{z}}    % z, r.v.
 \newcommand{\rvw}{\mathsf{w}}    % w, r.v.
 \newcommand{\rvH}{\mathsf{H}}    % H, r.v.
 \newcommand{\urvx}{\underline{\mathsf{x}}}    % x, r.v. vec
 \newcommand{\urvy}{\underline{\mathsf{y}}}    % y, r.v. vec
 \newcommand{\urvz}{\underline{\mathsf{z}}}    % z, r.v. vec
 \newcommand{\urvw}{\underline{\mathsf{w}}}    % w, r.v. vec
 \newcommand{\defas}{\triangleq} %\coloneqq
 \newcommand{\reals}{\mathbb{R}}
 \newcommand{\TT}{\mathrm{T}}    % transpose
 \DeclareMathOperator*{\argmax}{arg\,max}
 \DeclareMathOperator*{\argmin}{arg\,min}
 \DeclareMathOperator*{\argsup}{arg\,sup}
 \DeclareMathOperator*{\arginf}{arg\,inf}
 \DeclareMathOperator{\diag}{diag}
 \DeclareMathOperator{\Var}{Var}
 \DeclareMathOperator{\Cov}{Cov}
 \DeclareMathOperator{\MSE}{MSE}
 \DeclareMathOperator{\1}{\mathds{1}}
 \DeclareMathOperator{\In}{\mathbb{I}}
 \DeclareMathOperator{\E}{\mathbb{E}}
 \DeclareMathOperator{\Prob}{\mathbb{P}}
 \newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
 \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
 %%************************************************************************************
--- a/hw4/report/img/20240526_155701910_iOS.png
+++ b/hw4/report/img/20240526_155701910_iOS.png
--- a/hw4/report/img/attention_vis.png
+++ b/hw4/report/img/attention_vis.png
--- a/hw4/report/img/default_sample.txt
+++ b/hw4/report/img/default_sample.txt
@@ -0,0 +1,49 @@
 sample from workdirs/quansongci/best.pth
 +++水调歌头
 黄花满疏雨，月扫三宫。月明月明人去，绿绵声里，风光残霞。屈指两小天天静，绿满阶外，更相逢。那处得何曾小，泪断肠头。
 ---------------
 +++浣溪沙（五清）
 翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
 天人未遇向西楼。小阳春水一线清。玉壶重重重。
 ---------------
 +++菩萨蛮（梅）
 江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
 楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
 ---------------
 +++菩萨蛮
 江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
 豆蔻风前好因缘。送通住。试问三山同。人间无处难。
 ---------------
 +++秦楼月
 练雨梳妆。桃叶半枝，冰肌红子春寒。半枝都奈。吹香飞絮，记清凉。
 无限夜云春风护。玉阑无数转。碎帽孤情君，小海东风。
 ---------------
 +++浪淘沙
 橘上园阳关路早。绿钗风雨散，犹被东湖见楼。
 仿佛风前坡上去日，月如流。想取东南风。犹慵尘尽比重归。
 ---------------
 +++诉衷情（高人）
 时候又来深。长是红帘前。醉眼风入春期。
 应是时时，何处在、应厮续。
 ---------------
 +++浣溪沙（咏梅）
 离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
 素娥小山小曲，水朝元有长安。一榻了共取大家。
 ---------------
 +++浣溪沙（和怀）
 纵图清露歌黛倚，寒题金銮声珊瑚。十年人来懒舞丝。
 ---------------
 +++满江月
 风月不如旧，柔条欲到春风。掩花间心，道处难臾、相逢。
 陇头情不物里，阿谁向娇几。且看东词，还明红云与，一笑认教梳灯。
 ---------------
--- a/hw4/report/img/no_pos_sample.txt
+++ b/hw4/report/img/no_pos_sample.txt
@@ -0,0 +1,49 @@
 sample from workdirs/quansongci_no_pos/best.pth
 ++++++++菩萨蛮（牡丹月近）
 江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
 春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
 ---------------
 ++++浣溪沙
 清歌灯未无限。佳期时更传人不醉里，可奈有芳菲节懒。
 双蛾罗带向西楼。小小槛春寒人都怨，燕子未销眉花。
 ---------------
 ++++++++++++++++++++临江仙歌香花天
 九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
 放萧词传天稼时常相逢，还记，酒，占春寒花间风光相住，月劝花往事，占春留思，应春风到上，无人间一线秀船归来，点面皱。□□□□□□□□□□□□。都为谁老还来
 ---------------
 ++++鹧鸪天（十二之二）
 此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发，忍因缘凝理通。
 试语三岛不下，松径何处。问清将春愁易全窟，且识斗重阳。
 ---------------
 ++++浣溪沙（赋木犀）
 芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
 枝开夜忽春风护，玉阑凉痕转新碎香。有君恩多少载酒，且道有春风流。
 ---------------
 +++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案（西江仙香花宫春令（与梅子
 绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱，秋风露满庭芳菲节难过，紫。绿门好，十分飞燕子
 红，秋寒庭楼小西西风，春暮
 ---------------
 ++++++鹧鸪天（和坡衮侑觞）
 薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
 春色肃熟燕子，无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
 ---------------
 ++++菩萨蛮（用时春）
 竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
 暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
 ---------------
 ++++++++++最仙歌子（和尉生查子题）
 绿阴山淡黄未泛湘神神仙，美酒，长唱玉纤纤纤手。元何穷何处重约，清寒食、酒家流光光渐、寄新春花晓，小院映烟微香，正是十年瑶楼酒，水暖花枝枝黄昏昏不语，乍见月寂寞痴愠痕、落醉，看花梢啼红裳篆拂堕风流。
 东风吹泪过，
 ---------------
 +++++++++++++点绛唇头春事近
 花艳心头道酒前春风雨，欲春惨，春去，深自有极目娇几粉，看春词，还爱红云归，绿杨花，旧谢去年时节节，十分真时及华明月。
 醉眼底莺声中秋光幸有豆皇子
 杏花开后黄梅梢仙子，且占客里春风吹乱。
 细雨过春风轻椒香闺催春，小离
 ---------------
--- a/hw4/report/img/no_pos_train.png
+++ b/hw4/report/img/no_pos_train.png
--- a/hw4/report/img/no_res_sample.txt
+++ b/hw4/report/img/no_res_sample.txt
@@ -0,0 +1,56 @@
 sample from workdirs/quansongci_no_res/best.pth
 +++藕上空都未。消
 ---------------
 +++。水。香，清干灯翠无月。佳
 ---------------
 +++烟
 莫。。一
 真。，。，手）+（。当，。，还花。
 。。饱）花清生失楼犹。拂念。。。
 +东+柳人。碧放萧似天天饮时
 ---------------
 +++，一+
 楼。。移。无度此
 ，+路风砧东
 ---------------
 +++，。常明香天。早。+。色。，大，梅子春上妆半枝。奈。吹。飞、，歌。阑故溪枝开夜忽春花。情，重凉痕转。碎沙相，君有园海。奈。
 。会
 ---------------
 +++。。晓宫。。园。+二盈
 钗。+。，恁尾。
 见楼风
 寿到+。尽+。日。。
 ---------------
 +++。看。月。
 （
 时衮红。自。意
 须去前。醉急风入鼎人花
 。团时。丹翁怨在身云厮。厌
 秋海花拟燕
 ，无共宿道行气东。，鸾+雨。梦，
 。。余采
 ---------------
 ++++俊去莺浮
 时重。+功太。犹。头（人一溪+者。斋算。旧
 ---------------
 +++，人花长和寞。。纵图清孔歌幽
 ---------------
 +++髻
 。+风与不，干
 柔
 。头余说。花
 。心头道。前，枕相
 。
 忘，情+物。自水极初。几晶
 看。词光。明红主与，。。认，旧。去
 户萨尽玉罢
 不时家。亭，行翠厚情青
 +中思难梦。底南星
 。自马
 黄
 我来
 ，中+。花
 禁，，也
 。花、。风儿。堂莺催旧，+离
 ---------------
--- a/hw4/report/img/no_res_train.png
+++ b/hw4/report/img/no_res_train.png
--- a/hw4/report/img/specific_start_sample.txt
+++ b/hw4/report/img/specific_start_sample.txt
@@ -0,0 +1,51 @@
 sample from workdirs/quansongci/best.pth
 +++清平乐（上赋）
 黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
 屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
 ---------------
 +++清平乐
 京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
 客已暮云梦，天人未老。心事有天涯无数。人都不须关，只是秋千千里。
 ---------------
 +++清平乐（春）
 红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
 一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
 ---------------
 +++清平乐
 银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
 小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
 ---------------
 +++清平乐
 江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归，犹唤梅子春去。
 好都奈。吹回飞飞来。清凉不知无限夜，春风护雨晚梁归。
 ---------------
 +++清平乐
 春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
 钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
 ---------------
 +++清平乐（即回）
 六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
 好去前时醉，风入泥袖。挼黄团时时问。怨在月明千片春水。
 ---------------
 +++清平乐
 晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
 春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
 ---------------
 +++清平乐
 残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
 谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
 ---------------
 +++清平乐（月明月）
 醉来人在。春知何时到花时。似来东风识，时时倍度。
 风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
 ---------------
--- a/hw4/report/img/train.png
+++ b/hw4/report/img/train.png
--- a/hw4/report/main.tex
+++ b/hw4/report/main.tex
@@ -0,0 +1,187 @@
 % Homework template for Inference and Information
 % UPDATE: September 26, 2017 by Xiangxiang
 \documentclass[a4paper]{article}
 \usepackage{ctex}
 \usepackage{amsmath, amssymb, amsthm}
 \usepackage{moreenum}
 \usepackage{mathtools}
 \usepackage{url}
 \usepackage{bm}
 \usepackage{enumitem}
 \usepackage{graphicx}
 \usepackage{listings}
 \usepackage{color}
 \usepackage{float}
 \newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
 \newfontfamily\cascadia{Cascadia Code}
 \lstset{
    basicstyle          =   \small\codefont,
    % ---
    tabsize             =   4,
    showstringspaces    =   false,
    numbers             =   left,
    numberstyle         =   \codefont,
    % ---
    breaklines          =   true,
    captionpos          =   t,      
    % ---
    frame               =   l,
    flexiblecolumns,
 }
 \lstdefinestyle{Python}{
    language        =   Python, % 语言选Python
    keywordstyle    =   \color{blue},
    keywordstyle    =   [2] \color{teal},
    stringstyle     =   \color{orange!80!black},
    commentstyle    =   \color{red},
    identifierstyle =   \color{blue!80!white},
 }
 \lstdefinestyle{Bash}{
    language        =   bash
 }
 \usepackage{subcaption}
 \usepackage{booktabs} % toprule
 \usepackage[mathcal]{eucal}
 \usepackage[thehwcnt = 4]{iidef}
 \thecourseinstitute{清华大学电子工程系}
 \thecoursename{\textbf{媒体与认知}}
 \theterm{2023-2024学年春季学期}
 \hwname{作业}
 \begin{document}
 \courseheader
 \name{高艺轩}
 \vspace{3mm}
 \centerline{\textbf{\Large{理论部分}}}
 \section{单选题（15分）}
 \subsection{\underline{D}}
 \subsection{\underline{A}}
 \subsection{\underline{A}}
 \subsection{\underline{C}}
 \subsection{\underline{B}}
 \section{计算题（15 分）}
 % 计算题1
 \subsection{隐含马尔可夫模型}
 \hspace{2em}暑假中，小E每天进行一项体育活动，包括跑步（R）、游泳（S）和打球（B），所选择的体育活动受某种潜在因素（如心情）的影响。小E每天把进行体育活动的照片发至微信朋友圈，我们可以根据观测信息推测该潜在因素的状态。
 \hspace{2em}假设该潜在因素分为$S_1$和$S_2$两种状态。在$S_1$时，小E选择三种体育活动的概率分别为0.6，0.2，0.2；在$S_2$时，小E选择三种体育活动的概率分别为0.1，0.6，0.3。
 \hspace{2em}该潜在因素的变化也有一定规律，若某天处于$S_1$的状态，第二天处于$S_1$和$S_2$的状态的概率分别为0.5，0.5；若某天处于$S_2$的状态，第二天处于$S_1$和$S_2$的状态的概率分别为0.6，0.4。
 \hspace{2em}暑假第一天处于$S_1$和$S_2$的状态的概率均为0.5。
 \vspace{3mm}
 (1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模，{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
 \begin{proof}[解]
    \[\pi = \begin{bmatrix}
        0.5\\0.5
    \end{bmatrix}\]
    \[A = \begin{bmatrix}
        0.5 & 0.5\\
        0.6 & 0.4\\
    \end{bmatrix}\]
    \[B = \begin{bmatrix}
        0.6 & 0.2 & 0.2\\
        0.1 & 0.6 & 0.3
    \end{bmatrix}\]
 \end{proof}
 \vspace{3mm}
 (2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步（R）、打球（B）和游泳（S），{\color{blue}请计算出现该观测序列的概率}。
 \begin{proof}[解]
    \begin{align*}
        \alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
        \alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
        \alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
        & = 0.036\\
        \alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
        & = 0.051\\
        \alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
        & = 0.00972\\
        \alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
        & = 0.02304\\
        P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
    \end{align*}
 \end{proof}
 \vspace{3mm}
 (3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
 \begin{figure}[H]
    \centering
    \includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
 \end{figure}
 % 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
 \section{编程作业报告}
 \subsection{模型的训练与测试}
 首先进行数据预处理。预处理后进行模型训练，训练的结果见图\ref{fig:default_train}。
 \begin{figure}[H]
    \centering
    \includegraphics[width=\linewidth]{img/train.png}
    \caption{默认测试}
    \label{fig:default_train}
 \end{figure}
 默认配置的生成样本：
 \begin{lstlisting}
 python sample.py --ckpt_path workdirs/quansongci
 \end{lstlisting}
 得到的输出为
 \lstinputlisting{img/default_sample.txt}
 若指定初始文本：
 \begin{lstlisting}
 python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
 \end{lstlisting}
 得到的输出为
 \lstinputlisting{img/specific_start_sample.txt}
 \subsection{探究位置编码和残差链接在模型中的作用}
 关闭位置编码的训练：
 \begin{figure}[H]
    \centering
    \includegraphics[width=\linewidth]{img/no_pos_train.png}
 \end{figure}
 得到的生成结果：
 \lstinputlisting{img/no_pos_sample.txt}
 可以看到，模型没有很好理解句子的长度的关系。
 关闭残差连接的训练：
 \begin{figure}[H]
    \centering
    \includegraphics[width=\linewidth]{img/no_res_train.png}
 \end{figure}
 得到的生成结果：
 \lstinputlisting{img/no_res_sample.txt}
 模型训练遇到了梯度消失的问题，很难有效地训练。
 \subsection{可视化}
 \begin{figure}[H]
    \centering
    \includegraphics[width=.8\linewidth]{img/attention_vis.png}
 \end{figure}
 许多的词语的注意力系数都会集中在题目的几个字上，可以看到模型主要是分析了不同词牌名对内容的相关性。
 \end{document}
 %%% Local Variables:
 %%% mode: late\rvx
 %%% TeX-master: t
 %%% End:
--- a/j.ps1
+++ b/j.ps1
@@ -1 +1 @@
-cd ./hw3/code
+cd ./hw4/code
--- a/testtorch.ipynb
+++ b/testtorch.ipynb
@@ -10,7 +10,9 @@
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
-    "import torchvision.transforms as transforms"
+    "import torchvision.transforms as transforms\n",
    "\n",
    "import numpy as np"
   ]
  },
  {
@@ -212,6 +214,63 @@
    "b = torch.Tensor([1])\n",
    "print((a.T * b).T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[False,  True,  True,  True,  True],\n",
      "        [False, False,  True,  True,  True],\n",
      "        [False, False, False,  True,  True],\n",
      "        [False, False, False, False,  True],\n",
      "        [False, False, False, False, False]])\n",
      "tensor([[-0.1170,  0.6130,  0.9644, -1.2733, -0.9671],\n",
      "        [-0.7806,  0.5082, -0.2731,  0.1660, -0.5451],\n",
      "        [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
      "        [-1.8357, -0.8010, -0.0424,  0.1491, -1.5009],\n",
      "        [-1.3666, -0.8209,  0.0483, -1.3165, -0.9222]])\n",
      "tensor([[-0.1170,    -inf,    -inf,    -inf,    -inf],\n",
      "        [-0.7806,  0.5082,    -inf,    -inf,    -inf],\n",
      "        [-2.1527, -0.5059, -0.0079,    -inf,    -inf],\n",
      "        [-1.8357, -0.8010, -0.0424,  0.1491,    -inf],\n",
      "        [-1.3666, -0.8209,  0.0483, -1.3165, -0.9222]])\n"
     ]
    }
   ],
   "source": [
    "mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
    "print(mask)\n",
    "attn = torch.randn(5, 5)\n",
    "print(attn)\n",
    "print(attn.masked_fill(mask, -np.inf))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([0.1402, 0.2312, 0.6285])\n"
     ]
    }
   ],
   "source": [
    "Q = torch.Tensor([1, 0, 1, 1])\n",
    "K = torch.Tensor([[0, 0, 0, 2],\n",
    "                  [2, 0, 1, 0],\n",
    "                  [2, 1, 2, 1]])\n",
    "\n",
    "print(torch.softmax((Q @ K.T) / 2, dim=0))"
   ]
  }
 ],
 "metadata": {
		`@@ -0,0 +1,2 @@`
							`+++如梦令`
							`昨夜雨疏风骤。浓睡不消残酒。试问卷帘人，却道海棠依旧。知否。知否。应是绿肥红瘦。`