TA Release homework4.

2024-05-22 20:22:47 +08:00
parent c850f38778
commit c6b2420b85
12 changed files with 14707 additions and 0 deletions
--- a/hw4/code/attnvis.ipynb
+++ b/hw4/code/attnvis.ipynb
@@ -0,0 +1,104 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from contextlib import nullcontext\n",
    "import torch\n",
    "from model import GPTConfig, GPT\n",
    "from bertviz import head_view\n",
    "from dataset import Converter, LMDataset\n",
    "\n",
    "# set random seed for reproducibility\n",
    "seed = 2024\n",
    "torch.manual_seed(seed)\n",
    "torch.cuda.manual_seed(seed)\n",
    "torch.cuda.manual_seed_all(seed)\n",
    "torch.backends.cudnn.deterministic = True\n",
    "torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
    "torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
    "\n",
    "#################################################\n",
    "# \n",
    "model_name = 'mygpt'\n",
    "ckpt_path = 'workdirs/quansongci'\n",
    "data_root = 'data/quansongci'\n",
    "vis_text_path = 'data/vis/vis_1.txt'\n",
    "#################################################\n",
    "\n",
    "device = 'cpu'\n",
    "\n",
    "dataset = LMDataset(data_root, 'train')\n",
    "converter = Converter(dataset.stoi, dataset.itos)\n",
    "\n",
    "\n",
    "with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
    "    start = f.read()\n",
    "start_ids = converter.single_encode(start)\n",
    "start_texts = [c for c in start]\n",
    "x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
    "print(f\"Input texts: {start}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0792738",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model\n",
    "dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
    "ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
    "ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
    "# init from a model saved in a specific directory\n",
    "ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
    "print(\"loading model params from %s\"%ckpt_path)\n",
    "checkpoint = torch.load(ckpt_path, map_location=device)\n",
    "gptconf = GPTConfig[model_name]\n",
    "if 'model_args' in checkpoint:\n",
    "    gptconf = checkpoint['model_args']\n",
    "model = GPT(**gptconf)\n",
    "state_dict = checkpoint['state_dict']\n",
    "model.load_state_dict(state_dict)\n",
    "\n",
    "model.eval()\n",
    "model.to(device)\n",
    "\n",
    "# run generation\n",
    "with torch.no_grad():\n",
    "    with ctx:\n",
    "        _, attn_weights = model(x)\n",
    "\n",
    "head_view(attn_weights, start_texts)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/hw4/code/data/quansongci/data.json
+++ b/hw4/code/data/quansongci/data.json
--- a/hw4/code/data/vis/vis_1.txt
+++ b/hw4/code/data/vis/vis_1.txt
@@ -0,0 +1,2 @@
 +++如梦令
 昨夜雨疏风骤。浓睡不消残酒。试问卷帘人，却道海棠依旧。知否。知否。应是绿肥红瘦。
--- a/hw4/code/data/vis/vis_2.txt
+++ b/hw4/code/data/vis/vis_2.txt
@@ -0,0 +1,3 @@
 +++鹧鸪天（秋思）
 红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散，黄叶荻花秋水流。
 楼上角，笛声悠。兴王莫上叹人头。明朝归去无消息，只有当时一望流。
--- a/hw4/code/dataset.py
+++ b/hw4/code/dataset.py
@@ -0,0 +1,75 @@
 import torch
 from torch.utils.data import Dataset
 import numpy as np
 import os
 import json
 class LMDataset(Dataset):
    def __init__(self, data_dir, split):
        super().__init__()
        # load the data
        with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
            meta = json.load(f)
        self.data = meta['data'] # list of samples
        self.stoi = meta['stoi'] # a dict that maps character to integer
        self.itos = meta['itos'] # a dict that maps string of integer to character
        self.vocab_size = meta['vocab_size'] # vocab size
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        return self.data[index]
 class Converter:
    '''
    This class helps us convert strings to integers and back
    We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
    '''
    def __init__(self, stoi, itos):
        self.stoi = stoi # a dict that maps character to integer
        self.itos = itos # a dict that maps string of integer to character
    def single_encode(self, s):
        l = [] # initialize an empty list
        for i in s:
            l.append(self.stoi[i])
        # transform the list into a numpy array
        l = np.array(l, dtype=np.int64)
        return l 
    def single_decode(self, l):
        s = '' # initialize an empty string
        for i in l:
            # if we meet the end of the sequence (the value of integer is equal to 1), break
            if i == 1:
                break
            # convert string of the integer into a character
            s += self.itos[str(i)]
        return s 
    def encode(self, data):
        '''
        encode a list of strings into integers
        '''
        lens = [len(s) for s in data]
        max_len = max(lens)
        out = np.zeros((len(data), max_len+1), dtype=np.int64)
        for i,s in enumerate(data):
            out[i,:len(s)] = self.single_encode(s)
            out[i,len(s)] = 1
        x = torch.from_numpy(out[:,:-1])
        y = torch.from_numpy(out[:,1:])
        return x, y 
    def decode(self, data):
        '''
        decode a list of integers into strings
        '''
        data = data.cpu().numpy().astype(np.int64)
        out = []
        for i in range(len(data)):
            out.append(self.single_decode(data[i]))
        return out
--- a/hw4/code/model.py
+++ b/hw4/code/model.py
@@ -0,0 +1,356 @@
 # ========================================================
 #             Media and Cognition
 #             Homework 4  Sequence Modeling
 #             model.py - Model definition
 #             Student ID:
 #             Name:
 #             Tsinghua University
 #             (C) Copyright 2024
 # ========================================================
 # Import required libraries
 ############################################################
 import math
 import torch
 import torch.nn as nn 
 from torch.nn import functional as F
 import numpy as np
 ############################################################
 # Define the GELU activation function used in OpenAI GPT
 ############################################################
 def gelu(z):
    """
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
    """
    return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
 ############################################################
 # Define the Multi-Head SelfAttention module
 ############################################################
 class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_head, dropout):
        super().__init__()
        # define there linear layers for q, k, v generation separately
        self.q_layer = nn.Linear(embed_dim, embed_dim)
        self.k_layer = nn.Linear(embed_dim, embed_dim)
        self.v_layer = nn.Linear(embed_dim, embed_dim)
        # define the projection layer for output
        self.proj_layer = nn.Linear(embed_dim, embed_dim)
        # define the dropout layer for attention and output calculation
        self.attn_drop = nn.Dropout(dropout)
        self.proj_drop = nn.Dropout(dropout)
        self.num_head = num_head
        self.head_dim = embed_dim // num_head
    def forward(self, x):
        batch_size, seq_len, dim = x.shape
        # >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
        # Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
        # the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
        # num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
        q = ???
        k = ???
        v = ???
        # Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
        # first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
        q = ???
        k = ???
        v = ???
        # then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
        # the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
        q = ???
        k = ???
        v = ???
        # Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
        # Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
        # the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
        attn = ???
        # Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
        # Therefore, a mask is used to prevent positions from attending to subsequent positions
        # attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
        # Hint:
        # use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
        attn_mask = ???
        # use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
        attn_mask = ???
        # use Tensor.bool() to convert the matrix to a boolean matrix
        attn_mask = ???
        # fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
        attn = ???
        # Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
        attn = ???
        # Step 1.3.4: apply dropout to `attn` via self.attn_drop()
        attn = ???
        # Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
        # the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
        out = ???
        # Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
        # the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
        out = ???
        # Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
        result = ??? 
        # <<< TODO 1
        # return the final results `result` and attention weights `attn`
        return result, attn
 ############################################################
 # Define the feed forward network (FFN)
 ############################################################
 class FFN(nn.Module):
    def __init__(self, embed_dim, feedforward_dim, dropout):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, feedforward_dim)
        self.fc2 = nn.Linear(feedforward_dim, embed_dim)
        self.drop = nn.Dropout(dropout)
    def forward(self, x):
        x = self.fc1(x)
        x = gelu(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
 ############################################################
 # Define the TransformerLayer
 ############################################################
 class TransformerLayer(nn.Module):
    def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = SelfAttention(embed_dim, num_head, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, feedforward_dim, dropout)
        self.no_res = no_res # whether to use residual connection
    def forward(self, x):
        # >>> TODO 2: complete the forward process of the TransformerLayer module.
        # Step 2.1: calculate the output of multi-head self-attention
        # normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
        x_norm = ???
        # calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
        x_attn, attn = ???
        # add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
        if ???:
            x_attn = ???
        # Step 2.2: calculate the output of feed forward network
        # calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
        x_ffn = ???
        # add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
        if ???:
            out = ???
        else:
            out = ???
        # <<< TODO 2
        return out, attn
 ############################################################
 # Define the GPT module
 ############################################################
 class GPT(nn.Module):
    def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
        '''
            vocab_size: the size of vocabulary
            max_seq_len: the maximum length of input texts
            num_layer: the number of transformer layers
            embed_dim: the embedding dimension
            num_head: the number of heads in Multi-Head Self Attention
            feedforward_dim: the dimension in the feed forward network
            dropout: dropout ratio
            no_res: whether to use residual connection in transformer layers
            no_pos: whether to use position embeddings
        '''
        super().__init__()
        self.num_layer = num_layer
        self.max_seq_len = max_seq_len
        self.no_pos = no_pos
        # Define Embedding Layer to transfer input text tokens and positions to embeddings
        self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
        self.drop = nn.Dropout(dropout)
        # Define the transformer layers
        self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
        # Define the head layer to predict output
        self.norm = nn.LayerNorm(embed_dim)
        self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
        """
        Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
        Reference: https://paperswithcode.com/method/weight-tying
        """
        self.word_token_embedding.weight = self.language_model_head.weight
        self.init_weights()
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('proj_layer.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
    def forward(self, word_idx, targets=None):
        batch_size, seq_len = word_idx.shape
        # >>> TODO 3: complete the forward process of GPT
        # Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1] 
        pos = ???
        # Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
        token_embed = ???
        pos_embed = ???
        # Step 3.3: initialize the input embeddings `x` of transformer layers
        # add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
        if ???:
            x = ???
        else:
            x = ???
        # apply dropout to the input embeddings via `self.drop()`
        x = ???
        # Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
        # define a list `attention_weights` and append the attention weights of each transformer layer into the list
        attention_weights = ??? 
        for ???:
            # Step 4.1: obtain the output and attention weights of transformer layers
            x, attn = ???
            # Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
            ???
        # Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
        # self.language_model_head() is a linear layer defined in __init__() function
        # Note: do not add softmax here since it is included in the cross entropy loss function
        x = ???
        logits = ???
        # <<< TODO 3
        # return logits and loss or attention weights
        if targets is not None:
            loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
            return logits, loss
        assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
        return logits, attention_weights
    def configure_optimizers(self, weight_decay):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (nn.Linear, )
        blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
        # subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
        # will appear in the no_decay and decay sets respectively after the above.
        # In addition, because named_parameters() doesn't return duplicates, it
        # will only return the first occurence, key'd by 'transformer.wte.weight', below.
        # so let's manually remove 'lm_head.weight' from decay set. This will include
        # this tensor into optimization via transformer.wte.weight only, and not decayed.
        decay.remove('language_model_head.weight')
        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )
        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        return optim_groups
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
        return idx.squeeze().cpu().numpy()
 ############################################################
 GPTConfig = {
    'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
    'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
    'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
 }
--- a/hw4/code/prepare.py
+++ b/hw4/code/prepare.py
@@ -0,0 +1,61 @@
 """
 Prepare the dataset for character-level language modeling.
 So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
 """
 import os
 import numpy as np
 import argparse
 import json
 parser = argparse.ArgumentParser()
 parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
 args = parser.parse_args()
 # set the input file path
 input_file_path = os.path.join(args.data_root, 'data.json')
 with open(input_file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)['data']
 print(f"length of dataset: {len(data):,}")
 # get all the unique characters that occur in this text
 chars = sorted(list(set(''.join(data))))
 vocab_size = len(chars) + 2 # for <pad> and <eos>
 print("all the unique characters:", ''.join(chars))
 print(f"vocab size: {vocab_size:,}")
 # create a mapping from characters to integers
 stoi = { ch:i+2 for i,ch in enumerate(chars) }
 itos = { i+2:ch for i,ch in enumerate(chars) }
 stoi['<pad>'] = 0
 itos[0] = '<pad>'
 stoi['<eos>'] = 1
 itos[1] = '<eos>'
 # create the train and test splits
 n = len(data)
 train_data = data[:int(n*0.9)]
 val_data = data[int(n*0.9):]
 print(f"train has {len(train_data):,} samples")
 print(f"val has {len(val_data):,} samples")
 # save the meta information as well, to help us encode/decode later
 train_meta = {
    'data': train_data,
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
 }
 with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
    json.dump(train_meta, f, ensure_ascii=False, indent=4)
 val_meta = {
    'data': val_data,
    'vocab_size': vocab_size,
    'itos': itos,
    'stoi': stoi,
 }
 with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
    json.dump(val_meta, f, ensure_ascii=False, indent=4)
--- a/hw4/code/sample.py
+++ b/hw4/code/sample.py
@@ -0,0 +1,76 @@
 """
 Sample from a trained model
 """
 import os
 import pickle
 from contextlib import nullcontext
 import torch
 from model import GPTConfig, GPT
 import argparse
 from dataset import Converter, LMDataset
 def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
    dataset = LMDataset(data_root, 'train')
    converter = Converter(dataset.stoi, dataset.itos)
    temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
    # model
    dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
    # init from a model saved in a specific directory
    ckpt_path = os.path.join(ckpt_path, 'best.pth')
    print("sample from %s"%ckpt_path)
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig[model_name]
    if 'model_args' in checkpoint:
        gptconf = checkpoint['model_args']
    model = GPT(**gptconf)
    state_dict = checkpoint['state_dict']
    #unwanted_prefix = '_orig_mod.'
    #for k,v in list(state_dict.items()):
    #    if k.startswith(unwanted_prefix):
    #        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    model.eval()
    model.to(device)
    # encode the beginning of the prompt
    start_ids = converter.single_encode(start)
    x = torch.from_numpy(start_ids)[None, ...].to(device).long()
    # run generation
    with torch.no_grad():
        with ctx:
            for k in range(num_samples):
                y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
                print(converter.single_decode(y))
                print('---------------')
 if __name__ == '__main__':
    # set random seed for reproducibility
    seed = 2024
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
    # set configurations of the model and sampling process
    parser = argparse.ArgumentParser()
    parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
    parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
    parser.add_argument('--device', type=str, help='cpu or cuda')
    opt = parser.parse_args()
    if opt.device is None:
        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
--- a/hw4/code/train.py
+++ b/hw4/code/train.py
@@ -0,0 +1,219 @@
 import os
 import time
 import math
 import pickle
 from contextlib import nullcontext
 import argparse
 import numpy as np
 import torch
 from torch.utils.data import DataLoader
 from model import GPT, GPTConfig
 from dataset import LMDataset, Converter
 import matplotlib.pyplot as plt
 # learning rate decay scheduler (cosine with warmup)
 def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)
 def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
    train_dataset = LMDataset(data_root, 'train')
    val_dataset = LMDataset(data_root, 'val')
    train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
    converter = Converter(train_dataset.stoi, train_dataset.itos)
    # adamw optimizer
    learning_rate = 5e-3 # max learning rate
    weight_decay = 1e-1
    beta1 = 0.9
    beta2 = 0.99
    grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
    # system
    dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
    ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
    #ctx = torch.autocast(device_type=device, dtype=ptdtype)
    best_val_loss = 1e9
    iter_num = 0 # number of iterations in the lifetime of this process
    # model init
    model_args = GPTConfig[model_name]
    model_args['vocab_size'] = train_dataset.vocab_size
    model_args['max_seq_len'] = 128
    model_args['no_res'] = no_res
    model_args['no_pos'] = no_pos
    # init a new model from scratch
    print("Initializing a new model from scratch")
    model = GPT(**model_args)
    model.to(device)
    # initialize a GradScaler. If enabled=False scaler is a no-op
    scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
    # optimizer
    optim_groups = model.configure_optimizers(weight_decay)
    optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
    checkpoint = None # free up memory
    print('training...')
    # training loop    
    epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
    t0 = time.time()
    model.train()
    train_losses = []
    val_losses = []
    for epoch in range(epoch_num):
        for step, inputs in enumerate(train_loader):
            if iter_num >= n_iters:
                break
            X, Y = converter.encode(inputs)
            X, Y = X.to(device), Y.to(device)
            lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
            # forward backward update, with optional gradient accumulation to simulate larger batch size
            # and using the GradScaler if data type is float16
            with ctx:
                logits, loss = model(X, Y)
                loss = loss  # scale the loss to account for gradient accumulation
            # backward pass, with gradient scaling if training in fp16
            scaler.scale(loss).backward()
            # clip the gradient
            if grad_clip != 0.0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            # step the optimizer and scaler if training in fp16
            scaler.step(optimizer)
            scaler.update()
            # flush the gradients as soon as we can, no need for this memory anymore
            optimizer.zero_grad(set_to_none=True)
            iter_num += 1
            train_losses.append(loss.item())
            # evaluate the loss on train/val sets and write checkpoints
            if iter_num % val_interval == 0:
                # timing and logging
                t1 = time.time()
                dt = t1 - t0
                t0 = t1
                lossf = loss.item()
                print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
                losses = estimate_loss(model, val_loader, converter, ctx, device)
                val_losses.append(losses['val'])
                print(f"iter {iter_num}: val loss {losses['val']:.4f}")
                print(f"saving latest checkpoint to {ckpt_path}")
                checkpoint = {
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'model_args': model_args,
                        'iter_num': iter_num,
                        'best_val_loss': best_val_loss,
                    }
                torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
                if losses['val'] < best_val_loss:
                    best_val_loss = losses['val']
                    if iter_num > 0:
                        print(f"saving best checkpoint to {ckpt_path}")
                        torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
    plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
 def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
    # create a plot
    f, ax = plt.subplots(1,2,figsize=(18,6))
    val_iters = np.arange(1, n_iters+1, val_interval)
    # draw loss
    ax[0].plot(train_losses)
    ax[0].plot(val_iters, val_losses, 'r')
    # set labels
    ax[0].set_xlabel('training iters')
    ax[0].legend(['training loss', 'validation loss'])
    train_perplexity = [np.exp(x) for x in train_losses]
    val_perplexity = [np.exp(x) for x in val_losses]
    # draw perplexity
    ax[1].plot(train_perplexity)
    ax[1].plot(val_iters, val_perplexity, 'r')
    # set labels
    ax[1].set_xlabel('training iters')
    ax[1].legend(['training perplexity', 'validation perplexity'])
    plt.tight_layout()
    # show the image
    plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
    plt.show()
 # helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
 def estimate_loss(model, val_loader, converter, ctx, device):
    out = {}
    model.eval()
    losses = 0
    max_iters = 100
    iter_num = 0
    for inputs in val_loader:
        if iter_num >= max_iters:
            break
        iter_num += 1
        X, Y = converter.encode(inputs)
        X, Y = X.to(device), Y.to(device)
        with ctx:
            logits, loss = model(X, Y)
            #loss = model.loss(logits, Y)
        losses += loss.item()
    out['val'] = losses / max_iters
    model.train()
    return out
 if __name__ == '__main__':
    # set random seed for reproducibility
    seed = 2024
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
    torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
    # set configurations of the model and training process
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
    parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
    parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
    parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
    parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
    parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
    parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
    parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
    parser.add_argument('--device', type=str, help='cpu or cuda')
    opt = parser.parse_args()
    if opt.device is None:
        opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    os.makedirs(opt.ckpt_path, exist_ok=True)
    train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
--- a/hw4/report/dtx-style.sty
+++ b/hw4/report/dtx-style.sty
@@ -0,0 +1,132 @@
 %%
 %% This is file `dtx-style.sty',
 %% generated with the docstrip utility.
 %%
 %% The original source files were:
 %%
 %% thucoursework.dtx  (with options: `dtx-style')
 %% 
 %% This is a generated file.
 %% 
 %% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
 %% 
 %% This work may be distributed and/or modified under the
 %% conditions of the LaTeX Project Public License, either version 1.3
 %% of this license or (at your option) any later version.
 %% The latest version of this license is in
 %%   http://www.latex-project.org/lppl.txt
 %% and version 1.3 or later is part of all distributions of LaTeX
 %% version 2005/12/01 or later.
 %% 
 %% To produce the documentation run the original source files ending with `.dtx'
 %% through LaTeX.
 %% 
 \ProvidesPackage{dtx-style}
 \RequirePackage{hypdoc}
 \RequirePackage[UTF8,scheme=chinese]{ctex}
 \RequirePackage{newpxtext}
 \RequirePackage{newpxmath}
 \RequirePackage[
  top=2.5cm, bottom=2.5cm,
  left=4cm, right=2cm,
  headsep=3mm]{geometry}
 \RequirePackage{array,longtable,booktabs}
 \RequirePackage{listings}
 \RequirePackage{fancyhdr}
 \RequirePackage{xcolor}
 \RequirePackage{enumitem}
 \RequirePackage{etoolbox}
 \RequirePackage{metalogo}
 \colorlet{thu@macro}{blue!60!black}
 \colorlet{thu@env}{blue!70!black}
 \colorlet{thu@option}{purple}
 \patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
 \patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
 \patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
 \patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
 \def\DescribeOption{%
  \leavevmode\@bsphack\begingroup\MakePrivateLetters%
  \Describe@Option}
 \def\Describe@Option#1{\endgroup
  \marginpar{\raggedleft\PrintDescribeOption{#1}}%
  \thu@special@index{option}{#1}\@esphack\ignorespaces}
 \def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
 \def\thu@special@index#1#2{\@bsphack
  \begingroup
    \HD@target
    \let\HDorg@encapchar\encapchar
    \edef\encapchar usage{%
      \HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
    }%
    \index{#2\actualchar{\string\ttfamily\space#2}
           (#1)\encapchar usage}%
    \index{#1:\levelchar#2\actualchar
           {\string\ttfamily\space#2}\encapchar usage}%
  \endgroup
  \@esphack}
 \lstdefinestyle{lstStyleBase}{%
   basicstyle=\small\ttfamily,
   aboveskip=\medskipamount,
   belowskip=\medskipamount,
   lineskip=0pt,
   boxpos=c,
   showlines=false,
   extendedchars=true,
   upquote=true,
   tabsize=2,
   showtabs=false,
   showspaces=false,
   showstringspaces=false,
   numbers=none,
   linewidth=\linewidth,
   xleftmargin=4pt,
   xrightmargin=0pt,
   resetmargins=false,
   breaklines=true,
   breakatwhitespace=false,
   breakindent=0pt,
   breakautoindent=true,
   columns=flexible,
   keepspaces=true,
   gobble=2,
   framesep=3pt,
   rulesep=1pt,
   framerule=1pt,
   backgroundcolor=\color{gray!5},
   stringstyle=\color{green!40!black!100},
   keywordstyle=\bfseries\color{blue!50!black},
   commentstyle=\slshape\color{black!60}}
 \lstdefinestyle{lstStyleShell}{%
   style=lstStyleBase,
   frame=l,
   rulecolor=\color{purple},
   language=bash}
 \lstdefinestyle{lstStyleLaTeX}{%
   style=lstStyleBase,
   frame=l,
   rulecolor=\color{violet},
   language=[LaTeX]TeX}
 \lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
 \lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
 \setlist{nosep}
 \DeclareDocumentCommand{\option}{m}{\textsf{#1}}
 \DeclareDocumentCommand{\env}{m}{\texttt{#1}}
 \DeclareDocumentCommand{\pkg}{s m}{%
  \texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
 \DeclareDocumentCommand{\file}{s m}{%
  \texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
 \newcommand{\myentry}[1]{%
  \marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
 \newcommand{\note}[2][Note]{{%
  \color{magenta}{\bfseries #1}\emph{#2}}}
 \def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
--- a/hw4/report/iidef.sty
+++ b/hw4/report/iidef.sty
@@ -0,0 +1,153 @@
 %%
 %% This is file `iidef.sty',
 %% generated with the docstrip utility.
 %%
 %% The original source files were:
 %%
 %% thucoursework.dtx  (with options: `sty')
 %% 
 %% This is a generated file.
 %% 
 %% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
 %% 
 %% This work may be distributed and/or modified under the
 %% conditions of the LaTeX Project Public License, either version 1.3
 %% of this license or (at your option) any later version.
 %% The latest version of this license is in
 %%   http://www.latex-project.org/lppl.txt
 %% and version 1.3 or later is part of all distributions of LaTeX
 %% version 2005/12/01 or later.
 %% 
 %% To produce the documentation run the original source files ending with `.dtx'
 %% through LaTeX.
 %% 
 \NeedsTeXFormat{LaTeX2e}[1999/12/01]
 \ProvidesClass{iidef}
 [2020/09/09 2.6 Tsinghua University Coursework Template]
 %% configuration of nested enumerate env
 \RequirePackage{enumitem}
 %% set hwcount key-value option
 \RequirePackage{kvoptions}
 %% required by macro DeclareMathOperator
 \RequirePackage{amsmath}
 %% Set up page headers using with fancyhdr
 \@ifundefined{lhead}{\RequirePackage{fancyhdr}}
 {\def\@thulhead{thulhead}}
 \RequirePackage{amsthm}
 %% semester
 \def\@term{term}
 \newcommand{\theterm}[1]{\renewcommand\@term{#1}}
 %% institute
 \newcommand{\@courseinstitute}[1]{institute}
 \newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
 %% coursename
 \newcommand{\@coursename}[1]{coursename}
 \newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
 %% user can rewrite homework name
 \def\@hwname{Homework}
 \def\hwname#1{\renewcommand\@hwname{#1}}
 %% \iidef@thehwcnt = 1
 \DeclareStringOption[1]{thehwcnt}
 \ProcessKeyvalOptions*
 \def\thehwcnt{\iidef@thehwcnt}
 %% page header setup, distinguish between first page(plain style)
 %% and second page on (runningpage style)
 %%***************************************************************************
 \newcommand{\courseheader}{
 \thispagestyle{plain}%first page use native plain style to suppress header
 \vspace*{-1in}
 \begin{center}
 \@courseinstitute\\
 \@coursename\\
 \@term
 \vspace*{0.1in}
 \hrule
 \end{center}
 \begin{center}
  \underline{\bf \@hwname\;\thehwcnt} \\
 \end{center}
 }
 \@ifundefined{@thulhead}{
 \fancypagestyle{runningpage}
 {
  \fancyhead[L]{\small\@coursename}
  \fancyhead[R]{\small\@courseinstitute}
 }
 %% use runningpage style from second page on
 \pagestyle{runningpage}
 }{}
 %% *********************************************************************************************
 %%name command macro
 %%*************************
 \newcommand{\name}[1]{
 \begin{flushleft}
  #1\hfill
  \today
 \end{flushleft}
 \hrule
 \vspace{2em}
 \flushleft
 }
 %%*************************
 %% enumitem related configuration
 \setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
 \setlist[enumerate,2]{label=(\alph*)}
 \setlist[enumerate,3]{label=\roman*.}
 \setlist[enumerate,4]{label=\greek*}
 %%******************************
 \def\@slname{Solution}
 \def\slname#1{\renewcommand\@slname{#1}}
 \@ifundefined{solution}{
 \newenvironment{solution}
 {
 \proof[\@slname]
 }
 {
 %% no qed symbol in solution env
 \renewcommand{\qedsymbol}{}
 \endproof
 }
 }{}
 %%******************************
 %%common math symbols go here
 %%*************************************************
 \def\v#1{\underline{#1}}
 \newcommand{\uc}{\underline{c}}    % c, vec
 \newcommand{\uv}{\underline{v}}    % x, vec
 \newcommand{\uw}{\underline{w}}    % w, vec
 \newcommand{\ux}{\underline{x}}    % x, vec
 \newcommand{\uy}{\underline{y}}    % y, vec
 \newcommand{\uz}{\underline{z}}    % z, vec
 \newcommand{\um}{\underline{m}}    % m, vec
 \newcommand{\rvx}{\mathsf{x}}    % x, r.v.
 \newcommand{\rvy}{\mathsf{y}}    % y, r.v.
 \newcommand{\rvz}{\mathsf{z}}    % z, r.v.
 \newcommand{\rvw}{\mathsf{w}}    % w, r.v.
 \newcommand{\rvH}{\mathsf{H}}    % H, r.v.
 \newcommand{\urvx}{\underline{\mathsf{x}}}    % x, r.v. vec
 \newcommand{\urvy}{\underline{\mathsf{y}}}    % y, r.v. vec
 \newcommand{\urvz}{\underline{\mathsf{z}}}    % z, r.v. vec
 \newcommand{\urvw}{\underline{\mathsf{w}}}    % w, r.v. vec
 \newcommand{\defas}{\triangleq} %\coloneqq
 \newcommand{\reals}{\mathbb{R}}
 \newcommand{\TT}{\mathrm{T}}    % transpose
 \DeclareMathOperator*{\argmax}{arg\,max}
 \DeclareMathOperator*{\argmin}{arg\,min}
 \DeclareMathOperator*{\argsup}{arg\,sup}
 \DeclareMathOperator*{\arginf}{arg\,inf}
 \DeclareMathOperator{\diag}{diag}
 \DeclareMathOperator{\Var}{Var}
 \DeclareMathOperator{\Cov}{Cov}
 \DeclareMathOperator{\MSE}{MSE}
 \DeclareMathOperator{\1}{\mathds{1}}
 \DeclareMathOperator{\In}{\mathbb{I}}
 \DeclareMathOperator{\E}{\mathbb{E}}
 \DeclareMathOperator{\Prob}{\mathbb{P}}
 \newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
 \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
 %%************************************************************************************
--- a/hw4/report/main.tex
+++ b/hw4/report/main.tex
@@ -0,0 +1,100 @@
 % Homework template for Inference and Information
 % UPDATE: September 26, 2017 by Xiangxiang
 \documentclass[a4paper]{article}
 \usepackage{ctex}
 \usepackage{amsmath, amssymb, amsthm}
 \usepackage{moreenum}
 \usepackage{mathtools}
 \usepackage{url}
 \usepackage{bm}
 \usepackage{enumitem}
 \usepackage{graphicx}
 \usepackage{listings}
 \usepackage{color}
 \lstset{
    basicstyle          =   \sffamily,          % 基本代码风格
    keywordstyle        =   \bfseries,          % 关键字风格
    commentstyle        =   \rmfamily\itshape,  % 注释的风格，斜体
    stringstyle         =   \ttfamily,  % 字符串风格
    flexiblecolumns,                % 别问为什么，加上这个
    numbers             =   left,   % 行号的位置在左边
    showspaces          =   false,  % 是否显示空格，显示了有点乱，所以不现实了
    numberstyle         =   \zihao{-5}\ttfamily,    % 行号的样式，小五号，tt等宽字体
    showstringspaces    =   false,
    captionpos          =   t,      % 这段代码的名字所呈现的位置，t指的是top上面
    frame               =   lrtb,   % 显示边框
 }
 \lstdefinestyle{Python}{
    language        =   Python, % 语言选Python
    basicstyle      =   \zihao{-5}\ttfamily,
    numberstyle     =   \zihao{-5}\ttfamily,
    keywordstyle    =   \color{blue},
    keywordstyle    =   [2] \color{teal},
    stringstyle     =   \color{magenta},
    commentstyle    =   \color{red}\ttfamily,
    breaklines      =   true,   % 自动换行，建议不要写太长的行
    columns         =   fixed,  % 如果不加这一句，字间距就不固定，很丑，必须加
    basewidth       =   0.5em,
 }
 \usepackage{subcaption}
 \usepackage{booktabs} % toprule
 \usepackage[mathcal]{eucal}
 \usepackage[thehwcnt = 4]{iidef}
 \thecourseinstitute{清华大学电子工程系}
 \thecoursename{\textbf{媒体与认知}}
 \theterm{2023-2024学年春季学期}
 \hwname{作业}
 \begin{document}
 \courseheader
 \name{YOUR NAME}
 \vspace{3mm}
 \centerline{\textbf{\Large{理论部分}}}
 \section{单选题（15分）}
 \subsection{\underline{?}}
 \subsection{\underline{?}}
 \subsection{\underline{?}}
 \subsection{\underline{?}}
 \subsection{\underline{?}}
 \section{计算题（15 分）}
 % 计算题1
 \subsection{隐含马尔可夫模型}
 \hspace{2em}暑假中，小E每天进行一项体育活动，包括跑步（R）、游泳（S）和打球（B），所选择的体育活动受某种潜在因素（如心情）的影响。小E每天把进行体育活动的照片发至微信朋友圈，我们可以根据观测信息推测该潜在因素的状态。
 \hspace{2em}假设该潜在因素分为$S_1$和$S_2$两种状态。在$S_1$时，小E选择三种体育活动的概率分别为0.6，0.2，0.2；在$S_2$时，小E选择三种体育活动的概率分别为0.1，0.6，0.3。
 \hspace{2em}该潜在因素的变化也有一定规律，若某天处于$S_1$的状态，第二天处于$S_1$和$S_2$的状态的概率分别为0.5，0.5；若某天处于$S_2$的状态，第二天处于$S_1$和$S_2$的状态的概率分别为0.6，0.4。
 \hspace{2em}暑假第一天处于$S_1$和$S_2$的状态的概率均为0.5。
 \vspace{3mm}
 (1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模，{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
 \vspace{3mm}
 (2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步（R）、打球（B）和游泳（S），{\color{blue}请计算出现该观测序列的概率}。
 \vspace{3mm}
 (3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
 % 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
 \section{编程作业报告}
 \section{自选课题工作进度汇报}
 \end{document}
 %%% Local Variables:
 %%% mode: late\rvx
 %%% TeX-master: t
 %%% End:
		`@@ -0,0 +1,2 @@`
							`+++如梦令`
							`昨夜雨疏风骤。浓睡不消残酒。试问卷帘人，却道海棠依旧。知否。知否。应是绿肥红瘦。`