TA Release homework4.
This commit is contained in:
104
hw4/code/attnvis.ipynb
Normal file
104
hw4/code/attnvis.ipynb
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from contextlib import nullcontext\n",
|
||||
"import torch\n",
|
||||
"from model import GPTConfig, GPT\n",
|
||||
"from bertviz import head_view\n",
|
||||
"from dataset import Converter, LMDataset\n",
|
||||
"\n",
|
||||
"# set random seed for reproducibility\n",
|
||||
"seed = 2024\n",
|
||||
"torch.manual_seed(seed)\n",
|
||||
"torch.cuda.manual_seed(seed)\n",
|
||||
"torch.cuda.manual_seed_all(seed)\n",
|
||||
"torch.backends.cudnn.deterministic = True\n",
|
||||
"torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
|
||||
"torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
|
||||
"\n",
|
||||
"#################################################\n",
|
||||
"# \n",
|
||||
"model_name = 'mygpt'\n",
|
||||
"ckpt_path = 'workdirs/quansongci'\n",
|
||||
"data_root = 'data/quansongci'\n",
|
||||
"vis_text_path = 'data/vis/vis_1.txt'\n",
|
||||
"#################################################\n",
|
||||
"\n",
|
||||
"device = 'cpu'\n",
|
||||
"\n",
|
||||
"dataset = LMDataset(data_root, 'train')\n",
|
||||
"converter = Converter(dataset.stoi, dataset.itos)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
|
||||
" start = f.read()\n",
|
||||
"start_ids = converter.single_encode(start)\n",
|
||||
"start_texts = [c for c in start]\n",
|
||||
"x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
|
||||
"print(f\"Input texts: {start}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c0792738",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# model\n",
|
||||
"dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
|
||||
"ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
|
||||
"ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
|
||||
"# init from a model saved in a specific directory\n",
|
||||
"ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
|
||||
"print(\"loading model params from %s\"%ckpt_path)\n",
|
||||
"checkpoint = torch.load(ckpt_path, map_location=device)\n",
|
||||
"gptconf = GPTConfig[model_name]\n",
|
||||
"if 'model_args' in checkpoint:\n",
|
||||
" gptconf = checkpoint['model_args']\n",
|
||||
"model = GPT(**gptconf)\n",
|
||||
"state_dict = checkpoint['state_dict']\n",
|
||||
"model.load_state_dict(state_dict)\n",
|
||||
"\n",
|
||||
"model.eval()\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# run generation\n",
|
||||
"with torch.no_grad():\n",
|
||||
" with ctx:\n",
|
||||
" _, attn_weights = model(x)\n",
|
||||
"\n",
|
||||
"head_view(attn_weights, start_texts)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
13426
hw4/code/data/quansongci/data.json
Normal file
13426
hw4/code/data/quansongci/data.json
Normal file
File diff suppressed because it is too large
Load Diff
2
hw4/code/data/vis/vis_1.txt
Normal file
2
hw4/code/data/vis/vis_1.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
+++如梦令
|
||||
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。
|
||||
3
hw4/code/data/vis/vis_2.txt
Normal file
3
hw4/code/data/vis/vis_2.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
+++鹧鸪天(秋思)
|
||||
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
|
||||
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。
|
||||
75
hw4/code/dataset.py
Normal file
75
hw4/code/dataset.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
class LMDataset(Dataset):
|
||||
def __init__(self, data_dir, split):
|
||||
super().__init__()
|
||||
# load the data
|
||||
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
|
||||
meta = json.load(f)
|
||||
|
||||
self.data = meta['data'] # list of samples
|
||||
self.stoi = meta['stoi'] # a dict that maps character to integer
|
||||
self.itos = meta['itos'] # a dict that maps string of integer to character
|
||||
self.vocab_size = meta['vocab_size'] # vocab size
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.data[index]
|
||||
|
||||
class Converter:
|
||||
'''
|
||||
This class helps us convert strings to integers and back
|
||||
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
|
||||
'''
|
||||
def __init__(self, stoi, itos):
|
||||
self.stoi = stoi # a dict that maps character to integer
|
||||
self.itos = itos # a dict that maps string of integer to character
|
||||
|
||||
def single_encode(self, s):
|
||||
l = [] # initialize an empty list
|
||||
for i in s:
|
||||
l.append(self.stoi[i])
|
||||
# transform the list into a numpy array
|
||||
l = np.array(l, dtype=np.int64)
|
||||
return l
|
||||
|
||||
def single_decode(self, l):
|
||||
s = '' # initialize an empty string
|
||||
for i in l:
|
||||
# if we meet the end of the sequence (the value of integer is equal to 1), break
|
||||
if i == 1:
|
||||
break
|
||||
# convert string of the integer into a character
|
||||
s += self.itos[str(i)]
|
||||
return s
|
||||
|
||||
|
||||
def encode(self, data):
|
||||
'''
|
||||
encode a list of strings into integers
|
||||
'''
|
||||
lens = [len(s) for s in data]
|
||||
max_len = max(lens)
|
||||
out = np.zeros((len(data), max_len+1), dtype=np.int64)
|
||||
for i,s in enumerate(data):
|
||||
out[i,:len(s)] = self.single_encode(s)
|
||||
out[i,len(s)] = 1
|
||||
x = torch.from_numpy(out[:,:-1])
|
||||
y = torch.from_numpy(out[:,1:])
|
||||
return x, y
|
||||
|
||||
def decode(self, data):
|
||||
'''
|
||||
decode a list of integers into strings
|
||||
'''
|
||||
data = data.cpu().numpy().astype(np.int64)
|
||||
out = []
|
||||
for i in range(len(data)):
|
||||
out.append(self.single_decode(data[i]))
|
||||
return out
|
||||
356
hw4/code/model.py
Normal file
356
hw4/code/model.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 4 Sequence Modeling
|
||||
# model.py - Model definition
|
||||
# Student ID:
|
||||
# Name:
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
|
||||
# Import required libraries
|
||||
############################################################
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
import numpy as np
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the GELU activation function used in OpenAI GPT
|
||||
############################################################
|
||||
def gelu(z):
|
||||
"""
|
||||
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
|
||||
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
|
||||
"""
|
||||
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the Multi-Head SelfAttention module
|
||||
############################################################
|
||||
class SelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, embed_dim, num_head, dropout):
|
||||
super().__init__()
|
||||
|
||||
# define there linear layers for q, k, v generation separately
|
||||
self.q_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.k_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.v_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the projection layer for output
|
||||
self.proj_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the dropout layer for attention and output calculation
|
||||
self.attn_drop = nn.Dropout(dropout)
|
||||
self.proj_drop = nn.Dropout(dropout)
|
||||
|
||||
self.num_head = num_head
|
||||
self.head_dim = embed_dim // num_head
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
batch_size, seq_len, dim = x.shape
|
||||
|
||||
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
|
||||
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||
attn = ???
|
||||
|
||||
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||
# Hint:
|
||||
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||
attn_mask = ???
|
||||
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||
attn_mask = ???
|
||||
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||
attn_mask = ???
|
||||
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||
attn = ???
|
||||
|
||||
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||
attn = ???
|
||||
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||
attn = ???
|
||||
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||
out = ???
|
||||
|
||||
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||
out = ???
|
||||
|
||||
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||
result = ???
|
||||
# <<< TODO 1
|
||||
|
||||
# return the final results `result` and attention weights `attn`
|
||||
return result, attn
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the feed forward network (FFN)
|
||||
############################################################
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, embed_dim, feedforward_dim, dropout):
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
|
||||
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
|
||||
self.drop = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = gelu(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
############################################################
|
||||
|
||||
# Define the TransformerLayer
|
||||
############################################################
|
||||
class TransformerLayer(nn.Module):
|
||||
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(embed_dim)
|
||||
self.attn = SelfAttention(embed_dim, num_head, dropout)
|
||||
self.norm2 = nn.LayerNorm(embed_dim)
|
||||
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
|
||||
self.no_res = no_res # whether to use residual connection
|
||||
|
||||
def forward(self, x):
|
||||
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||
# Step 2.1: calculate the output of multi-head self-attention
|
||||
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||
x_norm = ???
|
||||
|
||||
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||
x_attn, attn = ???
|
||||
|
||||
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||
if ???:
|
||||
x_attn = ???
|
||||
|
||||
# Step 2.2: calculate the output of feed forward network
|
||||
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||
x_ffn = ???
|
||||
|
||||
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||
if ???:
|
||||
out = ???
|
||||
else:
|
||||
out = ???
|
||||
# <<< TODO 2
|
||||
|
||||
return out, attn
|
||||
############################################################
|
||||
|
||||
# Define the GPT module
|
||||
############################################################
|
||||
class GPT(nn.Module):
|
||||
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
|
||||
'''
|
||||
vocab_size: the size of vocabulary
|
||||
max_seq_len: the maximum length of input texts
|
||||
num_layer: the number of transformer layers
|
||||
embed_dim: the embedding dimension
|
||||
num_head: the number of heads in Multi-Head Self Attention
|
||||
feedforward_dim: the dimension in the feed forward network
|
||||
dropout: dropout ratio
|
||||
no_res: whether to use residual connection in transformer layers
|
||||
no_pos: whether to use position embeddings
|
||||
'''
|
||||
super().__init__()
|
||||
self.num_layer = num_layer
|
||||
self.max_seq_len = max_seq_len
|
||||
self.no_pos = no_pos
|
||||
|
||||
# Define Embedding Layer to transfer input text tokens and positions to embeddings
|
||||
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
|
||||
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
|
||||
|
||||
self.drop = nn.Dropout(dropout)
|
||||
# Define the transformer layers
|
||||
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
|
||||
|
||||
# Define the head layer to predict output
|
||||
self.norm = nn.LayerNorm(embed_dim)
|
||||
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
|
||||
|
||||
"""
|
||||
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
|
||||
Reference: https://paperswithcode.com/method/weight-tying
|
||||
"""
|
||||
self.word_token_embedding.weight = self.language_model_head.weight
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Linear):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
if m.bias is not None:
|
||||
torch.nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Embedding):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
|
||||
# apply special scaled init to the residual projections, per GPT-2 paper
|
||||
for pn, p in self.named_parameters():
|
||||
if pn.endswith('proj_layer.weight'):
|
||||
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
|
||||
|
||||
|
||||
def forward(self, word_idx, targets=None):
|
||||
batch_size, seq_len = word_idx.shape
|
||||
|
||||
# >>> TODO 3: complete the forward process of GPT
|
||||
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||
pos = ???
|
||||
|
||||
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||
token_embed = ???
|
||||
pos_embed = ???
|
||||
|
||||
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||
if ???:
|
||||
x = ???
|
||||
else:
|
||||
x = ???
|
||||
|
||||
# apply dropout to the input embeddings via `self.drop()`
|
||||
x = ???
|
||||
|
||||
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||
attention_weights = ???
|
||||
for ???:
|
||||
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||
x, attn = ???
|
||||
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||
???
|
||||
|
||||
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||
# self.language_model_head() is a linear layer defined in __init__() function
|
||||
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||
x = ???
|
||||
logits = ???
|
||||
# <<< TODO 3
|
||||
|
||||
# return logits and loss or attention weights
|
||||
if targets is not None:
|
||||
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
|
||||
return logits, loss
|
||||
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
|
||||
return logits, attention_weights
|
||||
|
||||
def configure_optimizers(self, weight_decay):
|
||||
"""
|
||||
This long function is unfortunately doing something very simple and is being very defensive:
|
||||
We are separating out all parameters of the model into two buckets: those that will experience
|
||||
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
||||
We are then returning the PyTorch optimizer object.
|
||||
"""
|
||||
|
||||
# separate out all parameters to those that will and won't experience regularizing weight decay
|
||||
decay = set()
|
||||
no_decay = set()
|
||||
whitelist_weight_modules = (nn.Linear, )
|
||||
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
|
||||
for mn, m in self.named_modules():
|
||||
for pn, p in m.named_parameters():
|
||||
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
|
||||
# random note: because named_modules and named_parameters are recursive
|
||||
# we will see the same tensors p many many times. but doing it this way
|
||||
# allows us to know which parent module any tensor p belongs to...
|
||||
if pn.endswith('bias'):
|
||||
# all biases will not be decayed
|
||||
no_decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
|
||||
# weights of whitelist modules will be weight decayed
|
||||
decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
|
||||
# weights of blacklist modules will NOT be weight decayed
|
||||
no_decay.add(fpn)
|
||||
|
||||
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
|
||||
# will appear in the no_decay and decay sets respectively after the above.
|
||||
# In addition, because named_parameters() doesn't return duplicates, it
|
||||
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
|
||||
# so let's manually remove 'lm_head.weight' from decay set. This will include
|
||||
# this tensor into optimization via transformer.wte.weight only, and not decayed.
|
||||
decay.remove('language_model_head.weight')
|
||||
|
||||
# validate that we considered every parameter
|
||||
param_dict = {pn: p for pn, p in self.named_parameters()}
|
||||
inter_params = decay & no_decay
|
||||
union_params = decay | no_decay
|
||||
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
|
||||
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
|
||||
% (str(param_dict.keys() - union_params), )
|
||||
|
||||
# create the pytorch optimizer object
|
||||
optim_groups = [
|
||||
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
|
||||
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||
]
|
||||
return optim_groups
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||
"""
|
||||
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
||||
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
||||
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
||||
"""
|
||||
for _ in range(max_new_tokens):
|
||||
# if the sequence context is growing too long we must crop it at block_size
|
||||
idx_cond = idx
|
||||
# forward the model to get the logits for the index in the sequence
|
||||
logits, _ = self(idx_cond)
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits[:, -1, :] / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
# sample from the distribution
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
# append sampled index to the running sequence and continue
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
|
||||
return idx.squeeze().cpu().numpy()
|
||||
############################################################
|
||||
|
||||
GPTConfig = {
|
||||
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
|
||||
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
|
||||
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
|
||||
}
|
||||
61
hw4/code/prepare.py
Normal file
61
hw4/code/prepare.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Prepare the dataset for character-level language modeling.
|
||||
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
||||
"""
|
||||
import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
# set the input file path
|
||||
input_file_path = os.path.join(args.data_root, 'data.json')
|
||||
|
||||
with open(input_file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)['data']
|
||||
print(f"length of dataset: {len(data):,}")
|
||||
|
||||
# get all the unique characters that occur in this text
|
||||
chars = sorted(list(set(''.join(data))))
|
||||
vocab_size = len(chars) + 2 # for <pad> and <eos>
|
||||
print("all the unique characters:", ''.join(chars))
|
||||
print(f"vocab size: {vocab_size:,}")
|
||||
|
||||
# create a mapping from characters to integers
|
||||
stoi = { ch:i+2 for i,ch in enumerate(chars) }
|
||||
itos = { i+2:ch for i,ch in enumerate(chars) }
|
||||
stoi['<pad>'] = 0
|
||||
itos[0] = '<pad>'
|
||||
stoi['<eos>'] = 1
|
||||
itos[1] = '<eos>'
|
||||
|
||||
|
||||
# create the train and test splits
|
||||
n = len(data)
|
||||
train_data = data[:int(n*0.9)]
|
||||
val_data = data[int(n*0.9):]
|
||||
print(f"train has {len(train_data):,} samples")
|
||||
print(f"val has {len(val_data):,} samples")
|
||||
|
||||
# save the meta information as well, to help us encode/decode later
|
||||
train_meta = {
|
||||
'data': train_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(train_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
val_meta = {
|
||||
'data': val_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(val_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
76
hw4/code/sample.py
Normal file
76
hw4/code/sample.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Sample from a trained model
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import torch
|
||||
from model import GPTConfig, GPT
|
||||
import argparse
|
||||
from dataset import Converter, LMDataset
|
||||
|
||||
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
|
||||
dataset = LMDataset(data_root, 'train')
|
||||
converter = Converter(dataset.stoi, dataset.itos)
|
||||
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
|
||||
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
|
||||
# model
|
||||
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
# init from a model saved in a specific directory
|
||||
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||
print("sample from %s"%ckpt_path)
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
gptconf = GPTConfig[model_name]
|
||||
if 'model_args' in checkpoint:
|
||||
gptconf = checkpoint['model_args']
|
||||
model = GPT(**gptconf)
|
||||
state_dict = checkpoint['state_dict']
|
||||
#unwanted_prefix = '_orig_mod.'
|
||||
#for k,v in list(state_dict.items()):
|
||||
# if k.startswith(unwanted_prefix):
|
||||
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
model.eval()
|
||||
model.to(device)
|
||||
|
||||
# encode the beginning of the prompt
|
||||
start_ids = converter.single_encode(start)
|
||||
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
|
||||
|
||||
# run generation
|
||||
with torch.no_grad():
|
||||
with ctx:
|
||||
for k in range(num_samples):
|
||||
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
|
||||
print(converter.single_decode(y))
|
||||
print('---------------')
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and sampling process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
|
||||
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
|
||||
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
|
||||
219
hw4/code/train.py
Normal file
219
hw4/code/train.py
Normal file
@@ -0,0 +1,219 @@
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from model import GPT, GPTConfig
|
||||
from dataset import LMDataset, Converter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# learning rate decay scheduler (cosine with warmup)
|
||||
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
|
||||
# 1) linear warmup for warmup_iters steps
|
||||
if it < warmup_iters:
|
||||
return learning_rate * it / warmup_iters
|
||||
# 2) if it > lr_decay_iters, return min learning rate
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
# 3) in between, use cosine decay down to min learning rate
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
||||
return min_lr + coeff * (learning_rate - min_lr)
|
||||
|
||||
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
|
||||
train_dataset = LMDataset(data_root, 'train')
|
||||
val_dataset = LMDataset(data_root, 'val')
|
||||
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
|
||||
converter = Converter(train_dataset.stoi, train_dataset.itos)
|
||||
|
||||
# adamw optimizer
|
||||
learning_rate = 5e-3 # max learning rate
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.99
|
||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||
|
||||
# system
|
||||
|
||||
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||
best_val_loss = 1e9
|
||||
iter_num = 0 # number of iterations in the lifetime of this process
|
||||
|
||||
# model init
|
||||
model_args = GPTConfig[model_name]
|
||||
model_args['vocab_size'] = train_dataset.vocab_size
|
||||
model_args['max_seq_len'] = 128
|
||||
model_args['no_res'] = no_res
|
||||
model_args['no_pos'] = no_pos
|
||||
|
||||
# init a new model from scratch
|
||||
print("Initializing a new model from scratch")
|
||||
model = GPT(**model_args)
|
||||
|
||||
model.to(device)
|
||||
|
||||
# initialize a GradScaler. If enabled=False scaler is a no-op
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
||||
|
||||
# optimizer
|
||||
optim_groups = model.configure_optimizers(weight_decay)
|
||||
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
|
||||
checkpoint = None # free up memory
|
||||
|
||||
print('training...')
|
||||
# training loop
|
||||
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
|
||||
t0 = time.time()
|
||||
model.train()
|
||||
train_losses = []
|
||||
val_losses = []
|
||||
for epoch in range(epoch_num):
|
||||
for step, inputs in enumerate(train_loader):
|
||||
if iter_num >= n_iters:
|
||||
break
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
||||
# and using the GradScaler if data type is float16
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
loss = loss # scale the loss to account for gradient accumulation
|
||||
|
||||
# backward pass, with gradient scaling if training in fp16
|
||||
scaler.scale(loss).backward()
|
||||
# clip the gradient
|
||||
if grad_clip != 0.0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
# step the optimizer and scaler if training in fp16
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
# flush the gradients as soon as we can, no need for this memory anymore
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
iter_num += 1
|
||||
train_losses.append(loss.item())
|
||||
# evaluate the loss on train/val sets and write checkpoints
|
||||
if iter_num % val_interval == 0:
|
||||
# timing and logging
|
||||
t1 = time.time()
|
||||
dt = t1 - t0
|
||||
t0 = t1
|
||||
lossf = loss.item()
|
||||
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
||||
losses = estimate_loss(model, val_loader, converter, ctx, device)
|
||||
val_losses.append(losses['val'])
|
||||
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
|
||||
print(f"saving latest checkpoint to {ckpt_path}")
|
||||
checkpoint = {
|
||||
'state_dict': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'model_args': model_args,
|
||||
'iter_num': iter_num,
|
||||
'best_val_loss': best_val_loss,
|
||||
}
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
|
||||
|
||||
if losses['val'] < best_val_loss:
|
||||
best_val_loss = losses['val']
|
||||
if iter_num > 0:
|
||||
print(f"saving best checkpoint to {ckpt_path}")
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
|
||||
|
||||
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
|
||||
|
||||
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
|
||||
# create a plot
|
||||
f, ax = plt.subplots(1,2,figsize=(18,6))
|
||||
val_iters = np.arange(1, n_iters+1, val_interval)
|
||||
|
||||
# draw loss
|
||||
ax[0].plot(train_losses)
|
||||
ax[0].plot(val_iters, val_losses, 'r')
|
||||
|
||||
# set labels
|
||||
ax[0].set_xlabel('training iters')
|
||||
ax[0].legend(['training loss', 'validation loss'])
|
||||
|
||||
train_perplexity = [np.exp(x) for x in train_losses]
|
||||
val_perplexity = [np.exp(x) for x in val_losses]
|
||||
# draw perplexity
|
||||
ax[1].plot(train_perplexity)
|
||||
ax[1].plot(val_iters, val_perplexity, 'r')
|
||||
|
||||
# set labels
|
||||
ax[1].set_xlabel('training iters')
|
||||
ax[1].legend(['training perplexity', 'validation perplexity'])
|
||||
plt.tight_layout()
|
||||
|
||||
# show the image
|
||||
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
|
||||
plt.show()
|
||||
|
||||
# helps estimate an arbitrarily accurate loss over either split using many batches
|
||||
@torch.no_grad()
|
||||
def estimate_loss(model, val_loader, converter, ctx, device):
|
||||
out = {}
|
||||
model.eval()
|
||||
losses = 0
|
||||
max_iters = 100
|
||||
iter_num = 0
|
||||
for inputs in val_loader:
|
||||
if iter_num >= max_iters:
|
||||
break
|
||||
iter_num += 1
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
#loss = model.loss(logits, Y)
|
||||
losses += loss.item()
|
||||
out['val'] = losses / max_iters
|
||||
model.train()
|
||||
return out
|
||||
|
||||
if __name__ == '__main__':
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and training process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
|
||||
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
|
||||
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
|
||||
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
|
||||
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
|
||||
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
os.makedirs(opt.ckpt_path, exist_ok=True)
|
||||
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
|
||||
|
||||
|
||||
132
hw4/report/dtx-style.sty
Normal file
132
hw4/report/dtx-style.sty
Normal file
@@ -0,0 +1,132 @@
|
||||
%%
|
||||
%% This is file `dtx-style.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `dtx-style')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\ProvidesPackage{dtx-style}
|
||||
\RequirePackage{hypdoc}
|
||||
\RequirePackage[UTF8,scheme=chinese]{ctex}
|
||||
\RequirePackage{newpxtext}
|
||||
\RequirePackage{newpxmath}
|
||||
\RequirePackage[
|
||||
top=2.5cm, bottom=2.5cm,
|
||||
left=4cm, right=2cm,
|
||||
headsep=3mm]{geometry}
|
||||
\RequirePackage{array,longtable,booktabs}
|
||||
\RequirePackage{listings}
|
||||
\RequirePackage{fancyhdr}
|
||||
\RequirePackage{xcolor}
|
||||
\RequirePackage{enumitem}
|
||||
\RequirePackage{etoolbox}
|
||||
\RequirePackage{metalogo}
|
||||
|
||||
\colorlet{thu@macro}{blue!60!black}
|
||||
\colorlet{thu@env}{blue!70!black}
|
||||
\colorlet{thu@option}{purple}
|
||||
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||
|
||||
\def\DescribeOption{%
|
||||
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
|
||||
\Describe@Option}
|
||||
\def\Describe@Option#1{\endgroup
|
||||
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
|
||||
\thu@special@index{option}{#1}\@esphack\ignorespaces}
|
||||
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
|
||||
\def\thu@special@index#1#2{\@bsphack
|
||||
\begingroup
|
||||
\HD@target
|
||||
\let\HDorg@encapchar\encapchar
|
||||
\edef\encapchar usage{%
|
||||
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
|
||||
}%
|
||||
\index{#2\actualchar{\string\ttfamily\space#2}
|
||||
(#1)\encapchar usage}%
|
||||
\index{#1:\levelchar#2\actualchar
|
||||
{\string\ttfamily\space#2}\encapchar usage}%
|
||||
\endgroup
|
||||
\@esphack}
|
||||
|
||||
\lstdefinestyle{lstStyleBase}{%
|
||||
basicstyle=\small\ttfamily,
|
||||
aboveskip=\medskipamount,
|
||||
belowskip=\medskipamount,
|
||||
lineskip=0pt,
|
||||
boxpos=c,
|
||||
showlines=false,
|
||||
extendedchars=true,
|
||||
upquote=true,
|
||||
tabsize=2,
|
||||
showtabs=false,
|
||||
showspaces=false,
|
||||
showstringspaces=false,
|
||||
numbers=none,
|
||||
linewidth=\linewidth,
|
||||
xleftmargin=4pt,
|
||||
xrightmargin=0pt,
|
||||
resetmargins=false,
|
||||
breaklines=true,
|
||||
breakatwhitespace=false,
|
||||
breakindent=0pt,
|
||||
breakautoindent=true,
|
||||
columns=flexible,
|
||||
keepspaces=true,
|
||||
gobble=2,
|
||||
framesep=3pt,
|
||||
rulesep=1pt,
|
||||
framerule=1pt,
|
||||
backgroundcolor=\color{gray!5},
|
||||
stringstyle=\color{green!40!black!100},
|
||||
keywordstyle=\bfseries\color{blue!50!black},
|
||||
commentstyle=\slshape\color{black!60}}
|
||||
|
||||
\lstdefinestyle{lstStyleShell}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{purple},
|
||||
language=bash}
|
||||
|
||||
\lstdefinestyle{lstStyleLaTeX}{%
|
||||
style=lstStyleBase,
|
||||
frame=l,
|
||||
rulecolor=\color{violet},
|
||||
language=[LaTeX]TeX}
|
||||
|
||||
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
|
||||
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
|
||||
|
||||
\setlist{nosep}
|
||||
|
||||
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
|
||||
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
|
||||
\DeclareDocumentCommand{\pkg}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
|
||||
\DeclareDocumentCommand{\file}{s m}{%
|
||||
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
|
||||
\newcommand{\myentry}[1]{%
|
||||
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
|
||||
\newcommand{\note}[2][Note]{{%
|
||||
\color{magenta}{\bfseries #1}\emph{#2}}}
|
||||
|
||||
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
|
||||
153
hw4/report/iidef.sty
Normal file
153
hw4/report/iidef.sty
Normal file
@@ -0,0 +1,153 @@
|
||||
%%
|
||||
%% This is file `iidef.sty',
|
||||
%% generated with the docstrip utility.
|
||||
%%
|
||||
%% The original source files were:
|
||||
%%
|
||||
%% thucoursework.dtx (with options: `sty')
|
||||
%%
|
||||
%% This is a generated file.
|
||||
%%
|
||||
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||
%%
|
||||
%% This work may be distributed and/or modified under the
|
||||
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||
%% of this license or (at your option) any later version.
|
||||
%% The latest version of this license is in
|
||||
%% http://www.latex-project.org/lppl.txt
|
||||
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||
%% version 2005/12/01 or later.
|
||||
%%
|
||||
%% To produce the documentation run the original source files ending with `.dtx'
|
||||
%% through LaTeX.
|
||||
%%
|
||||
|
||||
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
|
||||
\ProvidesClass{iidef}
|
||||
[2020/09/09 2.6 Tsinghua University Coursework Template]
|
||||
%% configuration of nested enumerate env
|
||||
\RequirePackage{enumitem}
|
||||
%% set hwcount key-value option
|
||||
\RequirePackage{kvoptions}
|
||||
%% required by macro DeclareMathOperator
|
||||
\RequirePackage{amsmath}
|
||||
%% Set up page headers using with fancyhdr
|
||||
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
|
||||
{\def\@thulhead{thulhead}}
|
||||
\RequirePackage{amsthm}
|
||||
%% semester
|
||||
\def\@term{term}
|
||||
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
|
||||
%% institute
|
||||
\newcommand{\@courseinstitute}[1]{institute}
|
||||
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
|
||||
%% coursename
|
||||
\newcommand{\@coursename}[1]{coursename}
|
||||
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
|
||||
%% user can rewrite homework name
|
||||
\def\@hwname{Homework}
|
||||
\def\hwname#1{\renewcommand\@hwname{#1}}
|
||||
%% \iidef@thehwcnt = 1
|
||||
\DeclareStringOption[1]{thehwcnt}
|
||||
\ProcessKeyvalOptions*
|
||||
\def\thehwcnt{\iidef@thehwcnt}
|
||||
%% page header setup, distinguish between first page(plain style)
|
||||
%% and second page on (runningpage style)
|
||||
%%***************************************************************************
|
||||
\newcommand{\courseheader}{
|
||||
\thispagestyle{plain}%first page use native plain style to suppress header
|
||||
\vspace*{-1in}
|
||||
\begin{center}
|
||||
\@courseinstitute\\
|
||||
\@coursename\\
|
||||
\@term
|
||||
\vspace*{0.1in}
|
||||
\hrule
|
||||
\end{center}
|
||||
\begin{center}
|
||||
\underline{\bf \@hwname\;\thehwcnt} \\
|
||||
\end{center}
|
||||
}
|
||||
\@ifundefined{@thulhead}{
|
||||
\fancypagestyle{runningpage}
|
||||
{
|
||||
\fancyhead[L]{\small\@coursename}
|
||||
\fancyhead[R]{\small\@courseinstitute}
|
||||
}
|
||||
%% use runningpage style from second page on
|
||||
\pagestyle{runningpage}
|
||||
}{}
|
||||
%% *********************************************************************************************
|
||||
%%name command macro
|
||||
%%*************************
|
||||
\newcommand{\name}[1]{
|
||||
\begin{flushleft}
|
||||
#1\hfill
|
||||
\today
|
||||
\end{flushleft}
|
||||
\hrule
|
||||
|
||||
\vspace{2em}
|
||||
|
||||
\flushleft
|
||||
}
|
||||
%%*************************
|
||||
%% enumitem related configuration
|
||||
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
|
||||
\setlist[enumerate,2]{label=(\alph*)}
|
||||
\setlist[enumerate,3]{label=\roman*.}
|
||||
\setlist[enumerate,4]{label=\greek*}
|
||||
%%******************************
|
||||
\def\@slname{Solution}
|
||||
\def\slname#1{\renewcommand\@slname{#1}}
|
||||
|
||||
\@ifundefined{solution}{
|
||||
\newenvironment{solution}
|
||||
{
|
||||
\proof[\@slname]
|
||||
}
|
||||
{
|
||||
%% no qed symbol in solution env
|
||||
\renewcommand{\qedsymbol}{}
|
||||
\endproof
|
||||
}
|
||||
}{}
|
||||
%%******************************
|
||||
%%common math symbols go here
|
||||
%%*************************************************
|
||||
\def\v#1{\underline{#1}}
|
||||
\newcommand{\uc}{\underline{c}} % c, vec
|
||||
\newcommand{\uv}{\underline{v}} % x, vec
|
||||
\newcommand{\uw}{\underline{w}} % w, vec
|
||||
\newcommand{\ux}{\underline{x}} % x, vec
|
||||
\newcommand{\uy}{\underline{y}} % y, vec
|
||||
\newcommand{\uz}{\underline{z}} % z, vec
|
||||
\newcommand{\um}{\underline{m}} % m, vec
|
||||
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
|
||||
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
|
||||
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
|
||||
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
|
||||
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
|
||||
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
|
||||
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
|
||||
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
|
||||
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
|
||||
|
||||
\newcommand{\defas}{\triangleq} %\coloneqq
|
||||
\newcommand{\reals}{\mathbb{R}}
|
||||
\newcommand{\TT}{\mathrm{T}} % transpose
|
||||
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||
\DeclareMathOperator*{\argsup}{arg\,sup}
|
||||
\DeclareMathOperator*{\arginf}{arg\,inf}
|
||||
\DeclareMathOperator{\diag}{diag}
|
||||
\DeclareMathOperator{\Var}{Var}
|
||||
\DeclareMathOperator{\Cov}{Cov}
|
||||
\DeclareMathOperator{\MSE}{MSE}
|
||||
\DeclareMathOperator{\1}{\mathds{1}}
|
||||
\DeclareMathOperator{\In}{\mathbb{I}}
|
||||
\DeclareMathOperator{\E}{\mathbb{E}}
|
||||
\DeclareMathOperator{\Prob}{\mathbb{P}}
|
||||
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
|
||||
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
|
||||
%%************************************************************************************
|
||||
100
hw4/report/main.tex
Normal file
100
hw4/report/main.tex
Normal file
@@ -0,0 +1,100 @@
|
||||
% Homework template for Inference and Information
|
||||
% UPDATE: September 26, 2017 by Xiangxiang
|
||||
\documentclass[a4paper]{article}
|
||||
\usepackage{ctex}
|
||||
\usepackage{amsmath, amssymb, amsthm}
|
||||
\usepackage{moreenum}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{url}
|
||||
\usepackage{bm}
|
||||
\usepackage{enumitem}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{listings}
|
||||
\usepackage{color}
|
||||
|
||||
\lstset{
|
||||
basicstyle = \sffamily, % 基本代码风格
|
||||
keywordstyle = \bfseries, % 关键字风格
|
||||
commentstyle = \rmfamily\itshape, % 注释的风格,斜体
|
||||
stringstyle = \ttfamily, % 字符串风格
|
||||
flexiblecolumns, % 别问为什么,加上这个
|
||||
numbers = left, % 行号的位置在左边
|
||||
showspaces = false, % 是否显示空格,显示了有点乱,所以不现实了
|
||||
numberstyle = \zihao{-5}\ttfamily, % 行号的样式,小五号,tt等宽字体
|
||||
showstringspaces = false,
|
||||
captionpos = t, % 这段代码的名字所呈现的位置,t指的是top上面
|
||||
frame = lrtb, % 显示边框
|
||||
}
|
||||
|
||||
\lstdefinestyle{Python}{
|
||||
language = Python, % 语言选Python
|
||||
basicstyle = \zihao{-5}\ttfamily,
|
||||
numberstyle = \zihao{-5}\ttfamily,
|
||||
keywordstyle = \color{blue},
|
||||
keywordstyle = [2] \color{teal},
|
||||
stringstyle = \color{magenta},
|
||||
commentstyle = \color{red}\ttfamily,
|
||||
breaklines = true, % 自动换行,建议不要写太长的行
|
||||
columns = fixed, % 如果不加这一句,字间距就不固定,很丑,必须加
|
||||
basewidth = 0.5em,
|
||||
}
|
||||
\usepackage{subcaption}
|
||||
\usepackage{booktabs} % toprule
|
||||
\usepackage[mathcal]{eucal}
|
||||
\usepackage[thehwcnt = 4]{iidef}
|
||||
|
||||
\thecourseinstitute{清华大学电子工程系}
|
||||
\thecoursename{\textbf{媒体与认知}}
|
||||
\theterm{2023-2024学年春季学期}
|
||||
\hwname{作业}
|
||||
\begin{document}
|
||||
\courseheader
|
||||
\name{YOUR NAME}
|
||||
\vspace{3mm}
|
||||
\centerline{\textbf{\Large{理论部分}}}
|
||||
|
||||
\section{单选题(15分)}
|
||||
\subsection{\underline{?}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
|
||||
\subsection{\underline{?}}
|
||||
|
||||
\section{计算题(15 分)}
|
||||
% 计算题1
|
||||
\subsection{隐含马尔可夫模型}
|
||||
|
||||
\hspace{2em}暑假中,小E每天进行一项体育活动,包括跑步(R)、游泳(S)和打球(B),所选择的体育活动受某种潜在因素(如心情)的影响。小E每天把进行体育活动的照片发至微信朋友圈,我们可以根据观测信息推测该潜在因素的状态。
|
||||
|
||||
\hspace{2em}假设该潜在因素分为$S_1$和$S_2$两种状态。在$S_1$时,小E选择三种体育活动的概率分别为0.6,0.2,0.2;在$S_2$时,小E选择三种体育活动的概率分别为0.1,0.6,0.3。
|
||||
|
||||
\hspace{2em}该潜在因素的变化也有一定规律,若某天处于$S_1$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.5,0.5;若某天处于$S_2$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.6,0.4。
|
||||
|
||||
\hspace{2em}暑假第一天处于$S_1$和$S_2$的状态的概率均为0.5。
|
||||
|
||||
\vspace{3mm}
|
||||
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模,{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
|
||||
|
||||
\vspace{3mm}
|
||||
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步(R)、打球(B)和游泳(S),{\color{blue}请计算出现该观测序列的概率}。
|
||||
|
||||
\vspace{3mm}
|
||||
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
|
||||
|
||||
|
||||
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||
\section{编程作业报告}
|
||||
\section{自选课题工作进度汇报}
|
||||
|
||||
\end{document}
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: late\rvx
|
||||
%%% TeX-master: t
|
||||
%%% End:
|
||||
Reference in New Issue
Block a user