TA Release homework4.

This commit is contained in:
unlockable
2024-05-22 20:22:47 +08:00
parent c850f38778
commit c6b2420b85
12 changed files with 14707 additions and 0 deletions

104
hw4/code/attnvis.ipynb Normal file
View File

@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"from contextlib import nullcontext\n",
"import torch\n",
"from model import GPTConfig, GPT\n",
"from bertviz import head_view\n",
"from dataset import Converter, LMDataset\n",
"\n",
"# set random seed for reproducibility\n",
"seed = 2024\n",
"torch.manual_seed(seed)\n",
"torch.cuda.manual_seed(seed)\n",
"torch.cuda.manual_seed_all(seed)\n",
"torch.backends.cudnn.deterministic = True\n",
"torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
"torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
"\n",
"#################################################\n",
"# \n",
"model_name = 'mygpt'\n",
"ckpt_path = 'workdirs/quansongci'\n",
"data_root = 'data/quansongci'\n",
"vis_text_path = 'data/vis/vis_1.txt'\n",
"#################################################\n",
"\n",
"device = 'cpu'\n",
"\n",
"dataset = LMDataset(data_root, 'train')\n",
"converter = Converter(dataset.stoi, dataset.itos)\n",
"\n",
"\n",
"with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
" start = f.read()\n",
"start_ids = converter.single_encode(start)\n",
"start_texts = [c for c in start]\n",
"x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
"print(f\"Input texts: {start}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0792738",
"metadata": {},
"outputs": [],
"source": [
"# model\n",
"dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
"ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
"ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
"# init from a model saved in a specific directory\n",
"ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
"print(\"loading model params from %s\"%ckpt_path)\n",
"checkpoint = torch.load(ckpt_path, map_location=device)\n",
"gptconf = GPTConfig[model_name]\n",
"if 'model_args' in checkpoint:\n",
" gptconf = checkpoint['model_args']\n",
"model = GPT(**gptconf)\n",
"state_dict = checkpoint['state_dict']\n",
"model.load_state_dict(state_dict)\n",
"\n",
"model.eval()\n",
"model.to(device)\n",
"\n",
"# run generation\n",
"with torch.no_grad():\n",
" with ctx:\n",
" _, attn_weights = model(x)\n",
"\n",
"head_view(attn_weights, start_texts)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
+++如梦令
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。

View File

@@ -0,0 +1,3 @@
+++鹧鸪天(秋思)
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。

75
hw4/code/dataset.py Normal file
View File

@@ -0,0 +1,75 @@
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import json
class LMDataset(Dataset):
def __init__(self, data_dir, split):
super().__init__()
# load the data
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
meta = json.load(f)
self.data = meta['data'] # list of samples
self.stoi = meta['stoi'] # a dict that maps character to integer
self.itos = meta['itos'] # a dict that maps string of integer to character
self.vocab_size = meta['vocab_size'] # vocab size
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
class Converter:
'''
This class helps us convert strings to integers and back
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
'''
def __init__(self, stoi, itos):
self.stoi = stoi # a dict that maps character to integer
self.itos = itos # a dict that maps string of integer to character
def single_encode(self, s):
l = [] # initialize an empty list
for i in s:
l.append(self.stoi[i])
# transform the list into a numpy array
l = np.array(l, dtype=np.int64)
return l
def single_decode(self, l):
s = '' # initialize an empty string
for i in l:
# if we meet the end of the sequence (the value of integer is equal to 1), break
if i == 1:
break
# convert string of the integer into a character
s += self.itos[str(i)]
return s
def encode(self, data):
'''
encode a list of strings into integers
'''
lens = [len(s) for s in data]
max_len = max(lens)
out = np.zeros((len(data), max_len+1), dtype=np.int64)
for i,s in enumerate(data):
out[i,:len(s)] = self.single_encode(s)
out[i,len(s)] = 1
x = torch.from_numpy(out[:,:-1])
y = torch.from_numpy(out[:,1:])
return x, y
def decode(self, data):
'''
decode a list of integers into strings
'''
data = data.cpu().numpy().astype(np.int64)
out = []
for i in range(len(data)):
out.append(self.single_decode(data[i]))
return out

356
hw4/code/model.py Normal file
View File

@@ -0,0 +1,356 @@
# ========================================================
# Media and Cognition
# Homework 4 Sequence Modeling
# model.py - Model definition
# Student ID:
# Name:
# Tsinghua University
# (C) Copyright 2024
# ========================================================
# Import required libraries
############################################################
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
############################################################
# Define the GELU activation function used in OpenAI GPT
############################################################
def gelu(z):
"""
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
"""
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
############################################################
# Define the Multi-Head SelfAttention module
############################################################
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_head, dropout):
super().__init__()
# define there linear layers for q, k, v generation separately
self.q_layer = nn.Linear(embed_dim, embed_dim)
self.k_layer = nn.Linear(embed_dim, embed_dim)
self.v_layer = nn.Linear(embed_dim, embed_dim)
# define the projection layer for output
self.proj_layer = nn.Linear(embed_dim, embed_dim)
# define the dropout layer for attention and output calculation
self.attn_drop = nn.Dropout(dropout)
self.proj_drop = nn.Dropout(dropout)
self.num_head = num_head
self.head_dim = embed_dim // num_head
def forward(self, x):
batch_size, seq_len, dim = x.shape
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
q = ???
k = ???
v = ???
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
q = ???
k = ???
v = ???
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
q = ???
k = ???
v = ???
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
attn = ???
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
# Therefore, a mask is used to prevent positions from attending to subsequent positions
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
# Hint:
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
attn_mask = ???
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
attn_mask = ???
# use Tensor.bool() to convert the matrix to a boolean matrix
attn_mask = ???
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
attn = ???
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
attn = ???
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
attn = ???
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
out = ???
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
out = ???
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
result = ???
# <<< TODO 1
# return the final results `result` and attention weights `attn`
return result, attn
############################################################
# Define the feed forward network (FFN)
############################################################
class FFN(nn.Module):
def __init__(self, embed_dim, feedforward_dim, dropout):
super().__init__()
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
self.drop = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = gelu(x)
x = self.fc2(x)
x = self.drop(x)
return x
############################################################
# Define the TransformerLayer
############################################################
class TransformerLayer(nn.Module):
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
super().__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = SelfAttention(embed_dim, num_head, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
self.no_res = no_res # whether to use residual connection
def forward(self, x):
# >>> TODO 2: complete the forward process of the TransformerLayer module.
# Step 2.1: calculate the output of multi-head self-attention
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
x_norm = ???
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
x_attn, attn = ???
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
if ???:
x_attn = ???
# Step 2.2: calculate the output of feed forward network
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
x_ffn = ???
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
if ???:
out = ???
else:
out = ???
# <<< TODO 2
return out, attn
############################################################
# Define the GPT module
############################################################
class GPT(nn.Module):
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
'''
vocab_size: the size of vocabulary
max_seq_len: the maximum length of input texts
num_layer: the number of transformer layers
embed_dim: the embedding dimension
num_head: the number of heads in Multi-Head Self Attention
feedforward_dim: the dimension in the feed forward network
dropout: dropout ratio
no_res: whether to use residual connection in transformer layers
no_pos: whether to use position embeddings
'''
super().__init__()
self.num_layer = num_layer
self.max_seq_len = max_seq_len
self.no_pos = no_pos
# Define Embedding Layer to transfer input text tokens and positions to embeddings
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
self.drop = nn.Dropout(dropout)
# Define the transformer layers
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
# Define the head layer to predict output
self.norm = nn.LayerNorm(embed_dim)
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
"""
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
Reference: https://paperswithcode.com/method/weight-tying
"""
self.word_token_embedding.weight = self.language_model_head.weight
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
if m.bias is not None:
torch.nn.init.zeros_(m.bias)
elif isinstance(m, nn.Embedding):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('proj_layer.weight'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
def forward(self, word_idx, targets=None):
batch_size, seq_len = word_idx.shape
# >>> TODO 3: complete the forward process of GPT
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
pos = ???
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
token_embed = ???
pos_embed = ???
# Step 3.3: initialize the input embeddings `x` of transformer layers
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
if ???:
x = ???
else:
x = ???
# apply dropout to the input embeddings via `self.drop()`
x = ???
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
attention_weights = ???
for ???:
# Step 4.1: obtain the output and attention weights of transformer layers
x, attn = ???
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
???
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
# self.language_model_head() is a linear layer defined in __init__() function
# Note: do not add softmax here since it is included in the cross entropy loss function
x = ???
logits = ???
# <<< TODO 3
# return logits and loss or attention weights
if targets is not None:
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
return logits, loss
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
return logits, attention_weights
def configure_optimizers(self, weight_decay):
"""
This long function is unfortunately doing something very simple and is being very defensive:
We are separating out all parameters of the model into two buckets: those that will experience
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
We are then returning the PyTorch optimizer object.
"""
# separate out all parameters to those that will and won't experience regularizing weight decay
decay = set()
no_decay = set()
whitelist_weight_modules = (nn.Linear, )
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
for mn, m in self.named_modules():
for pn, p in m.named_parameters():
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
# random note: because named_modules and named_parameters are recursive
# we will see the same tensors p many many times. but doing it this way
# allows us to know which parent module any tensor p belongs to...
if pn.endswith('bias'):
# all biases will not be decayed
no_decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
# weights of whitelist modules will be weight decayed
decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
# weights of blacklist modules will NOT be weight decayed
no_decay.add(fpn)
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
# will appear in the no_decay and decay sets respectively after the above.
# In addition, because named_parameters() doesn't return duplicates, it
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
# so let's manually remove 'lm_head.weight' from decay set. This will include
# this tensor into optimization via transformer.wte.weight only, and not decayed.
decay.remove('language_model_head.weight')
# validate that we considered every parameter
param_dict = {pn: p for pn, p in self.named_parameters()}
inter_params = decay & no_decay
union_params = decay | no_decay
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
% (str(param_dict.keys() - union_params), )
# create the pytorch optimizer object
optim_groups = [
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
]
return optim_groups
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
"""
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx.squeeze().cpu().numpy()
############################################################
GPTConfig = {
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
}

61
hw4/code/prepare.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Prepare the dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
"""
import os
import numpy as np
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
args = parser.parse_args()
# set the input file path
input_file_path = os.path.join(args.data_root, 'data.json')
with open(input_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)['data']
print(f"length of dataset: {len(data):,}")
# get all the unique characters that occur in this text
chars = sorted(list(set(''.join(data))))
vocab_size = len(chars) + 2 # for <pad> and <eos>
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")
# create a mapping from characters to integers
stoi = { ch:i+2 for i,ch in enumerate(chars) }
itos = { i+2:ch for i,ch in enumerate(chars) }
stoi['<pad>'] = 0
itos[0] = '<pad>'
stoi['<eos>'] = 1
itos[1] = '<eos>'
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]
print(f"train has {len(train_data):,} samples")
print(f"val has {len(val_data):,} samples")
# save the meta information as well, to help us encode/decode later
train_meta = {
'data': train_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
json.dump(train_meta, f, ensure_ascii=False, indent=4)
val_meta = {
'data': val_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
json.dump(val_meta, f, ensure_ascii=False, indent=4)

76
hw4/code/sample.py Normal file
View File

@@ -0,0 +1,76 @@
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
from model import GPTConfig, GPT
import argparse
from dataset import Converter, LMDataset
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
dataset = LMDataset(data_root, 'train')
converter = Converter(dataset.stoi, dataset.itos)
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
# model
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
# init from a model saved in a specific directory
ckpt_path = os.path.join(ckpt_path, 'best.pth')
print("sample from %s"%ckpt_path)
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig[model_name]
if 'model_args' in checkpoint:
gptconf = checkpoint['model_args']
model = GPT(**gptconf)
state_dict = checkpoint['state_dict']
#unwanted_prefix = '_orig_mod.'
#for k,v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.eval()
model.to(device)
# encode the beginning of the prompt
start_ids = converter.single_encode(start)
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
# run generation
with torch.no_grad():
with ctx:
for k in range(num_samples):
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
print(converter.single_decode(y))
print('---------------')
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and sampling process
parser = argparse.ArgumentParser()
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)

219
hw4/code/train.py Normal file
View File

@@ -0,0 +1,219 @@
import os
import time
import math
import pickle
from contextlib import nullcontext
import argparse
import numpy as np
import torch
from torch.utils.data import DataLoader
from model import GPT, GPTConfig
from dataset import LMDataset, Converter
import matplotlib.pyplot as plt
# learning rate decay scheduler (cosine with warmup)
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
train_dataset = LMDataset(data_root, 'train')
val_dataset = LMDataset(data_root, 'val')
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
converter = Converter(train_dataset.stoi, train_dataset.itos)
# adamw optimizer
learning_rate = 5e-3 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# system
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
best_val_loss = 1e9
iter_num = 0 # number of iterations in the lifetime of this process
# model init
model_args = GPTConfig[model_name]
model_args['vocab_size'] = train_dataset.vocab_size
model_args['max_seq_len'] = 128
model_args['no_res'] = no_res
model_args['no_pos'] = no_pos
# init a new model from scratch
print("Initializing a new model from scratch")
model = GPT(**model_args)
model.to(device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optim_groups = model.configure_optimizers(weight_decay)
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
checkpoint = None # free up memory
print('training...')
# training loop
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
t0 = time.time()
model.train()
train_losses = []
val_losses = []
for epoch in range(epoch_num):
for step, inputs in enumerate(train_loader):
if iter_num >= n_iters:
break
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# forward backward update, with optional gradient accumulation to simulate larger batch size
# and using the GradScaler if data type is float16
with ctx:
logits, loss = model(X, Y)
loss = loss # scale the loss to account for gradient accumulation
# backward pass, with gradient scaling if training in fp16
scaler.scale(loss).backward()
# clip the gradient
if grad_clip != 0.0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# step the optimizer and scaler if training in fp16
scaler.step(optimizer)
scaler.update()
# flush the gradients as soon as we can, no need for this memory anymore
optimizer.zero_grad(set_to_none=True)
iter_num += 1
train_losses.append(loss.item())
# evaluate the loss on train/val sets and write checkpoints
if iter_num % val_interval == 0:
# timing and logging
t1 = time.time()
dt = t1 - t0
t0 = t1
lossf = loss.item()
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
losses = estimate_loss(model, val_loader, converter, ctx, device)
val_losses.append(losses['val'])
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
print(f"saving latest checkpoint to {ckpt_path}")
checkpoint = {
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'model_args': model_args,
'iter_num': iter_num,
'best_val_loss': best_val_loss,
}
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
if losses['val'] < best_val_loss:
best_val_loss = losses['val']
if iter_num > 0:
print(f"saving best checkpoint to {ckpt_path}")
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
# create a plot
f, ax = plt.subplots(1,2,figsize=(18,6))
val_iters = np.arange(1, n_iters+1, val_interval)
# draw loss
ax[0].plot(train_losses)
ax[0].plot(val_iters, val_losses, 'r')
# set labels
ax[0].set_xlabel('training iters')
ax[0].legend(['training loss', 'validation loss'])
train_perplexity = [np.exp(x) for x in train_losses]
val_perplexity = [np.exp(x) for x in val_losses]
# draw perplexity
ax[1].plot(train_perplexity)
ax[1].plot(val_iters, val_perplexity, 'r')
# set labels
ax[1].set_xlabel('training iters')
ax[1].legend(['training perplexity', 'validation perplexity'])
plt.tight_layout()
# show the image
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
plt.show()
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss(model, val_loader, converter, ctx, device):
out = {}
model.eval()
losses = 0
max_iters = 100
iter_num = 0
for inputs in val_loader:
if iter_num >= max_iters:
break
iter_num += 1
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
with ctx:
logits, loss = model(X, Y)
#loss = model.loss(logits, Y)
losses += loss.item()
out['val'] = losses / max_iters
model.train()
return out
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and training process
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
os.makedirs(opt.ckpt_path, exist_ok=True)
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)

132
hw4/report/dtx-style.sty Normal file
View File

@@ -0,0 +1,132 @@
%%
%% This is file `dtx-style.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `dtx-style')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\ProvidesPackage{dtx-style}
\RequirePackage{hypdoc}
\RequirePackage[UTF8,scheme=chinese]{ctex}
\RequirePackage{newpxtext}
\RequirePackage{newpxmath}
\RequirePackage[
top=2.5cm, bottom=2.5cm,
left=4cm, right=2cm,
headsep=3mm]{geometry}
\RequirePackage{array,longtable,booktabs}
\RequirePackage{listings}
\RequirePackage{fancyhdr}
\RequirePackage{xcolor}
\RequirePackage{enumitem}
\RequirePackage{etoolbox}
\RequirePackage{metalogo}
\colorlet{thu@macro}{blue!60!black}
\colorlet{thu@env}{blue!70!black}
\colorlet{thu@option}{purple}
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
\def\DescribeOption{%
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
\Describe@Option}
\def\Describe@Option#1{\endgroup
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
\thu@special@index{option}{#1}\@esphack\ignorespaces}
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
\def\thu@special@index#1#2{\@bsphack
\begingroup
\HD@target
\let\HDorg@encapchar\encapchar
\edef\encapchar usage{%
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
}%
\index{#2\actualchar{\string\ttfamily\space#2}
(#1)\encapchar usage}%
\index{#1:\levelchar#2\actualchar
{\string\ttfamily\space#2}\encapchar usage}%
\endgroup
\@esphack}
\lstdefinestyle{lstStyleBase}{%
basicstyle=\small\ttfamily,
aboveskip=\medskipamount,
belowskip=\medskipamount,
lineskip=0pt,
boxpos=c,
showlines=false,
extendedchars=true,
upquote=true,
tabsize=2,
showtabs=false,
showspaces=false,
showstringspaces=false,
numbers=none,
linewidth=\linewidth,
xleftmargin=4pt,
xrightmargin=0pt,
resetmargins=false,
breaklines=true,
breakatwhitespace=false,
breakindent=0pt,
breakautoindent=true,
columns=flexible,
keepspaces=true,
gobble=2,
framesep=3pt,
rulesep=1pt,
framerule=1pt,
backgroundcolor=\color{gray!5},
stringstyle=\color{green!40!black!100},
keywordstyle=\bfseries\color{blue!50!black},
commentstyle=\slshape\color{black!60}}
\lstdefinestyle{lstStyleShell}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{purple},
language=bash}
\lstdefinestyle{lstStyleLaTeX}{%
style=lstStyleBase,
frame=l,
rulecolor=\color{violet},
language=[LaTeX]TeX}
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
\setlist{nosep}
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
\DeclareDocumentCommand{\pkg}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
\DeclareDocumentCommand{\file}{s m}{%
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
\newcommand{\myentry}[1]{%
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
\newcommand{\note}[2][Note]{{%
\color{magenta}{\bfseries #1}\emph{#2}}}
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}

153
hw4/report/iidef.sty Normal file
View File

@@ -0,0 +1,153 @@
%%
%% This is file `iidef.sty',
%% generated with the docstrip utility.
%%
%% The original source files were:
%%
%% thucoursework.dtx (with options: `sty')
%%
%% This is a generated file.
%%
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
%%
%% This work may be distributed and/or modified under the
%% conditions of the LaTeX Project Public License, either version 1.3
%% of this license or (at your option) any later version.
%% The latest version of this license is in
%% http://www.latex-project.org/lppl.txt
%% and version 1.3 or later is part of all distributions of LaTeX
%% version 2005/12/01 or later.
%%
%% To produce the documentation run the original source files ending with `.dtx'
%% through LaTeX.
%%
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
\ProvidesClass{iidef}
[2020/09/09 2.6 Tsinghua University Coursework Template]
%% configuration of nested enumerate env
\RequirePackage{enumitem}
%% set hwcount key-value option
\RequirePackage{kvoptions}
%% required by macro DeclareMathOperator
\RequirePackage{amsmath}
%% Set up page headers using with fancyhdr
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
{\def\@thulhead{thulhead}}
\RequirePackage{amsthm}
%% semester
\def\@term{term}
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
%% institute
\newcommand{\@courseinstitute}[1]{institute}
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
%% coursename
\newcommand{\@coursename}[1]{coursename}
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
%% user can rewrite homework name
\def\@hwname{Homework}
\def\hwname#1{\renewcommand\@hwname{#1}}
%% \iidef@thehwcnt = 1
\DeclareStringOption[1]{thehwcnt}
\ProcessKeyvalOptions*
\def\thehwcnt{\iidef@thehwcnt}
%% page header setup, distinguish between first page(plain style)
%% and second page on (runningpage style)
%%***************************************************************************
\newcommand{\courseheader}{
\thispagestyle{plain}%first page use native plain style to suppress header
\vspace*{-1in}
\begin{center}
\@courseinstitute\\
\@coursename\\
\@term
\vspace*{0.1in}
\hrule
\end{center}
\begin{center}
\underline{\bf \@hwname\;\thehwcnt} \\
\end{center}
}
\@ifundefined{@thulhead}{
\fancypagestyle{runningpage}
{
\fancyhead[L]{\small\@coursename}
\fancyhead[R]{\small\@courseinstitute}
}
%% use runningpage style from second page on
\pagestyle{runningpage}
}{}
%% *********************************************************************************************
%%name command macro
%%*************************
\newcommand{\name}[1]{
\begin{flushleft}
#1\hfill
\today
\end{flushleft}
\hrule
\vspace{2em}
\flushleft
}
%%*************************
%% enumitem related configuration
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
\setlist[enumerate,2]{label=(\alph*)}
\setlist[enumerate,3]{label=\roman*.}
\setlist[enumerate,4]{label=\greek*}
%%******************************
\def\@slname{Solution}
\def\slname#1{\renewcommand\@slname{#1}}
\@ifundefined{solution}{
\newenvironment{solution}
{
\proof[\@slname]
}
{
%% no qed symbol in solution env
\renewcommand{\qedsymbol}{}
\endproof
}
}{}
%%******************************
%%common math symbols go here
%%*************************************************
\def\v#1{\underline{#1}}
\newcommand{\uc}{\underline{c}} % c, vec
\newcommand{\uv}{\underline{v}} % x, vec
\newcommand{\uw}{\underline{w}} % w, vec
\newcommand{\ux}{\underline{x}} % x, vec
\newcommand{\uy}{\underline{y}} % y, vec
\newcommand{\uz}{\underline{z}} % z, vec
\newcommand{\um}{\underline{m}} % m, vec
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
\newcommand{\defas}{\triangleq} %\coloneqq
\newcommand{\reals}{\mathbb{R}}
\newcommand{\TT}{\mathrm{T}} % transpose
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argsup}{arg\,sup}
\DeclareMathOperator*{\arginf}{arg\,inf}
\DeclareMathOperator{\diag}{diag}
\DeclareMathOperator{\Var}{Var}
\DeclareMathOperator{\Cov}{Cov}
\DeclareMathOperator{\MSE}{MSE}
\DeclareMathOperator{\1}{\mathds{1}}
\DeclareMathOperator{\In}{\mathbb{I}}
\DeclareMathOperator{\E}{\mathbb{E}}
\DeclareMathOperator{\Prob}{\mathbb{P}}
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
%%************************************************************************************

100
hw4/report/main.tex Normal file
View File

@@ -0,0 +1,100 @@
% Homework template for Inference and Information
% UPDATE: September 26, 2017 by Xiangxiang
\documentclass[a4paper]{article}
\usepackage{ctex}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{moreenum}
\usepackage{mathtools}
\usepackage{url}
\usepackage{bm}
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{listings}
\usepackage{color}
\lstset{
basicstyle = \sffamily, % 基本代码风格
keywordstyle = \bfseries, % 关键字风格
commentstyle = \rmfamily\itshape, % 注释的风格,斜体
stringstyle = \ttfamily, % 字符串风格
flexiblecolumns, % 别问为什么,加上这个
numbers = left, % 行号的位置在左边
showspaces = false, % 是否显示空格,显示了有点乱,所以不现实了
numberstyle = \zihao{-5}\ttfamily, % 行号的样式小五号tt等宽字体
showstringspaces = false,
captionpos = t, % 这段代码的名字所呈现的位置t指的是top上面
frame = lrtb, % 显示边框
}
\lstdefinestyle{Python}{
language = Python, % 语言选Python
basicstyle = \zihao{-5}\ttfamily,
numberstyle = \zihao{-5}\ttfamily,
keywordstyle = \color{blue},
keywordstyle = [2] \color{teal},
stringstyle = \color{magenta},
commentstyle = \color{red}\ttfamily,
breaklines = true, % 自动换行,建议不要写太长的行
columns = fixed, % 如果不加这一句,字间距就不固定,很丑,必须加
basewidth = 0.5em,
}
\usepackage{subcaption}
\usepackage{booktabs} % toprule
\usepackage[mathcal]{eucal}
\usepackage[thehwcnt = 4]{iidef}
\thecourseinstitute{清华大学电子工程系}
\thecoursename{\textbf{媒体与认知}}
\theterm{2023-2024学年春季学期}
\hwname{作业}
\begin{document}
\courseheader
\name{YOUR NAME}
\vspace{3mm}
\centerline{\textbf{\Large{理论部分}}}
\section{单选题15分}
\subsection{\underline{?}}
\subsection{\underline{?}}
\subsection{\underline{?}}
\subsection{\underline{?}}
\subsection{\underline{?}}
\section{计算题15 分)}
% 计算题1
\subsection{隐含马尔可夫模型}
\hspace{2em}暑假中小E每天进行一项体育活动包括跑步R、游泳S和打球B所选择的体育活动受某种潜在因素如心情的影响。小E每天把进行体育活动的照片发至微信朋友圈我们可以根据观测信息推测该潜在因素的状态。
\hspace{2em}假设该潜在因素分为$S_1$$S_2$两种状态。在$S_1$小E选择三种体育活动的概率分别为0.60.20.2;在$S_2$小E选择三种体育活动的概率分别为0.10.60.3。
\hspace{2em}该潜在因素的变化也有一定规律,若某天处于$S_1$的状态,第二天处于$S_1$$S_2$的状态的概率分别为0.50.5;若某天处于$S_2$的状态,第二天处于$S_1$$S_2$的状态的概率分别为0.60.4。
\hspace{2em}暑假第一天处于$S_1$$S_2$的状态的概率均为0.5。
\vspace{3mm}
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}
\vspace{3mm}
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步R、打球B和游泳S{\color{blue}请计算出现该观测序列的概率}
\vspace{3mm}
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
\section{编程作业报告}
\section{自选课题工作进度汇报}
\end{document}
%%% Local Variables:
%%% mode: late\rvx
%%% TeX-master: t
%%% End: