TA Release homework4.

This commit is contained in:
unlockable
2024-05-22 20:22:47 +08:00
parent c850f38778
commit c6b2420b85
12 changed files with 14707 additions and 0 deletions

104
hw4/code/attnvis.ipynb Normal file
View File

@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"from contextlib import nullcontext\n",
"import torch\n",
"from model import GPTConfig, GPT\n",
"from bertviz import head_view\n",
"from dataset import Converter, LMDataset\n",
"\n",
"# set random seed for reproducibility\n",
"seed = 2024\n",
"torch.manual_seed(seed)\n",
"torch.cuda.manual_seed(seed)\n",
"torch.cuda.manual_seed_all(seed)\n",
"torch.backends.cudnn.deterministic = True\n",
"torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
"torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
"\n",
"#################################################\n",
"# \n",
"model_name = 'mygpt'\n",
"ckpt_path = 'workdirs/quansongci'\n",
"data_root = 'data/quansongci'\n",
"vis_text_path = 'data/vis/vis_1.txt'\n",
"#################################################\n",
"\n",
"device = 'cpu'\n",
"\n",
"dataset = LMDataset(data_root, 'train')\n",
"converter = Converter(dataset.stoi, dataset.itos)\n",
"\n",
"\n",
"with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
" start = f.read()\n",
"start_ids = converter.single_encode(start)\n",
"start_texts = [c for c in start]\n",
"x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
"print(f\"Input texts: {start}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0792738",
"metadata": {},
"outputs": [],
"source": [
"# model\n",
"dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
"ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
"ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
"# init from a model saved in a specific directory\n",
"ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
"print(\"loading model params from %s\"%ckpt_path)\n",
"checkpoint = torch.load(ckpt_path, map_location=device)\n",
"gptconf = GPTConfig[model_name]\n",
"if 'model_args' in checkpoint:\n",
" gptconf = checkpoint['model_args']\n",
"model = GPT(**gptconf)\n",
"state_dict = checkpoint['state_dict']\n",
"model.load_state_dict(state_dict)\n",
"\n",
"model.eval()\n",
"model.to(device)\n",
"\n",
"# run generation\n",
"with torch.no_grad():\n",
" with ctx:\n",
" _, attn_weights = model(x)\n",
"\n",
"head_view(attn_weights, start_texts)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
+++如梦令
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。

View File

@@ -0,0 +1,3 @@
+++鹧鸪天(秋思)
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。

75
hw4/code/dataset.py Normal file
View File

@@ -0,0 +1,75 @@
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import json
class LMDataset(Dataset):
def __init__(self, data_dir, split):
super().__init__()
# load the data
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
meta = json.load(f)
self.data = meta['data'] # list of samples
self.stoi = meta['stoi'] # a dict that maps character to integer
self.itos = meta['itos'] # a dict that maps string of integer to character
self.vocab_size = meta['vocab_size'] # vocab size
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
class Converter:
'''
This class helps us convert strings to integers and back
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
'''
def __init__(self, stoi, itos):
self.stoi = stoi # a dict that maps character to integer
self.itos = itos # a dict that maps string of integer to character
def single_encode(self, s):
l = [] # initialize an empty list
for i in s:
l.append(self.stoi[i])
# transform the list into a numpy array
l = np.array(l, dtype=np.int64)
return l
def single_decode(self, l):
s = '' # initialize an empty string
for i in l:
# if we meet the end of the sequence (the value of integer is equal to 1), break
if i == 1:
break
# convert string of the integer into a character
s += self.itos[str(i)]
return s
def encode(self, data):
'''
encode a list of strings into integers
'''
lens = [len(s) for s in data]
max_len = max(lens)
out = np.zeros((len(data), max_len+1), dtype=np.int64)
for i,s in enumerate(data):
out[i,:len(s)] = self.single_encode(s)
out[i,len(s)] = 1
x = torch.from_numpy(out[:,:-1])
y = torch.from_numpy(out[:,1:])
return x, y
def decode(self, data):
'''
decode a list of integers into strings
'''
data = data.cpu().numpy().astype(np.int64)
out = []
for i in range(len(data)):
out.append(self.single_decode(data[i]))
return out

356
hw4/code/model.py Normal file
View File

@@ -0,0 +1,356 @@
# ========================================================
# Media and Cognition
# Homework 4 Sequence Modeling
# model.py - Model definition
# Student ID:
# Name:
# Tsinghua University
# (C) Copyright 2024
# ========================================================
# Import required libraries
############################################################
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
############################################################
# Define the GELU activation function used in OpenAI GPT
############################################################
def gelu(z):
"""
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
"""
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
############################################################
# Define the Multi-Head SelfAttention module
############################################################
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_head, dropout):
super().__init__()
# define there linear layers for q, k, v generation separately
self.q_layer = nn.Linear(embed_dim, embed_dim)
self.k_layer = nn.Linear(embed_dim, embed_dim)
self.v_layer = nn.Linear(embed_dim, embed_dim)
# define the projection layer for output
self.proj_layer = nn.Linear(embed_dim, embed_dim)
# define the dropout layer for attention and output calculation
self.attn_drop = nn.Dropout(dropout)
self.proj_drop = nn.Dropout(dropout)
self.num_head = num_head
self.head_dim = embed_dim // num_head
def forward(self, x):
batch_size, seq_len, dim = x.shape
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
q = ???
k = ???
v = ???
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
q = ???
k = ???
v = ???
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
q = ???
k = ???
v = ???
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
attn = ???
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
# Therefore, a mask is used to prevent positions from attending to subsequent positions
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
# Hint:
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
attn_mask = ???
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
attn_mask = ???
# use Tensor.bool() to convert the matrix to a boolean matrix
attn_mask = ???
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
attn = ???
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
attn = ???
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
attn = ???
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
out = ???
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
out = ???
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
result = ???
# <<< TODO 1
# return the final results `result` and attention weights `attn`
return result, attn
############################################################
# Define the feed forward network (FFN)
############################################################
class FFN(nn.Module):
def __init__(self, embed_dim, feedforward_dim, dropout):
super().__init__()
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
self.drop = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = gelu(x)
x = self.fc2(x)
x = self.drop(x)
return x
############################################################
# Define the TransformerLayer
############################################################
class TransformerLayer(nn.Module):
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
super().__init__()
self.norm1 = nn.LayerNorm(embed_dim)
self.attn = SelfAttention(embed_dim, num_head, dropout)
self.norm2 = nn.LayerNorm(embed_dim)
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
self.no_res = no_res # whether to use residual connection
def forward(self, x):
# >>> TODO 2: complete the forward process of the TransformerLayer module.
# Step 2.1: calculate the output of multi-head self-attention
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
x_norm = ???
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
x_attn, attn = ???
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
if ???:
x_attn = ???
# Step 2.2: calculate the output of feed forward network
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
x_ffn = ???
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
if ???:
out = ???
else:
out = ???
# <<< TODO 2
return out, attn
############################################################
# Define the GPT module
############################################################
class GPT(nn.Module):
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
'''
vocab_size: the size of vocabulary
max_seq_len: the maximum length of input texts
num_layer: the number of transformer layers
embed_dim: the embedding dimension
num_head: the number of heads in Multi-Head Self Attention
feedforward_dim: the dimension in the feed forward network
dropout: dropout ratio
no_res: whether to use residual connection in transformer layers
no_pos: whether to use position embeddings
'''
super().__init__()
self.num_layer = num_layer
self.max_seq_len = max_seq_len
self.no_pos = no_pos
# Define Embedding Layer to transfer input text tokens and positions to embeddings
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
self.drop = nn.Dropout(dropout)
# Define the transformer layers
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
# Define the head layer to predict output
self.norm = nn.LayerNorm(embed_dim)
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
"""
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
Reference: https://paperswithcode.com/method/weight-tying
"""
self.word_token_embedding.weight = self.language_model_head.weight
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
if m.bias is not None:
torch.nn.init.zeros_(m.bias)
elif isinstance(m, nn.Embedding):
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('proj_layer.weight'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
def forward(self, word_idx, targets=None):
batch_size, seq_len = word_idx.shape
# >>> TODO 3: complete the forward process of GPT
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
pos = ???
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
token_embed = ???
pos_embed = ???
# Step 3.3: initialize the input embeddings `x` of transformer layers
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
if ???:
x = ???
else:
x = ???
# apply dropout to the input embeddings via `self.drop()`
x = ???
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
attention_weights = ???
for ???:
# Step 4.1: obtain the output and attention weights of transformer layers
x, attn = ???
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
???
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
# self.language_model_head() is a linear layer defined in __init__() function
# Note: do not add softmax here since it is included in the cross entropy loss function
x = ???
logits = ???
# <<< TODO 3
# return logits and loss or attention weights
if targets is not None:
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
return logits, loss
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
return logits, attention_weights
def configure_optimizers(self, weight_decay):
"""
This long function is unfortunately doing something very simple and is being very defensive:
We are separating out all parameters of the model into two buckets: those that will experience
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
We are then returning the PyTorch optimizer object.
"""
# separate out all parameters to those that will and won't experience regularizing weight decay
decay = set()
no_decay = set()
whitelist_weight_modules = (nn.Linear, )
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
for mn, m in self.named_modules():
for pn, p in m.named_parameters():
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
# random note: because named_modules and named_parameters are recursive
# we will see the same tensors p many many times. but doing it this way
# allows us to know which parent module any tensor p belongs to...
if pn.endswith('bias'):
# all biases will not be decayed
no_decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
# weights of whitelist modules will be weight decayed
decay.add(fpn)
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
# weights of blacklist modules will NOT be weight decayed
no_decay.add(fpn)
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
# will appear in the no_decay and decay sets respectively after the above.
# In addition, because named_parameters() doesn't return duplicates, it
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
# so let's manually remove 'lm_head.weight' from decay set. This will include
# this tensor into optimization via transformer.wte.weight only, and not decayed.
decay.remove('language_model_head.weight')
# validate that we considered every parameter
param_dict = {pn: p for pn, p in self.named_parameters()}
inter_params = decay & no_decay
union_params = decay | no_decay
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
% (str(param_dict.keys() - union_params), )
# create the pytorch optimizer object
optim_groups = [
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
]
return optim_groups
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
"""
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx.squeeze().cpu().numpy()
############################################################
GPTConfig = {
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
}

61
hw4/code/prepare.py Normal file
View File

@@ -0,0 +1,61 @@
"""
Prepare the dataset for character-level language modeling.
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
"""
import os
import numpy as np
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
args = parser.parse_args()
# set the input file path
input_file_path = os.path.join(args.data_root, 'data.json')
with open(input_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)['data']
print(f"length of dataset: {len(data):,}")
# get all the unique characters that occur in this text
chars = sorted(list(set(''.join(data))))
vocab_size = len(chars) + 2 # for <pad> and <eos>
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")
# create a mapping from characters to integers
stoi = { ch:i+2 for i,ch in enumerate(chars) }
itos = { i+2:ch for i,ch in enumerate(chars) }
stoi['<pad>'] = 0
itos[0] = '<pad>'
stoi['<eos>'] = 1
itos[1] = '<eos>'
# create the train and test splits
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]
print(f"train has {len(train_data):,} samples")
print(f"val has {len(val_data):,} samples")
# save the meta information as well, to help us encode/decode later
train_meta = {
'data': train_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
json.dump(train_meta, f, ensure_ascii=False, indent=4)
val_meta = {
'data': val_data,
'vocab_size': vocab_size,
'itos': itos,
'stoi': stoi,
}
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
json.dump(val_meta, f, ensure_ascii=False, indent=4)

76
hw4/code/sample.py Normal file
View File

@@ -0,0 +1,76 @@
"""
Sample from a trained model
"""
import os
import pickle
from contextlib import nullcontext
import torch
from model import GPTConfig, GPT
import argparse
from dataset import Converter, LMDataset
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
dataset = LMDataset(data_root, 'train')
converter = Converter(dataset.stoi, dataset.itos)
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
# model
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
# init from a model saved in a specific directory
ckpt_path = os.path.join(ckpt_path, 'best.pth')
print("sample from %s"%ckpt_path)
checkpoint = torch.load(ckpt_path, map_location=device)
gptconf = GPTConfig[model_name]
if 'model_args' in checkpoint:
gptconf = checkpoint['model_args']
model = GPT(**gptconf)
state_dict = checkpoint['state_dict']
#unwanted_prefix = '_orig_mod.'
#for k,v in list(state_dict.items()):
# if k.startswith(unwanted_prefix):
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.eval()
model.to(device)
# encode the beginning of the prompt
start_ids = converter.single_encode(start)
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
# run generation
with torch.no_grad():
with ctx:
for k in range(num_samples):
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
print(converter.single_decode(y))
print('---------------')
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and sampling process
parser = argparse.ArgumentParser()
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)

219
hw4/code/train.py Normal file
View File

@@ -0,0 +1,219 @@
import os
import time
import math
import pickle
from contextlib import nullcontext
import argparse
import numpy as np
import torch
from torch.utils.data import DataLoader
from model import GPT, GPTConfig
from dataset import LMDataset, Converter
import matplotlib.pyplot as plt
# learning rate decay scheduler (cosine with warmup)
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
train_dataset = LMDataset(data_root, 'train')
val_dataset = LMDataset(data_root, 'val')
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
converter = Converter(train_dataset.stoi, train_dataset.itos)
# adamw optimizer
learning_rate = 5e-3 # max learning rate
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# system
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
best_val_loss = 1e9
iter_num = 0 # number of iterations in the lifetime of this process
# model init
model_args = GPTConfig[model_name]
model_args['vocab_size'] = train_dataset.vocab_size
model_args['max_seq_len'] = 128
model_args['no_res'] = no_res
model_args['no_pos'] = no_pos
# init a new model from scratch
print("Initializing a new model from scratch")
model = GPT(**model_args)
model.to(device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optim_groups = model.configure_optimizers(weight_decay)
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
checkpoint = None # free up memory
print('training...')
# training loop
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
t0 = time.time()
model.train()
train_losses = []
val_losses = []
for epoch in range(epoch_num):
for step, inputs in enumerate(train_loader):
if iter_num >= n_iters:
break
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# forward backward update, with optional gradient accumulation to simulate larger batch size
# and using the GradScaler if data type is float16
with ctx:
logits, loss = model(X, Y)
loss = loss # scale the loss to account for gradient accumulation
# backward pass, with gradient scaling if training in fp16
scaler.scale(loss).backward()
# clip the gradient
if grad_clip != 0.0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# step the optimizer and scaler if training in fp16
scaler.step(optimizer)
scaler.update()
# flush the gradients as soon as we can, no need for this memory anymore
optimizer.zero_grad(set_to_none=True)
iter_num += 1
train_losses.append(loss.item())
# evaluate the loss on train/val sets and write checkpoints
if iter_num % val_interval == 0:
# timing and logging
t1 = time.time()
dt = t1 - t0
t0 = t1
lossf = loss.item()
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
losses = estimate_loss(model, val_loader, converter, ctx, device)
val_losses.append(losses['val'])
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
print(f"saving latest checkpoint to {ckpt_path}")
checkpoint = {
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
'model_args': model_args,
'iter_num': iter_num,
'best_val_loss': best_val_loss,
}
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
if losses['val'] < best_val_loss:
best_val_loss = losses['val']
if iter_num > 0:
print(f"saving best checkpoint to {ckpt_path}")
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
# create a plot
f, ax = plt.subplots(1,2,figsize=(18,6))
val_iters = np.arange(1, n_iters+1, val_interval)
# draw loss
ax[0].plot(train_losses)
ax[0].plot(val_iters, val_losses, 'r')
# set labels
ax[0].set_xlabel('training iters')
ax[0].legend(['training loss', 'validation loss'])
train_perplexity = [np.exp(x) for x in train_losses]
val_perplexity = [np.exp(x) for x in val_losses]
# draw perplexity
ax[1].plot(train_perplexity)
ax[1].plot(val_iters, val_perplexity, 'r')
# set labels
ax[1].set_xlabel('training iters')
ax[1].legend(['training perplexity', 'validation perplexity'])
plt.tight_layout()
# show the image
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
plt.show()
# helps estimate an arbitrarily accurate loss over either split using many batches
@torch.no_grad()
def estimate_loss(model, val_loader, converter, ctx, device):
out = {}
model.eval()
losses = 0
max_iters = 100
iter_num = 0
for inputs in val_loader:
if iter_num >= max_iters:
break
iter_num += 1
X, Y = converter.encode(inputs)
X, Y = X.to(device), Y.to(device)
with ctx:
logits, loss = model(X, Y)
#loss = model.loss(logits, Y)
losses += loss.item()
out['val'] = losses / max_iters
model.train()
return out
if __name__ == '__main__':
# set random seed for reproducibility
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
# set configurations of the model and training process
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
parser.add_argument('--device', type=str, help='cpu or cuda')
opt = parser.parse_args()
if opt.device is None:
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
os.makedirs(opt.ckpt_path, exist_ok=True)
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)