TA Release homework4.
This commit is contained in:
104
hw4/code/attnvis.ipynb
Normal file
104
hw4/code/attnvis.ipynb
Normal file
@@ -0,0 +1,104 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from contextlib import nullcontext\n",
|
||||
"import torch\n",
|
||||
"from model import GPTConfig, GPT\n",
|
||||
"from bertviz import head_view\n",
|
||||
"from dataset import Converter, LMDataset\n",
|
||||
"\n",
|
||||
"# set random seed for reproducibility\n",
|
||||
"seed = 2024\n",
|
||||
"torch.manual_seed(seed)\n",
|
||||
"torch.cuda.manual_seed(seed)\n",
|
||||
"torch.cuda.manual_seed_all(seed)\n",
|
||||
"torch.backends.cudnn.deterministic = True\n",
|
||||
"torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
|
||||
"torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
|
||||
"\n",
|
||||
"#################################################\n",
|
||||
"# \n",
|
||||
"model_name = 'mygpt'\n",
|
||||
"ckpt_path = 'workdirs/quansongci'\n",
|
||||
"data_root = 'data/quansongci'\n",
|
||||
"vis_text_path = 'data/vis/vis_1.txt'\n",
|
||||
"#################################################\n",
|
||||
"\n",
|
||||
"device = 'cpu'\n",
|
||||
"\n",
|
||||
"dataset = LMDataset(data_root, 'train')\n",
|
||||
"converter = Converter(dataset.stoi, dataset.itos)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"with open(vis_text_path, 'r', encoding='utf-8') as f:\n",
|
||||
" start = f.read()\n",
|
||||
"start_ids = converter.single_encode(start)\n",
|
||||
"start_texts = [c for c in start]\n",
|
||||
"x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])\n",
|
||||
"print(f\"Input texts: {start}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c0792738",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# model\n",
|
||||
"dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'\n",
|
||||
"ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]\n",
|
||||
"ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)\n",
|
||||
"# init from a model saved in a specific directory\n",
|
||||
"ckpt_path = os.path.join(ckpt_path, 'best.pth')\n",
|
||||
"print(\"loading model params from %s\"%ckpt_path)\n",
|
||||
"checkpoint = torch.load(ckpt_path, map_location=device)\n",
|
||||
"gptconf = GPTConfig[model_name]\n",
|
||||
"if 'model_args' in checkpoint:\n",
|
||||
" gptconf = checkpoint['model_args']\n",
|
||||
"model = GPT(**gptconf)\n",
|
||||
"state_dict = checkpoint['state_dict']\n",
|
||||
"model.load_state_dict(state_dict)\n",
|
||||
"\n",
|
||||
"model.eval()\n",
|
||||
"model.to(device)\n",
|
||||
"\n",
|
||||
"# run generation\n",
|
||||
"with torch.no_grad():\n",
|
||||
" with ctx:\n",
|
||||
" _, attn_weights = model(x)\n",
|
||||
"\n",
|
||||
"head_view(attn_weights, start_texts)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
13426
hw4/code/data/quansongci/data.json
Normal file
13426
hw4/code/data/quansongci/data.json
Normal file
File diff suppressed because it is too large
Load Diff
2
hw4/code/data/vis/vis_1.txt
Normal file
2
hw4/code/data/vis/vis_1.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
+++如梦令
|
||||
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。
|
||||
3
hw4/code/data/vis/vis_2.txt
Normal file
3
hw4/code/data/vis/vis_2.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
+++鹧鸪天(秋思)
|
||||
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
|
||||
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。
|
||||
75
hw4/code/dataset.py
Normal file
75
hw4/code/dataset.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
class LMDataset(Dataset):
|
||||
def __init__(self, data_dir, split):
|
||||
super().__init__()
|
||||
# load the data
|
||||
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
|
||||
meta = json.load(f)
|
||||
|
||||
self.data = meta['data'] # list of samples
|
||||
self.stoi = meta['stoi'] # a dict that maps character to integer
|
||||
self.itos = meta['itos'] # a dict that maps string of integer to character
|
||||
self.vocab_size = meta['vocab_size'] # vocab size
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.data[index]
|
||||
|
||||
class Converter:
|
||||
'''
|
||||
This class helps us convert strings to integers and back
|
||||
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
|
||||
'''
|
||||
def __init__(self, stoi, itos):
|
||||
self.stoi = stoi # a dict that maps character to integer
|
||||
self.itos = itos # a dict that maps string of integer to character
|
||||
|
||||
def single_encode(self, s):
|
||||
l = [] # initialize an empty list
|
||||
for i in s:
|
||||
l.append(self.stoi[i])
|
||||
# transform the list into a numpy array
|
||||
l = np.array(l, dtype=np.int64)
|
||||
return l
|
||||
|
||||
def single_decode(self, l):
|
||||
s = '' # initialize an empty string
|
||||
for i in l:
|
||||
# if we meet the end of the sequence (the value of integer is equal to 1), break
|
||||
if i == 1:
|
||||
break
|
||||
# convert string of the integer into a character
|
||||
s += self.itos[str(i)]
|
||||
return s
|
||||
|
||||
|
||||
def encode(self, data):
|
||||
'''
|
||||
encode a list of strings into integers
|
||||
'''
|
||||
lens = [len(s) for s in data]
|
||||
max_len = max(lens)
|
||||
out = np.zeros((len(data), max_len+1), dtype=np.int64)
|
||||
for i,s in enumerate(data):
|
||||
out[i,:len(s)] = self.single_encode(s)
|
||||
out[i,len(s)] = 1
|
||||
x = torch.from_numpy(out[:,:-1])
|
||||
y = torch.from_numpy(out[:,1:])
|
||||
return x, y
|
||||
|
||||
def decode(self, data):
|
||||
'''
|
||||
decode a list of integers into strings
|
||||
'''
|
||||
data = data.cpu().numpy().astype(np.int64)
|
||||
out = []
|
||||
for i in range(len(data)):
|
||||
out.append(self.single_decode(data[i]))
|
||||
return out
|
||||
356
hw4/code/model.py
Normal file
356
hw4/code/model.py
Normal file
@@ -0,0 +1,356 @@
|
||||
# ========================================================
|
||||
# Media and Cognition
|
||||
# Homework 4 Sequence Modeling
|
||||
# model.py - Model definition
|
||||
# Student ID:
|
||||
# Name:
|
||||
# Tsinghua University
|
||||
# (C) Copyright 2024
|
||||
# ========================================================
|
||||
|
||||
|
||||
# Import required libraries
|
||||
############################################################
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
import numpy as np
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the GELU activation function used in OpenAI GPT
|
||||
############################################################
|
||||
def gelu(z):
|
||||
"""
|
||||
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
|
||||
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
|
||||
"""
|
||||
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the Multi-Head SelfAttention module
|
||||
############################################################
|
||||
class SelfAttention(nn.Module):
|
||||
|
||||
def __init__(self, embed_dim, num_head, dropout):
|
||||
super().__init__()
|
||||
|
||||
# define there linear layers for q, k, v generation separately
|
||||
self.q_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.k_layer = nn.Linear(embed_dim, embed_dim)
|
||||
self.v_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the projection layer for output
|
||||
self.proj_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
# define the dropout layer for attention and output calculation
|
||||
self.attn_drop = nn.Dropout(dropout)
|
||||
self.proj_drop = nn.Dropout(dropout)
|
||||
|
||||
self.num_head = num_head
|
||||
self.head_dim = embed_dim // num_head
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
batch_size, seq_len, dim = x.shape
|
||||
|
||||
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
|
||||
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||
q = ???
|
||||
k = ???
|
||||
v = ???
|
||||
|
||||
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||
attn = ???
|
||||
|
||||
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||
# Hint:
|
||||
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||
attn_mask = ???
|
||||
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||
attn_mask = ???
|
||||
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||
attn_mask = ???
|
||||
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||
attn = ???
|
||||
|
||||
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||
attn = ???
|
||||
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||
attn = ???
|
||||
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||
out = ???
|
||||
|
||||
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||
out = ???
|
||||
|
||||
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||
result = ???
|
||||
# <<< TODO 1
|
||||
|
||||
# return the final results `result` and attention weights `attn`
|
||||
return result, attn
|
||||
|
||||
############################################################
|
||||
|
||||
# Define the feed forward network (FFN)
|
||||
############################################################
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, embed_dim, feedforward_dim, dropout):
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
|
||||
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
|
||||
self.drop = nn.Dropout(dropout)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = gelu(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
############################################################
|
||||
|
||||
# Define the TransformerLayer
|
||||
############################################################
|
||||
class TransformerLayer(nn.Module):
|
||||
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(embed_dim)
|
||||
self.attn = SelfAttention(embed_dim, num_head, dropout)
|
||||
self.norm2 = nn.LayerNorm(embed_dim)
|
||||
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
|
||||
self.no_res = no_res # whether to use residual connection
|
||||
|
||||
def forward(self, x):
|
||||
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||
# Step 2.1: calculate the output of multi-head self-attention
|
||||
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||
x_norm = ???
|
||||
|
||||
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||
x_attn, attn = ???
|
||||
|
||||
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||
if ???:
|
||||
x_attn = ???
|
||||
|
||||
# Step 2.2: calculate the output of feed forward network
|
||||
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||
x_ffn = ???
|
||||
|
||||
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||
if ???:
|
||||
out = ???
|
||||
else:
|
||||
out = ???
|
||||
# <<< TODO 2
|
||||
|
||||
return out, attn
|
||||
############################################################
|
||||
|
||||
# Define the GPT module
|
||||
############################################################
|
||||
class GPT(nn.Module):
|
||||
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
|
||||
'''
|
||||
vocab_size: the size of vocabulary
|
||||
max_seq_len: the maximum length of input texts
|
||||
num_layer: the number of transformer layers
|
||||
embed_dim: the embedding dimension
|
||||
num_head: the number of heads in Multi-Head Self Attention
|
||||
feedforward_dim: the dimension in the feed forward network
|
||||
dropout: dropout ratio
|
||||
no_res: whether to use residual connection in transformer layers
|
||||
no_pos: whether to use position embeddings
|
||||
'''
|
||||
super().__init__()
|
||||
self.num_layer = num_layer
|
||||
self.max_seq_len = max_seq_len
|
||||
self.no_pos = no_pos
|
||||
|
||||
# Define Embedding Layer to transfer input text tokens and positions to embeddings
|
||||
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
|
||||
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
|
||||
|
||||
self.drop = nn.Dropout(dropout)
|
||||
# Define the transformer layers
|
||||
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
|
||||
|
||||
# Define the head layer to predict output
|
||||
self.norm = nn.LayerNorm(embed_dim)
|
||||
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
|
||||
|
||||
"""
|
||||
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
|
||||
Reference: https://paperswithcode.com/method/weight-tying
|
||||
"""
|
||||
self.word_token_embedding.weight = self.language_model_head.weight
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Linear):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
if m.bias is not None:
|
||||
torch.nn.init.zeros_(m.bias)
|
||||
elif isinstance(m, nn.Embedding):
|
||||
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||
|
||||
# apply special scaled init to the residual projections, per GPT-2 paper
|
||||
for pn, p in self.named_parameters():
|
||||
if pn.endswith('proj_layer.weight'):
|
||||
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
|
||||
|
||||
|
||||
def forward(self, word_idx, targets=None):
|
||||
batch_size, seq_len = word_idx.shape
|
||||
|
||||
# >>> TODO 3: complete the forward process of GPT
|
||||
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||
pos = ???
|
||||
|
||||
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||
token_embed = ???
|
||||
pos_embed = ???
|
||||
|
||||
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||
if ???:
|
||||
x = ???
|
||||
else:
|
||||
x = ???
|
||||
|
||||
# apply dropout to the input embeddings via `self.drop()`
|
||||
x = ???
|
||||
|
||||
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||
attention_weights = ???
|
||||
for ???:
|
||||
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||
x, attn = ???
|
||||
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||
???
|
||||
|
||||
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||
# self.language_model_head() is a linear layer defined in __init__() function
|
||||
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||
x = ???
|
||||
logits = ???
|
||||
# <<< TODO 3
|
||||
|
||||
# return logits and loss or attention weights
|
||||
if targets is not None:
|
||||
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
|
||||
return logits, loss
|
||||
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
|
||||
return logits, attention_weights
|
||||
|
||||
def configure_optimizers(self, weight_decay):
|
||||
"""
|
||||
This long function is unfortunately doing something very simple and is being very defensive:
|
||||
We are separating out all parameters of the model into two buckets: those that will experience
|
||||
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
||||
We are then returning the PyTorch optimizer object.
|
||||
"""
|
||||
|
||||
# separate out all parameters to those that will and won't experience regularizing weight decay
|
||||
decay = set()
|
||||
no_decay = set()
|
||||
whitelist_weight_modules = (nn.Linear, )
|
||||
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
|
||||
for mn, m in self.named_modules():
|
||||
for pn, p in m.named_parameters():
|
||||
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
|
||||
# random note: because named_modules and named_parameters are recursive
|
||||
# we will see the same tensors p many many times. but doing it this way
|
||||
# allows us to know which parent module any tensor p belongs to...
|
||||
if pn.endswith('bias'):
|
||||
# all biases will not be decayed
|
||||
no_decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
|
||||
# weights of whitelist modules will be weight decayed
|
||||
decay.add(fpn)
|
||||
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
|
||||
# weights of blacklist modules will NOT be weight decayed
|
||||
no_decay.add(fpn)
|
||||
|
||||
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
|
||||
# will appear in the no_decay and decay sets respectively after the above.
|
||||
# In addition, because named_parameters() doesn't return duplicates, it
|
||||
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
|
||||
# so let's manually remove 'lm_head.weight' from decay set. This will include
|
||||
# this tensor into optimization via transformer.wte.weight only, and not decayed.
|
||||
decay.remove('language_model_head.weight')
|
||||
|
||||
# validate that we considered every parameter
|
||||
param_dict = {pn: p for pn, p in self.named_parameters()}
|
||||
inter_params = decay & no_decay
|
||||
union_params = decay | no_decay
|
||||
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
|
||||
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
|
||||
% (str(param_dict.keys() - union_params), )
|
||||
|
||||
# create the pytorch optimizer object
|
||||
optim_groups = [
|
||||
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
|
||||
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||
]
|
||||
return optim_groups
|
||||
|
||||
@torch.no_grad()
|
||||
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||
"""
|
||||
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
||||
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
||||
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
||||
"""
|
||||
for _ in range(max_new_tokens):
|
||||
# if the sequence context is growing too long we must crop it at block_size
|
||||
idx_cond = idx
|
||||
# forward the model to get the logits for the index in the sequence
|
||||
logits, _ = self(idx_cond)
|
||||
# pluck the logits at the final step and scale by desired temperature
|
||||
logits = logits[:, -1, :] / temperature
|
||||
# optionally crop the logits to only the top k options
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||
# apply softmax to convert logits to (normalized) probabilities
|
||||
probs = F.softmax(logits, dim=-1)
|
||||
# sample from the distribution
|
||||
idx_next = torch.multinomial(probs, num_samples=1)
|
||||
# append sampled index to the running sequence and continue
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
|
||||
return idx.squeeze().cpu().numpy()
|
||||
############################################################
|
||||
|
||||
GPTConfig = {
|
||||
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
|
||||
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
|
||||
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
|
||||
}
|
||||
61
hw4/code/prepare.py
Normal file
61
hw4/code/prepare.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""
|
||||
Prepare the dataset for character-level language modeling.
|
||||
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
||||
"""
|
||||
import os
|
||||
import numpy as np
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
# set the input file path
|
||||
input_file_path = os.path.join(args.data_root, 'data.json')
|
||||
|
||||
with open(input_file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)['data']
|
||||
print(f"length of dataset: {len(data):,}")
|
||||
|
||||
# get all the unique characters that occur in this text
|
||||
chars = sorted(list(set(''.join(data))))
|
||||
vocab_size = len(chars) + 2 # for <pad> and <eos>
|
||||
print("all the unique characters:", ''.join(chars))
|
||||
print(f"vocab size: {vocab_size:,}")
|
||||
|
||||
# create a mapping from characters to integers
|
||||
stoi = { ch:i+2 for i,ch in enumerate(chars) }
|
||||
itos = { i+2:ch for i,ch in enumerate(chars) }
|
||||
stoi['<pad>'] = 0
|
||||
itos[0] = '<pad>'
|
||||
stoi['<eos>'] = 1
|
||||
itos[1] = '<eos>'
|
||||
|
||||
|
||||
# create the train and test splits
|
||||
n = len(data)
|
||||
train_data = data[:int(n*0.9)]
|
||||
val_data = data[int(n*0.9):]
|
||||
print(f"train has {len(train_data):,} samples")
|
||||
print(f"val has {len(val_data):,} samples")
|
||||
|
||||
# save the meta information as well, to help us encode/decode later
|
||||
train_meta = {
|
||||
'data': train_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(train_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
val_meta = {
|
||||
'data': val_data,
|
||||
'vocab_size': vocab_size,
|
||||
'itos': itos,
|
||||
'stoi': stoi,
|
||||
}
|
||||
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
|
||||
json.dump(val_meta, f, ensure_ascii=False, indent=4)
|
||||
|
||||
76
hw4/code/sample.py
Normal file
76
hw4/code/sample.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Sample from a trained model
|
||||
"""
|
||||
import os
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import torch
|
||||
from model import GPTConfig, GPT
|
||||
import argparse
|
||||
from dataset import Converter, LMDataset
|
||||
|
||||
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
|
||||
dataset = LMDataset(data_root, 'train')
|
||||
converter = Converter(dataset.stoi, dataset.itos)
|
||||
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
|
||||
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
|
||||
# model
|
||||
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
# init from a model saved in a specific directory
|
||||
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||
print("sample from %s"%ckpt_path)
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
gptconf = GPTConfig[model_name]
|
||||
if 'model_args' in checkpoint:
|
||||
gptconf = checkpoint['model_args']
|
||||
model = GPT(**gptconf)
|
||||
state_dict = checkpoint['state_dict']
|
||||
#unwanted_prefix = '_orig_mod.'
|
||||
#for k,v in list(state_dict.items()):
|
||||
# if k.startswith(unwanted_prefix):
|
||||
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
model.eval()
|
||||
model.to(device)
|
||||
|
||||
# encode the beginning of the prompt
|
||||
start_ids = converter.single_encode(start)
|
||||
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
|
||||
|
||||
# run generation
|
||||
with torch.no_grad():
|
||||
with ctx:
|
||||
for k in range(num_samples):
|
||||
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
|
||||
print(converter.single_decode(y))
|
||||
print('---------------')
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and sampling process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
|
||||
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
|
||||
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
|
||||
219
hw4/code/train.py
Normal file
219
hw4/code/train.py
Normal file
@@ -0,0 +1,219 @@
|
||||
import os
|
||||
import time
|
||||
import math
|
||||
import pickle
|
||||
from contextlib import nullcontext
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from model import GPT, GPTConfig
|
||||
from dataset import LMDataset, Converter
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# learning rate decay scheduler (cosine with warmup)
|
||||
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
|
||||
# 1) linear warmup for warmup_iters steps
|
||||
if it < warmup_iters:
|
||||
return learning_rate * it / warmup_iters
|
||||
# 2) if it > lr_decay_iters, return min learning rate
|
||||
if it > lr_decay_iters:
|
||||
return min_lr
|
||||
# 3) in between, use cosine decay down to min learning rate
|
||||
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||
assert 0 <= decay_ratio <= 1
|
||||
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
||||
return min_lr + coeff * (learning_rate - min_lr)
|
||||
|
||||
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
|
||||
train_dataset = LMDataset(data_root, 'train')
|
||||
val_dataset = LMDataset(data_root, 'val')
|
||||
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
|
||||
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
|
||||
converter = Converter(train_dataset.stoi, train_dataset.itos)
|
||||
|
||||
# adamw optimizer
|
||||
learning_rate = 5e-3 # max learning rate
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.99
|
||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||
|
||||
# system
|
||||
|
||||
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||
ctx = nullcontext() if device == 'cpu' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||
best_val_loss = 1e9
|
||||
iter_num = 0 # number of iterations in the lifetime of this process
|
||||
|
||||
# model init
|
||||
model_args = GPTConfig[model_name]
|
||||
model_args['vocab_size'] = train_dataset.vocab_size
|
||||
model_args['max_seq_len'] = 128
|
||||
model_args['no_res'] = no_res
|
||||
model_args['no_pos'] = no_pos
|
||||
|
||||
# init a new model from scratch
|
||||
print("Initializing a new model from scratch")
|
||||
model = GPT(**model_args)
|
||||
|
||||
model.to(device)
|
||||
|
||||
# initialize a GradScaler. If enabled=False scaler is a no-op
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
||||
|
||||
# optimizer
|
||||
optim_groups = model.configure_optimizers(weight_decay)
|
||||
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
|
||||
checkpoint = None # free up memory
|
||||
|
||||
print('training...')
|
||||
# training loop
|
||||
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
|
||||
t0 = time.time()
|
||||
model.train()
|
||||
train_losses = []
|
||||
val_losses = []
|
||||
for epoch in range(epoch_num):
|
||||
for step, inputs in enumerate(train_loader):
|
||||
if iter_num >= n_iters:
|
||||
break
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
|
||||
for param_group in optimizer.param_groups:
|
||||
param_group['lr'] = lr
|
||||
|
||||
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
||||
# and using the GradScaler if data type is float16
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
loss = loss # scale the loss to account for gradient accumulation
|
||||
|
||||
# backward pass, with gradient scaling if training in fp16
|
||||
scaler.scale(loss).backward()
|
||||
# clip the gradient
|
||||
if grad_clip != 0.0:
|
||||
scaler.unscale_(optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
# step the optimizer and scaler if training in fp16
|
||||
scaler.step(optimizer)
|
||||
scaler.update()
|
||||
# flush the gradients as soon as we can, no need for this memory anymore
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
|
||||
iter_num += 1
|
||||
train_losses.append(loss.item())
|
||||
# evaluate the loss on train/val sets and write checkpoints
|
||||
if iter_num % val_interval == 0:
|
||||
# timing and logging
|
||||
t1 = time.time()
|
||||
dt = t1 - t0
|
||||
t0 = t1
|
||||
lossf = loss.item()
|
||||
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
||||
losses = estimate_loss(model, val_loader, converter, ctx, device)
|
||||
val_losses.append(losses['val'])
|
||||
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
|
||||
print(f"saving latest checkpoint to {ckpt_path}")
|
||||
checkpoint = {
|
||||
'state_dict': model.state_dict(),
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'model_args': model_args,
|
||||
'iter_num': iter_num,
|
||||
'best_val_loss': best_val_loss,
|
||||
}
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
|
||||
|
||||
if losses['val'] < best_val_loss:
|
||||
best_val_loss = losses['val']
|
||||
if iter_num > 0:
|
||||
print(f"saving best checkpoint to {ckpt_path}")
|
||||
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
|
||||
|
||||
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
|
||||
|
||||
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
|
||||
# create a plot
|
||||
f, ax = plt.subplots(1,2,figsize=(18,6))
|
||||
val_iters = np.arange(1, n_iters+1, val_interval)
|
||||
|
||||
# draw loss
|
||||
ax[0].plot(train_losses)
|
||||
ax[0].plot(val_iters, val_losses, 'r')
|
||||
|
||||
# set labels
|
||||
ax[0].set_xlabel('training iters')
|
||||
ax[0].legend(['training loss', 'validation loss'])
|
||||
|
||||
train_perplexity = [np.exp(x) for x in train_losses]
|
||||
val_perplexity = [np.exp(x) for x in val_losses]
|
||||
# draw perplexity
|
||||
ax[1].plot(train_perplexity)
|
||||
ax[1].plot(val_iters, val_perplexity, 'r')
|
||||
|
||||
# set labels
|
||||
ax[1].set_xlabel('training iters')
|
||||
ax[1].legend(['training perplexity', 'validation perplexity'])
|
||||
plt.tight_layout()
|
||||
|
||||
# show the image
|
||||
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
|
||||
plt.show()
|
||||
|
||||
# helps estimate an arbitrarily accurate loss over either split using many batches
|
||||
@torch.no_grad()
|
||||
def estimate_loss(model, val_loader, converter, ctx, device):
|
||||
out = {}
|
||||
model.eval()
|
||||
losses = 0
|
||||
max_iters = 100
|
||||
iter_num = 0
|
||||
for inputs in val_loader:
|
||||
if iter_num >= max_iters:
|
||||
break
|
||||
iter_num += 1
|
||||
X, Y = converter.encode(inputs)
|
||||
X, Y = X.to(device), Y.to(device)
|
||||
with ctx:
|
||||
logits, loss = model(X, Y)
|
||||
#loss = model.loss(logits, Y)
|
||||
losses += loss.item()
|
||||
out['val'] = losses / max_iters
|
||||
model.train()
|
||||
return out
|
||||
|
||||
if __name__ == '__main__':
|
||||
# set random seed for reproducibility
|
||||
seed = 2024
|
||||
torch.manual_seed(seed)
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||
|
||||
# set configurations of the model and training process
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
|
||||
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
|
||||
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
|
||||
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
|
||||
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
|
||||
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
|
||||
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
|
||||
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||
|
||||
opt = parser.parse_args()
|
||||
if opt.device is None:
|
||||
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
|
||||
os.makedirs(opt.ckpt_path, exist_ok=True)
|
||||
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user