Merge pull request 'Homework4 Submit' (#5) from homework4 into main
Reviewed-on: #5
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -12,3 +12,4 @@ hw2/code/checkpoints/
|
|||||||
hw2/code/visualized/
|
hw2/code/visualized/
|
||||||
hw3/code/data/
|
hw3/code/data/
|
||||||
hw3/code/checkpoints/
|
hw3/code/checkpoints/
|
||||||
|
hw4/code/workdirs/
|
||||||
163
hw4/code/attnvis.ipynb
Normal file
163
hw4/code/attnvis.ipynb
Normal file
File diff suppressed because one or more lines are too long
13426
hw4/code/data/quansongci/data.json
Normal file
13426
hw4/code/data/quansongci/data.json
Normal file
File diff suppressed because it is too large
Load Diff
22640
hw4/code/data/quansongci/train.json
Normal file
22640
hw4/code/data/quansongci/train.json
Normal file
File diff suppressed because it is too large
Load Diff
11904
hw4/code/data/quansongci/val.json
Normal file
11904
hw4/code/data/quansongci/val.json
Normal file
File diff suppressed because it is too large
Load Diff
2
hw4/code/data/vis/vis_1.txt
Normal file
2
hw4/code/data/vis/vis_1.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
+++如梦令
|
||||||
|
昨夜雨疏风骤。浓睡不消残酒。试问卷帘人,却道海棠依旧。知否。知否。应是绿肥红瘦。
|
||||||
3
hw4/code/data/vis/vis_2.txt
Normal file
3
hw4/code/data/vis/vis_2.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
+++鹧鸪天(秋思)
|
||||||
|
红蓼烟收尽暮鸦。秋风吹叶荻花秋。断肠天色连云散,黄叶荻花秋水流。
|
||||||
|
楼上角,笛声悠。兴王莫上叹人头。明朝归去无消息,只有当时一望流。
|
||||||
75
hw4/code/dataset.py
Normal file
75
hw4/code/dataset.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
import torch
|
||||||
|
from torch.utils.data import Dataset
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
class LMDataset(Dataset):
|
||||||
|
def __init__(self, data_dir, split):
|
||||||
|
super().__init__()
|
||||||
|
# load the data
|
||||||
|
with open(os.path.join(data_dir, '%s.json'%split), 'r', encoding='utf-8') as f:
|
||||||
|
meta = json.load(f)
|
||||||
|
|
||||||
|
self.data = meta['data'] # list of samples
|
||||||
|
self.stoi = meta['stoi'] # a dict that maps character to integer
|
||||||
|
self.itos = meta['itos'] # a dict that maps string of integer to character
|
||||||
|
self.vocab_size = meta['vocab_size'] # vocab size
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.data)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self.data[index]
|
||||||
|
|
||||||
|
class Converter:
|
||||||
|
'''
|
||||||
|
This class helps us convert strings to integers and back
|
||||||
|
We use "0" to denote the padding character '<pad>', and "1" to denote the end of the sequence '<eos>'
|
||||||
|
'''
|
||||||
|
def __init__(self, stoi, itos):
|
||||||
|
self.stoi = stoi # a dict that maps character to integer
|
||||||
|
self.itos = itos # a dict that maps string of integer to character
|
||||||
|
|
||||||
|
def single_encode(self, s):
|
||||||
|
l = [] # initialize an empty list
|
||||||
|
for i in s:
|
||||||
|
l.append(self.stoi[i])
|
||||||
|
# transform the list into a numpy array
|
||||||
|
l = np.array(l, dtype=np.int64)
|
||||||
|
return l
|
||||||
|
|
||||||
|
def single_decode(self, l):
|
||||||
|
s = '' # initialize an empty string
|
||||||
|
for i in l:
|
||||||
|
# if we meet the end of the sequence (the value of integer is equal to 1), break
|
||||||
|
if i == 1:
|
||||||
|
break
|
||||||
|
# convert string of the integer into a character
|
||||||
|
s += self.itos[str(i)]
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def encode(self, data):
|
||||||
|
'''
|
||||||
|
encode a list of strings into integers
|
||||||
|
'''
|
||||||
|
lens = [len(s) for s in data]
|
||||||
|
max_len = max(lens)
|
||||||
|
out = np.zeros((len(data), max_len+1), dtype=np.int64)
|
||||||
|
for i,s in enumerate(data):
|
||||||
|
out[i,:len(s)] = self.single_encode(s)
|
||||||
|
out[i,len(s)] = 1
|
||||||
|
x = torch.from_numpy(out[:,:-1])
|
||||||
|
y = torch.from_numpy(out[:,1:])
|
||||||
|
return x, y
|
||||||
|
|
||||||
|
def decode(self, data):
|
||||||
|
'''
|
||||||
|
decode a list of integers into strings
|
||||||
|
'''
|
||||||
|
data = data.cpu().numpy().astype(np.int64)
|
||||||
|
out = []
|
||||||
|
for i in range(len(data)):
|
||||||
|
out.append(self.single_decode(data[i]))
|
||||||
|
return out
|
||||||
356
hw4/code/model.py
Normal file
356
hw4/code/model.py
Normal file
@@ -0,0 +1,356 @@
|
|||||||
|
# ========================================================
|
||||||
|
# Media and Cognition
|
||||||
|
# Homework 4 Sequence Modeling
|
||||||
|
# model.py - Model definition
|
||||||
|
# Student ID: 2022010639
|
||||||
|
# Name: Yixuan Gao
|
||||||
|
# Tsinghua University
|
||||||
|
# (C) Copyright 2024
|
||||||
|
# ========================================================
|
||||||
|
|
||||||
|
|
||||||
|
# Import required libraries
|
||||||
|
############################################################
|
||||||
|
import math
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# Define the GELU activation function used in OpenAI GPT
|
||||||
|
############################################################
|
||||||
|
def gelu(z):
|
||||||
|
"""
|
||||||
|
Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
|
||||||
|
0.5z(1 + tanh[(2/π)^(1/2) * (z + 0.044715 z^3)])
|
||||||
|
"""
|
||||||
|
return 0.5 * z * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (z + 0.044715 * torch.pow(z, 3.0))))
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# Define the Multi-Head SelfAttention module
|
||||||
|
############################################################
|
||||||
|
class SelfAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, embed_dim, num_head, dropout):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# define there linear layers for q, k, v generation separately
|
||||||
|
self.q_layer = nn.Linear(embed_dim, embed_dim)
|
||||||
|
self.k_layer = nn.Linear(embed_dim, embed_dim)
|
||||||
|
self.v_layer = nn.Linear(embed_dim, embed_dim)
|
||||||
|
|
||||||
|
# define the projection layer for output
|
||||||
|
self.proj_layer = nn.Linear(embed_dim, embed_dim)
|
||||||
|
|
||||||
|
# define the dropout layer for attention and output calculation
|
||||||
|
self.attn_drop = nn.Dropout(dropout)
|
||||||
|
self.proj_drop = nn.Dropout(dropout)
|
||||||
|
|
||||||
|
self.num_head = num_head
|
||||||
|
self.head_dim = embed_dim // num_head
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
|
||||||
|
batch_size, seq_len, dim = x.shape
|
||||||
|
|
||||||
|
# >>> TODO 1: complete the forward process of the Multi-Head SelfAttention module.
|
||||||
|
# Step 1.1: obtain q, k, v via self.q_layer(), self.k_layer(), self.v_layer() respectively.
|
||||||
|
# the shape of q, k, v: (batch_size, seq_len, num_heads * head_dim)
|
||||||
|
# num_heads denotes the number of heads in multi-head attention, head_dim denotes the dimension of each head
|
||||||
|
q = self.q_layer(x)
|
||||||
|
k = self.k_layer(x)
|
||||||
|
v = self.v_layer(x)
|
||||||
|
|
||||||
|
# Step 1.2: in order to calculate multi-head attention in parallel, reshape q, k, v first.
|
||||||
|
# first use `Tensor.reshape()` to reshape q, k, v to: (batch_size, seq_len, num_heads, head_dim)
|
||||||
|
q = q.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||||
|
k = k.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||||
|
v = v.reshape(batch_size, seq_len, self.num_head, self.head_dim)
|
||||||
|
|
||||||
|
# then use `Tensor.transpose()` or `Tensor.permute()` to exchange the dim of q, k, v for matrix multiplication
|
||||||
|
# the shape of q, k, v from (batch_size, seq_len, num_heads, head_dim) to (batch_size, num_heads, seq_len, head_dim)
|
||||||
|
q = q.transpose(1, 2)
|
||||||
|
k = k.transpose(1, 2)
|
||||||
|
v = v.transpose(1, 2)
|
||||||
|
|
||||||
|
# Step 1.3: calculate multi-head attention in parallel: Attention(q, k, v) = softmax(qk^T / sqrt(head_dim)) v
|
||||||
|
# Step 1.3.1: do matrix multiplication via `torch.matmul()`: attn = qk^T / sqrt(head_dim)
|
||||||
|
# the shape of `attn`: (batch_size, num_heads, seq_len, seq_len)
|
||||||
|
attn = (q @ k.transpose(2, 3)) / torch.sqrt(torch.tensor(self.head_dim))
|
||||||
|
|
||||||
|
# Step 1.3.2: for Auto-Regressive Language Model, the predictions for position i can depend only on the input at positions less than i.
|
||||||
|
# Therefore, a mask is used to prevent positions from attending to subsequent positions
|
||||||
|
# attn_mask = (0,1,...,1; 0,0,1,...,0; ...; 0,...,0) is a upper triangular matrix with shape (seq_len, seq_len)
|
||||||
|
# Hint:
|
||||||
|
# use torch.ones(?, device=x.device) to generate a matrix filled with value 1 with shape (seq_len, seq_len)
|
||||||
|
attn_mask = torch.ones(seq_len, seq_len, device=attn.device)
|
||||||
|
# use torch.triu(?, diagonal=1) to obtain a upper triangular matrix where elements on the main diagonal are 0
|
||||||
|
attn_mask = torch.triu(attn_mask, diagonal=1)
|
||||||
|
# use Tensor.bool() to convert the matrix to a boolean matrix
|
||||||
|
attn_mask = attn_mask.bool()
|
||||||
|
# fill the position of `attn` where `attn_mask==True` with value `float('-inf')` via `Tensor.masked_fill()`
|
||||||
|
attn = attn.masked_fill(attn_mask, -np.inf)
|
||||||
|
|
||||||
|
# Step 1.3.3: normalize `attn` via softmax funtion: attn = Softmax(attn) = Softmax(qk^T / sqrt(head_dim))
|
||||||
|
attn = torch.softmax(attn, dim=3)
|
||||||
|
# Step 1.3.4: apply dropout to `attn` via self.attn_drop()
|
||||||
|
attn = self.attn_drop(attn)
|
||||||
|
# Step 1.3.5: multiply v by `attn` via torch.matmul(): out = Attention(q, k, v) = attn v
|
||||||
|
# the shape of `out`: (batch_size, num_heads, seq_len, head_dim)
|
||||||
|
out = attn @ v
|
||||||
|
|
||||||
|
# Step 1.4: use `Tensor.transpose()` and `Tensor.reshape()' to concatenate output of different heads
|
||||||
|
# the shape of `out` from (batch_size, num_heads, seq_len, head_dim) to (batch_size, seq_len, num_heads*head_dim)
|
||||||
|
out = out.transpose(1, 2).reshape(batch_size, seq_len, self.num_head * self.head_dim)
|
||||||
|
|
||||||
|
# Step 1.5: obtain the final results via self.proj_layer() and self.proj_drop(): result = Dropout(MultiHead(Q, K, V)) = Dropout(out W^O)
|
||||||
|
result = self.proj_drop(self.proj_layer(out))
|
||||||
|
# <<< TODO 1
|
||||||
|
|
||||||
|
# return the final results `result` and attention weights `attn`
|
||||||
|
return result, attn
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# Define the feed forward network (FFN)
|
||||||
|
############################################################
|
||||||
|
class FFN(nn.Module):
|
||||||
|
def __init__(self, embed_dim, feedforward_dim, dropout):
|
||||||
|
super().__init__()
|
||||||
|
self.fc1 = nn.Linear(embed_dim, feedforward_dim)
|
||||||
|
self.fc2 = nn.Linear(feedforward_dim, embed_dim)
|
||||||
|
self.drop = nn.Dropout(dropout)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc1(x)
|
||||||
|
x = gelu(x)
|
||||||
|
x = self.fc2(x)
|
||||||
|
x = self.drop(x)
|
||||||
|
return x
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# Define the TransformerLayer
|
||||||
|
############################################################
|
||||||
|
class TransformerLayer(nn.Module):
|
||||||
|
def __init__(self, embed_dim, num_head, feedforward_dim, dropout, no_res):
|
||||||
|
super().__init__()
|
||||||
|
self.norm1 = nn.LayerNorm(embed_dim)
|
||||||
|
self.attn = SelfAttention(embed_dim, num_head, dropout)
|
||||||
|
self.norm2 = nn.LayerNorm(embed_dim)
|
||||||
|
self.ffn = FFN(embed_dim, feedforward_dim, dropout)
|
||||||
|
self.no_res = no_res # whether to use residual connection
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# >>> TODO 2: complete the forward process of the TransformerLayer module.
|
||||||
|
# Step 2.1: calculate the output of multi-head self-attention
|
||||||
|
# normalize the input via `self.norm1()`: x_norm = LayerNorm(x)
|
||||||
|
x_norm = self.norm1(x)
|
||||||
|
|
||||||
|
# calculate the output of multi-head self-attention via `self.attn()`: x_attn, attn = SelfAttention(x_norm)
|
||||||
|
x_attn, attn = self.attn(x_norm)
|
||||||
|
|
||||||
|
# add the input 'x' to the output of attention (x_attn) if self.no_res is False: x_attn = x + x_attn if no_res is False else x_attn
|
||||||
|
if not self.no_res:
|
||||||
|
x_attn = x_attn + x
|
||||||
|
|
||||||
|
# Step 2.2: calculate the output of feed forward network
|
||||||
|
# calculate the output of feed forward network via `self.ffn()` and `self.norm2()`: x_ffn = FFN(LayerNorm(x_attn))
|
||||||
|
x_ffn = self.ffn(self.norm2(x_attn))
|
||||||
|
|
||||||
|
# add the output of attention (x_attn) to the output of feed forward network (x_ffn) if self.no_res is False: out = x_attn + x_ffn if no_res is False else x_ffn
|
||||||
|
if not self.no_res:
|
||||||
|
out = x_attn + x_ffn
|
||||||
|
else:
|
||||||
|
out = x_ffn
|
||||||
|
# <<< TODO 2
|
||||||
|
|
||||||
|
return out, attn
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
# Define the GPT module
|
||||||
|
############################################################
|
||||||
|
class GPT(nn.Module):
|
||||||
|
def __init__(self, vocab_size, max_seq_len, num_layer, embed_dim, num_head, feedforward_dim, dropout, no_res=False, no_pos=False):
|
||||||
|
'''
|
||||||
|
vocab_size: the size of vocabulary
|
||||||
|
max_seq_len: the maximum length of input texts
|
||||||
|
num_layer: the number of transformer layers
|
||||||
|
embed_dim: the embedding dimension
|
||||||
|
num_head: the number of heads in Multi-Head Self Attention
|
||||||
|
feedforward_dim: the dimension in the feed forward network
|
||||||
|
dropout: dropout ratio
|
||||||
|
no_res: whether to use residual connection in transformer layers
|
||||||
|
no_pos: whether to use position embeddings
|
||||||
|
'''
|
||||||
|
super().__init__()
|
||||||
|
self.num_layer = num_layer
|
||||||
|
self.max_seq_len = max_seq_len
|
||||||
|
self.no_pos = no_pos
|
||||||
|
|
||||||
|
# Define Embedding Layer to transfer input text tokens and positions to embeddings
|
||||||
|
self.word_token_embedding = nn.Embedding(vocab_size, embed_dim)
|
||||||
|
self.word_pos_embedding = nn.Embedding(max_seq_len, embed_dim)
|
||||||
|
|
||||||
|
self.drop = nn.Dropout(dropout)
|
||||||
|
# Define the transformer layers
|
||||||
|
self.transformer = nn.ModuleList([TransformerLayer(embed_dim, num_head, feedforward_dim, dropout, no_res) for _ in range(num_layer)])
|
||||||
|
|
||||||
|
# Define the head layer to predict output
|
||||||
|
self.norm = nn.LayerNorm(embed_dim)
|
||||||
|
self.language_model_head = nn.Linear(embed_dim, vocab_size, bias=False)
|
||||||
|
|
||||||
|
"""
|
||||||
|
Weight tying improves the performance of language models by tying (sharing) the weights of the embedding and softmax layers.
|
||||||
|
Reference: https://paperswithcode.com/method/weight-tying
|
||||||
|
"""
|
||||||
|
self.word_token_embedding.weight = self.language_model_head.weight
|
||||||
|
|
||||||
|
self.init_weights()
|
||||||
|
|
||||||
|
def init_weights(self):
|
||||||
|
for m in self.modules():
|
||||||
|
if isinstance(m, nn.Linear):
|
||||||
|
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||||
|
if m.bias is not None:
|
||||||
|
torch.nn.init.zeros_(m.bias)
|
||||||
|
elif isinstance(m, nn.Embedding):
|
||||||
|
torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
|
||||||
|
|
||||||
|
# apply special scaled init to the residual projections, per GPT-2 paper
|
||||||
|
for pn, p in self.named_parameters():
|
||||||
|
if pn.endswith('proj_layer.weight'):
|
||||||
|
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * self.num_layer))
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, word_idx, targets=None):
|
||||||
|
batch_size, seq_len = word_idx.shape
|
||||||
|
|
||||||
|
# >>> TODO 3: complete the forward process of GPT
|
||||||
|
# Step 3.1: use torch.arange(?, dtype=torch.long, device=word_idx.device) to generate the position sequence `pos` [0, 1, ..., seq_len-1]
|
||||||
|
pos = torch.arange(seq_len, dtype=torch.long, device=word_idx.device)
|
||||||
|
|
||||||
|
# Step 3.2: use self.word_token_embedding() and self.word_pos_embedding() to transfer `word_idx` and `pos` to embeddings ('token_embed` and `pos_embed`)
|
||||||
|
token_embed = self.word_token_embedding(word_idx)
|
||||||
|
pos_embed = self.word_pos_embedding(pos)
|
||||||
|
|
||||||
|
# Step 3.3: initialize the input embeddings `x` of transformer layers
|
||||||
|
# add the token embeddings and position embeddings to obtain the input embeddings `x` if self.no_pos is False
|
||||||
|
if not self.no_pos:
|
||||||
|
x = token_embed + pos_embed
|
||||||
|
else:
|
||||||
|
x = token_embed
|
||||||
|
|
||||||
|
# apply dropout to the input embeddings via `self.drop()`
|
||||||
|
x = self.drop(x)
|
||||||
|
|
||||||
|
# Step 3.4: use for loop to obtain the output and attention weights of multiple transformer layers
|
||||||
|
# define a list `attention_weights` and append the attention weights of each transformer layer into the list
|
||||||
|
attention_weights = list()
|
||||||
|
for i in range(self.num_layer):
|
||||||
|
# Step 4.1: obtain the output and attention weights of transformer layers
|
||||||
|
x, attn = self.transformer[i](x)
|
||||||
|
# Step 4.2: append the attention weights of transformer layers into the list `attention_weights`
|
||||||
|
attention_weights.append(attn)
|
||||||
|
|
||||||
|
# Step 3.5: use self.norm() to normalize the output of transformer layers and then use self.language_model_head() to obtain the `logits` for prediction
|
||||||
|
# self.language_model_head() is a linear layer defined in __init__() function
|
||||||
|
# Note: do not add softmax here since it is included in the cross entropy loss function
|
||||||
|
x = self.norm(x)
|
||||||
|
logits = self.language_model_head(x)
|
||||||
|
# <<< TODO 3
|
||||||
|
|
||||||
|
# return logits and loss or attention weights
|
||||||
|
if targets is not None:
|
||||||
|
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=0)
|
||||||
|
return logits, loss
|
||||||
|
assert isinstance(attention_weights, list), "attention_weights must be a list, please check whether to append the attention weights of all transformer layers into it!"
|
||||||
|
return logits, attention_weights
|
||||||
|
|
||||||
|
def configure_optimizers(self, weight_decay):
|
||||||
|
"""
|
||||||
|
This long function is unfortunately doing something very simple and is being very defensive:
|
||||||
|
We are separating out all parameters of the model into two buckets: those that will experience
|
||||||
|
weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
|
||||||
|
We are then returning the PyTorch optimizer object.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# separate out all parameters to those that will and won't experience regularizing weight decay
|
||||||
|
decay = set()
|
||||||
|
no_decay = set()
|
||||||
|
whitelist_weight_modules = (nn.Linear, )
|
||||||
|
blacklist_weight_modules = (nn.LayerNorm, torch.nn.Embedding)
|
||||||
|
for mn, m in self.named_modules():
|
||||||
|
for pn, p in m.named_parameters():
|
||||||
|
fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
|
||||||
|
# random note: because named_modules and named_parameters are recursive
|
||||||
|
# we will see the same tensors p many many times. but doing it this way
|
||||||
|
# allows us to know which parent module any tensor p belongs to...
|
||||||
|
if pn.endswith('bias'):
|
||||||
|
# all biases will not be decayed
|
||||||
|
no_decay.add(fpn)
|
||||||
|
elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
|
||||||
|
# weights of whitelist modules will be weight decayed
|
||||||
|
decay.add(fpn)
|
||||||
|
elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
|
||||||
|
# weights of blacklist modules will NOT be weight decayed
|
||||||
|
no_decay.add(fpn)
|
||||||
|
|
||||||
|
# subtle: 'transformer.wte.weight' and 'lm_head.weight' are tied, so they
|
||||||
|
# will appear in the no_decay and decay sets respectively after the above.
|
||||||
|
# In addition, because named_parameters() doesn't return duplicates, it
|
||||||
|
# will only return the first occurence, key'd by 'transformer.wte.weight', below.
|
||||||
|
# so let's manually remove 'lm_head.weight' from decay set. This will include
|
||||||
|
# this tensor into optimization via transformer.wte.weight only, and not decayed.
|
||||||
|
decay.remove('language_model_head.weight')
|
||||||
|
|
||||||
|
# validate that we considered every parameter
|
||||||
|
param_dict = {pn: p for pn, p in self.named_parameters()}
|
||||||
|
inter_params = decay & no_decay
|
||||||
|
union_params = decay | no_decay
|
||||||
|
assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
|
||||||
|
assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
|
||||||
|
% (str(param_dict.keys() - union_params), )
|
||||||
|
|
||||||
|
# create the pytorch optimizer object
|
||||||
|
optim_groups = [
|
||||||
|
{"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": weight_decay},
|
||||||
|
{"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
|
||||||
|
]
|
||||||
|
return optim_groups
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
|
||||||
|
"""
|
||||||
|
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
|
||||||
|
the sequence max_new_tokens times, feeding the predictions back into the model each time.
|
||||||
|
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
|
||||||
|
"""
|
||||||
|
for _ in range(max_new_tokens):
|
||||||
|
# if the sequence context is growing too long we must crop it at block_size
|
||||||
|
idx_cond = idx
|
||||||
|
# forward the model to get the logits for the index in the sequence
|
||||||
|
logits, _ = self(idx_cond)
|
||||||
|
# pluck the logits at the final step and scale by desired temperature
|
||||||
|
logits = logits[:, -1, :] / temperature
|
||||||
|
# optionally crop the logits to only the top k options
|
||||||
|
if top_k is not None:
|
||||||
|
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
|
||||||
|
logits[logits < v[:, [-1]]] = -float('Inf')
|
||||||
|
# apply softmax to convert logits to (normalized) probabilities
|
||||||
|
probs = F.softmax(logits, dim=-1)
|
||||||
|
# sample from the distribution
|
||||||
|
idx_next = torch.multinomial(probs, num_samples=1)
|
||||||
|
# append sampled index to the running sequence and continue
|
||||||
|
idx = torch.cat((idx, idx_next), dim=1)
|
||||||
|
|
||||||
|
return idx.squeeze().cpu().numpy()
|
||||||
|
############################################################
|
||||||
|
|
||||||
|
GPTConfig = {
|
||||||
|
'mygpt': dict(num_layer=4, embed_dim=128, num_head=4, feedforward_dim=128*4, dropout=0.0),
|
||||||
|
'gpt2-mini': dict(num_layer=6, embed_dim=384, num_head=6, feedforward_dim=384*4, dropout=0.2),
|
||||||
|
'gpt2': dict(num_layer=12, embed_dim=768, num_head=12, feedforward_dim=768*4, dropout=0.2),
|
||||||
|
}
|
||||||
61
hw4/code/prepare.py
Normal file
61
hw4/code/prepare.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""
|
||||||
|
Prepare the dataset for character-level language modeling.
|
||||||
|
So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--data_root', type=str, default='data/quansongci', help='data directory')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# set the input file path
|
||||||
|
input_file_path = os.path.join(args.data_root, 'data.json')
|
||||||
|
|
||||||
|
with open(input_file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)['data']
|
||||||
|
print(f"length of dataset: {len(data):,}")
|
||||||
|
|
||||||
|
# get all the unique characters that occur in this text
|
||||||
|
chars = sorted(list(set(''.join(data))))
|
||||||
|
vocab_size = len(chars) + 2 # for <pad> and <eos>
|
||||||
|
print("all the unique characters:", ''.join(chars))
|
||||||
|
print(f"vocab size: {vocab_size:,}")
|
||||||
|
|
||||||
|
# create a mapping from characters to integers
|
||||||
|
stoi = { ch:i+2 for i,ch in enumerate(chars) }
|
||||||
|
itos = { i+2:ch for i,ch in enumerate(chars) }
|
||||||
|
stoi['<pad>'] = 0
|
||||||
|
itos[0] = '<pad>'
|
||||||
|
stoi['<eos>'] = 1
|
||||||
|
itos[1] = '<eos>'
|
||||||
|
|
||||||
|
|
||||||
|
# create the train and test splits
|
||||||
|
n = len(data)
|
||||||
|
train_data = data[:int(n*0.9)]
|
||||||
|
val_data = data[int(n*0.9):]
|
||||||
|
print(f"train has {len(train_data):,} samples")
|
||||||
|
print(f"val has {len(val_data):,} samples")
|
||||||
|
|
||||||
|
# save the meta information as well, to help us encode/decode later
|
||||||
|
train_meta = {
|
||||||
|
'data': train_data,
|
||||||
|
'vocab_size': vocab_size,
|
||||||
|
'itos': itos,
|
||||||
|
'stoi': stoi,
|
||||||
|
}
|
||||||
|
with open(os.path.join(args.data_root, 'train.json'), 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(train_meta, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
val_meta = {
|
||||||
|
'data': val_data,
|
||||||
|
'vocab_size': vocab_size,
|
||||||
|
'itos': itos,
|
||||||
|
'stoi': stoi,
|
||||||
|
}
|
||||||
|
with open(os.path.join(args.data_root, 'val.json'), 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(val_meta, f, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
76
hw4/code/sample.py
Normal file
76
hw4/code/sample.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""
|
||||||
|
Sample from a trained model
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from contextlib import nullcontext
|
||||||
|
import torch
|
||||||
|
from model import GPTConfig, GPT
|
||||||
|
import argparse
|
||||||
|
from dataset import Converter, LMDataset
|
||||||
|
|
||||||
|
def sample(start, num_samples, max_new_tokens, model_name, ckpt_path, data_root, device):
|
||||||
|
dataset = LMDataset(data_root, 'train')
|
||||||
|
converter = Converter(dataset.stoi, dataset.itos)
|
||||||
|
temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
|
||||||
|
top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
|
||||||
|
# model
|
||||||
|
dtype = 'float16' # 'float32' or 'bfloat16' or 'float16'
|
||||||
|
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||||
|
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||||
|
# init from a model saved in a specific directory
|
||||||
|
ckpt_path = os.path.join(ckpt_path, 'best.pth')
|
||||||
|
print("sample from %s"%ckpt_path)
|
||||||
|
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||||
|
gptconf = GPTConfig[model_name]
|
||||||
|
if 'model_args' in checkpoint:
|
||||||
|
gptconf = checkpoint['model_args']
|
||||||
|
model = GPT(**gptconf)
|
||||||
|
state_dict = checkpoint['state_dict']
|
||||||
|
#unwanted_prefix = '_orig_mod.'
|
||||||
|
#for k,v in list(state_dict.items()):
|
||||||
|
# if k.startswith(unwanted_prefix):
|
||||||
|
# state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
|
||||||
|
model.load_state_dict(state_dict)
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
# encode the beginning of the prompt
|
||||||
|
start_ids = converter.single_encode(start)
|
||||||
|
x = torch.from_numpy(start_ids)[None, ...].to(device).long()
|
||||||
|
|
||||||
|
# run generation
|
||||||
|
with torch.no_grad():
|
||||||
|
with ctx:
|
||||||
|
for k in range(num_samples):
|
||||||
|
y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
|
||||||
|
print(converter.single_decode(y))
|
||||||
|
print('---------------')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# set random seed for reproducibility
|
||||||
|
seed = 2024
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
torch.backends.cudnn.deterministic = True
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||||
|
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||||
|
|
||||||
|
# set configurations of the model and sampling process
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--start', type=str, default='+++', help='start of the sample, e.g. "+++" or "+++清平乐"')
|
||||||
|
parser.add_argument('--num_samples', type=int, default='10', help='the number of samples')
|
||||||
|
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the model')
|
||||||
|
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to load checkpoints')
|
||||||
|
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||||
|
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||||
|
|
||||||
|
opt = parser.parse_args()
|
||||||
|
if opt.device is None:
|
||||||
|
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
|
||||||
|
|
||||||
|
sample(opt.start, opt.num_samples, 128-len(opt.start), opt.model_name, opt.ckpt_path, opt.data_root, opt.device)
|
||||||
219
hw4/code/train.py
Normal file
219
hw4/code/train.py
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
import pickle
|
||||||
|
from contextlib import nullcontext
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from model import GPT, GPTConfig
|
||||||
|
from dataset import LMDataset, Converter
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
# learning rate decay scheduler (cosine with warmup)
|
||||||
|
def get_lr(it, learning_rate, min_lr=1e-4, warmup_iters=100, lr_decay_iters=6000):
|
||||||
|
# 1) linear warmup for warmup_iters steps
|
||||||
|
if it < warmup_iters:
|
||||||
|
return learning_rate * it / warmup_iters
|
||||||
|
# 2) if it > lr_decay_iters, return min learning rate
|
||||||
|
if it > lr_decay_iters:
|
||||||
|
return min_lr
|
||||||
|
# 3) in between, use cosine decay down to min learning rate
|
||||||
|
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
|
||||||
|
assert 0 <= decay_ratio <= 1
|
||||||
|
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
|
||||||
|
return min_lr + coeff * (learning_rate - min_lr)
|
||||||
|
|
||||||
|
def train(data_root, model_name, batch_size, n_iters, ckpt_path, val_interval, device='cpu', no_res=False, no_pos=False):
|
||||||
|
train_dataset = LMDataset(data_root, 'train')
|
||||||
|
val_dataset = LMDataset(data_root, 'val')
|
||||||
|
train_loader = DataLoader(train_dataset, batch_size=int(batch_size), shuffle=True)
|
||||||
|
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
|
||||||
|
converter = Converter(train_dataset.stoi, train_dataset.itos)
|
||||||
|
|
||||||
|
# adamw optimizer
|
||||||
|
learning_rate = 5e-3 # max learning rate
|
||||||
|
weight_decay = 1e-1
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.99
|
||||||
|
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||||
|
|
||||||
|
# system
|
||||||
|
|
||||||
|
dtype = 'bfloat16' if device == 'cpu' else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
|
||||||
|
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
|
||||||
|
ctx = nullcontext() if device == 'cpu' or device == 'mps' else torch.autocast(device_type=device, dtype=ptdtype)
|
||||||
|
#ctx = torch.autocast(device_type=device, dtype=ptdtype)
|
||||||
|
best_val_loss = 1e9
|
||||||
|
iter_num = 0 # number of iterations in the lifetime of this process
|
||||||
|
|
||||||
|
# model init
|
||||||
|
model_args = GPTConfig[model_name]
|
||||||
|
model_args['vocab_size'] = train_dataset.vocab_size
|
||||||
|
model_args['max_seq_len'] = 128
|
||||||
|
model_args['no_res'] = no_res
|
||||||
|
model_args['no_pos'] = no_pos
|
||||||
|
|
||||||
|
# init a new model from scratch
|
||||||
|
print("Initializing a new model from scratch")
|
||||||
|
model = GPT(**model_args)
|
||||||
|
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
# initialize a GradScaler. If enabled=False scaler is a no-op
|
||||||
|
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
|
||||||
|
|
||||||
|
# optimizer
|
||||||
|
optim_groups = model.configure_optimizers(weight_decay)
|
||||||
|
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(beta1, beta2))
|
||||||
|
checkpoint = None # free up memory
|
||||||
|
|
||||||
|
print('training...')
|
||||||
|
# training loop
|
||||||
|
epoch_num = np.ceil(n_iters * int(batch_size) / float(len(train_dataset))).astype(np.int32)
|
||||||
|
t0 = time.time()
|
||||||
|
model.train()
|
||||||
|
train_losses = []
|
||||||
|
val_losses = []
|
||||||
|
for epoch in range(epoch_num):
|
||||||
|
for step, inputs in enumerate(train_loader):
|
||||||
|
if iter_num >= n_iters:
|
||||||
|
break
|
||||||
|
X, Y = converter.encode(inputs)
|
||||||
|
X, Y = X.to(device), Y.to(device)
|
||||||
|
lr = get_lr(iter_num, learning_rate, lr_decay_iters=n_iters)
|
||||||
|
for param_group in optimizer.param_groups:
|
||||||
|
param_group['lr'] = lr
|
||||||
|
|
||||||
|
# forward backward update, with optional gradient accumulation to simulate larger batch size
|
||||||
|
# and using the GradScaler if data type is float16
|
||||||
|
with ctx:
|
||||||
|
logits, loss = model(X, Y)
|
||||||
|
loss = loss # scale the loss to account for gradient accumulation
|
||||||
|
|
||||||
|
# backward pass, with gradient scaling if training in fp16
|
||||||
|
scaler.scale(loss).backward()
|
||||||
|
# clip the gradient
|
||||||
|
if grad_clip != 0.0:
|
||||||
|
scaler.unscale_(optimizer)
|
||||||
|
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||||
|
# step the optimizer and scaler if training in fp16
|
||||||
|
scaler.step(optimizer)
|
||||||
|
scaler.update()
|
||||||
|
# flush the gradients as soon as we can, no need for this memory anymore
|
||||||
|
optimizer.zero_grad(set_to_none=True)
|
||||||
|
|
||||||
|
iter_num += 1
|
||||||
|
train_losses.append(loss.item())
|
||||||
|
# evaluate the loss on train/val sets and write checkpoints
|
||||||
|
if iter_num % val_interval == 0:
|
||||||
|
# timing and logging
|
||||||
|
t1 = time.time()
|
||||||
|
dt = t1 - t0
|
||||||
|
t0 = t1
|
||||||
|
lossf = loss.item()
|
||||||
|
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
|
||||||
|
losses = estimate_loss(model, val_loader, converter, ctx, device)
|
||||||
|
val_losses.append(losses['val'])
|
||||||
|
print(f"iter {iter_num}: val loss {losses['val']:.4f}")
|
||||||
|
print(f"saving latest checkpoint to {ckpt_path}")
|
||||||
|
checkpoint = {
|
||||||
|
'state_dict': model.state_dict(),
|
||||||
|
'optimizer': optimizer.state_dict(),
|
||||||
|
'model_args': model_args,
|
||||||
|
'iter_num': iter_num,
|
||||||
|
'best_val_loss': best_val_loss,
|
||||||
|
}
|
||||||
|
torch.save(checkpoint, os.path.join(ckpt_path, 'latest.pth'))
|
||||||
|
|
||||||
|
if losses['val'] < best_val_loss:
|
||||||
|
best_val_loss = losses['val']
|
||||||
|
if iter_num > 0:
|
||||||
|
print(f"saving best checkpoint to {ckpt_path}")
|
||||||
|
torch.save(checkpoint, os.path.join(ckpt_path, 'best.pth'))
|
||||||
|
|
||||||
|
plot(n_iters, train_losses, val_losses, val_interval, ckpt_path)
|
||||||
|
|
||||||
|
def plot(n_iters, train_losses, val_losses, val_interval, ckpt_path):
|
||||||
|
# create a plot
|
||||||
|
f, ax = plt.subplots(1,2,figsize=(18,6))
|
||||||
|
val_iters = np.arange(1, n_iters+1, val_interval)
|
||||||
|
|
||||||
|
# draw loss
|
||||||
|
ax[0].plot(train_losses)
|
||||||
|
ax[0].plot(val_iters, val_losses, 'r')
|
||||||
|
|
||||||
|
# set labels
|
||||||
|
ax[0].set_xlabel('training iters')
|
||||||
|
ax[0].legend(['training loss', 'validation loss'])
|
||||||
|
|
||||||
|
train_perplexity = [np.exp(x) for x in train_losses]
|
||||||
|
val_perplexity = [np.exp(x) for x in val_losses]
|
||||||
|
# draw perplexity
|
||||||
|
ax[1].plot(train_perplexity)
|
||||||
|
ax[1].plot(val_iters, val_perplexity, 'r')
|
||||||
|
|
||||||
|
# set labels
|
||||||
|
ax[1].set_xlabel('training iters')
|
||||||
|
ax[1].legend(['training perplexity', 'validation perplexity'])
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
# show the image
|
||||||
|
plt.savefig(os.path.join(ckpt_path, 'loss&perplexity.jpg'), dpi=300)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# helps estimate an arbitrarily accurate loss over either split using many batches
|
||||||
|
@torch.no_grad()
|
||||||
|
def estimate_loss(model, val_loader, converter, ctx, device):
|
||||||
|
out = {}
|
||||||
|
model.eval()
|
||||||
|
losses = 0
|
||||||
|
max_iters = 100
|
||||||
|
iter_num = 0
|
||||||
|
for inputs in val_loader:
|
||||||
|
if iter_num >= max_iters:
|
||||||
|
break
|
||||||
|
iter_num += 1
|
||||||
|
X, Y = converter.encode(inputs)
|
||||||
|
X, Y = X.to(device), Y.to(device)
|
||||||
|
with ctx:
|
||||||
|
logits, loss = model(X, Y)
|
||||||
|
#loss = model.loss(logits, Y)
|
||||||
|
losses += loss.item()
|
||||||
|
out['val'] = losses / max_iters
|
||||||
|
model.train()
|
||||||
|
return out
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# set random seed for reproducibility
|
||||||
|
seed = 2024
|
||||||
|
torch.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
torch.backends.cudnn.deterministic = True
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
|
||||||
|
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
|
||||||
|
|
||||||
|
# set configurations of the model and training process
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--data_root', type=str, default='data/quansongci', help='file of training and validation data')
|
||||||
|
parser.add_argument('--model_name', type=str, default='mygpt', help='name of the pretrained model')
|
||||||
|
parser.add_argument('--iters', type=int, default=1000, help='number of training epochs')
|
||||||
|
parser.add_argument('--batchsize', type=int, default=16, help='training batch size')
|
||||||
|
parser.add_argument('--ckpt_path', type=str, default='workdirs/quansongci', help='path to save checkpoints')
|
||||||
|
parser.add_argument('--val_interval', type=int, default=20, help='iter intervals of validation')
|
||||||
|
parser.add_argument('--no_res', action='store_true', help='whether to use residual connection')
|
||||||
|
parser.add_argument('--no_pos', action='store_true', help='whether to use positional encoding')
|
||||||
|
parser.add_argument('--device', type=str, help='cpu or cuda')
|
||||||
|
|
||||||
|
opt = parser.parse_args()
|
||||||
|
if opt.device is None:
|
||||||
|
opt.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
|
||||||
|
os.makedirs(opt.ckpt_path, exist_ok=True)
|
||||||
|
train(opt.data_root, opt.model_name, opt.batchsize, opt.iters, opt.ckpt_path, opt.val_interval, opt.device, opt.no_res, opt.no_pos)
|
||||||
|
|
||||||
|
|
||||||
132
hw4/report/dtx-style.sty
Normal file
132
hw4/report/dtx-style.sty
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
%%
|
||||||
|
%% This is file `dtx-style.sty',
|
||||||
|
%% generated with the docstrip utility.
|
||||||
|
%%
|
||||||
|
%% The original source files were:
|
||||||
|
%%
|
||||||
|
%% thucoursework.dtx (with options: `dtx-style')
|
||||||
|
%%
|
||||||
|
%% This is a generated file.
|
||||||
|
%%
|
||||||
|
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||||
|
%%
|
||||||
|
%% This work may be distributed and/or modified under the
|
||||||
|
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||||
|
%% of this license or (at your option) any later version.
|
||||||
|
%% The latest version of this license is in
|
||||||
|
%% http://www.latex-project.org/lppl.txt
|
||||||
|
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||||
|
%% version 2005/12/01 or later.
|
||||||
|
%%
|
||||||
|
%% To produce the documentation run the original source files ending with `.dtx'
|
||||||
|
%% through LaTeX.
|
||||||
|
%%
|
||||||
|
|
||||||
|
\ProvidesPackage{dtx-style}
|
||||||
|
\RequirePackage{hypdoc}
|
||||||
|
\RequirePackage[UTF8,scheme=chinese]{ctex}
|
||||||
|
\RequirePackage{newpxtext}
|
||||||
|
\RequirePackage{newpxmath}
|
||||||
|
\RequirePackage[
|
||||||
|
top=2.5cm, bottom=2.5cm,
|
||||||
|
left=4cm, right=2cm,
|
||||||
|
headsep=3mm]{geometry}
|
||||||
|
\RequirePackage{array,longtable,booktabs}
|
||||||
|
\RequirePackage{listings}
|
||||||
|
\RequirePackage{fancyhdr}
|
||||||
|
\RequirePackage{xcolor}
|
||||||
|
\RequirePackage{enumitem}
|
||||||
|
\RequirePackage{etoolbox}
|
||||||
|
\RequirePackage{metalogo}
|
||||||
|
|
||||||
|
\colorlet{thu@macro}{blue!60!black}
|
||||||
|
\colorlet{thu@env}{blue!70!black}
|
||||||
|
\colorlet{thu@option}{purple}
|
||||||
|
\patchcmd{\PrintMacroName}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||||
|
\patchcmd{\PrintDescribeMacro}{\MacroFont}{\MacroFont\bfseries\color{thu@macro}}{}{}
|
||||||
|
\patchcmd{\PrintDescribeEnv}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||||
|
\patchcmd{\PrintEnvName}{\MacroFont}{\MacroFont\bfseries\color{thu@env}}{}{}
|
||||||
|
|
||||||
|
\def\DescribeOption{%
|
||||||
|
\leavevmode\@bsphack\begingroup\MakePrivateLetters%
|
||||||
|
\Describe@Option}
|
||||||
|
\def\Describe@Option#1{\endgroup
|
||||||
|
\marginpar{\raggedleft\PrintDescribeOption{#1}}%
|
||||||
|
\thu@special@index{option}{#1}\@esphack\ignorespaces}
|
||||||
|
\def\PrintDescribeOption#1{\strut \MacroFont\bfseries\sffamily\color{thu@option} #1\ }
|
||||||
|
\def\thu@special@index#1#2{\@bsphack
|
||||||
|
\begingroup
|
||||||
|
\HD@target
|
||||||
|
\let\HDorg@encapchar\encapchar
|
||||||
|
\edef\encapchar usage{%
|
||||||
|
\HDorg@encapchar hdclindex{\the\c@HD@hypercount}{usage}%
|
||||||
|
}%
|
||||||
|
\index{#2\actualchar{\string\ttfamily\space#2}
|
||||||
|
(#1)\encapchar usage}%
|
||||||
|
\index{#1:\levelchar#2\actualchar
|
||||||
|
{\string\ttfamily\space#2}\encapchar usage}%
|
||||||
|
\endgroup
|
||||||
|
\@esphack}
|
||||||
|
|
||||||
|
\lstdefinestyle{lstStyleBase}{%
|
||||||
|
basicstyle=\small\ttfamily,
|
||||||
|
aboveskip=\medskipamount,
|
||||||
|
belowskip=\medskipamount,
|
||||||
|
lineskip=0pt,
|
||||||
|
boxpos=c,
|
||||||
|
showlines=false,
|
||||||
|
extendedchars=true,
|
||||||
|
upquote=true,
|
||||||
|
tabsize=2,
|
||||||
|
showtabs=false,
|
||||||
|
showspaces=false,
|
||||||
|
showstringspaces=false,
|
||||||
|
numbers=none,
|
||||||
|
linewidth=\linewidth,
|
||||||
|
xleftmargin=4pt,
|
||||||
|
xrightmargin=0pt,
|
||||||
|
resetmargins=false,
|
||||||
|
breaklines=true,
|
||||||
|
breakatwhitespace=false,
|
||||||
|
breakindent=0pt,
|
||||||
|
breakautoindent=true,
|
||||||
|
columns=flexible,
|
||||||
|
keepspaces=true,
|
||||||
|
gobble=2,
|
||||||
|
framesep=3pt,
|
||||||
|
rulesep=1pt,
|
||||||
|
framerule=1pt,
|
||||||
|
backgroundcolor=\color{gray!5},
|
||||||
|
stringstyle=\color{green!40!black!100},
|
||||||
|
keywordstyle=\bfseries\color{blue!50!black},
|
||||||
|
commentstyle=\slshape\color{black!60}}
|
||||||
|
|
||||||
|
\lstdefinestyle{lstStyleShell}{%
|
||||||
|
style=lstStyleBase,
|
||||||
|
frame=l,
|
||||||
|
rulecolor=\color{purple},
|
||||||
|
language=bash}
|
||||||
|
|
||||||
|
\lstdefinestyle{lstStyleLaTeX}{%
|
||||||
|
style=lstStyleBase,
|
||||||
|
frame=l,
|
||||||
|
rulecolor=\color{violet},
|
||||||
|
language=[LaTeX]TeX}
|
||||||
|
|
||||||
|
\lstnewenvironment{latex}{\lstset{style=lstStyleLaTeX}}{}
|
||||||
|
\lstnewenvironment{shell}{\lstset{style=lstStyleShell}}{}
|
||||||
|
|
||||||
|
\setlist{nosep}
|
||||||
|
|
||||||
|
\DeclareDocumentCommand{\option}{m}{\textsf{#1}}
|
||||||
|
\DeclareDocumentCommand{\env}{m}{\texttt{#1}}
|
||||||
|
\DeclareDocumentCommand{\pkg}{s m}{%
|
||||||
|
\texttt{#2}\IfBooleanF#1{\thu@special@index{package}{#2}}}
|
||||||
|
\DeclareDocumentCommand{\file}{s m}{%
|
||||||
|
\texttt{#2}\IfBooleanF#1{\thu@special@index{file}{#2}}}
|
||||||
|
\newcommand{\myentry}[1]{%
|
||||||
|
\marginpar{\raggedleft\color{purple}\bfseries\strut #1}}
|
||||||
|
\newcommand{\note}[2][Note]{{%
|
||||||
|
\color{magenta}{\bfseries #1}\emph{#2}}}
|
||||||
|
|
||||||
|
\def\thucoursework{\textsc{Thu}\-\textsc{Coursework}}
|
||||||
153
hw4/report/iidef.sty
Normal file
153
hw4/report/iidef.sty
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
%%
|
||||||
|
%% This is file `iidef.sty',
|
||||||
|
%% generated with the docstrip utility.
|
||||||
|
%%
|
||||||
|
%% The original source files were:
|
||||||
|
%%
|
||||||
|
%% thucoursework.dtx (with options: `sty')
|
||||||
|
%%
|
||||||
|
%% This is a generated file.
|
||||||
|
%%
|
||||||
|
%% Copyright (C) 2021 by zhaofeng-shu33 <616545598@qq.com>
|
||||||
|
%%
|
||||||
|
%% This work may be distributed and/or modified under the
|
||||||
|
%% conditions of the LaTeX Project Public License, either version 1.3
|
||||||
|
%% of this license or (at your option) any later version.
|
||||||
|
%% The latest version of this license is in
|
||||||
|
%% http://www.latex-project.org/lppl.txt
|
||||||
|
%% and version 1.3 or later is part of all distributions of LaTeX
|
||||||
|
%% version 2005/12/01 or later.
|
||||||
|
%%
|
||||||
|
%% To produce the documentation run the original source files ending with `.dtx'
|
||||||
|
%% through LaTeX.
|
||||||
|
%%
|
||||||
|
|
||||||
|
\NeedsTeXFormat{LaTeX2e}[1999/12/01]
|
||||||
|
\ProvidesClass{iidef}
|
||||||
|
[2020/09/09 2.6 Tsinghua University Coursework Template]
|
||||||
|
%% configuration of nested enumerate env
|
||||||
|
\RequirePackage{enumitem}
|
||||||
|
%% set hwcount key-value option
|
||||||
|
\RequirePackage{kvoptions}
|
||||||
|
%% required by macro DeclareMathOperator
|
||||||
|
\RequirePackage{amsmath}
|
||||||
|
%% Set up page headers using with fancyhdr
|
||||||
|
\@ifundefined{lhead}{\RequirePackage{fancyhdr}}
|
||||||
|
{\def\@thulhead{thulhead}}
|
||||||
|
\RequirePackage{amsthm}
|
||||||
|
%% semester
|
||||||
|
\def\@term{term}
|
||||||
|
\newcommand{\theterm}[1]{\renewcommand\@term{#1}}
|
||||||
|
%% institute
|
||||||
|
\newcommand{\@courseinstitute}[1]{institute}
|
||||||
|
\newcommand{\thecourseinstitute}[1]{\renewcommand\@courseinstitute{#1}}
|
||||||
|
%% coursename
|
||||||
|
\newcommand{\@coursename}[1]{coursename}
|
||||||
|
\newcommand{\thecoursename}[1]{\renewcommand\@coursename{\textsc{#1}}}
|
||||||
|
%% user can rewrite homework name
|
||||||
|
\def\@hwname{Homework}
|
||||||
|
\def\hwname#1{\renewcommand\@hwname{#1}}
|
||||||
|
%% \iidef@thehwcnt = 1
|
||||||
|
\DeclareStringOption[1]{thehwcnt}
|
||||||
|
\ProcessKeyvalOptions*
|
||||||
|
\def\thehwcnt{\iidef@thehwcnt}
|
||||||
|
%% page header setup, distinguish between first page(plain style)
|
||||||
|
%% and second page on (runningpage style)
|
||||||
|
%%***************************************************************************
|
||||||
|
\newcommand{\courseheader}{
|
||||||
|
\thispagestyle{plain}%first page use native plain style to suppress header
|
||||||
|
\vspace*{-1in}
|
||||||
|
\begin{center}
|
||||||
|
\@courseinstitute\\
|
||||||
|
\@coursename\\
|
||||||
|
\@term
|
||||||
|
\vspace*{0.1in}
|
||||||
|
\hrule
|
||||||
|
\end{center}
|
||||||
|
\begin{center}
|
||||||
|
\underline{\bf \@hwname\;\thehwcnt} \\
|
||||||
|
\end{center}
|
||||||
|
}
|
||||||
|
\@ifundefined{@thulhead}{
|
||||||
|
\fancypagestyle{runningpage}
|
||||||
|
{
|
||||||
|
\fancyhead[L]{\small\@coursename}
|
||||||
|
\fancyhead[R]{\small\@courseinstitute}
|
||||||
|
}
|
||||||
|
%% use runningpage style from second page on
|
||||||
|
\pagestyle{runningpage}
|
||||||
|
}{}
|
||||||
|
%% *********************************************************************************************
|
||||||
|
%%name command macro
|
||||||
|
%%*************************
|
||||||
|
\newcommand{\name}[1]{
|
||||||
|
\begin{flushleft}
|
||||||
|
#1\hfill
|
||||||
|
\today
|
||||||
|
\end{flushleft}
|
||||||
|
\hrule
|
||||||
|
|
||||||
|
\vspace{2em}
|
||||||
|
|
||||||
|
\flushleft
|
||||||
|
}
|
||||||
|
%%*************************
|
||||||
|
%% enumitem related configuration
|
||||||
|
\setlist[enumerate,1]{label=\thehwcnt.\arabic*.}
|
||||||
|
\setlist[enumerate,2]{label=(\alph*)}
|
||||||
|
\setlist[enumerate,3]{label=\roman*.}
|
||||||
|
\setlist[enumerate,4]{label=\greek*}
|
||||||
|
%%******************************
|
||||||
|
\def\@slname{Solution}
|
||||||
|
\def\slname#1{\renewcommand\@slname{#1}}
|
||||||
|
|
||||||
|
\@ifundefined{solution}{
|
||||||
|
\newenvironment{solution}
|
||||||
|
{
|
||||||
|
\proof[\@slname]
|
||||||
|
}
|
||||||
|
{
|
||||||
|
%% no qed symbol in solution env
|
||||||
|
\renewcommand{\qedsymbol}{}
|
||||||
|
\endproof
|
||||||
|
}
|
||||||
|
}{}
|
||||||
|
%%******************************
|
||||||
|
%%common math symbols go here
|
||||||
|
%%*************************************************
|
||||||
|
\def\v#1{\underline{#1}}
|
||||||
|
\newcommand{\uc}{\underline{c}} % c, vec
|
||||||
|
\newcommand{\uv}{\underline{v}} % x, vec
|
||||||
|
\newcommand{\uw}{\underline{w}} % w, vec
|
||||||
|
\newcommand{\ux}{\underline{x}} % x, vec
|
||||||
|
\newcommand{\uy}{\underline{y}} % y, vec
|
||||||
|
\newcommand{\uz}{\underline{z}} % z, vec
|
||||||
|
\newcommand{\um}{\underline{m}} % m, vec
|
||||||
|
\newcommand{\rvx}{\mathsf{x}} % x, r.v.
|
||||||
|
\newcommand{\rvy}{\mathsf{y}} % y, r.v.
|
||||||
|
\newcommand{\rvz}{\mathsf{z}} % z, r.v.
|
||||||
|
\newcommand{\rvw}{\mathsf{w}} % w, r.v.
|
||||||
|
\newcommand{\rvH}{\mathsf{H}} % H, r.v.
|
||||||
|
\newcommand{\urvx}{\underline{\mathsf{x}}} % x, r.v. vec
|
||||||
|
\newcommand{\urvy}{\underline{\mathsf{y}}} % y, r.v. vec
|
||||||
|
\newcommand{\urvz}{\underline{\mathsf{z}}} % z, r.v. vec
|
||||||
|
\newcommand{\urvw}{\underline{\mathsf{w}}} % w, r.v. vec
|
||||||
|
|
||||||
|
\newcommand{\defas}{\triangleq} %\coloneqq
|
||||||
|
\newcommand{\reals}{\mathbb{R}}
|
||||||
|
\newcommand{\TT}{\mathrm{T}} % transpose
|
||||||
|
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||||
|
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||||
|
\DeclareMathOperator*{\argsup}{arg\,sup}
|
||||||
|
\DeclareMathOperator*{\arginf}{arg\,inf}
|
||||||
|
\DeclareMathOperator{\diag}{diag}
|
||||||
|
\DeclareMathOperator{\Var}{Var}
|
||||||
|
\DeclareMathOperator{\Cov}{Cov}
|
||||||
|
\DeclareMathOperator{\MSE}{MSE}
|
||||||
|
\DeclareMathOperator{\1}{\mathds{1}}
|
||||||
|
\DeclareMathOperator{\In}{\mathbb{I}}
|
||||||
|
\DeclareMathOperator{\E}{\mathbb{E}}
|
||||||
|
\DeclareMathOperator{\Prob}{\mathbb{P}}
|
||||||
|
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}}
|
||||||
|
\def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
|
||||||
|
%%************************************************************************************
|
||||||
BIN
hw4/report/img/20240526_155701910_iOS.png
Normal file
BIN
hw4/report/img/20240526_155701910_iOS.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 186 KiB |
BIN
hw4/report/img/attention_vis.png
Normal file
BIN
hw4/report/img/attention_vis.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 24 KiB |
49
hw4/report/img/default_sample.txt
Normal file
49
hw4/report/img/default_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
sample from workdirs/quansongci/best.pth
|
||||||
|
+++水调歌头
|
||||||
|
黄花满疏雨,月扫三宫。月明月明人去,绿绵声里,风光残霞。屈指两小天天静,绿满阶外,更相逢。那处得何曾小,泪断肠头。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++浣溪沙(五清)
|
||||||
|
翠雾玉奁烘蝉。轻姿未放花光。青袍有客已暮花。
|
||||||
|
天人未遇向西楼。小阳春水一线清。玉壶重重重。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++菩萨蛮(梅)
|
||||||
|
江南窗前月远中花。水高远。暗还花色碧。只恨欢事清。
|
||||||
|
楼上宴琼线。更欢归消息。柳边女碧云。便是天涯时。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++菩萨蛮
|
||||||
|
江上秋移香无度。凉风闹愁风。莺声瘦了归时未。小楼闲愁忆。
|
||||||
|
豆蔻风前好因缘。送通住。试问三山同。人间无处难。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++秦楼月
|
||||||
|
练雨梳妆。桃叶半枝,冰肌红子春寒。半枝都奈。吹香飞絮,记清凉。
|
||||||
|
无限夜云春风护。玉阑无数转。碎帽孤情君,小海东风。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++浪淘沙
|
||||||
|
橘上园阳关路早。绿钗风雨散,犹被东湖见楼。
|
||||||
|
仿佛风前坡上去日,月如流。想取东南风。犹慵尘尽比重归。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++诉衷情(高人)
|
||||||
|
时候又来深。长是红帘前。醉眼风入春期。
|
||||||
|
应是时时,何处在、应厮续。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++浣溪沙(咏梅)
|
||||||
|
离斟客太白犹如。不知常是西篱中。岂怜旧君些儿以言。
|
||||||
|
素娥小山小曲,水朝元有长安。一榻了共取大家。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++浣溪沙(和怀)
|
||||||
|
纵图清露歌黛倚,寒题金銮声珊瑚。十年人来懒舞丝。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++满江月
|
||||||
|
风月不如旧,柔条欲到春风。掩花间心,道处难臾、相逢。
|
||||||
|
陇头情不物里,阿谁向娇几。且看东词,还明红云与,一笑认教梳灯。
|
||||||
|
|
||||||
|
---------------
|
||||||
49
hw4/report/img/no_pos_sample.txt
Normal file
49
hw4/report/img/no_pos_sample.txt
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
sample from workdirs/quansongci_no_pos/best.pth
|
||||||
|
++++++++菩萨蛮(牡丹月近)
|
||||||
|
江月明月明月桂华开客。金交风枝残月到东风前。天色浸柳前风垂杨花更觉。坐角雪初开花小屏。断断头春风光薄。
|
||||||
|
春色悄。隔帘前阴转香千里。好破云深岸波波。不恨相思量。羞酌炉香何处。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
++++浣溪沙
|
||||||
|
清歌灯未无限。佳期时更传人不醉里,可奈有芳菲节懒。
|
||||||
|
双蛾罗带向西楼。小小槛春寒人都怨,燕子未销眉花。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
++++++++++++++++++++临江仙歌香花天
|
||||||
|
九月桃源长风留春风投宴琼桃李仙。一曾东风迟丽女
|
||||||
|
放萧词传天稼时常相逢,还记,酒,占春寒花间风光相住,月劝花往事,占春留思,应春风到上,无人间一线秀船归来,点面皱。□□□□□□□□□□□□。都为谁老还来
|
||||||
|
---------------
|
||||||
|
++++鹧鸪天(十二之二)
|
||||||
|
此见元是一声砧。紫鹤收残梳匀舞、谁家。正是平樵春发,忍因缘凝理通。
|
||||||
|
试语三岛不下,松径何处。问清将春愁易全窟,且识斗重阳。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
++++浣溪沙(赋木犀)
|
||||||
|
芙蓉水浮冰雪梅子。东风半枝都奈粉吹。飞落蕊满清凉。
|
||||||
|
枝开夜忽春风护,玉阑凉痕转新碎香。有君恩多少载酒,且道有春风流。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++++++++++++++++++++++++++++++++++++++++++++南歌头香慵尘中柳梢青玉案(西江仙香花宫春令(与梅子
|
||||||
|
绿碧梧桐梢落后西浣天云隐越山外、宿舟断乱,秋风露满庭芳菲节难过,紫。绿门好,十分飞燕子
|
||||||
|
红,秋寒庭楼小西西风,春暮
|
||||||
|
---------------
|
||||||
|
++++++鹧鸪天(和坡衮侑觞)
|
||||||
|
薰风须见前衢醉急风入鼎、花生绝团。不问何人公身口厮续厌
|
||||||
|
春色肃熟燕子,无限是道行气东风吹。看雨起梦三年。想余春事断自愁厌君。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
++++菩萨蛮(用时春)
|
||||||
|
竹花梅犹道何人时节。西篱上花前红。吹落帽风光深。素娥小金。
|
||||||
|
暮水朝秋寒。玉堂下梅花共取。小窗堂几举。从教著梅和雨。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
++++++++++最仙歌子(和尉生查子题)
|
||||||
|
绿阴山淡黄未泛湘神神仙,美酒,长唱玉纤纤纤手。元何穷何处重约,清寒食、酒家流光光渐、寄新春花晓,小院映烟微香,正是十年瑶楼酒,水暖花枝枝黄昏昏不语,乍见月寂寞痴愠痕、落醉,看花梢啼红裳篆拂堕风流。
|
||||||
|
东风吹泪过,
|
||||||
|
---------------
|
||||||
|
+++++++++++++点绛唇头春事近
|
||||||
|
花艳心头道酒前春风雨,欲春惨,春去,深自有极目娇几粉,看春词,还爱红云归,绿杨花,旧谢去年时节节,十分真时及华明月。
|
||||||
|
醉眼底莺声中秋光幸有豆皇子
|
||||||
|
杏花开后黄梅梢仙子,且占客里春风吹乱。
|
||||||
|
细雨过春风轻椒香闺催春,小离
|
||||||
|
---------------
|
||||||
BIN
hw4/report/img/no_pos_train.png
Normal file
BIN
hw4/report/img/no_pos_train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
56
hw4/report/img/no_res_sample.txt
Normal file
56
hw4/report/img/no_res_sample.txt
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
sample from workdirs/quansongci_no_res/best.pth
|
||||||
|
+++藕上空都未。消
|
||||||
|
---------------
|
||||||
|
+++。水。香,清干灯翠无月。佳
|
||||||
|
---------------
|
||||||
|
+++烟
|
||||||
|
莫。。一
|
||||||
|
真。,。,手)+(。当,。,还花。
|
||||||
|
。。饱)花清生失楼犹。拂念。。。
|
||||||
|
+东+柳人。碧放萧似天天饮时
|
||||||
|
---------------
|
||||||
|
+++,一+
|
||||||
|
楼。。移。无度此
|
||||||
|
,+路风砧东
|
||||||
|
---------------
|
||||||
|
+++,。常明香天。早。+。色。,大,梅子春上妆半枝。奈。吹。飞、,歌。阑故溪枝开夜忽春花。情,重凉痕转。碎沙相,君有园海。奈。
|
||||||
|
。会
|
||||||
|
---------------
|
||||||
|
+++。。晓宫。。园。+二盈
|
||||||
|
|
||||||
|
钗。+。,恁尾。
|
||||||
|
见楼风
|
||||||
|
寿到+。尽+。日。。
|
||||||
|
---------------
|
||||||
|
+++。看。月。
|
||||||
|
(
|
||||||
|
时衮红。自。意
|
||||||
|
须去前。醉急风入鼎人花
|
||||||
|
。团时。丹翁怨在身云厮。厌
|
||||||
|
秋海花拟燕
|
||||||
|
,无共宿道行气东。,鸾+雨。梦,
|
||||||
|
。。余采
|
||||||
|
---------------
|
||||||
|
++++俊去莺浮
|
||||||
|
时重。+功太。犹。头(人一溪+者。斋算。旧
|
||||||
|
---------------
|
||||||
|
+++,人花长和寞。。纵图清孔歌幽
|
||||||
|
---------------
|
||||||
|
+++髻
|
||||||
|
。+风与不,干
|
||||||
|
柔
|
||||||
|
。头余说。花
|
||||||
|
。心头道。前,枕相
|
||||||
|
。
|
||||||
|
忘,情+物。自水极初。几晶
|
||||||
|
看。词光。明红主与,。。认,旧。去
|
||||||
|
户萨尽玉罢
|
||||||
|
不时家。亭,行翠厚情青
|
||||||
|
+中思难梦。底南星
|
||||||
|
。自马
|
||||||
|
黄
|
||||||
|
我来
|
||||||
|
,中+。花
|
||||||
|
禁,,也
|
||||||
|
。花、。风儿。堂莺催旧,+离
|
||||||
|
---------------
|
||||||
BIN
hw4/report/img/no_res_train.png
Normal file
BIN
hw4/report/img/no_res_train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 75 KiB |
51
hw4/report/img/specific_start_sample.txt
Normal file
51
hw4/report/img/specific_start_sample.txt
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
sample from workdirs/quansongci/best.pth
|
||||||
|
+++清平乐(上赋)
|
||||||
|
黄花小。相逢去。三得东风何处。人去去年年。谁与他年道。
|
||||||
|
屈指两小天。留连心事。最思无意悠悠。无得何曾宽。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
京梅晚。几日一清声恶。无限作佳穷时。直见横户快愁儿。
|
||||||
|
客已暮云梦,天人未老。心事有天涯无数。人都不须关,只是秋千千里。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐(春)
|
||||||
|
红雪动。莫遣梅花开了。不解闲句中花妍。当时未问还近。
|
||||||
|
一枝上晚妆清明。帘犹有清香样。欢事消息意迟。东郊飞后便好天。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
银烛斜阳。斜雨初飞。日日楼前草移。无限此情休住。
|
||||||
|
小莺欲瘦收残梳。更有谁闲愁。却入豆蔻风前。因缘凝理通。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
江上秋波。一声歌舞。烟雨里常明烟雨。早来不见人归,犹唤梅子春去。
|
||||||
|
好都奈。吹回飞飞来。清凉不知无限夜,春风护雨晚梁归。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
春光西去。桂花清扇。天上一声伤春晓。却被园花不尽早。
|
||||||
|
钗边绿阴阴犹好。无计不知否。到少离愁去。谁知何处魂。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐(即回)
|
||||||
|
六钱地遍。楼前作花间。春暮云愁。月高斜阳远。困红衣自醉。
|
||||||
|
好去前时醉,风入泥袖。挼黄团时时问。怨在月明千片春水。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
晓来争觉。碧云花向楼。我似秋光也。花来日明月边莺怨。
|
||||||
|
春不语飞花知。玉浆不枉劳和困。坐中岂共旧。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐
|
||||||
|
残花晚。清闲鬓欲开。金盏一多时。菊花无计绪。娇花开花长。
|
||||||
|
谁把酒醒清声。幽心到寒题酒。一片香淡得春人。懒捻黄金眉。
|
||||||
|
|
||||||
|
---------------
|
||||||
|
+++清平乐(月明月)
|
||||||
|
醉来人在。春知何时到花时。似来东风识,时时倍度。
|
||||||
|
风月不识旧时春宵。万中说枉似、真心头道。前意追相逢。
|
||||||
|
|
||||||
|
---------------
|
||||||
BIN
hw4/report/img/train.png
Normal file
BIN
hw4/report/img/train.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 72 KiB |
187
hw4/report/main.tex
Normal file
187
hw4/report/main.tex
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
% Homework template for Inference and Information
|
||||||
|
% UPDATE: September 26, 2017 by Xiangxiang
|
||||||
|
\documentclass[a4paper]{article}
|
||||||
|
\usepackage{ctex}
|
||||||
|
\usepackage{amsmath, amssymb, amsthm}
|
||||||
|
\usepackage{moreenum}
|
||||||
|
\usepackage{mathtools}
|
||||||
|
\usepackage{url}
|
||||||
|
\usepackage{bm}
|
||||||
|
\usepackage{enumitem}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\usepackage{listings}
|
||||||
|
\usepackage{color}
|
||||||
|
\usepackage{float}
|
||||||
|
|
||||||
|
\newfontfamily\codefont[Ligatures=ResetAll]{Fira Code}[Contextuals={Alternate}]
|
||||||
|
\newfontfamily\cascadia{Cascadia Code}
|
||||||
|
|
||||||
|
\lstset{
|
||||||
|
basicstyle = \small\codefont,
|
||||||
|
% ---
|
||||||
|
tabsize = 4,
|
||||||
|
showstringspaces = false,
|
||||||
|
numbers = left,
|
||||||
|
numberstyle = \codefont,
|
||||||
|
% ---
|
||||||
|
breaklines = true,
|
||||||
|
captionpos = t,
|
||||||
|
% ---
|
||||||
|
frame = l,
|
||||||
|
flexiblecolumns,
|
||||||
|
}
|
||||||
|
|
||||||
|
\lstdefinestyle{Python}{
|
||||||
|
language = Python, % 语言选Python
|
||||||
|
keywordstyle = \color{blue},
|
||||||
|
keywordstyle = [2] \color{teal},
|
||||||
|
stringstyle = \color{orange!80!black},
|
||||||
|
commentstyle = \color{red},
|
||||||
|
identifierstyle = \color{blue!80!white},
|
||||||
|
}
|
||||||
|
|
||||||
|
\lstdefinestyle{Bash}{
|
||||||
|
language = bash
|
||||||
|
}
|
||||||
|
\usepackage{subcaption}
|
||||||
|
\usepackage{booktabs} % toprule
|
||||||
|
\usepackage[mathcal]{eucal}
|
||||||
|
\usepackage[thehwcnt = 4]{iidef}
|
||||||
|
|
||||||
|
\thecourseinstitute{清华大学电子工程系}
|
||||||
|
\thecoursename{\textbf{媒体与认知}}
|
||||||
|
\theterm{2023-2024学年春季学期}
|
||||||
|
\hwname{作业}
|
||||||
|
\begin{document}
|
||||||
|
\courseheader
|
||||||
|
\name{高艺轩}
|
||||||
|
\vspace{3mm}
|
||||||
|
\centerline{\textbf{\Large{理论部分}}}
|
||||||
|
|
||||||
|
\section{单选题(15分)}
|
||||||
|
\subsection{\underline{D}}
|
||||||
|
|
||||||
|
\subsection{\underline{A}}
|
||||||
|
|
||||||
|
\subsection{\underline{A}}
|
||||||
|
|
||||||
|
\subsection{\underline{C}}
|
||||||
|
|
||||||
|
\subsection{\underline{B}}
|
||||||
|
|
||||||
|
\section{计算题(15 分)}
|
||||||
|
% 计算题1
|
||||||
|
\subsection{隐含马尔可夫模型}
|
||||||
|
|
||||||
|
\hspace{2em}暑假中,小E每天进行一项体育活动,包括跑步(R)、游泳(S)和打球(B),所选择的体育活动受某种潜在因素(如心情)的影响。小E每天把进行体育活动的照片发至微信朋友圈,我们可以根据观测信息推测该潜在因素的状态。
|
||||||
|
|
||||||
|
\hspace{2em}假设该潜在因素分为$S_1$和$S_2$两种状态。在$S_1$时,小E选择三种体育活动的概率分别为0.6,0.2,0.2;在$S_2$时,小E选择三种体育活动的概率分别为0.1,0.6,0.3。
|
||||||
|
|
||||||
|
\hspace{2em}该潜在因素的变化也有一定规律,若某天处于$S_1$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.5,0.5;若某天处于$S_2$的状态,第二天处于$S_1$和$S_2$的状态的概率分别为0.6,0.4。
|
||||||
|
|
||||||
|
\hspace{2em}暑假第一天处于$S_1$和$S_2$的状态的概率均为0.5。
|
||||||
|
|
||||||
|
\vspace{3mm}
|
||||||
|
(1) 采用隐含马尔可夫模型(HMM)对小E暑假体育活动安排进行建模,{\color{blue}请写出HMM对应的参数$\lambda=\{\pi, A, B\}$}。
|
||||||
|
|
||||||
|
\begin{proof}[解]
|
||||||
|
\[\pi = \begin{bmatrix}
|
||||||
|
0.5\\0.5
|
||||||
|
\end{bmatrix}\]
|
||||||
|
\[A = \begin{bmatrix}
|
||||||
|
0.5 & 0.5\\
|
||||||
|
0.6 & 0.4\\
|
||||||
|
\end{bmatrix}\]
|
||||||
|
\[B = \begin{bmatrix}
|
||||||
|
0.6 & 0.2 & 0.2\\
|
||||||
|
0.1 & 0.6 & 0.3
|
||||||
|
\end{bmatrix}\]
|
||||||
|
\end{proof}
|
||||||
|
|
||||||
|
\vspace{3mm}
|
||||||
|
(2) 假设暑假第1、2、3天小E所进行的体育活动依次为跑步(R)、打球(B)和游泳(S),{\color{blue}请计算出现该观测序列的概率}。
|
||||||
|
|
||||||
|
\begin{proof}[解]
|
||||||
|
\begin{align*}
|
||||||
|
\alpha_1(S_1) & = 0.5 \times 0.6 = 0.3\\
|
||||||
|
\alpha_1(S_2) & = 0.5 \times 0.1 = 0.05\\
|
||||||
|
\alpha_2(S_1) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.6) \times 0.2\\
|
||||||
|
& = 0.036\\
|
||||||
|
\alpha_2(S_2) & = (\alpha_1(S_1) \times 0.5 + \alpha_1(S_2) \times 0.4) \times 0.3\\
|
||||||
|
& = 0.051\\
|
||||||
|
\alpha_3(S_1) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.6) \times 0.2\\
|
||||||
|
& = 0.00972\\
|
||||||
|
\alpha_3(S_2) & = (\alpha_2(S_1) \times 0.5 + \alpha_2(S_2) \times 0.4) \times 0.6\\
|
||||||
|
& = 0.02304\\
|
||||||
|
P(O \mid \lambda) & = \alpha_3(S_1) + \alpha_3(S_2) = 0.03276\\
|
||||||
|
\end{align*}
|
||||||
|
\end{proof}
|
||||||
|
|
||||||
|
\vspace{3mm}
|
||||||
|
(3) 在(2)的条件下。{\color{blue}请利用Viterbi算法推测暑假第1、2、3天最可能的隐含状态序列}。
|
||||||
|
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/20240526_155701910_iOS.png}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
|
||||||
|
% 请根据是否选择自选课题的情况选择“编程作业报告”或“自选课题开题报告”中的一项完成
|
||||||
|
\section{编程作业报告}
|
||||||
|
\subsection{模型的训练与测试}
|
||||||
|
首先进行数据预处理。预处理后进行模型训练,训练的结果见图\ref{fig:default_train}。
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/train.png}
|
||||||
|
\caption{默认测试}
|
||||||
|
\label{fig:default_train}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
默认配置的生成样本:
|
||||||
|
\begin{lstlisting}
|
||||||
|
python sample.py --ckpt_path workdirs/quansongci
|
||||||
|
\end{lstlisting}
|
||||||
|
得到的输出为
|
||||||
|
\lstinputlisting{img/default_sample.txt}
|
||||||
|
若指定初始文本:
|
||||||
|
\begin{lstlisting}
|
||||||
|
python sample.py --ckpt_path workdirs/quansongci --start +++清平乐
|
||||||
|
\end{lstlisting}
|
||||||
|
得到的输出为
|
||||||
|
\lstinputlisting{img/specific_start_sample.txt}
|
||||||
|
|
||||||
|
\subsection{探究位置编码和残差链接在模型中的作用}
|
||||||
|
关闭位置编码的训练:
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/no_pos_train.png}
|
||||||
|
\end{figure}
|
||||||
|
得到的生成结果:
|
||||||
|
\lstinputlisting{img/no_pos_sample.txt}
|
||||||
|
可以看到,模型没有很好理解句子的长度的关系。
|
||||||
|
|
||||||
|
关闭残差连接的训练:
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=\linewidth]{img/no_res_train.png}
|
||||||
|
\end{figure}
|
||||||
|
得到的生成结果:
|
||||||
|
\lstinputlisting{img/no_res_sample.txt}
|
||||||
|
模型训练遇到了梯度消失的问题,很难有效地训练。
|
||||||
|
|
||||||
|
\subsection{可视化}
|
||||||
|
\begin{figure}[H]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=.8\linewidth]{img/attention_vis.png}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
许多的词语的注意力系数都会集中在题目的几个字上,可以看到模型主要是分析了不同词牌名对内容的相关性。
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%%% Local Variables:
|
||||||
|
%%% mode: late\rvx
|
||||||
|
%%% TeX-master: t
|
||||||
|
%%% End:
|
||||||
@@ -10,7 +10,9 @@
|
|||||||
"import torch.nn as nn\n",
|
"import torch.nn as nn\n",
|
||||||
"import torch.nn.functional as F\n",
|
"import torch.nn.functional as F\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import torchvision.transforms as transforms"
|
"import torchvision.transforms as transforms\n",
|
||||||
|
"\n",
|
||||||
|
"import numpy as np"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -212,6 +214,63 @@
|
|||||||
"b = torch.Tensor([1])\n",
|
"b = torch.Tensor([1])\n",
|
||||||
"print((a.T * b).T)"
|
"print((a.T * b).T)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tensor([[False, True, True, True, True],\n",
|
||||||
|
" [False, False, True, True, True],\n",
|
||||||
|
" [False, False, False, True, True],\n",
|
||||||
|
" [False, False, False, False, True],\n",
|
||||||
|
" [False, False, False, False, False]])\n",
|
||||||
|
"tensor([[-0.1170, 0.6130, 0.9644, -1.2733, -0.9671],\n",
|
||||||
|
" [-0.7806, 0.5082, -0.2731, 0.1660, -0.5451],\n",
|
||||||
|
" [-2.1527, -0.5059, -0.0079, -0.5796, -1.1107],\n",
|
||||||
|
" [-1.8357, -0.8010, -0.0424, 0.1491, -1.5009],\n",
|
||||||
|
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n",
|
||||||
|
"tensor([[-0.1170, -inf, -inf, -inf, -inf],\n",
|
||||||
|
" [-0.7806, 0.5082, -inf, -inf, -inf],\n",
|
||||||
|
" [-2.1527, -0.5059, -0.0079, -inf, -inf],\n",
|
||||||
|
" [-1.8357, -0.8010, -0.0424, 0.1491, -inf],\n",
|
||||||
|
" [-1.3666, -0.8209, 0.0483, -1.3165, -0.9222]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()\n",
|
||||||
|
"print(mask)\n",
|
||||||
|
"attn = torch.randn(5, 5)\n",
|
||||||
|
"print(attn)\n",
|
||||||
|
"print(attn.masked_fill(mask, -np.inf))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"tensor([0.1402, 0.2312, 0.6285])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"Q = torch.Tensor([1, 0, 1, 1])\n",
|
||||||
|
"K = torch.Tensor([[0, 0, 0, 2],\n",
|
||||||
|
" [2, 0, 1, 0],\n",
|
||||||
|
" [2, 1, 2, 1]])\n",
|
||||||
|
"\n",
|
||||||
|
"print(torch.softmax((Q @ K.T) / 2, dim=0))"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|||||||
Reference in New Issue
Block a user