398 lines
16 KiB
Python
398 lines
16 KiB
Python
#========================================================
|
|
# Media and Cognition
|
|
# Homework 1 Neural network basics
|
|
# recognition.py - character classification
|
|
# Student ID: 2022010639
|
|
# Name: Gao Yixuan
|
|
# Tsinghua University
|
|
# (C) Copyright 2024
|
|
#========================================================
|
|
|
|
# ==== Part 0: import libs
|
|
import torch
|
|
import torch.optim as optim
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
|
import json, cv2, os, string
|
|
import matplotlib.pyplot as plt
|
|
|
|
import numpy as np
|
|
|
|
# this time we implement our networks and loss functions in other python script, and import them here
|
|
from network import MLP
|
|
from losses import CrossEntropyLoss
|
|
|
|
# argparse is used to conveniently set our configurations
|
|
import argparse
|
|
|
|
# ==== Part 1: data loader
|
|
|
|
# construct a dataset and a data loader, more details can be found in
|
|
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html?highlight=dataloader
|
|
|
|
class ListDataset(Dataset):
|
|
def __init__(self, im_dir, file_path, norm_size=(32, 32)):
|
|
'''
|
|
:param im_dir: path to directory with images
|
|
:param file_path: json file containing image names and labels
|
|
:param norm_size: image normalization size, (height, width)
|
|
'''
|
|
|
|
# this time we will try to recognize 26 English letters (case-insensitive)
|
|
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
|
self.alphabet = {letters[i]:i for i in range(len(letters))}
|
|
self.norm_size = norm_size
|
|
|
|
with open(file_path, 'r') as f:
|
|
imgs = json.load(f)
|
|
im_names = list(imgs.keys())
|
|
|
|
self.im_paths = [os.path.join(im_dir, im_name) for im_name in im_names]
|
|
self.labels = list(imgs.values())
|
|
|
|
def __len__(self):
|
|
# the __len__() function should return the total number of samples in the dataset
|
|
return len(self.im_paths)
|
|
|
|
def __getitem__(self, index):
|
|
assert index <= len(self), 'index range error'
|
|
|
|
# read an image and convert it to grey scale
|
|
im_path = self.im_paths[index]
|
|
im = cv2.imread(im_path)
|
|
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
|
|
|
# image pre-processing, after pre-processing, the size of the image should be as norm_size and the values of image pixels should be within [-1,1]
|
|
im = cv2.resize(im, self.norm_size)
|
|
# im = im / 255.
|
|
""" The above command does not seems to be valid in my environment """
|
|
im = np.divide(im, 255.)
|
|
im = (im - 0.5) * 2.0
|
|
|
|
# get the label of the current image
|
|
# upper() is used to convert a letter into uppercase
|
|
label = self.labels[index].upper()
|
|
|
|
# convert an English letter into a number index
|
|
label = self.alphabet[label]
|
|
|
|
# TODO 1: return the image and its label
|
|
return im, label
|
|
|
|
|
|
|
|
def dataLoader(im_dir, file_path, norm_size, batch_size, workers=0):
|
|
'''
|
|
:param im_dir: path to directory with images
|
|
:param file_path: file with image paths and labels
|
|
:param norm_size: image normalization size, (height, width)
|
|
:param batch_size: batch size
|
|
:param workers: number of workers for loading data in multiple threads
|
|
:return: a data loader
|
|
'''
|
|
|
|
dataset = ListDataset(im_dir, file_path, norm_size)
|
|
return DataLoader(dataset,
|
|
batch_size=batch_size,
|
|
shuffle=True if 'train' in file_path else False, # shuffle images only when training
|
|
num_workers=workers)
|
|
|
|
|
|
# ==== Part 2: training, validation and testing
|
|
|
|
def train_val(model, trainloader, valloader, n_epochs,
|
|
lr, optim_type, momentum, weight_decay,
|
|
valInterval, device='cpu'):
|
|
'''
|
|
The main training procedure
|
|
----------------------------
|
|
:param model: the MLP model
|
|
:param trainloader: the dataloader of the train set
|
|
:param valloader: the dataloader of the validation set
|
|
:param n_epochs: number of training epochs
|
|
:param lr: learning rate
|
|
:param optim_type: optimizer, can be 'sgd', 'adagrad', 'rmsprop', 'adam', or 'adadelta'
|
|
:param momentum: only used if optim_type == 'sgd'
|
|
:param weight_decay: the factor of L2 penalty on network weights
|
|
:param valInterval: the frequency of validation, e.g., if valInterval = 5, then do validation after each 5 training epochs
|
|
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
|
'''
|
|
|
|
# define the cross entropy loss function.
|
|
ce_loss = CrossEntropyLoss.apply
|
|
|
|
# optimizer
|
|
if optim_type == 'sgd':
|
|
optimizer = optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay)
|
|
elif optim_type == 'adagrad':
|
|
optimizer = optim.Adagrad(model.parameters(), lr, weight_decay=weight_decay)
|
|
elif optim_type == 'rmsprop':
|
|
optimizer = optim.RMSprop(model.parameters(), lr, weight_decay=weight_decay)
|
|
elif optim_type == 'adam':
|
|
optimizer = optim.Adam(model.parameters(), lr, weight_decay=weight_decay)
|
|
elif optim_type == 'adadelta':
|
|
optimizer = optim.Adadelta(model.parameters(), lr, weight_decay=weight_decay)
|
|
else:
|
|
print('[Error] optim_type should be one of sgd, adagrad, rmsprop, adam, or adadelta')
|
|
raise NotImplementedError
|
|
|
|
# training
|
|
|
|
# to save loss of each training epoch in a python "list" data structure
|
|
losses = []
|
|
|
|
for epoch in range(n_epochs):
|
|
# set the model in training mode
|
|
model.train()
|
|
|
|
# to save total loss in one epoch
|
|
total_loss = 0.
|
|
|
|
#TODO 2: Calculate losses and train the network using the optimizer
|
|
for data, labels in trainloader: # get a batch of data
|
|
|
|
# step 1: set data type and device
|
|
# data = torch.from_numpy(data)
|
|
data = data.type(torch.float32)
|
|
data = data.to(device)
|
|
labels = labels.to(device)
|
|
|
|
# print(data.device)
|
|
|
|
# step 2: convert an image to a vector as the input of the MLP
|
|
data = torch.flatten(data, start_dim=1)
|
|
# print(data.size())
|
|
|
|
# hit: clear gradients in the optimizer
|
|
optimizer.zero_grad()
|
|
|
|
# step 3: run the model which is the forward process
|
|
output = model(data)
|
|
|
|
# step 4: compute the loss, and call backward propagation function
|
|
loss = ce_loss(output, labels)
|
|
loss.backward()
|
|
# I have no idea why pylance can't get the data type of what ce_loss returns
|
|
|
|
# step 5: sum up of total loss, loss.item() return the value of the tensor as a standard python number
|
|
# this operation is not differentiable
|
|
total_loss += loss.item()
|
|
|
|
# step 6: call a function, optimizer.step(), to update the parameters of the models
|
|
optimizer.step()
|
|
|
|
|
|
# average of the total loss for iterations
|
|
avg_loss = total_loss / len(trainloader)
|
|
losses.append(avg_loss)
|
|
print('Epoch {:02d}: loss = {:.3f}'.format(epoch + 1, avg_loss))
|
|
|
|
# validation
|
|
if (epoch + 1) % valInterval == 0:
|
|
val_acc = test(model, valloader, device)
|
|
# show prediction accuracy
|
|
print('Epoch {:02d}: validation accuracy = {:.1f}%'.format(epoch + 1, 100 * val_acc))
|
|
|
|
|
|
# save model parameters in a file
|
|
# model_save_path = 'saved_models/recognition.pth'.format(epoch + 1)
|
|
model_save_path = opt.model_path
|
|
|
|
torch.save({'state_dict': model.state_dict(),
|
|
}, model_save_path)
|
|
print('Model saved in {}\n'.format(model_save_path))
|
|
|
|
# draw the loss curve
|
|
plot_loss(losses)
|
|
|
|
|
|
def test(model, testloader, device):
|
|
'''
|
|
The testing procedure
|
|
----------------------------
|
|
:param model: the MLP model
|
|
:param testloader: the dataloader to be tested/validated
|
|
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
|
'''
|
|
# set the model in evaluation mode
|
|
model.eval()
|
|
|
|
n_correct = 0. # number of images that are correctly classified
|
|
n_imgs = 0. # number of total images
|
|
|
|
with torch.no_grad(): # we do not need to compute gradients during validation
|
|
|
|
#TODO 3: get the prediction of the data and calculate the accuracy
|
|
for imgs, labels in testloader:
|
|
# step 1: set data type and device
|
|
# imgs = torch.from_numpy(imgs)
|
|
imgs = imgs.type(torch.float32)
|
|
imgs = imgs.to(device)
|
|
labels = labels.to(device)
|
|
|
|
# step 2: convert an image to a vector as the input of the MLP
|
|
imgs = torch.flatten(imgs, start_dim=1)
|
|
|
|
# step 3: run the model which is the forward process
|
|
output = model(imgs)
|
|
|
|
# step 4: get the predicted value by the output using out.argmax(1)
|
|
pred = output.argmax(1)
|
|
|
|
# step 5: sum up the number of images correctly recognized and the total image number
|
|
for predict, label in zip(pred, labels):
|
|
if predict == label:
|
|
n_correct += 1
|
|
n_imgs += 1
|
|
|
|
accuracy = n_correct / n_imgs
|
|
return accuracy
|
|
|
|
|
|
# ==== Part 3: predict new images
|
|
def predict(model, im_path, norm_size, device):
|
|
'''
|
|
The predicting procedure
|
|
---------------
|
|
:param model: the MLP model
|
|
:param im_path: path of an image
|
|
:param norm_size: image normalization size, (height, width)
|
|
:param device: 'cpu' or 'cuda', we can use 'cpu' for our homework if GPU with cuda support is not available
|
|
'''
|
|
|
|
# TODO 4: enter the evaluation mode
|
|
model.eval()
|
|
|
|
# TODO 4: image pre-processing, similar to what we do in ListDataset()
|
|
im = cv2.imread(im_path)
|
|
im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
|
|
|
im = cv2.resize(im, norm_size)
|
|
im = np.divide(im, 255.)
|
|
im = (im - 0.5) * 2.0
|
|
|
|
# convert im from numpy.ndarray to torch.tensor
|
|
im = torch.from_numpy(im)
|
|
|
|
# input im into the model
|
|
with torch.no_grad():
|
|
input = im.view(1, -1).type(torch.float32).to(device)
|
|
out = model(input)
|
|
prediction = out.argmax(1)[0].item()
|
|
|
|
# convert index of prediction to the corresponding character
|
|
letters = string.ascii_letters[-26:] # ABCD...XYZ
|
|
prediction = letters[prediction]
|
|
|
|
print('Prediction: {}'.format(prediction))
|
|
|
|
|
|
# ==== Part 4: draw the loss curve
|
|
def plot_loss(losses):
|
|
'''
|
|
:param losses: list of losses for each epoch
|
|
:return:
|
|
'''
|
|
|
|
f, ax = plt.subplots()
|
|
|
|
# draw loss
|
|
ax.plot(losses)
|
|
|
|
# set labels
|
|
ax.set_xlabel('training epoch')
|
|
ax.set_ylabel('loss')
|
|
|
|
# show the plots
|
|
plt.show()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# set random seed for reproducibility
|
|
seed = 2023
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed(seed)
|
|
torch.cuda.manual_seed_all(seed)
|
|
torch.backends.cudnn.deterministic = True
|
|
|
|
# set configurations
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--mode', type=str, default='train', help='train, test or predict')
|
|
parser.add_argument('--im_dir', type=str, default='data/character_classification/images',
|
|
help='path to directory with images')
|
|
parser.add_argument('--train_file_path', type=str, default='data/character_classification/train.json',
|
|
help='file list of training image paths and labels')
|
|
parser.add_argument('--val_file_path', type=str, default='data/character_classification/validation.json',
|
|
help='file list of validation image paths and labels')
|
|
parser.add_argument('--test_file_path', type=str, default='data/character_classification/test.json',
|
|
help='file list of test image paths and labels')
|
|
parser.add_argument('--batchsize', type=int, default=8, help='batch size')
|
|
parser.add_argument('--device', type=str, default='cpu', help='cpu or cuda')
|
|
|
|
# configurations for training
|
|
parser.add_argument('--hsize', type=str, default='32', help='hidden size for each hidden layer, splitted by comma')
|
|
parser.add_argument('--layer', type=int, default=2, help='number of layers in the MLP')
|
|
parser.add_argument('--act', type=str, default='relu',
|
|
help='type of activation function, can be sigmoid, tanh, or relu')
|
|
parser.add_argument('--norm_size', type=tuple, default=(32, 32), help='image normalization size, (height, width)')
|
|
parser.add_argument('--epoch', type=int, default=50, help='number of training epochs')
|
|
parser.add_argument('--n_classes', type=int, default=26, help='number of classes')
|
|
parser.add_argument('--valInterval', type=int, default=10, help='the frequency of validation')
|
|
parser.add_argument('--lr', type=float, default=5e-4, help='learning rate')
|
|
parser.add_argument('--optim_type', type=str, default='sgd', help='type of optimizer, can be sgd, adagrad, rmsprop, adam, or adadelta')
|
|
parser.add_argument('--momentum', type=float, default=0.9, help='momentum of the SGD optimizer, only used if optim_type is sgd')
|
|
parser.add_argument('--weight_decay', type=float, default=0., help='the factor of L2 penalty on network weights')
|
|
|
|
# configurations for test and prediction
|
|
parser.add_argument('--model_path', type=str, default='saved_models/recognition.pth', help='path of a saved model')
|
|
parser.add_argument('--im_path', type=str, default='data/character_classification/new_images/predict01.png',
|
|
help='path of an image to be recognized')
|
|
|
|
opt = parser.parse_args()
|
|
|
|
# TODO 5: initialize the MLP model
|
|
# what is the input size of the MLP?
|
|
# hint 1: we convert an image to a vector as the input of the MLP
|
|
# hint 2: each image has shape [norm_size[0], norm_size[1]]
|
|
model = MLP(opt.norm_size[0] * opt.norm_size[1], 26, [int(num) for num in opt.hsize.split(',')], opt.layer, opt.act)
|
|
|
|
# for the 'test' and 'predict' mode, we should load the saved checkpoint into the model
|
|
if opt.mode == 'test' or opt.mode == 'predict':
|
|
checkpoint = torch.load(opt.model_path, map_location='cpu')
|
|
# """The above code did not consider device problem"""
|
|
# checkpoint = torch.load(opt.model_path, map_location=opt.device)
|
|
# load model parameters we saved in model_path
|
|
model.load_state_dict(checkpoint['state_dict'])
|
|
print('[Info] Load model from {}'.format(opt.model_path))
|
|
|
|
# put the model on CPU or GPU according to the device in args
|
|
model = model.to(opt.device)
|
|
|
|
# -- run the code for training and validation
|
|
if opt.mode == 'train':
|
|
# training and validation data loader
|
|
trainloader = dataLoader(opt.im_dir, opt.train_file_path, opt.norm_size, opt.batchsize)
|
|
valloader = dataLoader(opt.im_dir, opt.val_file_path, opt.norm_size, opt.batchsize)
|
|
train_val(model, trainloader, valloader,
|
|
n_epochs=opt.epoch,
|
|
lr=opt.lr,
|
|
optim_type=opt.optim_type,
|
|
momentum=opt.momentum,
|
|
weight_decay=opt.weight_decay,
|
|
valInterval=opt.valInterval,
|
|
device=opt.device)
|
|
|
|
# -- test the saved model
|
|
elif opt.mode == 'test':
|
|
testloader = dataLoader(opt.im_dir, opt.test_file_path, opt.norm_size, opt.batchsize)
|
|
acc = test(model, testloader, opt.device)
|
|
print('[Info] Test accuracy = {:.1f}%'.format(100 * acc))
|
|
|
|
# -- predict a new image
|
|
elif opt.mode == 'predict':
|
|
predict(model, im_path=opt.im_path, norm_size=opt.norm_size, device=opt.device)
|
|
|
|
else:
|
|
print('mode should be train, test, or predict')
|
|
raise NotImplementedError
|