View file src/ia/pytorch/rnn.py - Download
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import pandas as pd
from sklearn.metrics import accuracy_score
import time
from idlmam import train_simple_network, Flatten, weight_reset
from IPython.display import set_matplotlib_formats
# set_matplotlib_formats('png', 'pdf')
torch.backends.cudnn.deterministic=True
from idlmam import set_seed, moveTo
set_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
zip_file_url = "https://download.pytorch.org/tutorial/data.zip"
# zip_file_path = "C:\\Users\\jbailhache01\\Documents\\ia\\pytorch\\data.zip"
import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
# z = zipfile.ZipFile(zip_file_path)
z.extractall()
#Zip file is organized as data/names/[LANG].txt , where [LANG] is a specific language
namge_language_data = {}
#We will use some code to remove UNICODE tokens to make life easy for us processing wise
#e.g., convert something like "Ślusàrski" to Slusarski
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
alphabet = {}
for i in range(n_letters):
alphabet[all_letters[i]] = i
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
#Loop through every language, open the zip file entry, and read all the lines from the text file.
for zip_path in z.namelist():
if "data/names/" in zip_path and zip_path.endswith(".txt"):
lang = zip_path[len("data/names/"):-len(".txt")]
with z.open(zip_path) as myfile:
lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding='utf-8').strip().split("\n")]
namge_language_data[lang] = lang_names
print(lang, ": ", len(lang_names)) #Print out the name of each language too.
class LanguageNameDataset(Dataset):
def __init__(self, lang_name_dict, vocabulary):
self.label_names = [x for x in lang_name_dict.keys()]
self.data = []
self.labels = []
self.vocabulary = vocabulary
for y, language in enumerate(self.label_names):
for sample in lang_name_dict[language]:
self.data.append(sample)
self.labels.append(y)
def __len__(self):
return len(self.data)
def string2InputVec(self, input_string):
"""
This method will convert any input string into a vector of long values, according to the vocabulary used by this object.
input_string: the string to convert to a tensor
"""
T = len(input_string) #How many characters long is the string?
#Create a new tensor to store the result in
name_vec = torch.zeros((T), dtype=torch.long)
#iterate through the string and place the appropriate values into the tensor
for pos, character in enumerate(input_string):
name_vec[pos] = self.vocabulary[character]
return name_vec
def __getitem__(self, idx):
name = self.data[idx]
label = self.labels[idx]
#Conver the correct class label into a tensor for PyTorch
label_vec = torch.tensor([label], dtype=torch.long)
return self.string2InputVec(name), label
dataset = LanguageNameDataset(namge_language_data, alphabet)
train_data, test_data = torch.utils.data.random_split(dataset, (len(dataset)-300, 300))
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)
class LastTimeStep(nn.Module):
"""
A class for extracting the hidden activations of the last time step following
the output of a PyTorch RNN module.
"""
def __init__(self, rnn_layers=1, bidirectional=False):
super(LastTimeStep, self).__init__()
self.rnn_layers = rnn_layers
if bidirectional:
self.num_driections = 2
else:
self.num_driections = 1
def forward(self, input):
#Result is either a tupe (out, h_t)
#or a tuple (out, (h_t, c_t))
rnn_output = input[0]
last_step = input[1] #this will be h_t
if(type(last_step) == tuple):#unless it's a tuple,
last_step = last_step[0]#then h_t is the first item in the tuple
batch_size = last_step.shape[1] #per docs, shape is: '(num_layers * num_directions, batch, hidden_size)'
#reshaping so that everything is separate
last_step = last_step.view(self.rnn_layers, self.num_driections, batch_size, -1)
#We want the last layer's results
last_step = last_step[self.rnn_layers-1]
#Re order so batch comes first
last_step = last_step.permute(1, 0, 2)
#Finally, flatten the last two dimensions into one
return last_step.reshape(batch_size, -1)
D = 64
vocab_size = len(all_letters)
hidden_nodes = 256
classes = len(dataset.label_names)
first_rnn = nn.Sequential(
nn.Embedding(vocab_size, D), #(B, T) -> (B, T, D)
nn.RNN(D, hidden_nodes, batch_first=True), #(B, T, D) -> ( (B,T,D) , (S, B, D) )
#the tanh activation is built into the RNN object, so we don't need to do it here
LastTimeStep(), #We need to take the RNN output and reduce it to one item, (B, D)
nn.Linear(hidden_nodes, classes), #(B, D) -> (B, classes)
)
loss_func = nn.CrossEntropyLoss()
batch_one_train = train_simple_network(first_rnn, loss_func, train_loader, test_loader=test_loader, score_funcs={'Accuracy': accuracy_score}, device=device, epochs=5)
sns.lineplot(x='epoch', y='test Accuracy', data=batch_one_train, label='RNN')
plt.show()
pred_rnn = first_rnn.to("cpu").eval()
with torch.no_grad():
preds = F.softmax(pred_rnn(dataset.string2InputVec("frank").reshape(1,-1)), dim=-1)
for class_id in range(len(dataset.label_names)):
print(dataset.label_names[class_id], ":", preds[0,class_id].item()*100 , "%")