View file src/colab/wordmap.py - Download
# -*- coding: utf-8 -*-
"""wordmap.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1i4Hr6_VKnjll2e1Rn3P-GAYxXT2r9fh0
# Example of word embedding
Word embedding consists in representing each word by a vector, the distance between two vectors in the vectorial space corresponfing to the similarity between the corresponding words.
In this notebook, we will calculate the embeddings of a list of words after transforming them into tokens, and then reduce the dimensionality of the embedding space to 2 to display a 2D word map.
3 algorithms are used to reduce the dimensionality of the embedding space:
* PCA : Principal Component analysis
* TSNE : T-distributed Stochastic Neighbor Embedding
* UMAP : Uniform Manifold Approximation and Projection
PCA gives always the same result, the two other give different results when applied successively several times.
## Tokenization and embedding models
"""
import torch
# Set device
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {torch_device}")
diffusers_old_version = True
if diffusers_old_version:
!pip install -q --upgrade transformers diffusers==0.2.4 ftfy
else:
!pip install -q --upgrade transformers diffusers ftfy
from transformers import CLIPTextModel, CLIPTokenizer, logging
# Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = text_encoder.to(torch_device)
"""Helper functions for embedding."""
def embed(text):
prompt = [text]
text_input = tokenizer([prompt[0]], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
with torch.no_grad():
text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
# return text_embeddings[0][1]
return torch.flatten(text_embeddings[0])
def get_embedding(text):
return embed(text).tolist()
"""Install packages for dimensional reduction and displaying."""
!pip install scipy scikit-learn umap-learn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import matplotlib.pyplot as plt
"""Display a word map using a given reducer."""
def wordmap(words, reducer):
embeddings = [get_embedding(word) for word in words]
embeddings_2d = reducer.fit_transform(torch.tensor(embeddings))
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax = ax.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1])
for i, txt in enumerate(words):
plt.text(embeddings_2d[i,0], embeddings_2d[i,1], txt[0:30])
fig.show()
"""Example of word list."""
words = ["man", "woman", "boy", "girl", "king", "queen", "house", "castle",
"dog", "wolf", "cat", "lion"]
"""Examples of word maps."""
wordmap(words, PCA(n_components=2))
wordmap(words, TSNE(n_components=2, perplexity=6))
wordmap(words, TSNE(n_components=2, perplexity=6))
wordmap(words, UMAP())
wordmap(words, UMAP())