View file src/colab/wordmap.py

View file src/colab/wordmap.py - Download

# -*- coding: utf-8 -*-
"""wordmap.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1i4Hr6_VKnjll2e1Rn3P-GAYxXT2r9fh0

# Example of word embedding

Word embedding consists in representing each word by a vector, the distance between two vectors in the vectorial space corresponfing to the similarity between the corresponding words.

In this notebook, we will calculate the embeddings of a list of words after transforming them into tokens, and then reduce the dimensionality of the embedding space to 2 to display a 2D word map.

3 algorithms are used to reduce the dimensionality of the embedding space:


*   PCA : Principal Component analysis
*   TSNE : T-distributed Stochastic Neighbor Embedding
*   UMAP : Uniform Manifold Approximation and Projection

PCA gives always the same result, the two other give different results when applied successively several times.

## Tokenization and embedding models
"""

import torch

# Set device
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {torch_device}")

diffusers_old_version = True

if diffusers_old_version:
  !pip install -q --upgrade transformers diffusers==0.2.4 ftfy
else:
  !pip install -q --upgrade transformers diffusers ftfy

from transformers import CLIPTextModel, CLIPTokenizer, logging

# Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

text_encoder = text_encoder.to(torch_device)

"""Helper functions for embedding."""

def embed(text):
    prompt = [text]
    text_input = tokenizer([prompt[0]], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    with torch.no_grad():
      text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
    # return text_embeddings[0][1]
    return torch.flatten(text_embeddings[0])

def get_embedding(text):
  return embed(text).tolist()

"""Install packages for dimensional reduction and displaying."""

!pip install scipy scikit-learn umap-learn

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP

import matplotlib.pyplot as plt

"""Display a word map using a given reducer."""

def wordmap(words, reducer):
  embeddings = [get_embedding(word) for word in words]
  embeddings_2d = reducer.fit_transform(torch.tensor(embeddings))
  fig = plt.figure(figsize=(5,5))
  ax = fig.add_subplot(111)
  ax = ax.scatter(x=embeddings_2d[:, 0], y=embeddings_2d[:, 1])
  for i, txt in enumerate(words):
    plt.text(embeddings_2d[i,0], embeddings_2d[i,1], txt[0:30])
  fig.show()

"""Example of word list."""

words = ["man", "woman", "boy", "girl", "king", "queen", "house", "castle",
"dog", "wolf", "cat", "lion"]

"""Examples of word maps."""

wordmap(words, PCA(n_components=2))

wordmap(words, TSNE(n_components=2, perplexity=6))

wordmap(words, TSNE(n_components=2, perplexity=6))

wordmap(words, UMAP())

wordmap(words, UMAP())