View file src/colab/bertopic_step_1_sbert_step_2_umap.py - Download

# -*- coding: utf-8 -*-
"""bertopic-step-1-sbert-step-2-umap.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1hqn-KYyEGvdkrvoPbQccTt7opXwHYcmF

https://www.kaggle.com/code/johannareiml/bertopic-step-1-sbert-step-2-umap/
"""

!pip install sentence-transformers umap-learn

from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
model

# 2. Defines some sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
sentences

# 3. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
embeddings.shape

# 4. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
similarities

!pip install datasets

from datasets import load_dataset

# load our filtered 20newsgroups dataset from huggingface
dataset = load_dataset("aihpi/20_newsgroups_demo", split="train")
dataset

# Get embeddings from 20newsgroup documents
docs = dataset["text"]
embeddings = model.encode(docs)

embeddings.shape

from umap import UMAP

# Initialize UMAP model
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine")

# Fit and transform the embeddings
umap_embeddings = umap_model.fit_transform(embeddings)

umap_embeddings.shape

# Create a dataframe for visualization
import plotly.express as px
import pandas as pd

# Create a new column for hover data with truncated text
df = pd.DataFrame(umap_embeddings, columns=["x", "y"])

# Plotting using Plotly
fig = px.scatter(df, x="x", y="y")
fig.show()

# take first 100 characters for tooltip when hovering on a datapoint
hover_texts = [doc[:100] + "..." for doc in docs]
# use the labels from the original dataset as colors (these are not the predicted topics)
labels = dataset["label_text"]

# Create new columns for hover data with truncated text and the label for colour
df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["hover_text"] = hover_texts
df["label"] = labels

# Plotting using Plotly
fig = px.scatter(df, x="x", y="y", color="label", hover_data=["hover_text"])
fig.show()

# Initialize UMAP model with n_components=3 for dimensionality reduction to 3 dimensions
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.0, metric="cosine")

# Fit and transform the embeddings
umap_embeddings = umap_model.fit_transform(embeddings)

umap_embeddings.shape[0]

import numpy as np

# Add a third dimension "z"
df = pd.DataFrame(umap_embeddings, columns=["x", "y", "z"])
df["hover_text"] = hover_texts
df["label"] = labels

# Plotting using scatter_3d instead of scatter with extra "z" argument
fig = px.scatter_3d(df, x="x", y="y", z="z", color="label", hover_data=["hover_text"])
fig.update_traces(marker_size=2)
fig.show()