View file src/colab/bertopic_step_1_sbert_step_2_umap.py - Download
# -*- coding: utf-8 -*-
"""bertopic-step-1-sbert-step-2-umap.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1hqn-KYyEGvdkrvoPbQccTt7opXwHYcmF
https://www.kaggle.com/code/johannareiml/bertopic-step-1-sbert-step-2-umap/
"""
!pip install sentence-transformers umap-learn
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
model
# 2. Defines some sentences to encode
sentences = [
"The weather is lovely today.",
"It's so sunny outside!",
"He drove to the stadium.",
]
sentences
# 3. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
embeddings.shape
# 4. Calculate the embedding similarities
similarities = model.similarity(embeddings, embeddings)
similarities
!pip install datasets
from datasets import load_dataset
# load our filtered 20newsgroups dataset from huggingface
dataset = load_dataset("aihpi/20_newsgroups_demo", split="train")
dataset
# Get embeddings from 20newsgroup documents
docs = dataset["text"]
embeddings = model.encode(docs)
embeddings.shape
from umap import UMAP
# Initialize UMAP model
umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine")
# Fit and transform the embeddings
umap_embeddings = umap_model.fit_transform(embeddings)
umap_embeddings.shape
# Create a dataframe for visualization
import plotly.express as px
import pandas as pd
# Create a new column for hover data with truncated text
df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
# Plotting using Plotly
fig = px.scatter(df, x="x", y="y")
fig.show()
# take first 100 characters for tooltip when hovering on a datapoint
hover_texts = [doc[:100] + "..." for doc in docs]
# use the labels from the original dataset as colors (these are not the predicted topics)
labels = dataset["label_text"]
# Create new columns for hover data with truncated text and the label for colour
df = pd.DataFrame(umap_embeddings, columns=["x", "y"])
df["hover_text"] = hover_texts
df["label"] = labels
# Plotting using Plotly
fig = px.scatter(df, x="x", y="y", color="label", hover_data=["hover_text"])
fig.show()
# Initialize UMAP model with n_components=3 for dimensionality reduction to 3 dimensions
umap_model = UMAP(n_neighbors=5, n_components=3, min_dist=0.0, metric="cosine")
# Fit and transform the embeddings
umap_embeddings = umap_model.fit_transform(embeddings)
umap_embeddings.shape[0]
import numpy as np
# Add a third dimension "z"
df = pd.DataFrame(umap_embeddings, columns=["x", "y", "z"])
df["hover_text"] = hover_texts
df["label"] = labels
# Plotting using scatter_3d instead of scatter with extra "z" argument
fig = px.scatter_3d(df, x="x", y="y", z="z", color="label", hover_data=["hover_text"])
fig.update_traces(marker_size=2)
fig.show()