View file src/colab/copie_de_bertopic_mathematics.py - Download
# -*- coding: utf-8 -*-
"""Copie de bertopic_mathematics.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1OwBL7mF0ZXMy-X0V8J0oAY9CgNXGtHf_
https://colab.research.google.com/drive/1PLuuIF_OP5XGOh4K7CpVqCUyRIxkkOPi?usp=sharing
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
# %%capture
# BERTopic + llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on, -DCMAKE_CUDA_ARCHITECTURES=all" FORCE_CMAKE=1 pip install llama-cpp-python
!pip install bertopic datasets
# DataMapPlot
!git clone https://github.com/TutteInstitute/datamapplot.git
!pip install datamapplot/.
# GPU-accelerated HDBSCAN + UMAP
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda12x -f https://pip.cupy.dev/aarch64
# %%capture
!pip install --upgrade mathematics_dataset/
!python -m mathematics_dataset.generate_to_file --output_dir=./data --train_split=False --per_train_module=1000
import os
# Function to read questions and answers from a file and store them in a list
def read_questions_and_answers(file_path):
qa_list = []
with open(file_path, 'r') as file:
lines = file.readlines()
for i in range(0, len(lines), 2):
question = lines[i].strip()
answer = lines[i + 1].strip()
qa_list.append(f"{question}, {answer}")
return qa_list
# Function to process all text files in a folder
def process_files_in_folder(folder_path):
qa_combined_list = []
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
qa_list = read_questions_and_answers(file_path)
qa_combined_list.extend(qa_list)
return qa_combined_list
# Specify the folder path where your text files are located
folder_path = "./data/train"
# Call the function to process the files in the folder
docs = process_files_in_folder(folder_path)
print(len(docs))
# !wget https://huggingface.co/TheBloke/dolphin-2.7-mixtral-8x7b-GGUF/resolve/main/dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf
!wget https://huggingface.co/TheBloke/dolphin-2.7-mixtral-8x7b-GGUF/resolve/main/dolphin-2.7-mixtral-8x7b.Q4_K_M.gguf
# !wget https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf
from llama_cpp import Llama
# Use llama.cpp to load in a Qu|antized LLM
# llm = Llama(model_path="dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
llm = Llama(model_path="dolphin-2.7-mixtral-8x7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, main_gpu=2, offload_kqv=True, stop=["Q:", "\n"])
# llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
from bertopic.representation import KeyBERTInspired, LlamaCPP
prompt = """ Q:
I have a topic you would learn in a high school mathematics class that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the above information, give me the most likely topic you would learn in a high school mathematics class that describes this information.
Only give me the topic and nothing else, with at most 5 words.
A:
"""
representation_model = {
"KeyBERT": KeyBERTInspired(),
"LLM": LlamaCPP(llm, prompt=prompt)
}
from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
# from umap import UMAP
# from hdbscan import HDBSCAN
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(docs, show_progress_bar=False)
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
# Define sub-models
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
from bertopic import BERTopic
topic_model = BERTopic(
# Sub-models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
# Hyperparameters
top_n_words=10,
verbose=True
)
# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)
# Show topics
topic_model.get_topic_info()
import datamapplot
import re
# Create a label for each document
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
# Run the visualization
datamapplot.create_plot(
reduced_embeddings,
all_labels,
label_font_size=11,
title="Mathematical Question Types - BERTopic",
sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
label_wrap_width=20,
use_medoids=True,
)