View file src/colab/copie_de_bertopic_mathematics.py - Download

# -*- coding: utf-8 -*-
"""Copie de bertopic_mathematics.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1OwBL7mF0ZXMy-X0V8J0oAY9CgNXGtHf_

https://colab.research.google.com/drive/1PLuuIF_OP5XGOh4K7CpVqCUyRIxkkOPi?usp=sharing
"""

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# %%capture
# BERTopic + llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on, -DCMAKE_CUDA_ARCHITECTURES=all" FORCE_CMAKE=1 pip install llama-cpp-python
!pip install bertopic datasets

# DataMapPlot
!git clone https://github.com/TutteInstitute/datamapplot.git
!pip install datamapplot/.

# GPU-accelerated HDBSCAN + UMAP
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda12x -f https://pip.cupy.dev/aarch64

# %%capture
!pip install --upgrade mathematics_dataset/
!python -m mathematics_dataset.generate_to_file --output_dir=./data --train_split=False --per_train_module=1000

import os

# Function to read questions and answers from a file and store them in a list
def read_questions_and_answers(file_path):
    qa_list = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 2):
            question = lines[i].strip()
            answer = lines[i + 1].strip()
            qa_list.append(f"{question}, {answer}")
    return qa_list

# Function to process all text files in a folder
def process_files_in_folder(folder_path):
    qa_combined_list = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            qa_list = read_questions_and_answers(file_path)
            qa_combined_list.extend(qa_list)
    return qa_combined_list

# Specify the folder path where your text files are located
folder_path = "./data/train"

# Call the function to process the files in the folder
docs = process_files_in_folder(folder_path)
print(len(docs))

# !wget https://huggingface.co/TheBloke/dolphin-2.7-mixtral-8x7b-GGUF/resolve/main/dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf
!wget https://huggingface.co/TheBloke/dolphin-2.7-mixtral-8x7b-GGUF/resolve/main/dolphin-2.7-mixtral-8x7b.Q4_K_M.gguf
# !wget https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf

from llama_cpp import Llama

# Use llama.cpp to load in a Qu|antized LLM
# llm = Llama(model_path="dolphin-2.7-mixtral-8x7b.Q3_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
llm = Llama(model_path="dolphin-2.7-mixtral-8x7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, main_gpu=2, offload_kqv=True, stop=["Q:", "\n"])
# llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])

from bertopic.representation import KeyBERTInspired, LlamaCPP


prompt = """ Q:
I have a topic you would learn in a high school mathematics class that contains the following documents:
[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, give me the most likely topic you would learn in a high school mathematics class that describes this information.
Only give me the topic and nothing else, with at most 5 words.
A:
"""
representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "LLM": LlamaCPP(llm, prompt=prompt)
}

from sentence_transformers import SentenceTransformer
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
# from umap import UMAP
# from hdbscan import HDBSCAN

# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(docs, show_progress_bar=False)

# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

# Define sub-models
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

from bertopic import BERTopic

topic_model = BERTopic(

  # Sub-models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)

# Show topics
topic_model.get_topic_info()

import datamapplot
import re

# Create a label for each document
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]

# Run the visualization
datamapplot.create_plot(
    reduced_embeddings,
    all_labels,
    label_font_size=11,
    title="Mathematical Question Types - BERTopic",
    sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
    label_wrap_width=20,
    use_medoids=True,
)