Spaces:

Polo123
/

Topic-Modelling-LLamaCpp

Sleeping

File size: 4,975 Bytes
from tabula import read_pdf
from bs4 import BeautifulSoup
import requests

from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic

import PIL
import numpy as np
import datamapplot
import re

def get_links():
    #reads table from pdf file
    dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file
    links = dfs[0]['Unnamed: 2'].to_list()
    for i in range(len(dfs)-1):
      links.extend(dfs[i+1]['Url'].to_list())
    return links

#--------------------------------------
# text processing

def remove_tags(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def remove_emoji(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

#-------------------------------------

def get_page(link):
  try:
    #print(link)
    x = requests.get(link)
    raw_html = x.text
    clean_text = remove_tags(raw_html)[:1050]
    clean_text = remove_emoji(clean_text)
    return clean_text
  except:
    print(link)

def get_documents(links):
    pre_processed_text = [get_page(link) for link in links]
    while(None in pre_processed_text):
        pre_processed_text.remove(None)
    pre_processed_text = [i for i in pre_processed_text if len(i) > 999]
    return pre_processed_text

#----------------------------------------

def get_topics(docs):
    # Use llama.cpp to load in a Quantized LLM
    llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
    
    prompt = """ Q:
    I have a topic that contains the following documents:
    [DOCUMENTS]
    
    The topic is described by the following keywords: '[KEYWORDS]'.
    
    Based on the above information, can you give a short label of the topic of at most 5 words?
    A:
    """
    
    representation_model = {
        "KeyBERT": KeyBERTInspired(),
        "LLM": Llam
    
    
    # Pre-calculate embeddings
    embedding_model = SentenceTransformer("BAAI/bge-small-en")
    embeddings = embedding_model.encode(docs, show_progress_bar=True)
    
    # Pre-reduce embeddings for visualization purposes
    reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
    
    # Define sub-models
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    
    topic_model = BERTopic(
    
      # Sub-models
      embedding_model=embedding_model,
      umap_model=umap_model,
      hdbscan_model=hdbscan_model,
      representation_model=representation_model,
    
      # Hyperparameters
      top_n_words=10,
      verbose=True
    )
    
    # Train model
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topic_model

#-------------------------------
# Visualize Topics
def get_figure(topic_model)
    # Prepare logo
    bertopic_logo_response = requests.get(
        "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png",
        stream=True,
        headers={'User-Agent': 'My User Agent 1.0'}
    )
    bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw))

    # Create a label for each document
    llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
    llm_labels = [label if label else "Unlabelled" for label in llm_labels]
    all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
    
    # Run the visualization
    fig = datamapplot.create_plot(
        reduced_embeddings,
        all_labels,
        label_font_size=11,
        title="ArXiv - BERTopic",
        sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
        label_wrap_width=20,
        use_medoids=True,
        logo=bertopic_logo,
        logo_width=0.16
    )

    return fig