from tabula import read_pdf from bs4 import BeautifulSoup import requests from llama_cpp import Llama from bertopic.representation import KeyBERTInspired, LlamaCPP from sentence_transformers import SentenceTransformer from umap import UMAP from hdbscan import HDBSCAN from bertopic import BERTopic import PIL import numpy as np import datamapplot import re def get_links(): #reads table from pdf file dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file links = dfs[0]['Unnamed: 2'].to_list() for i in range(len(dfs)-1): links.extend(dfs[i+1]['Url'].to_list()) return links #-------------------------------------- # text processing def remove_tags(html): # parse html content soup = BeautifulSoup(html, "html.parser") for data in soup(['style', 'script']): # Remove tags data.decompose() # return data by retrieving the tag content return ' '.join(soup.stripped_strings) def remove_emoji(data): emoj = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", re.UNICODE) return re.sub(emoj, '', data) #------------------------------------- def get_page(link): try: #print(link) x = requests.get(link) raw_html = x.text clean_text = remove_tags(raw_html)[:1050] clean_text = remove_emoji(clean_text) return clean_text except: print(link) def get_documents(links): pre_processed_text = [get_page(link) for link in links] while(None in pre_processed_text): pre_processed_text.remove(None) pre_processed_text = [i for i in pre_processed_text if len(i) > 999] return pre_processed_text #---------------------------------------- def get_topics(docs): # Use llama.cpp to load in a Quantized LLM llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"]) prompt = """ Q: I have a topic that contains the following documents: [DOCUMENTS] The topic is described by the following keywords: '[KEYWORDS]'. Based on the above information, can you give a short label of the topic of at most 5 words? A: """ representation_model = { "KeyBERT": KeyBERTInspired(), "LLM": LlamaCPP(llm, prompt=prompt), } # Pre-calculate embeddings embedding_model = SentenceTransformer("BAAI/bge-small-en") embeddings = embedding_model.encode(docs, show_progress_bar=True) # Pre-reduce embeddings for visualization purposes reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings) # Define sub-models umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True) topic_model = BERTopic( # Sub-models embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, representation_model=representation_model, # Hyperparameters top_n_words=10, verbose=True ) # Train model topics, probs = topic_model.fit_transform(docs, embeddings) return topic_model #------------------------------- # Visualize Topics def get_figure(topic_model): # Prepare logo bertopic_logo_response = requests.get( "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png", stream=True, headers={'User-Agent': 'My User Agent 1.0'} ) bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw)) # Create a label for each document llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()] llm_labels = [label if label else "Unlabelled" for label in llm_labels] all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics] # Run the visualization fig = datamapplot.create_plot( reduced_embeddings, all_labels, label_font_size=11, title="ArXiv - BERTopic", sub_title="Topics labeled with `openhermes-2.5-mistral-7b`", label_wrap_width=20, use_medoids=True, logo=bertopic_logo, logo_width=0.16 ) return fig