Spaces:
Sleeping
Sleeping
File size: 4,975 Bytes
7189553 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
from tabula import read_pdf
from bs4 import BeautifulSoup
import requests
from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
import PIL
import numpy as np
import datamapplot
import re
def get_links():
#reads table from pdf file
dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file
links = dfs[0]['Unnamed: 2'].to_list()
for i in range(len(dfs)-1):
links.extend(dfs[i+1]['Url'].to_list())
return links
#--------------------------------------
# text processing
def remove_tags(html):
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def remove_emoji(data):
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
#-------------------------------------
def get_page(link):
try:
#print(link)
x = requests.get(link)
raw_html = x.text
clean_text = remove_tags(raw_html)[:1050]
clean_text = remove_emoji(clean_text)
return clean_text
except:
print(link)
def get_documents(links):
pre_processed_text = [get_page(link) for link in links]
while(None in pre_processed_text):
pre_processed_text.remove(None)
pre_processed_text = [i for i in pre_processed_text if len(i) > 999]
return pre_processed_text
#----------------------------------------
def get_topics(docs):
# Use llama.cpp to load in a Quantized LLM
llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
prompt = """ Q:
I have a topic that contains the following documents:
[DOCUMENTS]
The topic is described by the following keywords: '[KEYWORDS]'.
Based on the above information, can you give a short label of the topic of at most 5 words?
A:
"""
representation_model = {
"KeyBERT": KeyBERTInspired(),
"LLM": Llam
# Pre-calculate embeddings
embedding_model = SentenceTransformer("BAAI/bge-small-en")
embeddings = embedding_model.encode(docs, show_progress_bar=True)
# Pre-reduce embeddings for visualization purposes
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
# Define sub-models
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
# Sub-models
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
representation_model=representation_model,
# Hyperparameters
top_n_words=10,
verbose=True
)
# Train model
topics, probs = topic_model.fit_transform(docs, embeddings)
return topic_model
#-------------------------------
# Visualize Topics
def get_figure(topic_model)
# Prepare logo
bertopic_logo_response = requests.get(
"https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png",
stream=True,
headers={'User-Agent': 'My User Agent 1.0'}
)
bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw))
# Create a label for each document
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
# Run the visualization
fig = datamapplot.create_plot(
reduced_embeddings,
all_labels,
label_font_size=11,
title="ArXiv - BERTopic",
sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
label_wrap_width=20,
use_medoids=True,
logo=bertopic_logo,
logo_width=0.16
)
return fig
|