Polo123 commited on
Commit
7189553
·
verified ·
1 Parent(s): b9a7684

Create logic.py

Browse files
Files changed (1) hide show
  1. logic.py +161 -0
logic.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tabula import read_pdf
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+
5
+ from llama_cpp import Llama
6
+ from bertopic.representation import KeyBERTInspired, LlamaCPP
7
+ from sentence_transformers import SentenceTransformer
8
+ from umap import UMAP
9
+ from hdbscan import HDBSCAN
10
+ from bertopic import BERTopic
11
+
12
+ import PIL
13
+ import numpy as np
14
+ import datamapplot
15
+ import re
16
+
17
+ def get_links():
18
+ #reads table from pdf file
19
+ dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file
20
+ links = dfs[0]['Unnamed: 2'].to_list()
21
+ for i in range(len(dfs)-1):
22
+ links.extend(dfs[i+1]['Url'].to_list())
23
+ return links
24
+
25
+ #--------------------------------------
26
+ # text processing
27
+
28
+ def remove_tags(html):
29
+
30
+ # parse html content
31
+ soup = BeautifulSoup(html, "html.parser")
32
+
33
+ for data in soup(['style', 'script']):
34
+ # Remove tags
35
+ data.decompose()
36
+
37
+ # return data by retrieving the tag content
38
+ return ' '.join(soup.stripped_strings)
39
+
40
+ def remove_emoji(data):
41
+ emoj = re.compile("["
42
+ u"\U0001F600-\U0001F64F" # emoticons
43
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
44
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
45
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
46
+ u"\U00002500-\U00002BEF" # chinese char
47
+ u"\U00002702-\U000027B0"
48
+ u"\U000024C2-\U0001F251"
49
+ u"\U0001f926-\U0001f937"
50
+ u"\U00010000-\U0010ffff"
51
+ u"\u2640-\u2642"
52
+ u"\u2600-\u2B55"
53
+ u"\u200d"
54
+ u"\u23cf"
55
+ u"\u23e9"
56
+ u"\u231a"
57
+ u"\ufe0f" # dingbats
58
+ u"\u3030"
59
+ "]+", re.UNICODE)
60
+ return re.sub(emoj, '', data)
61
+
62
+ #-------------------------------------
63
+
64
+ def get_page(link):
65
+ try:
66
+ #print(link)
67
+ x = requests.get(link)
68
+ raw_html = x.text
69
+ clean_text = remove_tags(raw_html)[:1050]
70
+ clean_text = remove_emoji(clean_text)
71
+ return clean_text
72
+ except:
73
+ print(link)
74
+
75
+ def get_documents(links):
76
+ pre_processed_text = [get_page(link) for link in links]
77
+ while(None in pre_processed_text):
78
+ pre_processed_text.remove(None)
79
+ pre_processed_text = [i for i in pre_processed_text if len(i) > 999]
80
+ return pre_processed_text
81
+
82
+ #----------------------------------------
83
+
84
+ def get_topics(docs):
85
+ # Use llama.cpp to load in a Quantized LLM
86
+ llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"])
87
+
88
+ prompt = """ Q:
89
+ I have a topic that contains the following documents:
90
+ [DOCUMENTS]
91
+
92
+ The topic is described by the following keywords: '[KEYWORDS]'.
93
+
94
+ Based on the above information, can you give a short label of the topic of at most 5 words?
95
+ A:
96
+ """
97
+
98
+ representation_model = {
99
+ "KeyBERT": KeyBERTInspired(),
100
+ "LLM": Llam
101
+
102
+
103
+ # Pre-calculate embeddings
104
+ embedding_model = SentenceTransformer("BAAI/bge-small-en")
105
+ embeddings = embedding_model.encode(docs, show_progress_bar=True)
106
+
107
+ # Pre-reduce embeddings for visualization purposes
108
+ reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
109
+
110
+ # Define sub-models
111
+ umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
112
+ hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
113
+
114
+ topic_model = BERTopic(
115
+
116
+ # Sub-models
117
+ embedding_model=embedding_model,
118
+ umap_model=umap_model,
119
+ hdbscan_model=hdbscan_model,
120
+ representation_model=representation_model,
121
+
122
+ # Hyperparameters
123
+ top_n_words=10,
124
+ verbose=True
125
+ )
126
+
127
+ # Train model
128
+ topics, probs = topic_model.fit_transform(docs, embeddings)
129
+
130
+ return topic_model
131
+
132
+ #-------------------------------
133
+ # Visualize Topics
134
+ def get_figure(topic_model)
135
+ # Prepare logo
136
+ bertopic_logo_response = requests.get(
137
+ "https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png",
138
+ stream=True,
139
+ headers={'User-Agent': 'My User Agent 1.0'}
140
+ )
141
+ bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw))
142
+
143
+ # Create a label for each document
144
+ llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
145
+ llm_labels = [label if label else "Unlabelled" for label in llm_labels]
146
+ all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
147
+
148
+ # Run the visualization
149
+ fig = datamapplot.create_plot(
150
+ reduced_embeddings,
151
+ all_labels,
152
+ label_font_size=11,
153
+ title="ArXiv - BERTopic",
154
+ sub_title="Topics labeled with `openhermes-2.5-mistral-7b`",
155
+ label_wrap_width=20,
156
+ use_medoids=True,
157
+ logo=bertopic_logo,
158
+ logo_width=0.16
159
+ )
160
+
161
+ return fig