Ashoka74 commited on
Commit
07e89d3
·
verified ·
1 Parent(s): 254b4f5

create app.py

Browse files
Files changed (1) hide show
  1. app.py +1361 -0
app.py ADDED
@@ -0,0 +1,1361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.cluster import KMeans
5
+ import umap
6
+ import hdbscan
7
+ import plotly.graph_objects as go
8
+ from sentence_transformers import SentenceTransformer
9
+ import torch
10
+ with torch.no_grad():
11
+ embed_model = SentenceTransformer('embaas/sentence-transformers-e5-large-v2')
12
+ embed_model.to('cuda')
13
+ from sentence_transformers.util import pytorch_cos_sim, pairwise_cos_sim
14
+ #from stqdm.notebook import stqdm
15
+ #stqdm.pandas()
16
+ import logging
17
+ import pandas as pd
18
+ import numpy as np
19
+ from sklearn.decomposition import PCA
20
+ from sklearn.cluster import KMeans
21
+ import umap
22
+ import hdbscan
23
+ import plotly.graph_objects as go
24
+ import plotly.express as px
25
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
26
+ import numpy as np
27
+ from Levenshtein import distance
28
+ import logging
29
+ from sklearn.metrics import confusion_matrix
30
+ import seaborn as sns
31
+ import matplotlib.pyplot as plt
32
+ import xgboost as xgb
33
+ from xgboost import plot_importance
34
+ import matplotlib.pyplot as plt
35
+ from sklearn.metrics import accuracy_score, confusion_matrix
36
+ from scipy.stats import chi2_contingency
37
+ import matplotlib.pyplot as plt
38
+ import seaborn as sns
39
+ from statsmodels.graphics.mosaicplot import mosaic
40
+ import pickle
41
+ import pandas as pd
42
+ from sklearn.model_selection import train_test_split
43
+ from sklearn.metrics import confusion_matrix
44
+ import seaborn as sns
45
+ import matplotlib.pyplot as plt
46
+ import xgboost as xgb
47
+ from xgboost import plot_importance
48
+ import matplotlib.pyplot as plt
49
+ from sklearn.metrics import accuracy_score, confusion_matrix
50
+ from scipy.stats import chi2_contingency
51
+ import matplotlib.pyplot as plt
52
+ import seaborn as sns
53
+ from statsmodels.graphics.mosaicplot import mosaic
54
+ from statsmodels.api import stats
55
+ import os
56
+ import time
57
+ import concurrent.futures
58
+ from requests.exceptions import HTTPError
59
+ from stqdm import stqdm
60
+ stqdm.pandas()
61
+ import json
62
+ import pandas as pd
63
+ from openai import OpenAI
64
+ import numpy as np
65
+ import matplotlib.pyplot as plt
66
+ import squarify
67
+ import matplotlib.colors as mcolors
68
+ import textwrap
69
+ import pandas as pd
70
+ import streamlit as st
71
+
72
+ # Configure logging
73
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
74
+
75
+ class UAPAnalyzer:
76
+ """
77
+ A class for analyzing and clustering textual data within a pandas DataFrame using
78
+ Natural Language Processing (NLP) techniques and machine learning models.
79
+
80
+ Attributes:
81
+ data (pd.DataFrame): The dataset containing textual data for analysis.
82
+ column (str): The name of the column in the DataFrame to be analyzed.
83
+ embeddings (np.ndarray): The vector representations of textual data.
84
+ reduced_embeddings (np.ndarray): The dimensionality-reduced embeddings.
85
+ cluster_labels (np.ndarray): The labels assigned to each data point after clustering.
86
+ cluster_terms (list): The list of terms associated with each cluster.
87
+ tfidf_matrix (sparse matrix): The Term Frequency-Inverse Document Frequency (TF-IDF) matrix.
88
+ models (dict): A dictionary to store trained machine learning models.
89
+ evaluations (dict): A dictionary to store evaluation results of models.
90
+ data_nums (pd.DataFrame): The DataFrame with numerical encoding of categorical data.
91
+ """
92
+
93
+ def __init__(self, data, column, has_embeddings=False):
94
+ """
95
+ Initializes the UAPAnalyzer with a dataset and a specified column for analysis.
96
+
97
+ Args:
98
+ data (pd.DataFrame): The dataset for analysis.
99
+ column (str): The column within the dataset to analyze.
100
+ """
101
+ assert isinstance(data, pd.DataFrame), "Data must be a pandas DataFrame"
102
+ assert column in data.columns, f"Column '{column}' not found in DataFrame"
103
+ self.has_embeddings = has_embeddings
104
+ self.data = data
105
+ self.column = column
106
+ self.embeddings = None
107
+ self.reduced_embeddings = None
108
+ self.cluster_labels = None
109
+ self.cluster_names = None
110
+ self.cluster_terms = None
111
+ self.cluster_terms_embeddings = None
112
+ self.tfidf_matrix = None
113
+ self.models = {} # To store trained models
114
+ self.evaluations = {} # To store evaluation results
115
+ self.data_nums = None # Encoded numerical data
116
+ self.x_train = None
117
+ self.y_train = None
118
+ self.x_test = None
119
+ self.y_test = None
120
+ self.preds = None
121
+ self.new_dataset = None
122
+ self.model = SentenceTransformer('embaas/sentence-transformers-e5-large-v2')
123
+ self.model = self.model.to('cuda')
124
+ #self.cluster_names_ = pd.DataFrame()
125
+
126
+ logging.info("UAPAnalyzer initialized")
127
+
128
+ def preprocess_data(self, trim=False, has_embeddings=False, top_n=32,):
129
+ """
130
+ Preprocesses the data by optionally trimming the dataset to include only the top N labels and extracting embeddings.
131
+
132
+ Args:
133
+ trim (bool): Whether to trim the dataset to include only the top N labels.
134
+ top_n (int): The number of top labels to retain if trimming is enabled.
135
+ """
136
+ logging.info("Preprocessing data")
137
+
138
+ # if trim is True
139
+ if trim:
140
+ # Identify the top labels based on value counts
141
+ top_labels = self.data[self.column].value_counts().nlargest(top_n).index.tolist()
142
+ # Revise the column data, setting values to 'Other' if they are not in the top labels
143
+ self.data[f'{self.column}_revised'] = np.where(self.data[self.column].isin(top_labels), self.data[self.column], 'Other')
144
+ # Convert the column data to string type before passing to _extract_embeddings
145
+ # This is useful especially if the data type of the column is not originally string
146
+ string_data = self.data[f'{self.column}'].astype(str)
147
+ # Extract embeddings from the revised and string-converted column data
148
+ if has_embeddings:
149
+ self.embeddings = self.data['embeddings'].to_list()
150
+ else:
151
+ self.embeddings = self._extract_embeddings(string_data)
152
+ logging.info("Data preprocessing complete")
153
+
154
+
155
+ def _extract_embeddings(self, data_column):
156
+ """
157
+ Extracts embeddings from the given data column.
158
+
159
+ Args:
160
+ data_column (pd.Series): The column from which to extract embeddings.
161
+
162
+ Returns:
163
+ np.ndarray: The extracted embeddings.
164
+ """
165
+ logging.info("Extracting embeddings")
166
+ # convert to str
167
+ return embed_model.encode(data_column.tolist(), show_progress_bar=True)
168
+
169
+ def reduce_dimensionality(self, method='UMAP', n_components=2, **kwargs):
170
+ """
171
+ Reduces the dimensionality of embeddings using specified method.
172
+
173
+ Args:
174
+ method (str): The dimensionality reduction method to use ('UMAP' or 'PCA').
175
+ n_components (int): The number of dimensions to reduce to.
176
+ **kwargs: Additional keyword arguments for the dimensionality reduction method.
177
+ """
178
+ logging.info(f"Reducing dimensionality using {method}")
179
+ if method == 'UMAP':
180
+ reducer = umap.UMAP(n_components=n_components, **kwargs)
181
+ elif method == 'PCA':
182
+ reducer = PCA(n_components=n_components)
183
+ else:
184
+ raise ValueError("Unsupported dimensionality reduction method")
185
+
186
+ self.reduced_embeddings = reducer.fit_transform(self.embeddings)
187
+ logging.info(f"Dimensionality reduced using {method}")
188
+
189
+ def cluster_data(self, method='HDBSCAN', **kwargs):
190
+ """
191
+ Clusters the reduced dimensionality data using the specified clustering method.
192
+
193
+ Args:
194
+ method (str): The clustering method to use ('HDBSCAN' or 'KMeans').
195
+ **kwargs: Additional keyword arguments for the clustering method.
196
+ """
197
+ logging.info(f"Clustering data using {method}")
198
+ if method == 'HDBSCAN':
199
+ clusterer = hdbscan.HDBSCAN(**kwargs)
200
+ elif method == 'KMeans':
201
+ clusterer = KMeans(**kwargs)
202
+ else:
203
+ raise ValueError("Unsupported clustering method")
204
+
205
+ clusterer.fit(self.reduced_embeddings)
206
+ self.cluster_labels = clusterer.labels_
207
+ logging.info(f"Data clustering complete using {method}")
208
+
209
+
210
+ def get_tf_idf_clusters(self, top_n=2):
211
+ """
212
+ Names clusters using the most frequent terms based on TF-IDF analysis.
213
+
214
+ Args:
215
+ top_n (int): The number of top terms to consider for naming each cluster.
216
+ """
217
+ logging.info("Naming clusters based on top TF-IDF terms.")
218
+
219
+ # Ensure data has been clustered
220
+ assert self.cluster_labels is not None, "Data has not been clustered yet."
221
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
222
+
223
+ # Fit the vectorizer to the text data and transform it into a TF-IDF matrix
224
+ tfidf_matrix = vectorizer.fit_transform(self.data[f'{self.column}'].astype(str))
225
+
226
+ # Initialize an empty list to store the cluster terms
227
+ self.cluster_terms = []
228
+
229
+ for cluster_id in np.unique(self.cluster_labels):
230
+ # Skip noise if present (-1 in HDBSCAN)
231
+ if cluster_id == -1:
232
+ continue
233
+
234
+ # Find indices of documents in the current cluster
235
+ indices = np.where(self.cluster_labels == cluster_id)[0]
236
+
237
+ # Compute the mean TF-IDF score for each term in the cluster
238
+ cluster_tfidf_mean = np.mean(tfidf_matrix[indices], axis=0)
239
+
240
+ # Use the matrix directly for indexing if it does not support .toarray()
241
+ # Ensure it's in a format that supports indexing, convert if necessary
242
+ if hasattr(cluster_tfidf_mean, "toarray"):
243
+ dense_mean = cluster_tfidf_mean.toarray().flatten()
244
+ else:
245
+ dense_mean = np.asarray(cluster_tfidf_mean).flatten()
246
+
247
+ # Get the indices of the top_n terms
248
+ top_n_indices = np.argsort(dense_mean)[-top_n:]
249
+
250
+ # Get the corresponding terms for these top indices
251
+ terms = vectorizer.get_feature_names_out()
252
+ top_terms = [terms[i] for i in top_n_indices]
253
+
254
+ # Join the top_n terms with a hyphen
255
+ cluster_name = '-'.join(top_terms)
256
+
257
+ # Append the cluster name to the list
258
+ self.cluster_terms.append(cluster_name)
259
+
260
+ # Convert the list of cluster terms to a categorical data type
261
+ self.cluster_terms = pd.Categorical(self.cluster_terms)
262
+ logging.info("Cluster naming completed.")
263
+
264
+ def merge_similar_clusters(self, distance='cosine', char_diff_threshold = 3, similarity_threshold = 0.92, embeddings = 'SBERT'):
265
+ """
266
+ Merges similar clusters based on cosine similarity of their associated terms.
267
+
268
+ Args:
269
+ similarity_threshold (float): The similarity threshold above which clusters are considered similar enough to merge.
270
+ """
271
+ from collections import defaultdict
272
+ logging.info("Merging similar clusters")
273
+
274
+ # A mapping from cluster names to a set of cluster names to be merged
275
+ merge_mapping = defaultdict(set)
276
+ merge_labels = defaultdict(set)
277
+
278
+ if distance == 'levenshtein':
279
+ distances = {}
280
+ for i, name1 in enumerate(self.cluster_terms):
281
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
282
+ dist = distance(name1, name2)
283
+ if dist <= char_diff_threshold:
284
+ logging.info(f"Merging '{name2}' into '{name1}'")
285
+ merge_mapping[name1].add(name2)
286
+
287
+ elif distance == 'cosine':
288
+ self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
289
+ cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
290
+ for i, name1 in enumerate(self.cluster_terms):
291
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
292
+ if cos_sim_matrix[i][j] > similarity_threshold:
293
+ #st.write(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
294
+ logging.info(f"Merging cluster '{name2}' into cluster '{name1}' based on cosine similarity")
295
+ merge_mapping[name1].add(name2)
296
+
297
+
298
+ # Flatten the merge mapping to a simple name change mapping
299
+ name_change_mapping = {}
300
+ for cluster_name, merges in merge_mapping.items():
301
+ for merge_name in merges:
302
+ name_change_mapping[merge_name] = cluster_name
303
+
304
+ # Update cluster labels based on name changes
305
+ updated_cluster_terms = []
306
+ original_to_updated_index = {}
307
+ for i, name in enumerate(self.cluster_terms):
308
+ updated_name = name_change_mapping.get(name, name)
309
+ if updated_name not in updated_cluster_terms:
310
+ updated_cluster_terms.append(updated_name)
311
+ original_to_updated_index[i] = len(updated_cluster_terms) - 1
312
+ else:
313
+ updated_index = updated_cluster_terms.index(updated_name)
314
+ original_to_updated_index[i] = updated_index
315
+
316
+ self.cluster_terms = updated_cluster_terms # Update cluster terms with merged names
317
+ self.clusters_labels = np.array([original_to_updated_index[label] for label in self.cluster_labels])
318
+
319
+
320
+ # Update cluster labels according to the new index mapping
321
+ # self.cluster_labels = np.array([original_to_updated_index[label] if label in original_to_updated_index else -1 for label in self.cluster_labels])
322
+ # self.cluster_terms = [self.cluster_terms[original_to_updated_index[label]] if label != -1 else 'Noise' for label in self.cluster_labels]
323
+
324
+ # Log the total number of merges
325
+ total_merges = sum(len(merges) for merges in merge_mapping.values())
326
+ logging.info(f"Total clusters merged: {total_merges}")
327
+
328
+ unique_labels = np.unique(self.cluster_labels)
329
+ label_to_index = {label: index for index, label in enumerate(unique_labels)}
330
+ self.cluster_labels = np.array([label_to_index[label] for label in self.cluster_labels])
331
+ self.cluster_terms = [self.cluster_terms[label_to_index[label]] for label in self.cluster_labels]
332
+
333
+ def merge_similar_clusters2(self, distance='cosine', char_diff_threshold=3, similarity_threshold=0.92):
334
+ logging.info("Merging similar clusters based on distance: {}".format(distance))
335
+ from collections import defaultdict
336
+ merge_mapping = defaultdict(set)
337
+
338
+ if distance == 'levenshtein':
339
+ for i, name1 in enumerate(self.cluster_terms):
340
+ for j, name2 in enumerate(self.cluster_terms[i + 1:], start=i + 1):
341
+ dist = distance(name1, name2)
342
+ if dist <= char_diff_threshold:
343
+ merge_mapping[name1].add(name2)
344
+ logging.info(f"Merging '{name2}' into '{name1}' based on Levenshtein distance")
345
+
346
+ elif distance == 'cosine':
347
+ if self.cluster_terms_embeddings is None:
348
+ self.cluster_terms_embeddings = embed_model.encode(self.cluster_terms)
349
+ cos_sim_matrix = pytorch_cos_sim(self.cluster_terms_embeddings, self.cluster_terms_embeddings)
350
+ for i in range(len(self.cluster_terms)):
351
+ for j in range(i + 1, len(self.cluster_terms)):
352
+ if cos_sim_matrix[i][j] > similarity_threshold:
353
+ merge_mapping[self.cluster_terms[i]].add(self.cluster_terms[j])
354
+ #st.write(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
355
+ logging.info(f"Merging cluster '{self.cluster_terms[j]}' into cluster '{self.cluster_terms[i]}'")
356
+
357
+ self._update_cluster_terms_and_labels(merge_mapping)
358
+
359
+ def _update_cluster_terms_and_labels(self, merge_mapping):
360
+ # Flatten the merge mapping to a simple name change mapping
361
+ name_change_mapping = {old: new for new, olds in merge_mapping.items() for old in olds}
362
+ # Update cluster terms and labels
363
+ unique_new_terms = list(set(name_change_mapping.values()))
364
+ # replace the old terms with the new terms (name2) otherwise, keep the old terms (name1)
365
+ # self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
366
+ # self.cluster_labels = np.array([unique_new_terms.index(term) if term in unique_new_terms else term for term in self.cluster_terms])
367
+ self.cluster_terms = [name_change_mapping.get(term, term) for term in self.cluster_terms]
368
+ self.cluster_labels = [unique_new_terms.index(term) if term in unique_new_terms else -1 for term in self.cluster_terms]
369
+
370
+ logging.info(f"Total clusters merged: {len(merge_mapping)}")
371
+
372
+
373
+ def cluster_levenshtein(self, cluster_terms, cluster_labels, char_diff_threshold=3):
374
+ from Levenshtein import distance # Make sure to import the correct distance function
375
+
376
+ merge_map = {}
377
+ # Iterate over term pairs and decide on merging based on the distance
378
+ for idx, term1 in enumerate(cluster_terms):
379
+ for jdx, term2 in enumerate(cluster_terms):
380
+ if idx < jdx and distance(term1, term2) <= char_diff_threshold:
381
+ labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
382
+ for label in labels_to_merge:
383
+ merge_map[label] = idx # Map the label to use the term index of term1
384
+ logging.info(f"Merging '{term2}' into '{term1}'")
385
+ st.write(f"Merging '{term2}' into '{term1}'")
386
+ # Update the cluster labels
387
+ updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
388
+ # Update string labels to reflect merged labels
389
+ updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
390
+ return updated_string_labels
391
+
392
+ def cluster_cosine(self, cluster_terms, cluster_labels, similarity_threshold):
393
+ from sklearn.metrics.pairwise import cosine_similarity
394
+
395
+ cluster_terms_embeddings = embed_model.encode(cluster_terms)
396
+ # Compute cosine similarity matrix in a vectorized form
397
+ cos_sim_matrix = cosine_similarity(cluster_terms_embeddings, cluster_terms_embeddings)
398
+
399
+ merge_map = {}
400
+ n_terms = len(cluster_terms)
401
+ # Iterate only over upper triangular matrix excluding diagonal to avoid redundant computations and self-comparison
402
+ for idx in range(n_terms):
403
+ for jdx in range(idx + 1, n_terms):
404
+ if cos_sim_matrix[idx, jdx] >= similarity_threshold:
405
+ labels_to_merge = [label for label, term_index in enumerate(cluster_labels) if term_index == jdx]
406
+ for label in labels_to_merge:
407
+ merge_map[label] = idx
408
+ st.write(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
409
+ logging.info(f"Merging '{cluster_terms[jdx]}' into '{cluster_terms[idx]}'")
410
+ # Update the cluster labels
411
+ updated_cluster_labels = [merge_map.get(label, label) for label in cluster_labels]
412
+ # Update string labels to reflect merged labels
413
+ updated_string_labels = [cluster_terms[label] for label in updated_cluster_labels]
414
+ # make a dataframe with index, cluster label and cluster term
415
+ return updated_string_labels
416
+
417
+ def merge_similar_clusters(self, cluster_terms, cluster_labels, distance_type='cosine', char_diff_threshold=3, similarity_threshold=0.92):
418
+ if distance_type == 'levenshtein':
419
+ return self.cluster_levenshtein(cluster_terms, cluster_labels, char_diff_threshold)
420
+ elif distance_type == 'cosine':
421
+ return self.cluster_cosine(cluster_terms, cluster_labels, similarity_threshold)
422
+
423
+
424
+ def plot_embeddings2(self, title=None):
425
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
426
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
427
+
428
+ logging.info("Plotting embeddings with TF-IDF colors")
429
+
430
+ fig = go.Figure()
431
+
432
+ unique_cluster_terms = np.unique(self.cluster_terms)
433
+
434
+ for cluster_term in unique_cluster_terms:
435
+ if cluster_term != 'Noise':
436
+ indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
437
+
438
+ # Plot points in the current cluster
439
+ fig.add_trace(
440
+ go.Scatter(
441
+ x=self.reduced_embeddings[indices, 0],
442
+ y=self.reduced_embeddings[indices, 1],
443
+ mode='markers',
444
+ marker=dict(
445
+ size=5,
446
+ opacity=0.8,
447
+ ),
448
+ name=cluster_term,
449
+ text=self.data[f'{self.column}'].iloc[indices],
450
+ hoverinfo='text',
451
+ )
452
+ )
453
+ else:
454
+ # Plot noise points differently if needed
455
+ fig.add_trace(
456
+ go.Scatter(
457
+ x=self.reduced_embeddings[indices, 0],
458
+ y=self.reduced_embeddings[indices, 1],
459
+ mode='markers',
460
+ marker=dict(
461
+ size=5,
462
+ opacity=0.5,
463
+ color='grey'
464
+ ),
465
+ name='Noise',
466
+ text=[self.data[f'{self.column}'][i] for i in indices], # Adjusted for potential pandas use
467
+ hoverinfo='text',
468
+ )
469
+ )
470
+ # else:
471
+ # indices = np.where(np.array(self.cluster_terms) == 'Noise')[0]
472
+
473
+ # # Plot noise points
474
+ # fig.add_trace(
475
+ # go.Scatter(
476
+ # x=self.reduced_embeddings[indices, 0],
477
+ # y=self.reduced_embeddings[indices, 1],
478
+ # mode='markers',
479
+ # marker=dict(
480
+ # size=5,
481
+ # opacity=0.8,
482
+ # ),
483
+ # name='Noise',
484
+ # text=self.data[f'{self.column}'].iloc[indices],
485
+ # hoverinfo='text',
486
+ # )
487
+ # )
488
+
489
+ fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
490
+ #return fig
491
+ st.plotly_chart(fig, use_container_width=True)
492
+ #fig.show()
493
+ #logging.info("Embeddings plotted with TF-IDF colors")
494
+
495
+ def plot_embeddings3(self, title=None):
496
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
497
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
498
+
499
+ logging.info("Plotting embeddings with TF-IDF colors")
500
+
501
+ fig = go.Figure()
502
+
503
+ unique_cluster_terms = np.unique(self.cluster_terms)
504
+
505
+ terms_order = {term: i for i, term in enumerate(np.unique(self.cluster_terms, return_index=True)[0])}
506
+ #indices = np.argsort([terms_order[term] for term in self.cluster_terms])
507
+
508
+ # Handling color assignment, especially for noise
509
+ colors = {term: ('grey' if term == 'Noise' else None) for term in unique_cluster_terms}
510
+ color_map = px.colors.qualitative.Plotly # Default color map from Plotly Express for consistency
511
+
512
+ # Apply a custom color map, handling 'Noise' specifically
513
+ color_idx = 0
514
+ for cluster_term in unique_cluster_terms:
515
+ indices = np.where(np.array(self.cluster_terms) == cluster_term)[0]
516
+ if cluster_term != 'Noise':
517
+ marker_color = color_map[color_idx % len(color_map)]
518
+ color_idx += 1
519
+ else:
520
+ marker_color = 'grey'
521
+
522
+ fig.add_trace(
523
+ go.Scatter(
524
+ x=self.reduced_embeddings[indices, 0],
525
+ y=self.reduced_embeddings[indices, 1],
526
+ mode='markers',
527
+ marker=dict(
528
+ size=5,
529
+ opacity=(0.5 if cluster_term == 'Noise' else 0.8),
530
+ color=marker_color
531
+ ),
532
+ name=cluster_term,
533
+ text=self.data[f'{self.column}'].iloc[indices],
534
+ hoverinfo='text'
535
+ )
536
+ )
537
+ fig.data = sorted(fig.data, key=lambda trace: terms_order[trace.name])
538
+ fig.update_layout(title=title if title else "Embeddings Visualized", showlegend=True, legend_title_text='Top TF-IDF Terms')
539
+ st.plotly_chart(fig, use_container_width=True)
540
+
541
+
542
+ def plot_embeddings(self, title=None):
543
+ """
544
+ Plots the reduced dimensionality embeddings with clusters indicated.
545
+
546
+ Args:
547
+ title (str): The title of the plot.
548
+ """
549
+ # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
550
+ assert self.reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
551
+ assert self.cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
552
+
553
+ logging.info("Plotting embeddings with TF-IDF colors")
554
+
555
+ fig = go.Figure()
556
+
557
+ #for i, term in enumerate(self.cluster_terms):
558
+ # Indices of points in the current cluster
559
+ #unique_cluster_ids = np.unique(self.cluster_labels[self.cluster_labels != -1]) # Exclude noise
560
+ unique_cluster_terms = np.unique(self.cluster_terms)
561
+ unique_cluster_labels = np.unique(self.cluster_labels)
562
+
563
+ for i, (cluster_id, cluster_terms) in enumerate(zip(unique_cluster_labels, unique_cluster_terms)):
564
+ indices = np.where(self.cluster_labels == cluster_id)[0]
565
+ #indices = np.where(self.cluster_labels == i)[0]
566
+
567
+ # Plot points in the current cluster
568
+ fig.add_trace(
569
+ go.Scatter(
570
+ x=self.reduced_embeddings[indices, 0],
571
+ y=self.reduced_embeddings[indices, 1],
572
+ mode='markers',
573
+ marker=dict(
574
+ #color=i,
575
+ #colorscale='rainbow',
576
+ size=5,
577
+ opacity=0.8,
578
+ ),
579
+ name=cluster_terms,
580
+ text=self.data[f'{self.column}'].iloc[indices],
581
+ hoverinfo='text',
582
+ )
583
+ )
584
+
585
+
586
+ fig.update_layout(title=title, showlegend=True, legend_title_text='Top TF-IDF Terms')
587
+ st.plotly_chart(fig, use_container_width=True)
588
+ logging.info("Embeddings plotted with TF-IDF colors")
589
+
590
+ def plot_embeddings4(self, title=None, cluster_terms=None, cluster_labels=None, reduced_embeddings=None, column=None, data=None):
591
+ """
592
+ Plots the reduced dimensionality embeddings with clusters indicated.
593
+
594
+ Args:
595
+ title (str): The title of the plot.
596
+ """
597
+ # Ensure dimensionality reduction and TF-IDF based cluster naming have been performed
598
+ assert reduced_embeddings is not None, "Dimensionality reduction has not been performed yet."
599
+ assert cluster_terms is not None, "Cluster TF-IDF analysis has not been performed yet."
600
+
601
+ logging.info("Plotting embeddings with TF-IDF colors")
602
+
603
+ fig = go.Figure()
604
+
605
+ # Determine unique cluster IDs and terms, and ensure consistent color mapping
606
+ unique_cluster_ids = np.unique(cluster_labels)
607
+ unique_cluster_terms = [cluster_terms[i] for i in unique_cluster_ids]#if i != -1] # Exclude noise by ID
608
+
609
+ color_map = px.colors.qualitative.Plotly # Using Plotly Express's qualitative colors for consistency
610
+ color_idx = 0
611
+
612
+ # Map each cluster ID to a color
613
+ cluster_colors = {}
614
+ for cid in unique_cluster_ids:
615
+ #if cid != -1: # Exclude noise
616
+ cluster_colors[cid] = color_map[color_idx % len(color_map)]
617
+ color_idx += 1
618
+ #else:
619
+ # cluster_colors[cid] = 'grey' # Noise or outliers in grey
620
+
621
+ for cluster_id, cluster_term in zip(unique_cluster_ids, unique_cluster_terms):
622
+ indices = np.where(cluster_labels == cluster_id)[0]
623
+ fig.add_trace(
624
+ go.Scatter(
625
+ x=reduced_embeddings[indices, 0],
626
+ y=reduced_embeddings[indices, 1],
627
+ mode='markers',
628
+ marker=dict(
629
+ color=cluster_colors[cluster_id],
630
+ size=5,
631
+ opacity=0.8#if cluster_id != -1 else 0.5,
632
+ ),
633
+ name=cluster_term,
634
+ text=data[f'{column}'].iloc[indices],
635
+ hoverinfo='text',
636
+ )
637
+ )
638
+
639
+ fig.update_layout(
640
+ title=title if title else "Embeddings Visualized",
641
+ showlegend=True,
642
+ legend_title_text='Top TF-IDF Terms',
643
+ legend=dict(
644
+ traceorder='normal', # 'normal' or 'reversed'; ensures that traces appear in the order they are added
645
+ itemsizing='constant'
646
+ )
647
+ )
648
+ st.plotly_chart(fig, use_container_width=True)
649
+ logging.info("Embeddings plotted with TF-IDF colors")
650
+
651
+
652
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
653
+
654
+ def analyze_and_predict(data, analyzers, col_names):
655
+ """
656
+ Performs analysis on the data using provided analyzers and makes predictions on specified columns.
657
+
658
+ Args:
659
+ data (pd.DataFrame): The dataset for analysis.
660
+ analyzers (list): A list of UAPAnalyzer instances.
661
+ col_names (list): Column names to be analyzed and predicted.
662
+ """
663
+ new_data = pd.DataFrame()
664
+ for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
665
+ new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
666
+ logging.info(f"Cluster terms extracted for {column}")
667
+
668
+ new_data = new_data.fillna('null').astype('category')
669
+ data_nums = new_data.apply(lambda x: x.cat.codes)
670
+
671
+ for col in data_nums.columns:
672
+ try:
673
+ categories = new_data[col].cat.categories
674
+ x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
675
+ bst, accuracy, preds = train_xgboost(x_train, y_train, x_test, y_test, len(categories))
676
+ plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
677
+ except Exception as e:
678
+ logging.error(f"Error processing {col}: {e}")
679
+ return new_data
680
+
681
+ def train_xgboost(x_train, y_train, x_test, y_test, num_classes):
682
+ """
683
+ Trains an XGBoost model and evaluates its performance.
684
+
685
+ Args:
686
+ x_train (pd.DataFrame): Training features.
687
+ y_train (pd.Series): Training labels.
688
+ x_test (pd.DataFrame): Test features.
689
+ y_test (pd.Series): Test labels.
690
+ num_classes (int): The number of unique classes in the target variable.
691
+
692
+ Returns:
693
+ bst (Booster): The trained XGBoost model.
694
+ accuracy (float): The accuracy of the model on the test set.
695
+ """
696
+ dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
697
+ dtest = xgb.DMatrix(x_test, label=y_test)
698
+
699
+ params = {'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
700
+ num_round = 100
701
+ bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
702
+ preds = bst.predict(dtest)
703
+ accuracy = accuracy_score(y_test, preds)
704
+
705
+ logging.info(f"XGBoost trained with accuracy: {accuracy:.2f}")
706
+ return bst, accuracy, preds
707
+
708
+ def plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col):
709
+ """
710
+ Plots the feature importance, confusion matrix, and contingency table.
711
+
712
+ Args:
713
+ bst (Booster): The trained XGBoost model.
714
+ x_test (pd.DataFrame): Test features.
715
+ y_test (pd.Series): Test labels.
716
+ preds (np.array): Predictions made by the model.
717
+ categories (Index): Category names for the target variable.
718
+ accuracy (float): The accuracy of the model on the test set.
719
+ col (str): The target column name being analyzed and predicted.
720
+ """
721
+ fig, axs = plt.subplots(1, 3, figsize=(25, 5), dpi=300)
722
+ fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
723
+
724
+ plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
725
+ conf_matrix = confusion_matrix(y_test, preds)
726
+ sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
727
+ axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
728
+ # make axes rotated
729
+ axs[1].set_yticklabels(axs[1].get_yticklabels(), rotation=30, ha='right')
730
+ sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
731
+ # The most important feature is the first element in the sorted list
732
+ most_important_feature = sorted_features[0][0]
733
+ # Create a contingency table
734
+ contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
735
+
736
+ # resid pearson is used to calculate the residuals, which
737
+ table = stats.Table(contingency_table).resid_pearson
738
+ #print(table)
739
+ # Perform the chi-squared test
740
+ chi2, p, dof, expected = chi2_contingency(contingency_table)
741
+ # Print the results
742
+ print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
743
+
744
+ sns.heatmap(table, annot=True, cmap='Greens', ax=axs[2])
745
+ # make axis rotated
746
+ axs[2].set_yticklabels(axs[2].get_yticklabels(), rotation=30, ha='right')
747
+ axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
748
+
749
+ plt.tight_layout()
750
+ #plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
751
+ return plt
752
+
753
+ def cramers_v(confusion_matrix):
754
+ """Calculate Cramer's V statistic for categorical-categorical association."""
755
+ chi2 = chi2_contingency(confusion_matrix)[0]
756
+ n = confusion_matrix.sum().sum()
757
+ phi2 = chi2 / n
758
+ r, k = confusion_matrix.shape
759
+ phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
760
+ r_corr = r - ((r-1)**2)/(n-1)
761
+ k_corr = k - ((k-1)**2)/(n-1)
762
+ return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
763
+
764
+ def plot_cramers_v_heatmap(data, significance_level=0.05):
765
+ """Plot heatmap of Cramer's V statistic for each pair of categorical variables in a DataFrame."""
766
+ # Initialize a DataFrame to store Cramer's V values
767
+ cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
768
+
769
+ # Compute Cramer's V for each pair of columns
770
+ for col1 in data.columns:
771
+ for col2 in data.columns:
772
+ if col1 != col2: # Avoid self-comparison
773
+ confusion_matrix = pd.crosstab(data[col1], data[col2])
774
+ chi2, p, dof, expected = chi2_contingency(confusion_matrix)
775
+ # Check if the p-value is less than the significance level
776
+ #if p < significance_level:
777
+ # cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
778
+ # alternatively, you can use the following line to include all pairs
779
+ cramers_v_df.at[col1, col2] = cramers_v(confusion_matrix)
780
+
781
+ # Plot the heatmap
782
+ plt.figure(figsize=(12, 10), dpi=200)
783
+ mask = np.triu(np.ones_like(cramers_v_df, dtype=bool)) # Mask for the upper triangle
784
+ # make a max and min of the cmap
785
+ sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
786
+ plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
787
+ return plt
788
+
789
+
790
+ class UAPVisualizer:
791
+ def __init__(self, data=None):
792
+ pass # Initialization can be added if needed
793
+
794
+ def analyze_and_predict(self, data, analyzers, col_names):
795
+ new_data = pd.DataFrame()
796
+ for i, (column, analyzer) in enumerate(zip(col_names, analyzers)):
797
+ new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_terms']
798
+ print(f"Cluster terms extracted for {column}")
799
+
800
+ new_data = new_data.fillna('null').astype('category')
801
+ data_nums = new_data.apply(lambda x: x.cat.codes)
802
+
803
+ for col in data_nums.columns:
804
+ try:
805
+ categories = new_data[col].cat.categories
806
+ x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
807
+ bst, accuracy, preds = self.train_xgboost(x_train, y_train, x_test, y_test, len(categories))
808
+ self.plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
809
+ except Exception as e:
810
+ print(f"Error processing {col}: {e}")
811
+
812
+ def train_xgboost(self, x_train, y_train, x_test, y_test, num_classes):
813
+ dtrain = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
814
+ dtest = xgb.DMatrix(x_test, label=y_test)
815
+
816
+ params = {'objective': 'multi:softmax', 'num_class': num_classes, 'max_depth': 6, 'eta': 0.3}
817
+ num_round = 100
818
+ bst = xgb.train(dtrain=dtrain, params=params, num_boost_round=num_round)
819
+ preds = bst.predict(dtest)
820
+ accuracy = accuracy_score(y_test, preds)
821
+
822
+ print(f"XGBoost trained with accuracy: {accuracy:.2f}")
823
+ return bst, accuracy, preds
824
+
825
+ def plot_results(self, new_data, bst, x_test, y_test, preds, categories, accuracy, col):
826
+ fig, axs = plt.subplots(1, 3, figsize=(25, 5))
827
+ fig.suptitle(f'{col.split(sep=".")[-1]} prediction', fontsize=35)
828
+
829
+ plot_importance(bst, ax=axs[0], importance_type='gain', show_values=False)
830
+ conf_matrix = confusion_matrix(y_test, preds)
831
+ sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=categories, yticklabels=categories, ax=axs[1])
832
+ axs[1].set_title(f'Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%')
833
+
834
+ sorted_features = sorted(bst.get_score(importance_type="gain").items(), key=lambda x: x[1], reverse=True)
835
+ most_important_feature = sorted_features[0][0]
836
+ contingency_table = pd.crosstab(new_data[col], new_data[most_important_feature])
837
+ chi2, p, dof, expected = chi2_contingency(contingency_table)
838
+ print(f"Chi-squared test for {col} and {most_important_feature}: p-value = {p}")
839
+
840
+ sns.heatmap(contingency_table, annot=True, cmap='Greens', ax=axs[2])
841
+ axs[2].set_title(f'Contingency Table between {col.split(sep=".")[-1]} and {most_important_feature.split(sep=".")[-1]}\np-value = {p}')
842
+
843
+ plt.tight_layout()
844
+ plt.savefig(f"{col}_{accuracy:.2f}_prediction_XGB.jpeg", dpi=300)
845
+ plt.show()
846
+
847
+ @staticmethod
848
+ def cramers_v(confusion_matrix):
849
+ chi2 = chi2_contingency(confusion_matrix)[0]
850
+ n = confusion_matrix.sum().sum()
851
+ phi2 = chi2 / n
852
+ r, k = confusion_matrix.shape
853
+ phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
854
+ r_corr = r - ((r-1)**2)/(n-1)
855
+ k_corr = k - ((k-1)**2)/(n-1)
856
+ return np.sqrt(phi2corr / min((k_corr-1), (r_corr-1)))
857
+
858
+ def plot_cramers_v_heatmap(self, data, significance_level=0.05):
859
+ cramers_v_df = pd.DataFrame(index=data.columns, columns=data.columns, data=np.nan)
860
+
861
+ for col1 in data.columns:
862
+ for col2 in data.columns:
863
+ if col1 != col2:
864
+ confusion_matrix = pd.crosstab(data[col1], data[col2])
865
+ chi2, p, dof, expected = chi2_contingency(confusion_matrix)
866
+ if p < significance_level:
867
+ cramers_v_df.at[col1, col2] = UAPVisualizer.cramers_v(confusion_matrix)
868
+
869
+ plt.figure(figsize=(10, 8)),# facecolor="black")
870
+ mask = np.triu(np.ones_like(cramers_v_df, dtype=bool))
871
+ #sns.set_theme(style="dark", rc={"axes.facecolor": "black", "grid.color": "white", "xtick.color": "white", "ytick.color": "white", "axes.labelcolor": "white", "axes.titlecolor": "white"})
872
+ # ax = sns.heatmap(cramers_v_df, annot=True, fmt=".1f", linewidths=.5, linecolor='white', cmap='coolwarm', annot_kws={"color":"white"}, cbar=True, mask=mask, square=True)
873
+ # Customizing the color of the ticks and labels to white
874
+ # plt.xticks(color='white')
875
+ # plt.yticks(color='white')
876
+ sns.heatmap(cramers_v_df, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, mask=mask, square=True)
877
+ plt.title(f"Heatmap of Cramér's V (p < {significance_level})")
878
+ plt.show()
879
+
880
+
881
+ def plot_treemap(self, df, column, top_n=32):
882
+ # Get the value counts and the top N labels
883
+ value_counts = df[column].value_counts()
884
+ top_labels = value_counts.iloc[:top_n].index
885
+
886
+
887
+ # Use np.where to replace all values not in the top N with 'Other'
888
+ revised_column = f'{column}_revised'
889
+ df[revised_column] = np.where(df[column].isin(top_labels), df[column], 'Other')
890
+
891
+ # Get the value counts including the 'Other' category
892
+ sizes = df[revised_column].value_counts().values
893
+ labels = df[revised_column].value_counts().index
894
+
895
+ # Get a gradient of colors
896
+ colors = list(mcolors.TABLEAU_COLORS.values())
897
+
898
+ # Get % of each category
899
+ percents = sizes / sizes.sum()
900
+
901
+ # Prepare labels with percentages
902
+ labels = [f'{label}\n {percent:.1%}' for label, percent in zip(labels, percents)]
903
+
904
+ # Plot the treemap
905
+ squarify.plot(sizes=sizes, label=labels, alpha=0.7, pad=True, color=colors, text_kwargs={'fontsize': 10})
906
+
907
+ ax = plt.gca()
908
+
909
+ # Iterate over text elements and rectangles (patches) in the axes for color adjustment
910
+ for text, rect in zip(ax.texts, ax.patches):
911
+ background_color = rect.get_facecolor()
912
+ r, g, b, _ = mcolors.to_rgba(background_color)
913
+ brightness = np.average([r, g, b])
914
+ text.set_color('white' if brightness < 0.5 else 'black')
915
+
916
+ # Adjust font size based on rectangle's area and wrap long text
917
+ coef = 0.8
918
+ font_size = np.sqrt(rect.get_width() * rect.get_height()) * coef
919
+ text.set_fontsize(font_size)
920
+ wrapped_text = textwrap.fill(text.get_text(), width=20)
921
+ text.set_text(wrapped_text)
922
+
923
+ plt.axis('off')
924
+ plt.gca().invert_yaxis()
925
+ plt.gcf().set_size_inches(20, 12)
926
+ plt.show()
927
+
928
+
929
+
930
+ class UAPParser:
931
+ def __init__(self, api_key, model="gpt-3.5-turbo-0125", col=None, format_long=None):
932
+ os.environ['OPENAI_API_KEY'] = api_key
933
+ self.client = OpenAI()
934
+ self.model = model
935
+ self.responses = {}
936
+ self.col = None
937
+
938
+ def fetch_response(self, description, format_long):
939
+ INITIAL_WAIT_TIME = 5
940
+ MAX_WAIT_TIME = 600
941
+ MAX_RETRIES = 10
942
+
943
+ wait_time = INITIAL_WAIT_TIME
944
+ for attempt in range(MAX_RETRIES):
945
+ try:
946
+ response = self.client.chat.completions.create(
947
+ model=self.model,
948
+ response_format={"type": "json_object"},
949
+ messages=[
950
+ {"role": "system", "content": "You are a helpful assistant which is tasked to assign a trustworthiness value between 0 and 100 to the given first-hand report."},
951
+ {"role": "user", "content": f'Input report: {description}\n\n Parse data following this json structure; leave missing data empty: {format_long} Output:'}
952
+ ]
953
+ )
954
+ return response
955
+ except HTTPError as e:
956
+ if 'TooManyRequests' in str(e):
957
+ time.sleep(wait_time)
958
+ wait_time = min(wait_time * 2, MAX_WAIT_TIME) # Exponential backoff
959
+ else:
960
+ raise
961
+ except Exception as e:
962
+ print(f"Unexpected error: {e}")
963
+ break
964
+
965
+ return None # Return None if all retries fail
966
+
967
+ def process_descriptions(self, descriptions, format_long, max_workers=32):
968
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
969
+ future_to_desc = {executor.submit(self.fetch_response, desc, format_long): desc for desc in descriptions}
970
+
971
+ for future in stqdm(concurrent.futures.as_completed(future_to_desc), total=len(descriptions)):
972
+ desc = future_to_desc[future]
973
+ try:
974
+ response = future.result()
975
+ response_text = response.choices[0].message.content if response else None
976
+ if response_text:
977
+ self.responses[desc] = response_text
978
+ except Exception as exc:
979
+ print(f'Error occurred for description {desc}: {exc}')
980
+
981
+ def parse_responses(self):
982
+ parsed_responses = {}
983
+ not_parsed = 0
984
+ try:
985
+ for k, v in self.responses.items():
986
+ try:
987
+ parsed_responses[k] = json.loads(v)
988
+ except:
989
+ try:
990
+ parsed_responses[k] = json.loads(v.replace("'", '"'))
991
+ except:
992
+ not_parsed += 1
993
+ except Exception as e:
994
+ print(f"Error parsing responses: {e}")
995
+
996
+ print(f"Number of unparsed responses: {not_parsed}")
997
+ print(f"Number of parsed responses: {len(parsed_responses)}")
998
+ return parsed_responses
999
+
1000
+ def responses_to_df(self, col, parsed_responses):
1001
+ parsed_df = pd.DataFrame(parsed_responses).T
1002
+ parsed_df2 = pd.json_normalize(parsed_df[col])
1003
+ parsed_df2.index = parsed_df.index
1004
+ return parsed_df2
1005
+
1006
+
1007
+ import streamlit as st
1008
+ import pandas as pd
1009
+ import numpy as np
1010
+ import matplotlib.pyplot as plt
1011
+ import seaborn as sns
1012
+ from Levenshtein import distance
1013
+ from sklearn.model_selection import train_test_split
1014
+ from sklearn.metrics import confusion_matrix
1015
+ from stqdm import stqdm
1016
+ stqdm.pandas()
1017
+ import streamlit.components.v1 as components
1018
+ from dateutil import parser
1019
+ from sentence_transformers import SentenceTransformer
1020
+ import torch
1021
+ st.set_option('deprecation.showPyplotGlobalUse', False)
1022
+
1023
+
1024
+ from pandas.api.types import (
1025
+ is_categorical_dtype,
1026
+ is_datetime64_any_dtype,
1027
+ is_numeric_dtype,
1028
+ is_object_dtype,
1029
+ )
1030
+
1031
+
1032
+
1033
+ def load_data(file_path, key='df'):
1034
+ return pd.read_hdf(file_path, key=key)
1035
+
1036
+
1037
+ def gemini_query(question, selected_data, gemini_key):
1038
+
1039
+ if question == "":
1040
+ question = "Summarize the following data in relevant bullet points"
1041
+
1042
+ import pathlib
1043
+ import textwrap
1044
+
1045
+ import google.generativeai as genai
1046
+
1047
+ from IPython.display import display
1048
+ from IPython.display import Markdown
1049
+
1050
+
1051
+ def to_markdown(text):
1052
+ text = text.replace('•', ' *')
1053
+ return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
1054
+
1055
+ # selected_data is a list
1056
+ # remove empty
1057
+
1058
+ filtered = [str(x) for x in selected_data if str(x) != '' and x is not None]
1059
+ # make a string
1060
+ context = '\n'.join(filtered)
1061
+
1062
+ genai.configure(api_key=gemini_key)
1063
+ query_model = genai.GenerativeModel('models/gemini-1.5-pro-latest')
1064
+ response = query_model.generate_content([f"{question}\n Answer based on this context: {context}\n\n"])
1065
+ return(response.text)
1066
+
1067
+ def filter_dataframe(df: pd.DataFrame) -> pd.DataFrame:
1068
+ """
1069
+ Adds a UI on top of a dataframe to let viewers filter columns
1070
+
1071
+ Args:
1072
+ df (pd.DataFrame): Original dataframe
1073
+
1074
+ Returns:
1075
+ pd.DataFrame: Filtered dataframe
1076
+ """
1077
+ try:
1078
+ modify = st.checkbox("Add filters on raw data")
1079
+ except:
1080
+ try:
1081
+ modify = st.checkbox("Add filters on processed data")
1082
+ except:
1083
+ try:
1084
+ modify = st.checkbox("Add filters on parsed data")
1085
+ except:
1086
+ pass
1087
+
1088
+ if not modify:
1089
+ return df
1090
+
1091
+ df_ = df.copy()
1092
+ # Try to convert datetimes into a standard format (datetime, no timezone)
1093
+ for col in df_.columns:
1094
+ if is_object_dtype(df_[col]):
1095
+ try:
1096
+ df_[col] = pd.to_datetime(df_[col])
1097
+ except Exception:
1098
+ try:
1099
+ df_[col] = df_[col].apply(parser.parse)
1100
+ except Exception:
1101
+ pass
1102
+ if is_datetime64_any_dtype(df_[col]):
1103
+ df_[col] = df_[col].dt.tz_localize(None)
1104
+
1105
+ modification_container = st.container()
1106
+
1107
+ with modification_container:
1108
+ to_filter_columns = st.multiselect("Filter dataframe on", df_.columns)
1109
+ for column in to_filter_columns:
1110
+ left, right = st.columns((1, 20))
1111
+ # Treat columns with < 200 unique values as categorical if not date or numeric
1112
+ if is_categorical_dtype(df_[column]) :# or (df_[column].nunique() < 200 and not is_datetime64_any_dtype(df_[column]) and not is_numeric_dtype(df_[column])):
1113
+ user_cat_input = right.multiselect(
1114
+ f"Values for {column}",
1115
+ df_[column].unique(),
1116
+ default=list(df_[column].unique()),
1117
+ )
1118
+ df_ = df_[df_[column].isin(user_cat_input)]
1119
+ elif is_numeric_dtype(df_[column]):
1120
+ _min = float(df_[column].min())
1121
+ _max = float(df_[column].max())
1122
+ step = (_max - _min) / 100
1123
+ user_num_input = right.slider(
1124
+ f"Values for {column}",
1125
+ min_value=_min,
1126
+ max_value=_max,
1127
+ value=(_min, _max),
1128
+ step=step,
1129
+ )
1130
+ df_ = df_[df_[column].between(*user_num_input)]
1131
+ elif is_datetime64_any_dtype(df_[column]):
1132
+ user_date_input = right.date_input(
1133
+ f"Values for {column}",
1134
+ value=(
1135
+ df_[column].min(),
1136
+ df_[column].max(),
1137
+ ),
1138
+ )
1139
+ if len(user_date_input) == 2:
1140
+ user_date_input = tuple(map(pd.to_datetime, user_date_input))
1141
+ start_date, end_date = user_date_input
1142
+ df_ = df_.loc[df_[column].between(start_date, end_date)]
1143
+ else:
1144
+ user_text_input = right.text_input(
1145
+ f"Substring or regex in {column}",
1146
+ )
1147
+ if user_text_input:
1148
+ df_ = df_[df_[column].astype(str).str.contains(user_text_input)]
1149
+ # write len of df after filtering with % of original
1150
+ st.write(f"{len(df_)} rows ({len(df_) / len(df) * 100:.2f}%)")
1151
+ return df_
1152
+
1153
+ def merge_clusters(df, column):
1154
+ cluster_terms_ = df.__dict__['cluster_terms']
1155
+ cluster_labels_ = df.__dict__['cluster_labels']
1156
+ label_name_map = {label: cluster_terms_[label] for label in set(cluster_labels_)}
1157
+ merge_map = {}
1158
+ # Iterate over term pairs and decide on merging based on the distance
1159
+ for idx, term1 in enumerate(cluster_terms_):
1160
+ for jdx, term2 in enumerate(cluster_terms_):
1161
+ if idx < jdx and distance(term1, term2) <= 3: # Adjust threshold as needed
1162
+ # Decide to merge labels corresponding to jdx into labels corresponding to idx
1163
+ # Find labels corresponding to jdx and idx
1164
+ labels_to_merge = [label for label, term_index in enumerate(cluster_labels_) if term_index == jdx]
1165
+ for label in labels_to_merge:
1166
+ merge_map[label] = idx # Map the label to use the term index of term1
1167
+
1168
+ # Update the analyzer with the merged numeric labels
1169
+ updated_cluster_labels_ = [merge_map[label] if label in merge_map else label for label in cluster_labels_]
1170
+
1171
+ df.__dict__['cluster_labels'] = updated_cluster_labels_
1172
+ # Optional: Update string labels to reflect merged labels
1173
+ updated_string_labels = [cluster_terms_[label] for label in updated_cluster_labels_]
1174
+ df.__dict__['string_labels'] = updated_string_labels
1175
+ return updated_string_labels
1176
+
1177
+ def analyze_and_predict(data, analyzers, col_names, clusters):
1178
+ visualizer = UAPVisualizer()
1179
+ new_data = pd.DataFrame()
1180
+ for i, column in enumerate(col_names):
1181
+ #new_data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_labels']
1182
+ new_data[f'Analyzer_{column}'] = clusters[column]
1183
+ data[f'Analyzer_{column}'] = clusters[column]
1184
+ #data[f'Analyzer_{column}'] = analyzer.__dict__['cluster_labels']
1185
+
1186
+ print(f"Cluster terms extracted for {column}")
1187
+
1188
+ for col in data.columns:
1189
+ if 'Analyzer' in col:
1190
+ data[col] = data[col].astype('category')
1191
+
1192
+ new_data = new_data.fillna('null').astype('category')
1193
+ data_nums = new_data.apply(lambda x: x.cat.codes)
1194
+
1195
+ for col in data_nums.columns:
1196
+ try:
1197
+ categories = new_data[col].cat.categories
1198
+ x_train, x_test, y_train, y_test = train_test_split(data_nums.drop(columns=[col]), data_nums[col], test_size=0.2, random_state=42)
1199
+ bst, accuracy, preds = visualizer.train_xgboost(x_train, y_train, x_test, y_test, len(categories))
1200
+ fig = visualizer.plot_results(new_data, bst, x_test, y_test, preds, categories, accuracy, col)
1201
+ with st.status(f"Charts Analyses: {col}", expanded=True) as status:
1202
+ st.pyplot(fig)
1203
+ status.update(label=f"Chart Processed: {col}", expanded=False)
1204
+ except Exception as e:
1205
+ print(f"Error processing {col}: {e}")
1206
+ continue
1207
+ return new_data, data
1208
+
1209
+ def main():
1210
+ from config import API_KEY, GEMINI_KEY, FORMAT_LONG
1211
+
1212
+ with torch.no_grad():
1213
+ torch.cuda.empty_cache()
1214
+
1215
+ st.set_page_config(
1216
+ page_title="UAP ANALYSIS",
1217
+ page_icon=":alien:",
1218
+ layout="wide",
1219
+ initial_sidebar_state="expanded",
1220
+ )
1221
+
1222
+ st.title('UAP Analysis Dashboard')
1223
+
1224
+ # Initialize session state
1225
+ if 'analyzers' not in st.session_state:
1226
+ st.session_state['analyzers'] = []
1227
+ if 'col_names' not in st.session_state:
1228
+ st.session_state['col_names'] = []
1229
+ if 'clusters' not in st.session_state:
1230
+ st.session_state['clusters'] = {}
1231
+ if 'new_data' not in st.session_state:
1232
+ st.session_state['new_data'] = pd.DataFrame()
1233
+ if 'dataset' not in st.session_state:
1234
+ st.session_state['dataset'] = pd.DataFrame()
1235
+ if 'data_processed' not in st.session_state:
1236
+ st.session_state['data_processed'] = False
1237
+ if 'stage' not in st.session_state:
1238
+ st.session_state['stage'] = 0
1239
+ if 'filtered_data' not in st.session_state:
1240
+ st.session_state['filtered_data'] = None
1241
+
1242
+ # Load dataset
1243
+ data_path = 'parsed_files_distance_embeds.h5'
1244
+ parsed = load_data(data_path).drop(columns=['embeddings']).head(10000)
1245
+
1246
+ # Unparsed data
1247
+ unparsed_tickbox = st.checkbox('Unparsed Data')
1248
+ if unparsed_tickbox:
1249
+ unparsed = st.file_uploader("Upload Raw DataFrame", type=["csv", "xlsx"])
1250
+ if unparsed is not None:
1251
+ try:
1252
+ data = pd.read_csv(unparsed) if unparsed.type == "text/csv" else pd.read_excel(unparsed)
1253
+ filtered_data = filter_dataframe(data)
1254
+ st.dataframe(filtered_data)
1255
+ except Exception as e:
1256
+ st.error(f"An error occurred while reading the file: {e}")
1257
+
1258
+ # Parsed data
1259
+ parsed_tickbox = st.checkbox('Parsed Data')
1260
+ if parsed_tickbox:
1261
+ parsed_responses = filter_dataframe(parsed)
1262
+ st.session_state['parsed_responses'] = parsed_responses
1263
+ col1, col2 = st.columns(2)
1264
+ st.dataframe(parsed_responses)
1265
+ with col1:
1266
+ col_parsed = st.selectbox("Which column do you want to query?", st.session_state['parsed_responses'].columns)
1267
+ with col2:
1268
+ GEMINI_KEY = st.text_input('Gemini API Key', GEMINI_KEY, type='password', help="Enter your Gemini API key")
1269
+ if col_parsed and GEMINI_KEY:
1270
+ selected_column_data = st.session_state['parsed_responses'][col_parsed].tolist()
1271
+ question = st.text_input("Ask a question or leave empty for summarization")
1272
+ if st.button("Generate Query") and selected_column_data:
1273
+ st.write(gemini_query(question, selected_column_data, GEMINI_KEY))
1274
+ st.session_state['stage'] = 1
1275
+
1276
+ # Analyze data
1277
+ if st.session_state.stage > 0:
1278
+ columns_to_analyze = st.multiselect(
1279
+ label='Select columns to analyze',
1280
+ options=parsed_responses.columns
1281
+ )
1282
+ if columns_to_analyze:
1283
+ analyzers = []
1284
+ col_names = []
1285
+ clusters = {}
1286
+ for column in columns_to_analyze:
1287
+ with torch.no_grad():
1288
+ with st.status(f"Processing {column}", expanded=True) as status:
1289
+ analyzer = UAPAnalyzer(parsed_responses, column)
1290
+ st.write(f"Processing {column}...")
1291
+ analyzer.preprocess_data(top_n=32)
1292
+ st.write("Reducing dimensionality...")
1293
+ analyzer.reduce_dimensionality(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
1294
+ st.write("Clustering data...")
1295
+ analyzer.cluster_data(method='HDBSCAN', min_cluster_size=15)
1296
+ analyzer.get_tf_idf_clusters(top_n=1)
1297
+ st.write("Naming clusters...")
1298
+ analyzers.append(analyzer)
1299
+ col_names.append(column)
1300
+ clusters[column] = analyzer.merge_similar_clusters(cluster_terms=analyzer.__dict__['cluster_terms'], cluster_labels=analyzer.__dict__['cluster_labels'])
1301
+ status.update(label=f"Processing {column} complete", expanded=False)
1302
+ st.session_state['analyzers'] = analyzers
1303
+ st.session_state['col_names'] = col_names
1304
+ st.session_state['clusters'] = clusters
1305
+
1306
+ # save space
1307
+ parsed = None
1308
+ analyzers = None
1309
+ col_names = None
1310
+ clusters = None
1311
+
1312
+ if st.session_state['clusters'] is not None:
1313
+ try:
1314
+ new_data, parsed_responses = analyze_and_predict(parsed_responses, st.session_state['analyzers'], st.session_state['col_names'], st.session_state['clusters'])
1315
+ st.session_state['dataset'] = parsed_responses
1316
+ st.session_state['new_data'] = new_data
1317
+ st.session_state['data_processed'] = True
1318
+ except Exception as e:
1319
+ st.write(f"Error processing data: {e}")
1320
+
1321
+
1322
+
1323
+ if st.session_state['data_processed']:
1324
+ try:
1325
+ visualizer = UAPVisualizer(data=st.session_state['new_data'])
1326
+ #new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
1327
+ fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
1328
+ with st.status(f"Cramer's V Chart", expanded=True) as statuss:
1329
+ st.pyplot(fig2)
1330
+ statuss.update(label="Cramer's V chart plotted", expanded=False)
1331
+ except Exception as e:
1332
+ st.write(f"Error plotting Cramers V: {e}")
1333
+
1334
+ for i, column in enumerate(st.session_state['col_names']):
1335
+ #if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
1336
+ if st.session_state['data_processed']:
1337
+ with st.status(f"Show clusters {column}", expanded=True) as stats:
1338
+ # plot_embeddings4(self, title=None, cluster_terms=None, cluster_labels=None, reduced_embeddings=None, column=None, data=None):
1339
+ fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
1340
+ stats.update(label=f"Show clusters {column} complete", expanded=False)
1341
+
1342
+ if st.session_state['data_processed']:
1343
+ parsed2 = st.session_state.get('new_data', pd.DataFrame())
1344
+ parsed2 = filter_dataframe(parsed2)
1345
+ col1, col2 = st.columns(2)
1346
+ st.dataframe(parsed2)
1347
+ with col1:
1348
+ col_parsed2 = st.selectbox("Which column do you want to query?", parsed2.columns)
1349
+ with col2:
1350
+ GEMINI_KEY = st.text_input('Gemini API Key', GEMINI_KEY, type='password', help="Enter your Gemini API key")
1351
+ if col_parsed and GEMINI_KEY:
1352
+ selected_column_data2 = parsed2[col_parsed2].tolist()
1353
+ question2 = st.text_input("Ask a question or leave empty for summarization")
1354
+ if st.button("Generate Query") and selected_column_data2:
1355
+ st.write(gemini_query(question2, selected_column_data2, GEMINI_KEY))
1356
+
1357
+
1358
+ if __name__ == '__main__':
1359
+ main()
1360
+
1361
+ #streamlit run streamlit_uap_clean.py --server.enableXsrfProtection=false --theme.primaryColor=#FFA500 --theme.base=dark