Maria Tsilimos commited on
Commit
190b8c6
·
unverified ·
1 Parent(s): 2ec60f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -84
app.py CHANGED
@@ -11,11 +11,11 @@ from comet_ml import Experiment
11
  import re
12
  import numpy as np
13
  import json
14
- from cryptography.fernet import Fernet
15
 
16
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
17
 
18
-
19
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
20
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
21
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
@@ -24,35 +24,51 @@ comet_initialized = False
24
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
25
  comet_initialized = True
26
 
27
-
28
  if 'file_upload_attempts' not in st.session_state:
29
  st.session_state['file_upload_attempts'] = 0
30
 
31
-
32
  if 'encrypted_extracted_text' not in st.session_state:
33
  st.session_state['encrypted_extracted_text'] = None
34
 
35
-
36
  if 'json_dataframe' not in st.session_state:
37
  st.session_state['json_dataframe'] = None
38
 
39
- max_attempts = 10
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  @st.cache_resource
43
  def load_ner_model():
44
-
 
 
 
45
  try:
46
- return pipeline("token-classification",
47
- model="saattrupdan/nbailab-base-ner-scandi",
48
- aggregation_strategy="max", ignore_labels=["O"],
49
- stride=128)
 
 
 
50
  except Exception as e:
51
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
52
  st.stop()
53
 
54
 
55
-
56
  @st.cache_resource
57
  def load_encryption_key():
58
  """
@@ -65,20 +81,25 @@ def load_encryption_key():
65
  key_str = os.environ.get("FERNET_KEY")
66
  if not key_str:
67
  raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
68
-
69
  # Fernet key must be bytes, so encode the string
70
  key_bytes = key_str.encode('utf-8')
71
  return Fernet(key_bytes)
72
  except ValueError as ve:
73
- st.error(f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely in your deployment environment (e.g., Hugging Face Spaces secrets, Render environment variables) or in a local .env file for development.")
74
- st.stop() # Stop the app if the key is not found, as security is compromised
 
 
 
 
75
  except Exception as e:
76
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
77
  st.stop()
78
 
79
- # Initialize the Fernet cipher instance
80
  fernet = load_encryption_key()
81
 
 
82
  def encrypt_text(text_content: str) -> bytes:
83
  """
84
  Encrypts a string using the loaded Fernet cipher.
@@ -86,6 +107,7 @@ def encrypt_text(text_content: str) -> bytes:
86
  """
87
  return fernet.encrypt(text_content.encode('utf-8'))
88
 
 
89
  def decrypt_text(encrypted_bytes: bytes) -> str | None:
90
  """
91
  Decrypts bytes using the loaded Fernet cipher.
@@ -108,23 +130,17 @@ expander.write('''
108
  miscellaneous”). Results are presented in an easy-to-read table, visualized in
109
  an interactive tree map, pie chart, and bar chart, and are available for
110
  download along with a Glossary of tags.
111
-
112
  **How to Use:** Upload your JSON file. Then, click the 'Results' button
113
  to extract and tag entities in your text data.
114
-
115
  **Usage Limits:** You can request results up to 10 times.
116
-
117
  **Language settings:** Please check and adjust the language settings in
118
  your computer, so the Danish, Swedish, Norwegian, Icelandic and Faroese
119
  characters are handled properly in your downloaded file.
120
-
121
  **Customization:** To change the app's background color to white or
122
  black, click the three-dot menu on the right-hand side of your app, go to
123
  Settings and then Choose app theme, colors and fonts.
124
-
125
  **Technical issues:** If your connection times out, please refresh the
126
  page or reopen the app's URL.
127
-
128
  For any errors or inquiries, please contact us at [email protected]
129
  ''')
130
 
@@ -134,11 +150,10 @@ with st.sidebar:
134
  "extracting and tagging entities in text data. Entities can be persons, "
135
  "organizations, locations, countries, products, events etc.")
136
  st.subheader("Related NER Web Apps", divider="orange")
137
- st.link_button("MediDoc Entity Finder",
138
- " https://nlpblogs.com/shop/named-entity-recognition-ner/medidoc-entity-finder/",
139
  type="primary")
140
 
141
-
142
  uploaded_file = st.file_uploader("Choose a JSON file", type=["json"])
143
 
144
  # Initialize text for the current run outside the if uploaded_file block
@@ -149,7 +164,7 @@ if uploaded_file is not None:
149
  try:
150
  # Read the content as bytes first, then decode for JSON parsing
151
  file_contents_bytes = uploaded_file.read()
152
-
153
  # Reset the file pointer after reading, so json.load can read from the beginning
154
  uploaded_file.seek(0)
155
  dados = json.load(uploaded_file)
@@ -157,19 +172,17 @@ if uploaded_file is not None:
157
  # Attempt to convert JSON to DataFrame and extract text
158
  try:
159
  st.session_state['json_dataframe'] = pd.DataFrame(dados)
160
-
161
  # Concatenate all content into a single string for NER
162
  df_string_representation = st.session_state['json_dataframe'].to_string(index=False, header=False)
163
  # Simple regex to remove non-alphanumeric characters but keep spaces and periods
164
  text_content = re.sub(r'[^\w\s.]', '', df_string_representation)
165
  # Remove the specific string "Empty DataFrame Columns" if it appears due to conversion
166
  text_content = text_content.replace("Empty DataFrame Columns", "").strip()
167
- current_run_text = text_content # Set text for current run
168
-
169
- if not current_run_text.strip(): # Check if text is effectively empty
170
  st.warning("No meaningful text could be extracted from the JSON DataFrame for analysis.")
171
- current_run_text = None # Reset to None if empty
172
-
173
  except ValueError:
174
  # If direct conversion to DataFrame fails, try to extract strings directly from JSON structure
175
  st.info("JSON data could not be directly converted to a simple DataFrame for display. Attempting to extract text directly.")
@@ -196,7 +209,6 @@ if uploaded_file is not None:
196
  for sub_val in value:
197
  if isinstance(sub_val, str):
198
  extracted_texts_list.append(sub_val)
199
-
200
  if extracted_texts_list:
201
  current_run_text = " ".join(extracted_texts_list).strip()
202
  else:
@@ -208,15 +220,13 @@ if uploaded_file is not None:
208
  encrypted_text_bytes = encrypt_text(current_run_text)
209
  st.session_state['encrypted_extracted_text'] = encrypted_text_bytes
210
  # Optionally clear the unencrypted version from session state if you only want the encrypted one
211
- # st.session_state['extracted_text_for_ner'] = None
212
-
213
  st.success("JSON file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
214
  st.divider()
215
  else:
216
  st.session_state['encrypted_extracted_text'] = None
217
  # st.session_state['extracted_text_for_ner'] = None
218
  st.error("Could not extract meaningful text from the uploaded JSON file.")
219
-
220
  except json.JSONDecodeError as e:
221
  st.error(f"JSON Decode Error: {e}")
222
  st.error("Please ensure the uploaded file contains valid JSON data.")
@@ -227,10 +237,9 @@ if uploaded_file is not None:
227
  st.session_state['encrypted_extracted_text'] = None
228
  st.session_state['json_dataframe'] = None
229
 
230
-
231
  # --- Results Button and Processing Logic ---
232
  if st.button("Results"):
233
- start_time = time.time()
234
  if not comet_initialized:
235
  st.warning("Comet ML not initialized. Check environment variables if you wish to log data.")
236
 
@@ -242,7 +251,7 @@ if st.button("Results"):
242
  text_for_ner = None
243
  if st.session_state['encrypted_extracted_text'] is not None:
244
  text_for_ner = decrypt_text(st.session_state['encrypted_extracted_text'])
245
-
246
  if text_for_ner is None or not text_for_ner.strip():
247
  st.warning("No extractable text content available for analysis. Please upload a valid JSON file.")
248
  st.stop()
@@ -251,19 +260,25 @@ if st.button("Results"):
251
 
252
  with st.spinner("Analyzing text...", show_time=True):
253
  model = load_ner_model()
254
- text_entities = model(text_for_ner) # Use the decrypted text
 
 
 
 
 
 
255
  df = pd.DataFrame(text_entities)
256
 
257
  if 'word' in df.columns:
258
  # Ensure 'word' column is string type before applying regex
259
- if df['word'].dtype == 'object':
260
- pattern = r'[^\w\s]' # Regex to remove non-alphanumeric characters but keep spaces and periods
261
  df['word'] = df['word'].astype(str).replace(pattern, '', regex=True)
262
  else:
263
  st.warning("The 'word' column is not of string type; skipping character cleaning.")
264
  else:
265
  st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
266
- st.stop() # Stop execution if the column is missing
267
 
268
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
269
  df = df.replace('', 'Unknown').dropna()
@@ -272,6 +287,11 @@ if st.button("Results"):
272
  st.warning("No entities were extracted from the uploaded text.")
273
  st.stop()
274
 
 
 
 
 
 
275
  if comet_initialized:
276
  experiment = Experiment(
277
  api_key=COMET_API_KEY,
@@ -280,8 +300,11 @@ if st.button("Results"):
280
  )
281
  experiment.log_parameter("input_text_length", len(text_for_ner))
282
  experiment.log_table("predicted_entities", df)
 
 
283
 
284
  # --- Display Results ---
 
285
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
286
  df_styled = df.style.set_properties(**properties)
287
  st.dataframe(df_styled, use_container_width=True)
@@ -298,89 +321,90 @@ if st.button("Results"):
298
  '**start**': ['index of the start of the corresponding entity']
299
 
300
  '**end**': ['index of the end of the corresponding entity']
 
 
301
  ''')
302
 
303
- entity_groups = {"PER": "person",
304
- "LOC": "location",
305
- "ORG": "organization",
306
- "MISC": "miscellaneous",
307
- }
308
-
309
- st.subheader("Grouped entities", divider = "orange")
310
-
311
- # Convert entity_groups dictionary to a list of (key, title) tuples
312
- entity_items = list(entity_groups.items())
313
- # Define how many tabs per row
314
- tabs_per_row = 5
315
- # Loop through the entity items in chunks
316
- for i in range(0, len(entity_items), tabs_per_row):
317
- current_row_entities = entity_items[i : i + tabs_per_row]
318
- tab_titles = [item[1] for item in current_row_entities]
319
-
320
- tabs = st.tabs(tab_titles)
321
- for j, (entity_group_key, tab_title) in enumerate(current_row_entities):
322
  with tabs[j]:
323
- if entity_group_key in df["entity_group"].unique():
324
- df_filtered = df[df["entity_group"] == entity_group_key]
325
  st.dataframe(df_filtered, use_container_width=True)
326
  else:
327
- st.info(f"No '{tab_title}' entities found in the text.")
328
  # Display an empty DataFrame for consistency if no entities are found
329
  st.dataframe(pd.DataFrame({
330
- 'entity_group': [entity_group_key],
331
  'score': [np.nan],
332
  'word': [np.nan],
333
  'start': [np.nan],
334
- 'end': [np.nan]
 
335
  }), hide_index=True)
336
-
337
  st.divider()
338
 
339
  # --- Visualizations ---
340
  st.subheader("Tree map", divider="orange")
341
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'word',
342
- 'entity_group'],
343
- values='score', color='entity_group')
 
 
 
 
 
 
344
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
345
  st.plotly_chart(fig_treemap)
346
  if comet_initialized:
347
  experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
348
 
349
- value_counts1 = df['entity_group'].value_counts()
350
- final_df_counts = value_counts1.reset_index().rename(columns={"index": "entity_group"})
351
 
352
  col1, col2 = st.columns(2)
353
  with col1:
354
  st.subheader("Pie Chart", divider="orange")
355
- fig_pie = px.pie(final_df_counts, values='count', names='entity_group',
356
- hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted labels')
357
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
358
  st.plotly_chart(fig_pie)
359
  if comet_initialized:
360
- experiment.log_figure(figure=fig_pie, figure_name="label_pie_chart")
361
 
362
  with col2:
363
  st.subheader("Bar Chart", divider="orange")
364
- fig_bar = px.bar(final_df_counts, x="count", y="entity_group", color="entity_group", text_auto=True,
365
- title='Occurrences of predicted labels')
366
  st.plotly_chart(fig_bar)
367
  if comet_initialized:
368
- experiment.log_figure(figure=fig_bar, figure_name="label_bar_chart")
369
 
370
  # --- Downloadable Content ---
371
  dfa = pd.DataFrame(
372
  data={
373
- 'Column Name': ['word', 'entity_group','score', 'start', 'end'],
374
  'Description': [
375
  'entity extracted from your text data',
376
  'label (tag) assigned to a given extracted entity',
377
  'accuracy score; how accurately a tag has been assigned to a given entity',
378
  'index of the start of the corresponding entity',
379
  'index of the end of the corresponding entity',
 
380
  ]
381
  }
382
  )
383
-
384
  buf = io.BytesIO()
385
  with zipfile.ZipFile(buf, "w") as myzip:
386
  myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
@@ -402,7 +426,10 @@ if st.button("Results"):
402
  st.divider()
403
  if comet_initialized:
404
  experiment.end()
405
- end_time = time.time()
406
- elapsed_time = end_time - start_time
407
- st.info(f"Results processed in **{elapsed_time:.2f} seconds**.")
 
 
408
  st.write(f"Number of times you requested results: **{st.session_state['file_upload_attempts']}/{max_attempts}**")
 
 
11
  import re
12
  import numpy as np
13
  import json
14
+ from cryptography.fernet import Fernet
15
 
16
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
17
 
18
+ # --- Configuration ---
19
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
20
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
21
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
 
24
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
25
  comet_initialized = True
26
 
27
+ # --- Initialize session state ---
28
  if 'file_upload_attempts' not in st.session_state:
29
  st.session_state['file_upload_attempts'] = 0
30
 
 
31
  if 'encrypted_extracted_text' not in st.session_state:
32
  st.session_state['encrypted_extracted_text'] = None
33
 
 
34
  if 'json_dataframe' not in st.session_state:
35
  st.session_state['json_dataframe'] = None
36
 
37
+ max_attempts = 10
38
+
39
+ # Define the categories and their associated entity labels
40
+ ENTITY_LABELS_CATEGORIZED = {
41
+ "Persons": ["PER"],
42
+ "Locations": ["LOC"],
43
+ "Organizations": ["ORG"],
44
+ "Miscellaneous": ["MISC"],
45
+ }
46
+
47
+ # Create a mapping from each specific entity label to its category
48
+ LABEL_TO_CATEGORY_MAP = {
49
+ label: category for category, labels in ENTITY_LABELS_CATEGORIZED.items() for label in labels
50
+ }
51
 
52
 
53
  @st.cache_resource
54
  def load_ner_model():
55
+ """
56
+ Loads the pre-trained NER model ("saattrupdan/nbailab-base-ner-scandi") and caches it.
57
+ This model is specifically trained for Scandinavian languages.
58
+ """
59
  try:
60
+ return pipeline(
61
+ "token-classification",
62
+ model="saattrupdan/nbailab-base-ner-scandi",
63
+ aggregation_strategy="max",
64
+ ignore_labels=["O"],
65
+ stride=128
66
+ )
67
  except Exception as e:
68
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
69
  st.stop()
70
 
71
 
 
72
  @st.cache_resource
73
  def load_encryption_key():
74
  """
 
81
  key_str = os.environ.get("FERNET_KEY")
82
  if not key_str:
83
  raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
84
+
85
  # Fernet key must be bytes, so encode the string
86
  key_bytes = key_str.encode('utf-8')
87
  return Fernet(key_bytes)
88
  except ValueError as ve:
89
+ st.error(
90
+ f"Configuration Error: {ve}. Please ensure the 'FERNET_KEY' environment variable is set securely "
91
+ "in your deployment environment (e.g., Hugging Face Spaces secrets, Render environment variables) "
92
+ "or in a local .env file for development."
93
+ )
94
+ st.stop() # Stop the app if the key is not found, as security is compromised
95
  except Exception as e:
96
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
97
  st.stop()
98
 
99
+ # Initialize the Fernet cipher instance globally (cached)
100
  fernet = load_encryption_key()
101
 
102
+
103
  def encrypt_text(text_content: str) -> bytes:
104
  """
105
  Encrypts a string using the loaded Fernet cipher.
 
107
  """
108
  return fernet.encrypt(text_content.encode('utf-8'))
109
 
110
+
111
  def decrypt_text(encrypted_bytes: bytes) -> str | None:
112
  """
113
  Decrypts bytes using the loaded Fernet cipher.
 
130
  miscellaneous”). Results are presented in an easy-to-read table, visualized in
131
  an interactive tree map, pie chart, and bar chart, and are available for
132
  download along with a Glossary of tags.
 
133
  **How to Use:** Upload your JSON file. Then, click the 'Results' button
134
  to extract and tag entities in your text data.
 
135
  **Usage Limits:** You can request results up to 10 times.
 
136
  **Language settings:** Please check and adjust the language settings in
137
  your computer, so the Danish, Swedish, Norwegian, Icelandic and Faroese
138
  characters are handled properly in your downloaded file.
 
139
  **Customization:** To change the app's background color to white or
140
  black, click the three-dot menu on the right-hand side of your app, go to
141
  Settings and then Choose app theme, colors and fonts.
 
142
  **Technical issues:** If your connection times out, please refresh the
143
  page or reopen the app's URL.
 
144
  For any errors or inquiries, please contact us at [email protected]
145
  ''')
146
 
 
150
  "extracting and tagging entities in text data. Entities can be persons, "
151
  "organizations, locations, countries, products, events etc.")
152
  st.subheader("Related NER Web Apps", divider="orange")
153
+ st.link_button("Multilingual PDF & DOCX Entity Finder",
154
+ "https://nlpblogs.com/shop/named-entity-recognition-ner/multilingual-pdf-docx-entity-finder/",
155
  type="primary")
156
 
 
157
  uploaded_file = st.file_uploader("Choose a JSON file", type=["json"])
158
 
159
  # Initialize text for the current run outside the if uploaded_file block
 
164
  try:
165
  # Read the content as bytes first, then decode for JSON parsing
166
  file_contents_bytes = uploaded_file.read()
167
+
168
  # Reset the file pointer after reading, so json.load can read from the beginning
169
  uploaded_file.seek(0)
170
  dados = json.load(uploaded_file)
 
172
  # Attempt to convert JSON to DataFrame and extract text
173
  try:
174
  st.session_state['json_dataframe'] = pd.DataFrame(dados)
175
+
176
  # Concatenate all content into a single string for NER
177
  df_string_representation = st.session_state['json_dataframe'].to_string(index=False, header=False)
178
  # Simple regex to remove non-alphanumeric characters but keep spaces and periods
179
  text_content = re.sub(r'[^\w\s.]', '', df_string_representation)
180
  # Remove the specific string "Empty DataFrame Columns" if it appears due to conversion
181
  text_content = text_content.replace("Empty DataFrame Columns", "").strip()
182
+ current_run_text = text_content # Set text for current run
183
+ if not current_run_text.strip(): # Check if text is effectively empty
 
184
  st.warning("No meaningful text could be extracted from the JSON DataFrame for analysis.")
185
+ current_run_text = None # Reset to None if empty
 
186
  except ValueError:
187
  # If direct conversion to DataFrame fails, try to extract strings directly from JSON structure
188
  st.info("JSON data could not be directly converted to a simple DataFrame for display. Attempting to extract text directly.")
 
209
  for sub_val in value:
210
  if isinstance(sub_val, str):
211
  extracted_texts_list.append(sub_val)
 
212
  if extracted_texts_list:
213
  current_run_text = " ".join(extracted_texts_list).strip()
214
  else:
 
220
  encrypted_text_bytes = encrypt_text(current_run_text)
221
  st.session_state['encrypted_extracted_text'] = encrypted_text_bytes
222
  # Optionally clear the unencrypted version from session state if you only want the encrypted one
223
+ # st.session_state['extracted_text_for_ner'] = None
 
224
  st.success("JSON file uploaded successfully. File content encrypted and secured. Due to security protocols, the file content is hidden.")
225
  st.divider()
226
  else:
227
  st.session_state['encrypted_extracted_text'] = None
228
  # st.session_state['extracted_text_for_ner'] = None
229
  st.error("Could not extract meaningful text from the uploaded JSON file.")
 
230
  except json.JSONDecodeError as e:
231
  st.error(f"JSON Decode Error: {e}")
232
  st.error("Please ensure the uploaded file contains valid JSON data.")
 
237
  st.session_state['encrypted_extracted_text'] = None
238
  st.session_state['json_dataframe'] = None
239
 
 
240
  # --- Results Button and Processing Logic ---
241
  if st.button("Results"):
242
+ start_time_overall = time.time() # Start time for overall processing
243
  if not comet_initialized:
244
  st.warning("Comet ML not initialized. Check environment variables if you wish to log data.")
245
 
 
251
  text_for_ner = None
252
  if st.session_state['encrypted_extracted_text'] is not None:
253
  text_for_ner = decrypt_text(st.session_state['encrypted_extracted_text'])
254
+
255
  if text_for_ner is None or not text_for_ner.strip():
256
  st.warning("No extractable text content available for analysis. Please upload a valid JSON file.")
257
  st.stop()
 
260
 
261
  with st.spinner("Analyzing text...", show_time=True):
262
  model = load_ner_model()
263
+
264
+ # Measure NER model processing time
265
+ start_time_ner = time.time()
266
+ text_entities = model(text_for_ner) # Use the decrypted text
267
+ end_time_ner = time.time()
268
+ ner_processing_time = end_time_ner - start_time_ner
269
+
270
  df = pd.DataFrame(text_entities)
271
 
272
  if 'word' in df.columns:
273
  # Ensure 'word' column is string type before applying regex
274
+ if df['word'].dtype == 'object':
275
+ pattern = r'[^\w\s]' # Regex to remove non-alphanumeric characters but keep spaces and periods
276
  df['word'] = df['word'].astype(str).replace(pattern, '', regex=True)
277
  else:
278
  st.warning("The 'word' column is not of string type; skipping character cleaning.")
279
  else:
280
  st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
281
+ st.stop() # Stop execution if the column is missing
282
 
283
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
284
  df = df.replace('', 'Unknown').dropna()
 
287
  st.warning("No entities were extracted from the uploaded text.")
288
  st.stop()
289
 
290
+ # --- Add 'category' column to the DataFrame based on the grouped labels ---
291
+ df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
292
+ # Handle cases where an entity_group might not have a category
293
+ df['category'] = df['category'].fillna('Uncategorized')
294
+
295
  if comet_initialized:
296
  experiment = Experiment(
297
  api_key=COMET_API_KEY,
 
300
  )
301
  experiment.log_parameter("input_text_length", len(text_for_ner))
302
  experiment.log_table("predicted_entities", df)
303
+ experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
304
+
305
 
306
  # --- Display Results ---
307
+ st.subheader("Extracted Entities", divider="rainbow")
308
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
309
  df_styled = df.style.set_properties(**properties)
310
  st.dataframe(df_styled, use_container_width=True)
 
321
  '**start**': ['index of the start of the corresponding entity']
322
 
323
  '**end**': ['index of the end of the corresponding entity']
324
+
325
+ '**category**': ['the broader category the entity belongs to']
326
  ''')
327
 
328
+ st.subheader("Grouped entities", divider="orange")
329
+
330
+ # Get unique categories and sort them for consistent tab order
331
+ unique_categories = sorted(df['category'].unique())
332
+ tabs_per_row = 4 # Adjust as needed for better layout
333
+
334
+ # Loop through categories in chunks to create rows of tabs
335
+ for i in range(0, len(unique_categories), tabs_per_row):
336
+ current_row_categories = unique_categories[i : i + tabs_per_row]
337
+ tabs = st.tabs(current_row_categories)
338
+
339
+ for j, category in enumerate(current_row_categories):
 
 
 
 
 
 
 
340
  with tabs[j]:
341
+ df_filtered = df[df["category"] == category]
342
+ if not df_filtered.empty:
343
  st.dataframe(df_filtered, use_container_width=True)
344
  else:
345
+ st.info(f"No '{category}' entities found in the text.")
346
  # Display an empty DataFrame for consistency if no entities are found
347
  st.dataframe(pd.DataFrame({
348
+ 'entity_group': [np.nan],
349
  'score': [np.nan],
350
  'word': [np.nan],
351
  'start': [np.nan],
352
+ 'end': [np.nan],
353
+ 'category': [category]
354
  }), hide_index=True)
 
355
  st.divider()
356
 
357
  # --- Visualizations ---
358
  st.subheader("Tree map", divider="orange")
359
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
360
+ values='score', color='category',
361
+ color_discrete_map={
362
+ 'Persons': 'blue',
363
+ 'Locations': 'green',
364
+ 'Organizations': 'red',
365
+ 'Miscellaneous': 'purple',
366
+ 'Uncategorized': 'gray'
367
+ })
368
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
369
  st.plotly_chart(fig_treemap)
370
  if comet_initialized:
371
  experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
372
 
373
+ # Group by category and entity_group to get counts for pie and bar charts
374
+ grouped_counts = df.groupby('category').size().reset_index(name='count')
375
 
376
  col1, col2 = st.columns(2)
377
  with col1:
378
  st.subheader("Pie Chart", divider="orange")
379
+ fig_pie = px.pie(grouped_counts, values='count', names='category',
380
+ hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
381
  fig_pie.update_traces(textposition='inside', textinfo='percent+label')
382
  st.plotly_chart(fig_pie)
383
  if comet_initialized:
384
+ experiment.log_figure(figure=fig_pie, figure_name="category_pie_chart")
385
 
386
  with col2:
387
  st.subheader("Bar Chart", divider="orange")
388
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True,
389
+ title='Occurrences of predicted categories')
390
  st.plotly_chart(fig_bar)
391
  if comet_initialized:
392
+ experiment.log_figure(figure=fig_bar, figure_name="category_bar_chart")
393
 
394
  # --- Downloadable Content ---
395
  dfa = pd.DataFrame(
396
  data={
397
+ 'Column Name': ['word', 'entity_group', 'score', 'start', 'end', 'category'],
398
  'Description': [
399
  'entity extracted from your text data',
400
  'label (tag) assigned to a given extracted entity',
401
  'accuracy score; how accurately a tag has been assigned to a given entity',
402
  'index of the start of the corresponding entity',
403
  'index of the end of the corresponding entity',
404
+ 'the broader category the entity belongs to',
405
  ]
406
  }
407
  )
 
408
  buf = io.BytesIO()
409
  with zipfile.ZipFile(buf, "w") as myzip:
410
  myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
 
426
  st.divider()
427
  if comet_initialized:
428
  experiment.end()
429
+
430
+ end_time_overall = time.time()
431
+ elapsed_time_overall = end_time_overall - start_time_overall
432
+ st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
433
+
434
  st.write(f"Number of times you requested results: **{st.session_state['file_upload_attempts']}/{max_attempts}**")
435
+