Maria Tsilimos commited on
Commit
408ba90
·
unverified ·
1 Parent(s): 30e6219

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -92
app.py CHANGED
@@ -12,30 +12,91 @@ import re
12
  import numpy as np
13
  import json
14
  from cryptography.fernet import Fernet
15
-
16
- st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
17
-
 
18
  # --- Configuration ---
19
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
20
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
21
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
22
-
23
  comet_initialized = False
24
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
25
  comet_initialized = True
26
-
27
- # --- Initialize session state ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  if 'file_upload_attempts' not in st.session_state:
29
- st.session_state['file_upload_attempts'] = 0
30
-
 
 
 
 
 
 
 
31
  if 'encrypted_extracted_text' not in st.session_state:
32
- st.session_state['encrypted_extracted_text'] = None
33
-
34
  if 'json_dataframe' not in st.session_state:
35
  st.session_state['json_dataframe'] = None
36
-
37
- max_attempts = 10
38
-
39
  # Define the categories and their associated entity labels
40
  ENTITY_LABELS_CATEGORIZED = {
41
  "Persons": ["PER"],
@@ -43,13 +104,12 @@ ENTITY_LABELS_CATEGORIZED = {
43
  "Organizations": ["ORG"],
44
  "Miscellaneous": ["MISC"],
45
  }
46
-
47
  # Create a mapping from each specific entity label to its category
48
  LABEL_TO_CATEGORY_MAP = {
49
  label: category for category, labels in ENTITY_LABELS_CATEGORIZED.items() for label in labels
50
  }
51
-
52
-
53
  @st.cache_resource
54
  def load_ner_model():
55
  """
@@ -67,8 +127,7 @@ def load_ner_model():
67
  except Exception as e:
68
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
69
  st.stop()
70
-
71
-
72
  @st.cache_resource
73
  def load_encryption_key():
74
  """
@@ -81,7 +140,7 @@ def load_encryption_key():
81
  key_str = os.environ.get("FERNET_KEY")
82
  if not key_str:
83
  raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
84
-
85
  # Fernet key must be bytes, so encode the string
86
  key_bytes = key_str.encode('utf-8')
87
  return Fernet(key_bytes)
@@ -95,19 +154,17 @@ def load_encryption_key():
95
  except Exception as e:
96
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
97
  st.stop()
98
-
99
  # Initialize the Fernet cipher instance globally (cached)
100
  fernet = load_encryption_key()
101
-
102
-
103
  def encrypt_text(text_content: str) -> bytes:
104
  """
105
  Encrypts a string using the loaded Fernet cipher.
106
  The input string is first encoded to UTF-8 bytes.
107
  """
108
  return fernet.encrypt(text_content.encode('utf-8'))
109
-
110
-
111
  def decrypt_text(encrypted_bytes: bytes) -> str | None:
112
  """
113
  Decrypts bytes using the loaded Fernet cipher.
@@ -118,11 +175,11 @@ def decrypt_text(encrypted_bytes: bytes) -> str | None:
118
  except Exception as e:
119
  st.error(f"Decryption failed. This might indicate data tampering or an incorrect encryption key. Error: {e}")
120
  return None
121
-
122
  # --- UI Elements ---
123
  st.subheader("Scandinavian JSON Entity Finder", divider="orange")
124
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
125
-
126
  expander = st.expander("**Important notes on the Scandinavian JSON Entity Finder**")
127
  expander.write('''
128
  **Named Entities:** This Scandinavian JSON Entity Finder predicts four
@@ -130,55 +187,66 @@ expander.write('''
130
  miscellaneous”). Results are presented in an easy-to-read table, visualized in
131
  an interactive tree map, pie chart, and bar chart, and are available for
132
  download along with a Glossary of tags.
133
-
134
  **How to Use:** Upload your JSON file. Then, click the 'Results' button
135
  to extract and tag entities in your text data.
136
-
137
  **Usage Limits:** You can request results up to 10 times.
138
-
139
  **Language settings:** Please check and adjust the language settings in
140
  your computer, so the Danish, Swedish, Norwegian, Icelandic and Faroese
141
  characters are handled properly in your downloaded file.
142
-
143
  **Customization:** To change the app's background color to white or
144
  black, click the three-dot menu on the right-hand side of your app, go to
145
  Settings and then Choose app theme, colors and fonts.
146
-
147
  **Technical issues:** If your connection times out, please refresh the
148
  page or reopen the app's URL.
149
-
150
  For any errors or inquiries, please contact us at [email protected]
151
  ''')
152
-
153
  with st.sidebar:
154
- container = st.container(border=True)
155
- container.write("**Named Entity Recognition (NER)** is the task of "
156
- "extracting and tagging entities in text data. Entities can be persons, "
157
- "organizations, locations, countries, products, events etc.")
158
- st.subheader("Related NER Web Apps", divider="orange")
159
- st.link_button("Multilingual PDF & DOCX Entity Finder",
160
- "https://nlpblogs.com/shop/named-entity-recognition-ner/multilingual-pdf-docx-entity-finder/",
 
 
 
 
 
 
 
 
 
 
 
161
  type="primary")
162
-
163
  uploaded_file = st.file_uploader("Choose a JSON file", type=["json"])
164
-
165
  # Initialize text for the current run outside the if uploaded_file block
166
  # This will be populated if a file is uploaded, otherwise it remains None
167
  current_run_text = None
168
-
169
  if uploaded_file is not None:
170
  try:
171
  # Read the content as bytes first, then decode for JSON parsing
172
  file_contents_bytes = uploaded_file.read()
173
-
174
  # Reset the file pointer after reading, so json.load can read from the beginning
175
  uploaded_file.seek(0)
176
  dados = json.load(uploaded_file)
177
-
178
  # Attempt to convert JSON to DataFrame and extract text
179
  try:
180
  st.session_state['json_dataframe'] = pd.DataFrame(dados)
181
-
182
  # Concatenate all content into a single string for NER
183
  df_string_representation = st.session_state['json_dataframe'].to_string(index=False, header=False)
184
  # Simple regex to remove non-alphanumeric characters but keep spaces and periods
@@ -196,32 +264,42 @@ if uploaded_file is not None:
196
  if isinstance(dados, list):
197
  for item in dados:
198
  if isinstance(item, str):
199
- extracted_texts_list.append(item)
200
  elif isinstance(item, dict):
201
  # Recursively get string values from dicts in a list
202
  for val in item.values():
203
  if isinstance(val, str):
204
- extracted_texts_list.append(val)
205
  elif isinstance(val, list):
206
  for sub_val in val:
207
  if isinstance(sub_val, str):
208
- extracted_texts_list.append(sub_val)
209
  elif isinstance(dados, dict):
210
  # Get string values from a dictionary
211
  for value in dados.values():
212
  if isinstance(value, str):
213
- extracted_texts_list.append(value)
214
  elif isinstance(value, list):
215
  for sub_val in value:
216
  if isinstance(sub_val, str):
217
- extracted_texts_list.append(sub_val)
218
  if extracted_texts_list:
219
  current_run_text = " ".join(extracted_texts_list).strip()
220
  else:
221
  st.warning("No string text could be extracted from the JSON for analysis.")
222
  current_run_text = None
223
-
224
  if current_run_text:
 
 
 
 
 
 
 
 
 
 
225
  # --- ENCRYPT THE EXTRACTED TEXT BEFORE STORING IN SESSION STATE ---
226
  encrypted_text_bytes = encrypt_text(current_run_text)
227
  st.session_state['encrypted_extracted_text'] = encrypted_text_bytes
@@ -242,39 +320,42 @@ if uploaded_file is not None:
242
  st.error(f"An unexpected error occurred during file processing: {e}")
243
  st.session_state['encrypted_extracted_text'] = None
244
  st.session_state['json_dataframe'] = None
245
-
246
  # --- Results Button and Processing Logic ---
247
  if st.button("Results"):
248
  start_time_overall = time.time() # Start time for overall processing
249
  if not comet_initialized:
250
  st.warning("Comet ML not initialized. Check environment variables if you wish to log data.")
251
-
 
252
  if st.session_state['file_upload_attempts'] >= max_attempts:
253
  st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
254
  st.stop()
255
-
256
  # --- DECRYPT THE TEXT BEFORE PASSING TO NER MODEL ---
257
  text_for_ner = None
258
  if st.session_state['encrypted_extracted_text'] is not None:
259
  text_for_ner = decrypt_text(st.session_state['encrypted_extracted_text'])
260
-
261
  if text_for_ner is None or not text_for_ner.strip():
262
  st.warning("No extractable text content available for analysis. Please upload a valid JSON file.")
263
  st.stop()
264
-
 
265
  st.session_state['file_upload_attempts'] += 1
266
-
 
267
  with st.spinner("Analyzing text...", show_time=True):
268
  model = load_ner_model()
269
-
270
  # Measure NER model processing time
271
  start_time_ner = time.time()
272
  text_entities = model(text_for_ner) # Use the decrypted text
273
  end_time_ner = time.time()
274
  ner_processing_time = end_time_ner - start_time_ner
275
-
276
  df = pd.DataFrame(text_entities)
277
-
278
  if 'word' in df.columns:
279
  # Ensure 'word' column is string type before applying regex
280
  if df['word'].dtype == 'object':
@@ -285,19 +366,19 @@ if st.button("Results"):
285
  else:
286
  st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
287
  st.stop() # Stop execution if the column is missing
288
-
289
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
290
  df = df.replace('', 'Unknown').dropna()
291
-
292
  if df.empty:
293
  st.warning("No entities were extracted from the uploaded text.")
294
  st.stop()
295
-
296
  # --- Add 'category' column to the DataFrame based on the grouped labels ---
297
  df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
298
  # Handle cases where an entity_group might not have a category
299
  df['category'] = df['category'].fillna('Uncategorized')
300
-
301
  if comet_initialized:
302
  experiment = Experiment(
303
  api_key=COMET_API_KEY,
@@ -307,41 +388,41 @@ if st.button("Results"):
307
  experiment.log_parameter("input_text_length", len(text_for_ner))
308
  experiment.log_table("predicted_entities", df)
309
  experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
310
-
311
-
312
  # --- Display Results ---
313
  st.subheader("Extracted Entities", divider="rainbow")
314
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
315
  df_styled = df.style.set_properties(**properties)
316
  st.dataframe(df_styled, use_container_width=True)
317
-
318
  with st.expander("See Glossary of tags"):
319
  st.write('''
320
  '**word**': ['entity extracted from your text data']
321
-
322
  '**score**': ['accuracy score; how accurately a tag has been assigned to
323
  a given entity']
324
-
325
  '**entity_group**': ['label (tag) assigned to a given extracted entity']
326
-
327
  '**start**': ['index of the start of the corresponding entity']
328
-
329
  '**end**': ['index of the end of the corresponding entity']
330
-
331
  '**category**': ['the broader category the entity belongs to']
332
  ''')
333
-
334
  st.subheader("Grouped entities", divider="orange")
335
-
336
  # Get unique categories and sort them for consistent tab order
337
  unique_categories = sorted(df['category'].unique())
338
  tabs_per_row = 4 # Adjust as needed for better layout
339
-
340
  # Loop through categories in chunks to create rows of tabs
341
  for i in range(0, len(unique_categories), tabs_per_row):
342
  current_row_categories = unique_categories[i : i + tabs_per_row]
343
  tabs = st.tabs(current_row_categories)
344
-
345
  for j, category in enumerate(current_row_categories):
346
  with tabs[j]:
347
  df_filtered = df[df["category"] == category]
@@ -359,26 +440,27 @@ if st.button("Results"):
359
  'category': [category]
360
  }), hide_index=True)
361
  st.divider()
362
-
363
  # --- Visualizations ---
364
  st.subheader("Tree map", divider="orange")
365
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'entity_group', 'word'],
 
366
  values='score', color='category',
367
  color_discrete_map={
368
- 'Persons': 'blue',
369
- 'Locations': 'green',
370
- 'Organizations': 'red',
371
- 'Miscellaneous': 'purple',
372
- 'Uncategorized': 'gray'
373
  })
374
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
375
  st.plotly_chart(fig_treemap)
376
  if comet_initialized:
377
  experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
378
-
379
  # Group by category and entity_group to get counts for pie and bar charts
380
  grouped_counts = df.groupby('category').size().reset_index(name='count')
381
-
382
  col1, col2 = st.columns(2)
383
  with col1:
384
  st.subheader("Pie Chart", divider="orange")
@@ -388,7 +470,7 @@ if st.button("Results"):
388
  st.plotly_chart(fig_pie)
389
  if comet_initialized:
390
  experiment.log_figure(figure=fig_pie, figure_name="category_pie_chart")
391
-
392
  with col2:
393
  st.subheader("Bar Chart", divider="orange")
394
  fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True,
@@ -396,7 +478,7 @@ if st.button("Results"):
396
  st.plotly_chart(fig_bar)
397
  if comet_initialized:
398
  experiment.log_figure(figure=fig_bar, figure_name="category_bar_chart")
399
-
400
  # --- Downloadable Content ---
401
  dfa = pd.DataFrame(
402
  data={
@@ -415,7 +497,7 @@ if st.button("Results"):
415
  with zipfile.ZipFile(buf, "w") as myzip:
416
  myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
417
  myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
418
-
419
  with stylable_container(
420
  key="download_button",
421
  css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
@@ -428,13 +510,13 @@ if st.button("Results"):
428
  )
429
  if comet_initialized:
430
  experiment.log_asset(buf.getvalue(), file_name="downloadable_results.zip")
431
-
432
  st.divider()
433
  if comet_initialized:
434
  experiment.end()
435
-
436
  end_time_overall = time.time()
437
  elapsed_time_overall = end_time_overall - start_time_overall
438
  st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
439
-
440
  st.write(f"Number of times you requested results: **{st.session_state['file_upload_attempts']}/{max_attempts}**")
 
12
  import numpy as np
13
  import json
14
  from cryptography.fernet import Fernet
15
+
16
+ st.set_page_config(layout="wide",
17
+ page_title="Named Entity Recognition App")
18
+
19
  # --- Configuration ---
20
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
21
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
22
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
23
+
24
  comet_initialized = False
25
  if COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME:
26
  comet_initialized = True
27
+
28
+ # --- Persistent Counter and History Configuration ---
29
+ COUNTER_FILE = "counter_json_finder.json"
30
+ HISTORY_FILE = "file_history_json_finder.json"
31
+ max_attempts = 10
32
+
33
+ # --- Functions to manage persistent data ---
34
+ def load_attempts():
35
+ """
36
+ Loads the attempts count from a persistent JSON file.
37
+ Returns 0 if the file doesn't exist or is invalid.
38
+ """
39
+ if os.path.exists(COUNTER_FILE):
40
+ try:
41
+ with open(COUNTER_FILE, "r") as f:
42
+ data = json.load(f)
43
+ return data.get('file_upload_attempts', 0)
44
+ except (json.JSONDecodeError, KeyError):
45
+ return 0
46
+ return 0
47
+
48
+ def save_attempts(attempts):
49
+ """
50
+ Saves the current attempts count to the persistent JSON file.
51
+ """
52
+ with open(COUNTER_FILE, "w") as f:
53
+ json.dump({'file_upload_attempts': attempts}, f)
54
+
55
+ def load_history():
56
+ """
57
+ Loads the file upload history from a persistent JSON file.
58
+ Returns an empty list if the file doesn't exist or is invalid.
59
+ """
60
+ if os.path.exists(HISTORY_FILE):
61
+ try:
62
+ with open(HISTORY_FILE, "r") as f:
63
+ data = json.load(f)
64
+ return data.get('uploaded_files', [])
65
+ except (json.JSONDecodeError, KeyError):
66
+ return []
67
+ return []
68
+
69
+ def save_history(history):
70
+ """
71
+ Saves the current file upload history to the persistent JSON file.
72
+ """
73
+ with open(HISTORY_FILE, "w") as f:
74
+ json.dump({'uploaded_files': history}, f)
75
+
76
+ def clear_history_data():
77
+ """Clears the file history from session state and deletes the persistent file."""
78
+ if os.path.exists(HISTORY_FILE):
79
+ os.remove(HISTORY_FILE)
80
+ st.session_state['uploaded_files_history'] = []
81
+ st.rerun()
82
+
83
+ # --- Initialize session state with persistent data ---
84
  if 'file_upload_attempts' not in st.session_state:
85
+ st.session_state['file_upload_attempts'] = load_attempts()
86
+ # Save to ensure the file exists on first run
87
+ save_attempts(st.session_state['file_upload_attempts'])
88
+
89
+ if 'uploaded_files_history' not in st.session_state:
90
+ st.session_state['uploaded_files_history'] = load_history()
91
+ # Save to ensure the file exists on first run
92
+ save_history(st.session_state['uploaded_files_history'])
93
+
94
  if 'encrypted_extracted_text' not in st.session_state:
95
+ st.session_state['encrypted_extracted_text'] = None
96
+
97
  if 'json_dataframe' not in st.session_state:
98
  st.session_state['json_dataframe'] = None
99
+
 
 
100
  # Define the categories and their associated entity labels
101
  ENTITY_LABELS_CATEGORIZED = {
102
  "Persons": ["PER"],
 
104
  "Organizations": ["ORG"],
105
  "Miscellaneous": ["MISC"],
106
  }
107
+
108
  # Create a mapping from each specific entity label to its category
109
  LABEL_TO_CATEGORY_MAP = {
110
  label: category for category, labels in ENTITY_LABELS_CATEGORIZED.items() for label in labels
111
  }
112
+
 
113
  @st.cache_resource
114
  def load_ner_model():
115
  """
 
127
  except Exception as e:
128
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
129
  st.stop()
130
+
 
131
  @st.cache_resource
132
  def load_encryption_key():
133
  """
 
140
  key_str = os.environ.get("FERNET_KEY")
141
  if not key_str:
142
  raise ValueError("FERNET_KEY environment variable not set. Cannot perform encryption/decryption.")
143
+
144
  # Fernet key must be bytes, so encode the string
145
  key_bytes = key_str.encode('utf-8')
146
  return Fernet(key_bytes)
 
154
  except Exception as e:
155
  st.error(f"An unexpected error occurred while loading encryption key: {e}. Please check your key format and environment settings.")
156
  st.stop()
157
+
158
  # Initialize the Fernet cipher instance globally (cached)
159
  fernet = load_encryption_key()
160
+
 
161
  def encrypt_text(text_content: str) -> bytes:
162
  """
163
  Encrypts a string using the loaded Fernet cipher.
164
  The input string is first encoded to UTF-8 bytes.
165
  """
166
  return fernet.encrypt(text_content.encode('utf-8'))
167
+
 
168
  def decrypt_text(encrypted_bytes: bytes) -> str | None:
169
  """
170
  Decrypts bytes using the loaded Fernet cipher.
 
175
  except Exception as e:
176
  st.error(f"Decryption failed. This might indicate data tampering or an incorrect encryption key. Error: {e}")
177
  return None
178
+
179
  # --- UI Elements ---
180
  st.subheader("Scandinavian JSON Entity Finder", divider="orange")
181
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
182
+
183
  expander = st.expander("**Important notes on the Scandinavian JSON Entity Finder**")
184
  expander.write('''
185
  **Named Entities:** This Scandinavian JSON Entity Finder predicts four
 
187
  miscellaneous”). Results are presented in an easy-to-read table, visualized in
188
  an interactive tree map, pie chart, and bar chart, and are available for
189
  download along with a Glossary of tags.
190
+
191
  **How to Use:** Upload your JSON file. Then, click the 'Results' button
192
  to extract and tag entities in your text data.
193
+
194
  **Usage Limits:** You can request results up to 10 times.
195
+
196
  **Language settings:** Please check and adjust the language settings in
197
  your computer, so the Danish, Swedish, Norwegian, Icelandic and Faroese
198
  characters are handled properly in your downloaded file.
199
+
200
  **Customization:** To change the app's background color to white or
201
  black, click the three-dot menu on the right-hand side of your app, go to
202
  Settings and then Choose app theme, colors and fonts.
203
+
204
  **Technical issues:** If your connection times out, please refresh the
205
  page or reopen the app's URL.
206
+
207
  For any errors or inquiries, please contact us at [email protected]
208
  ''')
209
+
210
  with st.sidebar:
211
+
212
+
213
+ # --- Added Persistent History Display ---
214
+ st.subheader("Your File Upload History", divider="orange")
215
+ if st.session_state['uploaded_files_history']:
216
+ history_to_display = st.session_state['uploaded_files_history']
217
+ history_df = pd.DataFrame(history_to_display)
218
+ st.dataframe(history_df, use_container_width=True, hide_index=True)
219
+ # Add a clear history button
220
+ if st.button("Clear File History", help="This will permanently delete the file history from the application."):
221
+ clear_history_data()
222
+ else:
223
+ st.info("You have not uploaded any files yet.")
224
+
225
+
226
+ st.subheader("Build your own NER Web App in a minute without writing a single line of code.", divider="orange")
227
+ st.link_button("NER File Builder",
228
+ "https://nlpblogs.com/shop/named-entity-recognition-ner/ner-file-builder/",
229
  type="primary")
230
+
231
  uploaded_file = st.file_uploader("Choose a JSON file", type=["json"])
232
+
233
  # Initialize text for the current run outside the if uploaded_file block
234
  # This will be populated if a file is uploaded, otherwise it remains None
235
  current_run_text = None
236
+
237
  if uploaded_file is not None:
238
  try:
239
  # Read the content as bytes first, then decode for JSON parsing
240
  file_contents_bytes = uploaded_file.read()
241
+
242
  # Reset the file pointer after reading, so json.load can read from the beginning
243
  uploaded_file.seek(0)
244
  dados = json.load(uploaded_file)
245
+
246
  # Attempt to convert JSON to DataFrame and extract text
247
  try:
248
  st.session_state['json_dataframe'] = pd.DataFrame(dados)
249
+
250
  # Concatenate all content into a single string for NER
251
  df_string_representation = st.session_state['json_dataframe'].to_string(index=False, header=False)
252
  # Simple regex to remove non-alphanumeric characters but keep spaces and periods
 
264
  if isinstance(dados, list):
265
  for item in dados:
266
  if isinstance(item, str):
267
+ extracted_texts_list.append(item)
268
  elif isinstance(item, dict):
269
  # Recursively get string values from dicts in a list
270
  for val in item.values():
271
  if isinstance(val, str):
272
+ extracted_texts_list.append(val)
273
  elif isinstance(val, list):
274
  for sub_val in val:
275
  if isinstance(sub_val, str):
276
+ extracted_texts_list.append(sub_val)
277
  elif isinstance(dados, dict):
278
  # Get string values from a dictionary
279
  for value in dados.values():
280
  if isinstance(value, str):
281
+ extracted_texts_list.append(value)
282
  elif isinstance(value, list):
283
  for sub_val in value:
284
  if isinstance(sub_val, str):
285
+ extracted_texts_list.append(sub_val)
286
  if extracted_texts_list:
287
  current_run_text = " ".join(extracted_texts_list).strip()
288
  else:
289
  st.warning("No string text could be extracted from the JSON for analysis.")
290
  current_run_text = None
291
+
292
  if current_run_text:
293
+ # --- ADDING TO UPLOAD HISTORY ---
294
+ new_upload_entry = {
295
+ "filename": uploaded_file.name,
296
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
297
+ }
298
+ # Append the new file to the session state history
299
+ st.session_state['uploaded_files_history'].append(new_upload_entry)
300
+ # Save the updated history to the persistent file
301
+ save_history(st.session_state['uploaded_files_history'])
302
+ # --- END OF HISTORY ADDITION ---
303
  # --- ENCRYPT THE EXTRACTED TEXT BEFORE STORING IN SESSION STATE ---
304
  encrypted_text_bytes = encrypt_text(current_run_text)
305
  st.session_state['encrypted_extracted_text'] = encrypted_text_bytes
 
320
  st.error(f"An unexpected error occurred during file processing: {e}")
321
  st.session_state['encrypted_extracted_text'] = None
322
  st.session_state['json_dataframe'] = None
323
+
324
  # --- Results Button and Processing Logic ---
325
  if st.button("Results"):
326
  start_time_overall = time.time() # Start time for overall processing
327
  if not comet_initialized:
328
  st.warning("Comet ML not initialized. Check environment variables if you wish to log data.")
329
+
330
+ # Check attempts limit BEFORE running the model
331
  if st.session_state['file_upload_attempts'] >= max_attempts:
332
  st.error(f"You have requested results {max_attempts} times. You have reached your daily request limit.")
333
  st.stop()
334
+
335
  # --- DECRYPT THE TEXT BEFORE PASSING TO NER MODEL ---
336
  text_for_ner = None
337
  if st.session_state['encrypted_extracted_text'] is not None:
338
  text_for_ner = decrypt_text(st.session_state['encrypted_extracted_text'])
339
+
340
  if text_for_ner is None or not text_for_ner.strip():
341
  st.warning("No extractable text content available for analysis. Please upload a valid JSON file.")
342
  st.stop()
343
+
344
+ # Increment the attempts counter and save it to the persistent file
345
  st.session_state['file_upload_attempts'] += 1
346
+ save_attempts(st.session_state['file_upload_attempts'])
347
+
348
  with st.spinner("Analyzing text...", show_time=True):
349
  model = load_ner_model()
350
+
351
  # Measure NER model processing time
352
  start_time_ner = time.time()
353
  text_entities = model(text_for_ner) # Use the decrypted text
354
  end_time_ner = time.time()
355
  ner_processing_time = end_time_ner - start_time_ner
356
+
357
  df = pd.DataFrame(text_entities)
358
+
359
  if 'word' in df.columns:
360
  # Ensure 'word' column is string type before applying regex
361
  if df['word'].dtype == 'object':
 
366
  else:
367
  st.error("The 'word' column does not exist in the DataFrame. Cannot perform cleaning.")
368
  st.stop() # Stop execution if the column is missing
369
+
370
  # Replace empty strings with 'Unknown' and drop rows with NaN after cleaning
371
  df = df.replace('', 'Unknown').dropna()
372
+
373
  if df.empty:
374
  st.warning("No entities were extracted from the uploaded text.")
375
  st.stop()
376
+
377
  # --- Add 'category' column to the DataFrame based on the grouped labels ---
378
  df['category'] = df['entity_group'].map(LABEL_TO_CATEGORY_MAP)
379
  # Handle cases where an entity_group might not have a category
380
  df['category'] = df['category'].fillna('Uncategorized')
381
+
382
  if comet_initialized:
383
  experiment = Experiment(
384
  api_key=COMET_API_KEY,
 
388
  experiment.log_parameter("input_text_length", len(text_for_ner))
389
  experiment.log_table("predicted_entities", df)
390
  experiment.log_metric("ner_processing_time_seconds", ner_processing_time)
391
+
392
+
393
  # --- Display Results ---
394
  st.subheader("Extracted Entities", divider="rainbow")
395
  properties = {"border": "2px solid gray", "color": "blue", "font-size": "16px"}
396
  df_styled = df.style.set_properties(**properties)
397
  st.dataframe(df_styled, use_container_width=True)
398
+
399
  with st.expander("See Glossary of tags"):
400
  st.write('''
401
  '**word**': ['entity extracted from your text data']
402
+
403
  '**score**': ['accuracy score; how accurately a tag has been assigned to
404
  a given entity']
405
+
406
  '**entity_group**': ['label (tag) assigned to a given extracted entity']
407
+
408
  '**start**': ['index of the start of the corresponding entity']
409
+
410
  '**end**': ['index of the end of the corresponding entity']
411
+
412
  '**category**': ['the broader category the entity belongs to']
413
  ''')
414
+
415
  st.subheader("Grouped entities", divider="orange")
416
+
417
  # Get unique categories and sort them for consistent tab order
418
  unique_categories = sorted(df['category'].unique())
419
  tabs_per_row = 4 # Adjust as needed for better layout
420
+
421
  # Loop through categories in chunks to create rows of tabs
422
  for i in range(0, len(unique_categories), tabs_per_row):
423
  current_row_categories = unique_categories[i : i + tabs_per_row]
424
  tabs = st.tabs(current_row_categories)
425
+
426
  for j, category in enumerate(current_row_categories):
427
  with tabs[j]:
428
  df_filtered = df[df["category"] == category]
 
440
  'category': [category]
441
  }), hide_index=True)
442
  st.divider()
443
+
444
  # --- Visualizations ---
445
  st.subheader("Tree map", divider="orange")
446
+ fig_treemap = px.treemap(df,
447
+ path=[px.Constant("all"), 'category', 'entity_group', 'word'],
448
  values='score', color='category',
449
  color_discrete_map={
450
+ 'Persons': 'blue',
451
+ 'Locations': 'green',
452
+ 'Organizations': 'red',
453
+ 'Miscellaneous': 'purple',
454
+ 'Uncategorized': 'gray'
455
  })
456
  fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
457
  st.plotly_chart(fig_treemap)
458
  if comet_initialized:
459
  experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap")
460
+
461
  # Group by category and entity_group to get counts for pie and bar charts
462
  grouped_counts = df.groupby('category').size().reset_index(name='count')
463
+
464
  col1, col2 = st.columns(2)
465
  with col1:
466
  st.subheader("Pie Chart", divider="orange")
 
470
  st.plotly_chart(fig_pie)
471
  if comet_initialized:
472
  experiment.log_figure(figure=fig_pie, figure_name="category_pie_chart")
473
+
474
  with col2:
475
  st.subheader("Bar Chart", divider="orange")
476
  fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True,
 
478
  st.plotly_chart(fig_bar)
479
  if comet_initialized:
480
  experiment.log_figure(figure=fig_bar, figure_name="category_bar_chart")
481
+
482
  # --- Downloadable Content ---
483
  dfa = pd.DataFrame(
484
  data={
 
497
  with zipfile.ZipFile(buf, "w") as myzip:
498
  myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
499
  myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
500
+
501
  with stylable_container(
502
  key="download_button",
503
  css_styles="""button { background-color: yellow; border: 1px solid black; padding: 5px; color: black; }""",
 
510
  )
511
  if comet_initialized:
512
  experiment.log_asset(buf.getvalue(), file_name="downloadable_results.zip")
513
+
514
  st.divider()
515
  if comet_initialized:
516
  experiment.end()
517
+
518
  end_time_overall = time.time()
519
  elapsed_time_overall = end_time_overall - start_time_overall
520
  st.info(f"Results processed in **{elapsed_time_overall:.2f} seconds**.")
521
+
522
  st.write(f"Number of times you requested results: **{st.session_state['file_upload_attempts']}/{max_attempts}**")