m7n commited on
Commit
2b3a335
·
1 Parent(s): 1743cbd

Refactor data processing in app.py to improve handling of dictionary-like strings in DataFrame and update file upload options to support both CSV and Pickle formats.

Browse files
Files changed (1) hide show
  1. app.py +25 -38
app.py CHANGED
@@ -24,13 +24,11 @@ import os
24
  # print("Updated Numba threads:", numba.get_num_threads())
25
 
26
  # import datamapplot.medoids
27
- #tensorflow==2.19.0
28
- #keras<3
29
 
30
  # print(help(datamapplot.medoids))
31
 
32
- # Numba used to be set to : ==0.58.1 Does it work wit hl ater versions?
33
- # Move pynndescent up from 0.5.12 to 0.5.13
34
 
35
  from pathlib import Path
36
  from datetime import datetime
@@ -49,9 +47,6 @@ import gradio as gr
49
 
50
  print(f"Gradio version: {gr.__version__}")
51
 
52
- # import torch
53
- # torch.set_num_threads(1)
54
-
55
  import subprocess
56
 
57
  def print_datamapplot_version():
@@ -188,7 +183,6 @@ def no_op_decorator(func):
188
 
189
 
190
  if is_running_in_hf_space():
191
- print("Running in HF Space")
192
  @spaces.GPU(duration=30)
193
  def create_embeddings_30(texts_to_embedd):
194
  """Create embeddings for the input texts using the loaded model."""
@@ -211,7 +205,6 @@ if is_running_in_hf_space():
211
 
212
 
213
  else:
214
- print("Running locally")
215
  def create_embeddings(texts_to_embedd):
216
  """Create embeddings for the input texts using the loaded model."""
217
  return model.encode(texts_to_embedd, show_progress_bar=True, batch_size=192)
@@ -287,26 +280,26 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
287
  records_df = pd.read_csv(csv_upload)
288
  filename = os.path.splitext(os.path.basename(csv_upload))[0]
289
 
290
- # Convert *every* cell that looks like a serialized list/dict
291
- def _try_parse_obj(cell):
292
- if isinstance(cell, str):
293
- txt = cell.strip()
294
- if (txt.startswith('{') and txt.endswith('}')) or (txt.startswith('[') and txt.endswith(']')):
295
- # Try JSON first
296
- try:
297
- return json.loads(txt)
298
- except Exception:
299
- pass
300
- # Fallback to Python-repr (single quotes etc.)
301
- try:
302
- return ast.literal_eval(txt)
303
- except Exception:
304
- pass
305
- return cell
306
-
307
- records_df = records_df.map(_try_parse_obj)
308
- print(records_df.head())
309
-
310
  elif file_extension == '.pkl':
311
  # Read the pickle file
312
  with open(csv_upload, 'rb') as f:
@@ -442,7 +435,6 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
442
 
443
  basedata_df['color'] = '#ced4d211'
444
 
445
-
446
  if not plot_time_checkbox:
447
  records_df['color'] = '#5e2784'
448
  else:
@@ -532,8 +524,7 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
532
  # Export relevant column
533
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
534
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
535
- export_df['referenced_works'] = [x if isinstance(x, str) else ', '.join(x) if isinstance(x, (list, tuple)) and not pd.isna(x) else '' for x in records_df['referenced_works']]
536
-
537
  if locally_approximate_publication_date_checkbox and plot_time_checkbox:
538
  export_df['approximate_publication_year'] = local_years
539
  export_df.to_csv(csv_file_path, index=False)
@@ -790,8 +781,8 @@ with gr.Blocks(theme=theme, css="""
790
  gr.Markdown("### Upload Your Own Data")
791
  csv_upload = gr.File(
792
  file_count="single",
793
- label="Upload your own CSV file downloaded via pyalex.",
794
- file_types=[".csv"],
795
  )
796
 
797
 
@@ -835,10 +826,6 @@ with gr.Blocks(theme=theme, css="""
835
 
836
  3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
837
 
838
- ## I want to use my own data!
839
-
840
- Sure! You can upload csv-files produced by downloading things from OpenAlex using the pyalex package. You will need to provide at least the columns `id`, `title`, `publication_year`, `doi`, `abstract` or `abstract_inverted_index`, `referenced_works` and `primary_topic`.
841
-
842
  </div>
843
  """)
844
 
 
24
  # print("Updated Numba threads:", numba.get_num_threads())
25
 
26
  # import datamapplot.medoids
27
+
 
28
 
29
  # print(help(datamapplot.medoids))
30
 
31
+
 
32
 
33
  from pathlib import Path
34
  from datetime import datetime
 
47
 
48
  print(f"Gradio version: {gr.__version__}")
49
 
 
 
 
50
  import subprocess
51
 
52
  def print_datamapplot_version():
 
183
 
184
 
185
  if is_running_in_hf_space():
 
186
  @spaces.GPU(duration=30)
187
  def create_embeddings_30(texts_to_embedd):
188
  """Create embeddings for the input texts using the loaded model."""
 
205
 
206
 
207
  else:
 
208
  def create_embeddings(texts_to_embedd):
209
  """Create embeddings for the input texts using the loaded model."""
210
  return model.encode(texts_to_embedd, show_progress_bar=True, batch_size=192)
 
280
  records_df = pd.read_csv(csv_upload)
281
  filename = os.path.splitext(os.path.basename(csv_upload))[0]
282
 
283
+ # Process dictionary-like strings in the DataFrame
284
+ for column in records_df.columns:
285
+ # Check if the column contains dictionary-like strings
286
+ if records_df[column].dtype == 'object':
287
+ try:
288
+ # Use a sample value to check if it looks like a dictionary or list
289
+ sample_value = records_df[column].dropna().iloc[0] if not records_df[column].dropna().empty else None
290
+ # Add type checking before using startswith
291
+ if isinstance(sample_value, str) and (sample_value.startswith('{') or sample_value.startswith('[')):
292
+ # Try to convert strings to Python objects using ast.literal_eval
293
+ records_df[column] = records_df[column].apply(
294
+ lambda x: ast.literal_eval(x) if isinstance(x, str) and (
295
+ (x.startswith('{') and x.endswith('}')) or
296
+ (x.startswith('[') and x.endswith(']'))
297
+ ) else x
298
+ )
299
+ except (ValueError, SyntaxError, TypeError) as e:
300
+ # If conversion fails, keep as string
301
+ print(f"Could not convert column {column} to Python objects: {e}")
302
+
303
  elif file_extension == '.pkl':
304
  # Read the pickle file
305
  with open(csv_upload, 'rb') as f:
 
435
 
436
  basedata_df['color'] = '#ced4d211'
437
 
 
438
  if not plot_time_checkbox:
439
  records_df['color'] = '#5e2784'
440
  else:
 
524
  # Export relevant column
525
  export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
526
  export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
527
+ export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
 
528
  if locally_approximate_publication_date_checkbox and plot_time_checkbox:
529
  export_df['approximate_publication_year'] = local_years
530
  export_df.to_csv(csv_file_path, index=False)
 
781
  gr.Markdown("### Upload Your Own Data")
782
  csv_upload = gr.File(
783
  file_count="single",
784
+ label="Upload your own CSV or Pickle file downloaded via pyalex.",
785
+ file_types=[".csv", ".pkl"],
786
  )
787
 
788
 
 
826
 
827
  3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
828
 
 
 
 
 
829
  </div>
830
  """)
831