m7n commited on
Commit
c3c41c1
·
1 Parent(s): a236b7f

Refactor app.py to improve handling of serialized data in DataFrame uploads, replacing previous parsing logic with a more robust JSON and Python-repr parsing function. Update file upload instructions to clarify supported formats.

Browse files
Files changed (2) hide show
  1. app.py +27 -27
  2. openalex_utils.py +3 -2
app.py CHANGED
@@ -306,31 +306,25 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
306
  records_df = pd.read_csv(csv_upload)
307
  filename = os.path.splitext(os.path.basename(csv_upload))[0]
308
 
309
- # Process dictionary-like strings in the DataFrame
310
- for column in records_df.columns:
311
- # Check if the column contains dictionary-like strings
312
- if records_df[column].dtype == 'object':
313
- try:
314
- # Use a sample value to check if it looks like a dictionary or list
315
- sample_value = records_df[column].dropna().iloc[0] if not records_df[column].dropna().empty else None
316
- # Add type checking before using startswith
317
- if isinstance(sample_value, str) and (sample_value.startswith('{') or sample_value.startswith('[')):
318
- # Try to convert strings to Python objects using ast.literal_eval
319
- records_df[column] = records_df[column].apply(
320
- lambda x: ast.literal_eval(x) if isinstance(x, str) and (
321
- (x.startswith('{') and x.endswith('}')) or
322
- (x.startswith('[') and x.endswith(']'))
323
- ) else x
324
- )
325
- except (ValueError, SyntaxError, TypeError) as e:
326
- # If conversion fails, keep as string
327
- print(f"Could not convert column {column} to Python objects: {e}")
328
-
329
- elif file_extension == '.pkl':
330
- # Read the pickle file
331
- with open(csv_upload, 'rb') as f:
332
- records_df = pickle.load(f)
333
- filename = os.path.splitext(os.path.basename(csv_upload))[0]
334
 
335
  else:
336
  error_message = f"Error: Unsupported file type. Please upload a CSV or PKL file."
@@ -807,10 +801,12 @@ with gr.Blocks(theme=theme, css="""
807
  gr.Markdown("### Upload Your Own Data")
808
  csv_upload = gr.File(
809
  file_count="single",
810
- label="Upload your own CSV or Pickle file downloaded via pyalex.",
811
- file_types=[".csv", ".pkl"],
812
  )
813
 
 
 
814
 
815
  with gr.Column(scale=2):
816
  html = gr.HTML(
@@ -852,6 +848,10 @@ with gr.Blocks(theme=theme, css="""
852
 
853
  3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
854
 
 
 
 
 
855
  </div>
856
  """)
857
 
 
306
  records_df = pd.read_csv(csv_upload)
307
  filename = os.path.splitext(os.path.basename(csv_upload))[0]
308
 
309
+ # Convert *every* cell that looks like a serialized list/dict
310
+ def _try_parse_obj(cell):
311
+ if isinstance(cell, str):
312
+ txt = cell.strip()
313
+ if (txt.startswith('{') and txt.endswith('}')) or (txt.startswith('[') and txt.endswith(']')):
314
+ # Try JSON first
315
+ try:
316
+ return json.loads(txt)
317
+ except Exception:
318
+ pass
319
+ # Fallback to Python-repr (single quotes etc.)
320
+ try:
321
+ return ast.literal_eval(txt)
322
+ except Exception:
323
+ pass
324
+ return cell
325
+
326
+ records_df = records_df.map(_try_parse_obj)
327
+ print(records_df.head())
 
 
 
 
 
 
328
 
329
  else:
330
  error_message = f"Error: Unsupported file type. Please upload a CSV or PKL file."
 
801
  gr.Markdown("### Upload Your Own Data")
802
  csv_upload = gr.File(
803
  file_count="single",
804
+ label="Upload your own CSV file downloaded via pyalex.",
805
+ file_types=[".csv"],
806
  )
807
 
808
+
809
+
810
 
811
  with gr.Column(scale=2):
812
  html = gr.HTML(
 
848
 
849
  3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
850
 
851
+ ## I want to use my own data!
852
+
853
+ Sure! You can upload csv-files produced by downloading things from OpenAlex using the pyalex package. You will need to provide at least the columns `id`, `title`, `publication_year`, `doi`, `abstract` or `abstract_inverted_index`, `referenced_works` and `primary_topic`.
854
+
855
  </div>
856
  """)
857
 
openalex_utils.py CHANGED
@@ -70,8 +70,9 @@ def invert_abstract(inv_index):
70
  l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
71
  return " ".join(w for w, _ in sorted(l_inv, key=lambda x: x[1]))
72
  else:
73
- return " " # fallback
74
-
 
75
  def get_pub(x):
76
  """Extract publication name from record."""
77
  try:
 
70
  l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
71
  return " ".join(w for w, _ in sorted(l_inv, key=lambda x: x[1]))
72
  else:
73
+ return " "
74
+
75
+
76
  def get_pub(x):
77
  """Extract publication name from record."""
78
  try: