Spaces:
Running
on
Zero
Running
on
Zero
Refactor app.py to improve handling of serialized data in DataFrame uploads, replacing previous parsing logic with a more robust JSON and Python-repr parsing function. Update file upload instructions to clarify supported formats.
Browse files- app.py +27 -27
- openalex_utils.py +3 -2
app.py
CHANGED
@@ -306,31 +306,25 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
306 |
records_df = pd.read_csv(csv_upload)
|
307 |
filename = os.path.splitext(os.path.basename(csv_upload))[0]
|
308 |
|
309 |
-
#
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
#
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
elif file_extension == '.pkl':
|
330 |
-
# Read the pickle file
|
331 |
-
with open(csv_upload, 'rb') as f:
|
332 |
-
records_df = pickle.load(f)
|
333 |
-
filename = os.path.splitext(os.path.basename(csv_upload))[0]
|
334 |
|
335 |
else:
|
336 |
error_message = f"Error: Unsupported file type. Please upload a CSV or PKL file."
|
@@ -807,10 +801,12 @@ with gr.Blocks(theme=theme, css="""
|
|
807 |
gr.Markdown("### Upload Your Own Data")
|
808 |
csv_upload = gr.File(
|
809 |
file_count="single",
|
810 |
-
label="Upload your own CSV
|
811 |
-
file_types=[".csv"
|
812 |
)
|
813 |
|
|
|
|
|
814 |
|
815 |
with gr.Column(scale=2):
|
816 |
html = gr.HTML(
|
@@ -852,6 +848,10 @@ with gr.Blocks(theme=theme, css="""
|
|
852 |
|
853 |
3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
|
854 |
|
|
|
|
|
|
|
|
|
855 |
</div>
|
856 |
""")
|
857 |
|
|
|
306 |
records_df = pd.read_csv(csv_upload)
|
307 |
filename = os.path.splitext(os.path.basename(csv_upload))[0]
|
308 |
|
309 |
+
# Convert *every* cell that looks like a serialized list/dict
|
310 |
+
def _try_parse_obj(cell):
|
311 |
+
if isinstance(cell, str):
|
312 |
+
txt = cell.strip()
|
313 |
+
if (txt.startswith('{') and txt.endswith('}')) or (txt.startswith('[') and txt.endswith(']')):
|
314 |
+
# Try JSON first
|
315 |
+
try:
|
316 |
+
return json.loads(txt)
|
317 |
+
except Exception:
|
318 |
+
pass
|
319 |
+
# Fallback to Python-repr (single quotes etc.)
|
320 |
+
try:
|
321 |
+
return ast.literal_eval(txt)
|
322 |
+
except Exception:
|
323 |
+
pass
|
324 |
+
return cell
|
325 |
+
|
326 |
+
records_df = records_df.map(_try_parse_obj)
|
327 |
+
print(records_df.head())
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
else:
|
330 |
error_message = f"Error: Unsupported file type. Please upload a CSV or PKL file."
|
|
|
801 |
gr.Markdown("### Upload Your Own Data")
|
802 |
csv_upload = gr.File(
|
803 |
file_count="single",
|
804 |
+
label="Upload your own CSV file downloaded via pyalex.",
|
805 |
+
file_types=[".csv"],
|
806 |
)
|
807 |
|
808 |
+
|
809 |
+
|
810 |
|
811 |
with gr.Column(scale=2):
|
812 |
html = gr.HTML(
|
|
|
848 |
|
849 |
3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
|
850 |
|
851 |
+
## I want to use my own data!
|
852 |
+
|
853 |
+
Sure! You can upload csv-files produced by downloading things from OpenAlex using the pyalex package. You will need to provide at least the columns `id`, `title`, `publication_year`, `doi`, `abstract` or `abstract_inverted_index`, `referenced_works` and `primary_topic`.
|
854 |
+
|
855 |
</div>
|
856 |
""")
|
857 |
|
openalex_utils.py
CHANGED
@@ -70,8 +70,9 @@ def invert_abstract(inv_index):
|
|
70 |
l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
|
71 |
return " ".join(w for w, _ in sorted(l_inv, key=lambda x: x[1]))
|
72 |
else:
|
73 |
-
return " "
|
74 |
-
|
|
|
75 |
def get_pub(x):
|
76 |
"""Extract publication name from record."""
|
77 |
try:
|
|
|
70 |
l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
|
71 |
return " ".join(w for w, _ in sorted(l_inv, key=lambda x: x[1]))
|
72 |
else:
|
73 |
+
return " "
|
74 |
+
|
75 |
+
|
76 |
def get_pub(x):
|
77 |
"""Extract publication name from record."""
|
78 |
try:
|