Spaces:
Running
on
Zero
Running
on
Zero
Refactor data processing in app.py to improve handling of dictionary-like strings in DataFrame and update file upload options to support both CSV and Pickle formats.
Browse files
app.py
CHANGED
@@ -24,13 +24,11 @@ import os
|
|
24 |
# print("Updated Numba threads:", numba.get_num_threads())
|
25 |
|
26 |
# import datamapplot.medoids
|
27 |
-
|
28 |
-
#keras<3
|
29 |
|
30 |
# print(help(datamapplot.medoids))
|
31 |
|
32 |
-
|
33 |
-
# Move pynndescent up from 0.5.12 to 0.5.13
|
34 |
|
35 |
from pathlib import Path
|
36 |
from datetime import datetime
|
@@ -49,9 +47,6 @@ import gradio as gr
|
|
49 |
|
50 |
print(f"Gradio version: {gr.__version__}")
|
51 |
|
52 |
-
# import torch
|
53 |
-
# torch.set_num_threads(1)
|
54 |
-
|
55 |
import subprocess
|
56 |
|
57 |
def print_datamapplot_version():
|
@@ -188,7 +183,6 @@ def no_op_decorator(func):
|
|
188 |
|
189 |
|
190 |
if is_running_in_hf_space():
|
191 |
-
print("Running in HF Space")
|
192 |
@spaces.GPU(duration=30)
|
193 |
def create_embeddings_30(texts_to_embedd):
|
194 |
"""Create embeddings for the input texts using the loaded model."""
|
@@ -211,7 +205,6 @@ if is_running_in_hf_space():
|
|
211 |
|
212 |
|
213 |
else:
|
214 |
-
print("Running locally")
|
215 |
def create_embeddings(texts_to_embedd):
|
216 |
"""Create embeddings for the input texts using the loaded model."""
|
217 |
return model.encode(texts_to_embedd, show_progress_bar=True, batch_size=192)
|
@@ -287,26 +280,26 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
287 |
records_df = pd.read_csv(csv_upload)
|
288 |
filename = os.path.splitext(os.path.basename(csv_upload))[0]
|
289 |
|
290 |
-
#
|
291 |
-
|
292 |
-
if
|
293 |
-
|
294 |
-
|
295 |
-
#
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
elif file_extension == '.pkl':
|
311 |
# Read the pickle file
|
312 |
with open(csv_upload, 'rb') as f:
|
@@ -442,7 +435,6 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
442 |
|
443 |
basedata_df['color'] = '#ced4d211'
|
444 |
|
445 |
-
|
446 |
if not plot_time_checkbox:
|
447 |
records_df['color'] = '#5e2784'
|
448 |
else:
|
@@ -532,8 +524,7 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
532 |
# Export relevant column
|
533 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
534 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
535 |
-
export_df['referenced_works'] = [
|
536 |
-
|
537 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
538 |
export_df['approximate_publication_year'] = local_years
|
539 |
export_df.to_csv(csv_file_path, index=False)
|
@@ -790,8 +781,8 @@ with gr.Blocks(theme=theme, css="""
|
|
790 |
gr.Markdown("### Upload Your Own Data")
|
791 |
csv_upload = gr.File(
|
792 |
file_count="single",
|
793 |
-
label="Upload your own CSV file downloaded via pyalex.",
|
794 |
-
file_types=[".csv"],
|
795 |
)
|
796 |
|
797 |
|
@@ -835,10 +826,6 @@ with gr.Blocks(theme=theme, css="""
|
|
835 |
|
836 |
3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
|
837 |
|
838 |
-
## I want to use my own data!
|
839 |
-
|
840 |
-
Sure! You can upload csv-files produced by downloading things from OpenAlex using the pyalex package. You will need to provide at least the columns `id`, `title`, `publication_year`, `doi`, `abstract` or `abstract_inverted_index`, `referenced_works` and `primary_topic`.
|
841 |
-
|
842 |
</div>
|
843 |
""")
|
844 |
|
|
|
24 |
# print("Updated Numba threads:", numba.get_num_threads())
|
25 |
|
26 |
# import datamapplot.medoids
|
27 |
+
|
|
|
28 |
|
29 |
# print(help(datamapplot.medoids))
|
30 |
|
31 |
+
|
|
|
32 |
|
33 |
from pathlib import Path
|
34 |
from datetime import datetime
|
|
|
47 |
|
48 |
print(f"Gradio version: {gr.__version__}")
|
49 |
|
|
|
|
|
|
|
50 |
import subprocess
|
51 |
|
52 |
def print_datamapplot_version():
|
|
|
183 |
|
184 |
|
185 |
if is_running_in_hf_space():
|
|
|
186 |
@spaces.GPU(duration=30)
|
187 |
def create_embeddings_30(texts_to_embedd):
|
188 |
"""Create embeddings for the input texts using the loaded model."""
|
|
|
205 |
|
206 |
|
207 |
else:
|
|
|
208 |
def create_embeddings(texts_to_embedd):
|
209 |
"""Create embeddings for the input texts using the loaded model."""
|
210 |
return model.encode(texts_to_embedd, show_progress_bar=True, batch_size=192)
|
|
|
280 |
records_df = pd.read_csv(csv_upload)
|
281 |
filename = os.path.splitext(os.path.basename(csv_upload))[0]
|
282 |
|
283 |
+
# Process dictionary-like strings in the DataFrame
|
284 |
+
for column in records_df.columns:
|
285 |
+
# Check if the column contains dictionary-like strings
|
286 |
+
if records_df[column].dtype == 'object':
|
287 |
+
try:
|
288 |
+
# Use a sample value to check if it looks like a dictionary or list
|
289 |
+
sample_value = records_df[column].dropna().iloc[0] if not records_df[column].dropna().empty else None
|
290 |
+
# Add type checking before using startswith
|
291 |
+
if isinstance(sample_value, str) and (sample_value.startswith('{') or sample_value.startswith('[')):
|
292 |
+
# Try to convert strings to Python objects using ast.literal_eval
|
293 |
+
records_df[column] = records_df[column].apply(
|
294 |
+
lambda x: ast.literal_eval(x) if isinstance(x, str) and (
|
295 |
+
(x.startswith('{') and x.endswith('}')) or
|
296 |
+
(x.startswith('[') and x.endswith(']'))
|
297 |
+
) else x
|
298 |
+
)
|
299 |
+
except (ValueError, SyntaxError, TypeError) as e:
|
300 |
+
# If conversion fails, keep as string
|
301 |
+
print(f"Could not convert column {column} to Python objects: {e}")
|
302 |
+
|
303 |
elif file_extension == '.pkl':
|
304 |
# Read the pickle file
|
305 |
with open(csv_upload, 'rb') as f:
|
|
|
435 |
|
436 |
basedata_df['color'] = '#ced4d211'
|
437 |
|
|
|
438 |
if not plot_time_checkbox:
|
439 |
records_df['color'] = '#5e2784'
|
440 |
else:
|
|
|
524 |
# Export relevant column
|
525 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
526 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
527 |
+
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
|
|
528 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
529 |
export_df['approximate_publication_year'] = local_years
|
530 |
export_df.to_csv(csv_file_path, index=False)
|
|
|
781 |
gr.Markdown("### Upload Your Own Data")
|
782 |
csv_upload = gr.File(
|
783 |
file_count="single",
|
784 |
+
label="Upload your own CSV or Pickle file downloaded via pyalex.",
|
785 |
+
file_types=[".csv", ".pkl"],
|
786 |
)
|
787 |
|
788 |
|
|
|
826 |
|
827 |
3. Finally, the labels we're using for the regions of this plot are created from OpenAlex's own labels of sub-disciplines. They give a rough indication of the papers that could be expected in this broad area of the map, but they are not necessarily the perfect label for the articles that are precisely below them. They are just located at the median point of a usually much larger, much broader, and fuzzier category, so they should always be taken with quite a big grain of salt.
|
828 |
|
|
|
|
|
|
|
|
|
829 |
</div>
|
830 |
""")
|
831 |
|