medrax.org / data /figures.py
oldcai's picture
Upload folder using huggingface_hub
d7a7846 verified
import json
import os
from pathlib import Path
import requests
from tqdm import tqdm
def download_eurorad_figures(metadata_path: str, output_dir: str) -> None:
"""
Download figures from Eurorad dataset and save them organized by case_id.
Args:
metadata_path: Path to the eurorad_metadata.json file
output_dir: Base directory where figures will be saved
The figures will be saved as:
{output_dir}/{case_id}/{figure_number}.jpg
Example:
figures/189/Figure_1a.jpg
"""
# Create output directory if it doesn't exist
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# Load metadata
with open(metadata_path) as f:
metadata = json.load(f)
# Iterate through all cases with progress bar
for case_id in tqdm(metadata, desc="Downloading cases", unit="case"):
case = metadata[case_id]
case_dir = output_path / str(case["case_id"])
case_dir.mkdir(exist_ok=True)
# Process all figures and their subfigures
for figure in case["figures"]:
for subfig in figure["subfigures"]:
# Remove leading and trailing whitespace and convert to lowercase
subfig_name = f"{subfig['number'].strip().replace(' ', '_').lower()}.jpg"
subfig_path = Path(case_dir) / subfig_name
save_figure(
url=subfig["url"],
output_path=subfig_path,
)
def save_figure(url: str, output_path: Path) -> None:
"""
Download and save a single figure.
Args:
url: URL of the figure to download
output_path: Path where the figure should be saved
"""
if output_path.exists():
return
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
with open(output_path, "wb") as f:
f.write(response.content)
except Exception as e:
print(f"Error downloading {url}: {e}")
if __name__ == "__main__":
root = os.path.dirname(os.path.abspath(__file__))
download_eurorad_figures(
metadata_path=os.path.join(root, "eurorad_metadata.json"),
output_dir=os.path.join(root, "figures"),
)