File size: 2,242 Bytes
d7a7846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
import os
from pathlib import Path
import requests
from tqdm import tqdm


def download_eurorad_figures(metadata_path: str, output_dir: str) -> None:
    """
    Download figures from Eurorad dataset and save them organized by case_id.

    Args:
        metadata_path: Path to the eurorad_metadata.json file
        output_dir: Base directory where figures will be saved

    The figures will be saved as:
        {output_dir}/{case_id}/{figure_number}.jpg
    Example:
        figures/189/Figure_1a.jpg
    """
    # Create output directory if it doesn't exist
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Load metadata
    with open(metadata_path) as f:
        metadata = json.load(f)

    # Iterate through all cases with progress bar
    for case_id in tqdm(metadata, desc="Downloading cases", unit="case"):
        case = metadata[case_id]
        case_dir = output_path / str(case["case_id"])
        case_dir.mkdir(exist_ok=True)

        # Process all figures and their subfigures
        for figure in case["figures"]:
            for subfig in figure["subfigures"]:

                # Remove leading and trailing whitespace and convert to lowercase
                subfig_name = f"{subfig['number'].strip().replace(' ', '_').lower()}.jpg"
                subfig_path = Path(case_dir) / subfig_name

                save_figure(
                    url=subfig["url"],
                    output_path=subfig_path,
                )


def save_figure(url: str, output_path: Path) -> None:
    """
    Download and save a single figure.

    Args:
        url: URL of the figure to download
        output_path: Path where the figure should be saved
    """
    if output_path.exists():
        return

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(output_path, "wb") as f:
            f.write(response.content)
    except Exception as e:
        print(f"Error downloading {url}: {e}")


if __name__ == "__main__":
    root = os.path.dirname(os.path.abspath(__file__))
    download_eurorad_figures(
        metadata_path=os.path.join(root, "eurorad_metadata.json"),
        output_dir=os.path.join(root, "figures"),
    )