File size: 4,897 Bytes
bdafe83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53709ed
bdafe83
 
 
 
 
 
53709ed
bdafe83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53709ed
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
Download all papers from one year of ICLR conference using OpenReview API.

This script downloads all paper PDFs and their corresponding metadata
from the ICLR 2023 conference using the OpenReview API.

Alternative methods to download can be found in this
[colab notebook](https://colab.research.google.com/drive/1vXXNxn8lnO3j1dgoidjybbKIN0DW0Bt2),
though it's not used here.
"""

import glob
import json
import os
import time
import requests

from agentreview.arguments import parse_args

try:
    import openreview
except ImportError:
    raise ImportError("Please install openreview package using `pip install openreview-py`")

def download_papers(args):
    """Downloads all papers from ICLR 2023 using OpenReview API.

    This function authenticates with the OpenReview API using environment
    variables for the username and password. It then iterates through the
    available papers, downloads the PDF, and saves the corresponding metadata
    (in JSON format) in the specified directories.

    Raises:
        AssertionError: If the OPENREVIEW_USERNAME or OPENREVIEW_PASSWORD environment
                        variables are not set.
        AssertionError: If the conference argument is not for ICLR.
    """

    openreview_username = os.environ.get("OPENREVIEW_USERNAME")
    openreview_password = os.environ.get("OPENREVIEW_PASSWORD")

    assert openreview_username is not None, (
        "Please set your OpenReview username through the OPENREVIEW_USERNAME environment variable."
    )
    assert openreview_password is not None, (
        "Please set your OpenReview password through the OPENREVIEW_PASSWORD environment variable."
    )

    client = openreview.Client(
        baseurl='https://api.openreview.net',
        username=openreview_username,
        password=openreview_password
    )

    page_size = 1000
    offset = 0
    papers_directory = os.path.join(args.data_dir, args.conference, "paper")
    notes_directory = os.path.join(args.data_dir, args.conference, "notes")

    assert "ICLR" in args.conference, "Only works for ICLR conferences!"
    year = int(args.conference.split("ICLR")[-1])  # Only works for ICLR currently
    ids = []

    # Create directories if they don't exist
    for path in [papers_directory, notes_directory]:
        os.makedirs(path, exist_ok=True)

    while True:
        # Fetch submissions with pagination
        notes = client.get_notes(
            invitation=f'ICLR.cc/{year}/Conference/-/Blind_Submission',
            details='all',
            offset=offset,
            limit=page_size
        )

        if not notes:
            break  # Exit if no more notes are available

        # Get existing paper IDs to avoid re-downloading
        existing_papers = glob.glob(f"{papers_directory}/*.pdf")
        existing_paper_ids = {int(os.path.basename(paper).split(".pdf")[0]) for paper in existing_papers}

        for note in notes:
            paper_id = note.number
            paper_path = os.path.join(papers_directory, f"{paper_id}.pdf")
            note_path = os.path.join(notes_directory, f"{paper_id}.json")

            # Skip existing papers
            if paper_id in existing_paper_ids:
                print(f"Paper {paper_id} already downloaded.")
                continue

            print(f"Title: {note.content.get('title', 'N/A')}")
            print(f"Abstract: {note.content.get('abstract', 'N/A')}")
            print(f"TL;DR: {note.content.get('TL;DR', 'N/A')}")
            pdf_link = f"https://openreview.net/pdf?id={note.id}"
            print(f"PDF Link: {pdf_link}")

            # Attempt to download the paper PDF, retry if fails
            tries = 0
            while tries < 10:
                try:
                    response = requests.get(pdf_link)

                    if response.status_code == 200:
                        
                        with open(paper_path, "wb") as pdf_file:
                            pdf_file.write(response.content)

                        print(f"PDF downloaded successfully as {paper_path}")

                        # Save metadata as JSON, which contains the reviews, rebuttals, and decisions. 
                        with open(note_path, "w") as note_file:
                            json.dump(note.to_json(), note_file, indent=2)

                        break

                    else:
                        print(f"Attempt {tries} failed. Status code: {response.status_code}")
                        if response.status_code == 429:  # Too many requests
                            print("Too many requests. Sleeping for 10 seconds.")
                            time.sleep(10)

                except Exception as e:
                    print(f"Attempt {tries} failed with error: {e}")

                tries += 1

        offset += page_size


if __name__ == "__main__":
    args = parse_args()
    download_papers(args)