File size: 4,222 Bytes
7b40c73
348017a
 
 
 
 
 
 
 
 
 
 
 
0841c28
 
 
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0841c28
 
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0841c28
 
348017a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0841c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import logging
import pathlib
import random
import re
import time

import requests
from tqdm import trange

from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import dump_json, load_json

logger = logging.getLogger("uvicorn.default")


class DblpPaperList(SearchAPI):
    """DBLP paper list

    Inputs:
        cache_filepath: Filepath to save cached file
        use_cache: will use cached file if `True`, otherwise download again
        query: Query string, basically the title
            you wanna search in a search box.
            Special logical grammars refer to the reference.
        max_results: Maximal returned papers
        request_time_inteval: Seconds to sleep when calling DBLP API

    References:
        https://dblp.org/faq/How+to+use+the+dblp+search+API.html
    """

    API_URL = "https://dblp.org/search/publ/api"

    def __init__(
        self,
        cache_filepath: pathlib.Path,
        use_cache: bool = False,
        query: str = "",
        max_results: int = 5000,
        request_time_inteval: float = 3,
    ) -> None:
        super().__init__()

        if isinstance(cache_filepath, str):
            cache_filepath = pathlib.Path(cache_filepath)
        if (not cache_filepath.exists()) or (not use_cache):
            query = query.strip()
            query = re.sub(r"\s+?\|\s+?", "|", query)
            query = re.sub(r"\s+", "+", query)

            searched_results = []
            # max capacity is 1000
            h = 1000
            for f in trange(0, max_results, h, desc="DBLP Downloading"):
                url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    page = response.json()
                    page_data = page["result"]["hits"]["hit"]
                    if page_data:
                        searched_results.extend(page_data)
                    else:
                        break
                except KeyboardInterrupt:
                    raise KeyboardInterrupt
                except Exception as err:
                    logger.info(err)
                    break
                time.sleep((random.random() + 0.5) * request_time_inteval)
            dump_json(searched_results, cache_filepath)

        data = load_json(cache_filepath)
        for d in data:
            # dblp does not provide abstract and month data
            authors = []
            if "authors" in d["info"]:
                if isinstance(d["info"]["authors"]["author"], dict):
                    authors.append(d["info"]["authors"]["author"]["text"])
                else:
                    authors = [a["text"] for a in d["info"]["authors"]["author"]]

            venues = []
            if "venue" in d["info"]:
                if isinstance(d["info"]["venue"], str):
                    venues.append(d["info"]["venue"])
                else:
                    for venue in d["info"]["venue"]:
                        venues.append(venue)
            paper = Paper(
                d["info"]["title"],
                " , ".join(authors),
                "",
                d["info"].get("ee", d["info"].get("url", "")),
                d["info"].get("doi", ""),
                " , ".join(venues),
                d["info"].get("year", "9999"),
                "99",
            )
            self.papers.append(paper)

    @classmethod
    def build_paper_list(
        cls, cache_filepath: str, query: dict, max_results: int = 1000
    ):
        title = query.get("title", [])
        abstract = query.get("abstract", [])

        cls_q = ""
        for t in title:
            cls_q += " ".join(t)
        for a in abstract:
            cls_q += " ".join(a)
        return cls(
            cache_filepath,
            use_cache=False,
            query=cls_q,
            max_results=max_results,
        )

    @classmethod
    def build_and_search(
        cls, cache_filepath: str, query: dict, max_results: int = 1000
    ) -> list[Paper]:
        obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
        return obj.search(query)[:max_results]