Spaces:
Sleeping
Sleeping
File size: 4,222 Bytes
7b40c73 348017a 0841c28 348017a 0841c28 348017a 0841c28 348017a 0841c28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import logging
import pathlib
import random
import re
import time
import requests
from tqdm import trange
from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import dump_json, load_json
logger = logging.getLogger("uvicorn.default")
class DblpPaperList(SearchAPI):
"""DBLP paper list
Inputs:
cache_filepath: Filepath to save cached file
use_cache: will use cached file if `True`, otherwise download again
query: Query string, basically the title
you wanna search in a search box.
Special logical grammars refer to the reference.
max_results: Maximal returned papers
request_time_inteval: Seconds to sleep when calling DBLP API
References:
https://dblp.org/faq/How+to+use+the+dblp+search+API.html
"""
API_URL = "https://dblp.org/search/publ/api"
def __init__(
self,
cache_filepath: pathlib.Path,
use_cache: bool = False,
query: str = "",
max_results: int = 5000,
request_time_inteval: float = 3,
) -> None:
super().__init__()
if isinstance(cache_filepath, str):
cache_filepath = pathlib.Path(cache_filepath)
if (not cache_filepath.exists()) or (not use_cache):
query = query.strip()
query = re.sub(r"\s+?\|\s+?", "|", query)
query = re.sub(r"\s+", "+", query)
searched_results = []
# max capacity is 1000
h = 1000
for f in trange(0, max_results, h, desc="DBLP Downloading"):
url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
try:
response = requests.get(url)
response.raise_for_status()
page = response.json()
page_data = page["result"]["hits"]["hit"]
if page_data:
searched_results.extend(page_data)
else:
break
except KeyboardInterrupt:
raise KeyboardInterrupt
except Exception as err:
logger.info(err)
break
time.sleep((random.random() + 0.5) * request_time_inteval)
dump_json(searched_results, cache_filepath)
data = load_json(cache_filepath)
for d in data:
# dblp does not provide abstract and month data
authors = []
if "authors" in d["info"]:
if isinstance(d["info"]["authors"]["author"], dict):
authors.append(d["info"]["authors"]["author"]["text"])
else:
authors = [a["text"] for a in d["info"]["authors"]["author"]]
venues = []
if "venue" in d["info"]:
if isinstance(d["info"]["venue"], str):
venues.append(d["info"]["venue"])
else:
for venue in d["info"]["venue"]:
venues.append(venue)
paper = Paper(
d["info"]["title"],
" , ".join(authors),
"",
d["info"].get("ee", d["info"].get("url", "")),
d["info"].get("doi", ""),
" , ".join(venues),
d["info"].get("year", "9999"),
"99",
)
self.papers.append(paper)
@classmethod
def build_paper_list(
cls, cache_filepath: str, query: dict, max_results: int = 1000
):
title = query.get("title", [])
abstract = query.get("abstract", [])
cls_q = ""
for t in title:
cls_q += " ".join(t)
for a in abstract:
cls_q += " ".join(a)
return cls(
cache_filepath,
use_cache=False,
query=cls_q,
max_results=max_results,
)
@classmethod
def build_and_search(
cls, cache_filepath: str, query: dict, max_results: int = 1000
) -> list[Paper]:
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
return obj.search(query)[:max_results]
|