Spaces:
Paused
Paused
import requests | |
import random | |
from functools import lru_cache | |
class GROBID_OFFLINE_EXCEPTION(Exception): pass | |
def get_avail_grobid_url(): | |
from toolbox import get_conf | |
GROBID_URLS, = get_conf('GROBID_URLS') | |
if len(GROBID_URLS) == 0: return None | |
try: | |
_grobid_url = random.choice(GROBID_URLS) # 随机负载均衡 | |
if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') | |
res = requests.get(_grobid_url+'/api/isalive') | |
if res.text=='true': return _grobid_url | |
else: return None | |
except: | |
return None | |
def parse_pdf(pdf_path, grobid_url): | |
import scipdf # pip install scipdf_parser | |
if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/') | |
try: | |
article_dict = scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) | |
except GROBID_OFFLINE_EXCEPTION: | |
raise GROBID_OFFLINE_EXCEPTION("GROBID服务不可用,请修改config中的GROBID_URL,可修改成本地GROBID服务。") | |
except: | |
raise RuntimeError("解析PDF失败,请检查PDF是否损坏。") | |
return article_dict | |