Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
from contextlib import asynccontextmanager | |
import xml.etree.ElementTree as xmlparser | |
import requests | |
import sys | |
import random | |
import fitz | |
import re | |
from io import BytesIO | |
from datetime import datetime | |
def receive_signal(signalNumber, frame): | |
print('Received:', signalNumber) | |
sys.exit() | |
async def lifespan(app: FastAPI): | |
import signal | |
signal.signal(signal.SIGINT, receive_signal) | |
yield | |
app = FastAPI(lifespan=lifespan) | |
origins = [ | |
"*", | |
] | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def root(): | |
return {"message": "API started successfully"} | |
async def get_articles(keyword: str, limit: int): | |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}" | |
content = {} | |
try: | |
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False) | |
response = xmlparser.fromstring(arxiv_search_result.text) | |
publications = response.findall(f"{XML_NAMESPACE}entry") | |
for pub in publications: | |
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1] | |
title_pub = pub.find(f"{XML_NAMESPACE}title").text | |
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")]) | |
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y") | |
abstract = pub.find(f"{XML_NAMESPACE}summary").text | |
content[id_pub] = { | |
"title": title_pub, | |
"authors": authors, | |
"date": pub_date, | |
"abstract": abstract | |
} | |
return {"error": False, "message": content} | |
except Exception as e: | |
print(f"Error while downloading data : {str(e)}") | |
return {"error": True, "message": str(e)} | |
async def extract_text_pdf(id_doc: str): | |
pdf_req = requests.get(f"http://arxiv.org/pdf/{id_doc}", verify=False) | |
if pdf_req.status_code == 200: | |
pdf_data = BytesIO(pdf_req.content) | |
doc = fitz.open(stream=pdf_data, filetype="pdf") | |
pdf_text = " ".join([page.get_text("text") for page in doc]) | |
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE) | |
if ref_pos: | |
ref_pos = ref_pos.end() | |
if ref_pos is not None: | |
postprocess_text = pdf_text[:ref_pos - 10] | |
def remove_in_betweens(text): | |
removed_brackets = re.sub(r'\[.*?\]', ' ', text) | |
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets) | |
return removed_parentheses | |
def remove_punctuations(text): | |
return re.sub(r"[\,\;\:\?\!\'\β\"\(\)\{\}\[\]\/\\\*\-]", ' ', text) | |
postprocess_text = remove_in_betweens(postprocess_text) | |
postprocess_text = remove_punctuations(postprocess_text) | |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$" | |
titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) if len(doc.get_toc()) <= 0 else doc.get_toc() | |
return {"error": False, "message": titles} | |
else: | |
return {"error": True} | |
async def extract_random_pdf(keyword: str, limit: int): | |
pubs = await get_articles(keyword, limit) | |
return await extract_text_pdf(random.choice(pubs.keys())) |