Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
from contextlib import asynccontextmanager | |
import xml.etree.ElementTree as xmlparser | |
import requests | |
import sys | |
import random | |
import fitz | |
import re | |
from io import BytesIO | |
from datetime import datetime | |
def receive_signal(signalNumber, frame): | |
print('Received:', signalNumber) | |
sys.exit() | |
async def lifespan(app: FastAPI): | |
import signal | |
signal.signal(signal.SIGINT, receive_signal) | |
yield | |
app = FastAPI(lifespan=lifespan) | |
origins = [ | |
"*", | |
] | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def root(): | |
return {"message": "API started successfully"} | |
async def get_articles(keyword: str, limit: int): | |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}" | |
content = {} | |
try: | |
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False) | |
response = xmlparser.fromstring(arxiv_search_result.text) | |
publications = response.findall(f"{XML_NAMESPACE}entry") | |
for pub in publications: | |
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1] | |
title_pub = pub.find(f"{XML_NAMESPACE}title").text | |
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")]) | |
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y") | |
abstract = pub.find(f"{XML_NAMESPACE}summary").text | |
content[id_pub] = { | |
"title": title_pub, | |
"authors": authors, | |
"date": pub_date, | |
"abstract": abstract | |
} | |
return {"error": False, "message": content} | |
except Exception as e: | |
print(f"Error while downloading data : {str(e)}") | |
return {"error": True, "message": str(e)} | |
async def extract_text_pdf(id_doc: str): | |
pdf_req = requests.get(f"http://arxiv.org/pdf/{id_doc}", verify=False) | |
if pdf_req.status_code == 200: | |
pdf_data = BytesIO(pdf_req.content) | |
doc = fitz.open(stream=pdf_data, filetype="pdf") | |
pdf_text = " ".join([page.get_text("text") for page in doc]) | |
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE) | |
if ref_pos: | |
ref_pos = ref_pos.end() | |
if ref_pos is not None: | |
pdf_text = pdf_text[:ref_pos - 10] | |
def remove_in_betweens(text): | |
removed_brackets = re.sub(r'\[.*?\]', ' ', text) | |
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets) | |
return removed_parentheses | |
def remove_punctuations(text): | |
return re.sub(r"[\,\;\:\?\!\'\β\"\(\)\{\}\[\]\/\\\*\-]", '', text) | |
postprocess_text = remove_in_betweens(pdf_text) | |
postprocess_text = remove_punctuations(postprocess_text) | |
postprocess_text = re.sub(r"\ +", " ", postprocess_text) | |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$" | |
titles = doc.get_toc() | |
main_titles = [] | |
if len(titles) <= 0: | |
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) | |
else: | |
for title in titles: | |
if title[0] == 1: | |
main_titles.append(title[1]) | |
return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False} | |
else: | |
print("ID: " + id_doc) | |
print("URL: " + f"http://arxiv.org/pdf/{id_doc}") | |
print("Status code: " + str(pdf_req.status_code)) | |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} | |
async def extract_random_pdf(keyword: str, limit: int): | |
pubs = await get_articles(keyword, limit) | |
return await extract_text_pdf(random.choice(list(pubs["message"].keys()))) |