Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
from fastapi.responses import FileResponse | |
from fastapi.staticfiles import StaticFiles | |
from contextlib import asynccontextmanager | |
import xml.etree.ElementTree as xmlparser | |
import requests | |
from pydantic import BaseModel | |
import sys | |
import random | |
import fitz | |
import re,os | |
from io import BytesIO | |
from datetime import datetime | |
def receive_signal(signalNumber, frame): | |
print('Received:', signalNumber) | |
sys.exit() | |
async def lifespan(app: FastAPI): | |
import signal | |
signal.signal(signal.SIGINT, receive_signal) | |
yield | |
app = FastAPI(lifespan=lifespan) | |
app.mount("/static", StaticFiles(directory="static"), name="static") | |
origins = [ | |
"*", | |
] | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def root(): | |
return FileResponse(os.path.join("templates", "index.html")) | |
class Query(BaseModel): | |
keyword: str | |
limit: int | |
class DocumentID(BaseModel): | |
doc_id: str | |
class WebPDF(BaseModel): | |
url: str | |
async def get_articles(query: Query): | |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}" | |
content = {} | |
try: | |
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False) | |
response = xmlparser.fromstring(arxiv_search_result.text) | |
publications = response.findall(f"{XML_NAMESPACE}entry") | |
for pub in publications: | |
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1] | |
title_pub = pub.find(f"{XML_NAMESPACE}title").text | |
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")]) | |
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y") | |
abstract = pub.find(f"{XML_NAMESPACE}summary").text | |
content[id_pub] = { | |
"title": title_pub, | |
"authors": authors, | |
"date": pub_date, | |
"abstract": abstract, | |
"pdf": f"http://arxiv.org/pdf/{id_pub}" | |
} | |
return {"error": False, "message": content} | |
except Exception as e: | |
print(f"Error while downloading data : {str(e)}") | |
return {"error": True, "message": str(e)} | |
async def extract_arxiv_pdf(document: DocumentID): | |
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False) | |
if pdf_req.status_code == 200: | |
pdf_data = BytesIO(pdf_req.content) | |
doc = fitz.open(stream=pdf_data, filetype="pdf") | |
pdf_text = " ".join([page.get_text("text") for page in doc]) | |
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE) | |
if ref_pos: | |
ref_pos = ref_pos.end() | |
if ref_pos is not None: | |
pdf_text = pdf_text[:ref_pos - 10] | |
def remove_in_betweens(text): | |
removed_brackets = re.sub(r'\[.*?\]', ' ', text) | |
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets) | |
return removed_parentheses | |
def remove_punctuations(text): | |
return re.sub(r"[\,\;\:\?\!\'\β\"\(\)\{\}\[\]\/\\\*]", '', text) | |
postprocess_text = remove_in_betweens(pdf_text) | |
postprocess_text = remove_punctuations(postprocess_text) | |
postprocess_text = re.sub(r"\s+", " ", postprocess_text) | |
postprocess_text = postprocess_text.strip() | |
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$" | |
titles = doc.get_toc() | |
main_titles = [] | |
if len(titles) <= 0: | |
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) | |
main_titles = [(-1, t) for t in main_titles] | |
else: | |
for title in titles: | |
if title[0] == 1 or title[0] == 2: | |
main_titles.append((title[0], title[1])) | |
return {"pub_id": document.doc_id, "titles": [(t[0],re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t[1]))).strip()) for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False} | |
else: | |
print("ID: " + document.doc_id) | |
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}") | |
print("Status code: " + str(pdf_req.status_code)) | |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} | |
async def extract_random_arxiv_pdf(query: Query): | |
pubs = await get_articles(query) | |
return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys()))) | |
async def extract_pdf(url: WebPDF): | |
pdf_req = requests.get(url) | |
if pdf_req.status_code == 200: | |
pdf_data = BytesIO(pdf_req.content) | |
doc = fitz.open(stream=pdf_data, filetype="pdf") | |
pdf_text = " ".join([page.get_text("text") for page in doc]) | |
pdf_metadata = doc.metadata | |
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text} | |
else: | |
print("URL: " + url) | |
print("Status code: " + str(pdf_req.status_code)) | |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} |