arXiv / app.py
Omar ID EL MOUMEN
Fix attempt #1
d588a4d
raw
history blame
5.56 kB
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
from pydantic import BaseModel
import sys
import random
import fitz
import re,os
from io import BytesIO
from datetime import datetime
def receive_signal(signalNumber, frame):
print('Received:', signalNumber)
sys.exit()
@asynccontextmanager
async def lifespan(app: FastAPI):
import signal
signal.signal(signal.SIGINT, receive_signal)
yield
app = FastAPI(lifespan=lifespan)
app.mount("/static", StaticFiles(directory="static"), name="static")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def root():
return FileResponse(os.path.join("templates", "index.html"))
class Query(BaseModel):
keyword: str
limit: int
class DocumentID(BaseModel):
doc_id: str
class WebPDF(BaseModel):
url: str
@app.post("/search")
async def get_articles(query: Query):
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
content = {}
try:
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
response = xmlparser.fromstring(arxiv_search_result.text)
publications = response.findall(f"{XML_NAMESPACE}entry")
for pub in publications:
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
title_pub = pub.find(f"{XML_NAMESPACE}title").text
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
abstract = pub.find(f"{XML_NAMESPACE}summary").text
content[id_pub] = {
"title": title_pub,
"authors": authors,
"date": pub_date,
"abstract": abstract,
"pdf": f"http://arxiv.org/pdf/{id_pub}"
}
return {"error": False, "message": content}
except Exception as e:
print(f"Error while downloading data : {str(e)}")
return {"error": True, "message": str(e)}
@app.post("/extract_pdf/arxiv_id")
async def extract_arxiv_pdf(document: DocumentID):
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([page.get_text("text") for page in doc])
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
if ref_pos:
ref_pos = ref_pos.end()
if ref_pos is not None:
pdf_text = pdf_text[:ref_pos - 10]
def remove_in_betweens(text):
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
return removed_parentheses
def remove_punctuations(text):
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
postprocess_text = remove_in_betweens(pdf_text)
postprocess_text = remove_punctuations(postprocess_text)
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
postprocess_text = postprocess_text.strip()
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
titles = doc.get_toc()
main_titles = []
if len(titles) <= 0:
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
main_titles = [(-1, t) for t in main_titles]
else:
for title in titles:
if title[0] == 1 or title[0] == 2:
main_titles.append((title[0], title[1]))
return {"pub_id": document.doc_id, "titles": [(t[0],re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t[1]))).strip()) for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
else:
print("ID: " + document.doc_id)
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
@app.post("/extract_pdf/arxiv_id/random")
async def extract_random_arxiv_pdf(query: Query):
pubs = await get_articles(query)
return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
@app.post("/extract_pdf/url")
async def extract_pdf(url: WebPDF):
pdf_req = requests.get(url)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([page.get_text("text") for page in doc])
pdf_metadata = doc.metadata
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
else:
print("URL: " + url)
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}