from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from contextlib import asynccontextmanager import xml.etree.ElementTree as xmlparser import requests import sys import random import fitz import re from io import BytesIO from datetime import datetime def receive_signal(signalNumber, frame): print('Received:', signalNumber) sys.exit() @asynccontextmanager async def lifespan(app: FastAPI): import signal signal.signal(signal.SIGINT, receive_signal) yield app = FastAPI(lifespan=lifespan) origins = [ "*", ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/") async def root(): return {"message": "API started successfully"} @app.get("/search/{keyword}/{limit}") async def get_articles(keyword: str, limit: int): XML_NAMESPACE = "{http://www.w3.org/2005/Atom}" content = {} try: arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False) response = xmlparser.fromstring(arxiv_search_result.text) publications = response.findall(f"{XML_NAMESPACE}entry") for pub in publications: id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1] title_pub = pub.find(f"{XML_NAMESPACE}title").text authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")]) pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y") abstract = pub.find(f"{XML_NAMESPACE}summary").text content[id_pub] = { "title": title_pub, "authors": authors, "date": pub_date, "abstract": abstract } return {"error": False, "message": content} except Exception as e: print(f"Error while downloading data : {str(e)}") return {"error": True, "message": str(e)} @app.get("/extract/{id_doc}") async def extract_text_pdf(id_doc: str): pdf_req = requests.get(f"http://arxiv.org/pdf/{id_doc}", verify=False) if pdf_req.status_code == 200: pdf_data = BytesIO(pdf_req.content) doc = fitz.open(stream=pdf_data, filetype="pdf") pdf_text = " ".join([page.get_text("text") for page in doc]) ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE) if ref_pos: ref_pos = ref_pos.end() if ref_pos is not None: pdf_text = pdf_text[:ref_pos - 10] def remove_in_betweens(text): removed_brackets = re.sub(r'\[.*?\]', ' ', text) removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets) return removed_parentheses def remove_punctuations(text): return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '', text) postprocess_text = remove_in_betweens(pdf_text) postprocess_text = remove_punctuations(postprocess_text) postprocess_text = re.sub(r"\ +", " ", postprocess_text) regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$" titles = doc.get_toc() main_titles = [] if len(titles) <= 0: main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE) else: for title in titles: if title[0] == 1: main_titles.append(title[1]) return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False} else: print("ID: " + id_doc) print("URL: " + f"http://arxiv.org/pdf/{id_doc}") print("Status code: " + str(pdf_req.status_code)) return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} @app.get("/extract/random/{keyword}/{limit}") async def extract_random_pdf(keyword: str, limit: int): pubs = await get_articles(keyword, limit) return await extract_text_pdf(random.choice(list(pubs["message"].keys())))