Spaces:
Sleeping
Sleeping
File size: 5,798 Bytes
61b2353 9764706 2e329bd 61b2353 9513d18 61b2353 2e329bd 61b2353 a13fabc 61b2353 2e329bd 61b2353 2e329bd 61b2353 9513d18 ca2c7e8 d588a4d 577d055 9513d18 61b2353 9513d18 61b2353 2ca42fd 61b2353 9513d18 577d055 ca2c7e8 61b2353 07e2819 61b2353 07e2819 61b2353 5e9984e d8045d1 c2b2088 ad1e294 c2b2088 db5cf0a bceef6c 61b2353 ca2c7e8 88cff0c a5f46a9 61b2353 577d055 9513d18 577d055 0bf43b3 577d055 a13fabc 577d055 0bf43b3 577d055 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
from pydantic import BaseModel
import sys
import random
import fitz
import re,os
from io import BytesIO
from datetime import datetime
def remove_in_betweens(text):
removed_brackets = re.sub(r'\[.*?\]', ' ', text)
removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
return removed_parentheses
def remove_punctuations(text):
return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)
def receive_signal(signalNumber, frame):
print('Received:', signalNumber)
sys.exit()
@asynccontextmanager
async def lifespan(app: FastAPI):
import signal
signal.signal(signal.SIGINT, receive_signal)
yield
app = FastAPI(lifespan=lifespan)
app.mount("/static", StaticFiles(directory="static"), name="static")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def root():
return FileResponse(os.path.join("templates", "index.html"))
class Query(BaseModel):
keyword: str
limit: int
class DocumentID(BaseModel):
doc_id: str
class WebPDF(BaseModel):
url: str
@app.post("/search")
async def get_articles(query: Query):
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
content = {}
try:
arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
response = xmlparser.fromstring(arxiv_search_result.text)
publications = response.findall(f"{XML_NAMESPACE}entry")
for pub in publications:
id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
title_pub = pub.find(f"{XML_NAMESPACE}title").text
authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
abstract = pub.find(f"{XML_NAMESPACE}summary").text
content[id_pub] = {
"title": title_pub,
"authors": authors,
"date": pub_date,
"abstract": abstract,
"pdf": f"http://arxiv.org/pdf/{id_pub}"
}
return {"error": False, "message": content}
except Exception as e:
print(f"Error while downloading data : {str(e)}")
return {"error": True, "message": str(e)}
@app.post("/extract_pdf/arxiv_id")
async def extract_arxiv_pdf(document: DocumentID):
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([page.get_text("text") for page in doc])
ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
if ref_pos:
ref_pos = ref_pos.end()
if ref_pos is not None:
pdf_text = pdf_text[:ref_pos - 10]
postprocess_text = remove_in_betweens(pdf_text)
postprocess_text = remove_punctuations(postprocess_text)
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
postprocess_text = postprocess_text.strip()
regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
titles = doc.get_toc()
main_titles = []
if len(titles) <= 0:
main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
main_titles = [(-1, t) for t in main_titles]
else:
for title in titles:
if title[0] == 1 or title[0] == 2:
main_titles.append((title[0], title[1]))
return {"pub_id": document.doc_id, "titles": [(t[0],re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t[1]))).strip()) for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
else:
print("ID: " + document.doc_id)
print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
@app.post("/extract_pdf/arxiv_id/random")
async def extract_random_arxiv_pdf(query: Query):
pubs = await get_articles(query)
return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
@app.post("/extract_pdf/url")
async def extract_pdf(pdf: WebPDF):
pdf_req = requests.get(pdf.url)
if pdf_req.status_code == 200:
pdf_data = BytesIO(pdf_req.content)
doc = fitz.open(stream=pdf_data, filetype="pdf")
pdf_text = " ".join([page.get_text("text") for page in doc])
pdf_metadata = doc.metadata
print(pdf_metadata)
postprocess_text = remove_in_betweens(pdf_text)
postprocess_text = remove_punctuations(postprocess_text)
postprocess_text = re.sub(r"\s+", " ", postprocess_text)
postprocess_text = postprocess_text.strip()
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": postprocess_text}
else:
print("URL: " + pdf.url)
print("Status code: " + str(pdf_req.status_code))
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)} |