File size: 5,798 Bytes
61b2353
 
9764706
2e329bd
61b2353
 
 
9513d18
61b2353
 
 
2e329bd
61b2353
 
 
a13fabc
 
 
 
 
 
 
61b2353
 
 
 
 
 
 
 
 
 
 
 
 
 
2e329bd
 
61b2353
 
 
 
 
 
 
 
 
 
 
 
 
 
2e329bd
61b2353
9513d18
 
 
 
ca2c7e8
 
 
d588a4d
577d055
 
9513d18
 
61b2353
 
 
9513d18
61b2353
 
 
 
 
 
 
 
 
 
 
 
2ca42fd
 
61b2353
 
 
 
 
9513d18
577d055
 
ca2c7e8
61b2353
 
 
 
 
 
 
 
 
07e2819
61b2353
07e2819
61b2353
5e9984e
 
d8045d1
c2b2088
 
 
 
ad1e294
c2b2088
 
db5cf0a
bceef6c
 
61b2353
ca2c7e8
 
88cff0c
a5f46a9
61b2353
577d055
 
9513d18
577d055
 
 
0bf43b3
 
577d055
 
 
 
 
a13fabc
 
 
 
 
 
 
577d055
0bf43b3
577d055
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
from pydantic import BaseModel
import sys
import random
import fitz
import re,os
from io import BytesIO
from datetime import datetime

def remove_in_betweens(text):
    removed_brackets = re.sub(r'\[.*?\]', ' ', text)
    removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
    return removed_parentheses

def remove_punctuations(text):
    return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*]", '', text)

def receive_signal(signalNumber, frame):
    print('Received:', signalNumber)
    sys.exit()


@asynccontextmanager
async def lifespan(app: FastAPI):
    import signal
    signal.signal(signal.SIGINT, receive_signal)
    yield

app = FastAPI(lifespan=lifespan)

app.mount("/static", StaticFiles(directory="static"), name="static")

origins = [
    "*",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def root():
    return FileResponse(os.path.join("templates", "index.html"))

class Query(BaseModel):
    keyword: str
    limit: int

class DocumentID(BaseModel):
    doc_id: str

class WebPDF(BaseModel):
    url: str

@app.post("/search")
async def get_articles(query: Query):
    XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
    content = {}
    try:
        arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{query.keyword}&max_results={query.limit}", verify=False)
        response = xmlparser.fromstring(arxiv_search_result.text)
        publications = response.findall(f"{XML_NAMESPACE}entry")
        for pub in publications:
            id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
            title_pub = pub.find(f"{XML_NAMESPACE}title").text
            authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
            pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
            abstract = pub.find(f"{XML_NAMESPACE}summary").text
            content[id_pub] = {
                "title": title_pub,
                "authors": authors,
                "date": pub_date,
                "abstract": abstract,
                "pdf": f"http://arxiv.org/pdf/{id_pub}"
            }
        return {"error": False, "message": content}
    except Exception as e:
        print(f"Error while downloading data : {str(e)}")
        return {"error": True, "message": str(e)}
    
@app.post("/extract_pdf/arxiv_id")
async def extract_arxiv_pdf(document: DocumentID):
    pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
    if pdf_req.status_code == 200:
        pdf_data = BytesIO(pdf_req.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        pdf_text = " ".join([page.get_text("text") for page in doc])
        ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
        if ref_pos:
            ref_pos = ref_pos.end()

        if ref_pos is not None:
            pdf_text = pdf_text[:ref_pos - 10]

        postprocess_text = remove_in_betweens(pdf_text)
        postprocess_text = remove_punctuations(postprocess_text)
        postprocess_text = re.sub(r"\s+", " ", postprocess_text)
        postprocess_text = postprocess_text.strip()
        regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
        titles = doc.get_toc()
        main_titles = []
        if len(titles) <= 0:
            main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
            main_titles = [(-1, t) for t in main_titles]
        else:
            for title in titles:
                if title[0] == 1 or title[0] == 2:
                    main_titles.append((title[0], title[1]))
        return {"pub_id": document.doc_id, "titles": [(t[0],re.sub(r"\s+", " ", remove_punctuations(remove_in_betweens(t[1]))).strip()) for t in main_titles], "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": document.doc_id, "titles": "No titles found !", "text": postprocess_text, "error": False}
    else:
        print("ID: " + document.doc_id)
        print("URL: " + f"http://arxiv.org/pdf/{document.doc_id}")
        print("Status code: " + str(pdf_req.status_code))
        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}

@app.post("/extract_pdf/arxiv_id/random")
async def extract_random_arxiv_pdf(query: Query):
    pubs = await get_articles(query)
    return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))

@app.post("/extract_pdf/url")
async def extract_pdf(pdf: WebPDF):
    pdf_req = requests.get(pdf.url)
    if pdf_req.status_code == 200:
        pdf_data = BytesIO(pdf_req.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        pdf_text = " ".join([page.get_text("text") for page in doc])
        pdf_metadata = doc.metadata
        print(pdf_metadata)

        postprocess_text = remove_in_betweens(pdf_text)
        postprocess_text = remove_punctuations(postprocess_text)
        postprocess_text = re.sub(r"\s+", " ", postprocess_text)
        postprocess_text = postprocess_text.strip()
        return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": postprocess_text}
    else:
        print("URL: " + pdf.url)
        print("Status code: " + str(pdf_req.status_code))
        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}