File size: 4,260 Bytes
61b2353
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07e2819
61b2353
 
 
 
 
 
 
07e2819
61b2353
07e2819
61b2353
a5f46a9
61b2353
c2b2088
 
 
 
 
 
 
 
a5f46a9
61b2353
743537e
 
88cff0c
a5f46a9
61b2353
 
 
 
88cff0c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import xml.etree.ElementTree as xmlparser
import requests
import sys
import random
import fitz
import re
from io import BytesIO
from datetime import datetime


def receive_signal(signalNumber, frame):
    print('Received:', signalNumber)
    sys.exit()


@asynccontextmanager
async def lifespan(app: FastAPI):
    import signal
    signal.signal(signal.SIGINT, receive_signal)
    yield

app = FastAPI(lifespan=lifespan)

origins = [
    "*",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

@app.get("/")
async def root():
    return {"message": "API started successfully"}

@app.get("/search/{keyword}/{limit}")
async def get_articles(keyword: str, limit: int):
    XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
    content = {}
    try:
        arxiv_search_result = requests.get(f"http://export.arxiv.org/api/query?search_query=all:{keyword}&max_results={limit}", verify=False)
        response = xmlparser.fromstring(arxiv_search_result.text)
        publications = response.findall(f"{XML_NAMESPACE}entry")
        for pub in publications:
            id_pub = pub.find(f"{XML_NAMESPACE}id").text.split("/")[-1]
            title_pub = pub.find(f"{XML_NAMESPACE}title").text
            authors = " and ".join([author.find(f"{XML_NAMESPACE}name").text for author in pub.findall(f"{XML_NAMESPACE}author")])
            pub_date = datetime.strptime(pub.find(f"{XML_NAMESPACE}published").text, "%Y-%m-%dT%H:%M:%SZ").strftime("%d/%m/%Y")
            abstract = pub.find(f"{XML_NAMESPACE}summary").text
            content[id_pub] = {
                "title": title_pub,
                "authors": authors,
                "date": pub_date,
                "abstract": abstract
            }
        return {"error": False, "message": content}
    except Exception as e:
        print(f"Error while downloading data : {str(e)}")
        return {"error": True, "message": str(e)}

@app.get("/extract/{id_doc}")
async def extract_text_pdf(id_doc: str):
    pdf_req = requests.get(f"http://arxiv.org/pdf/{id_doc}", verify=False)
    if pdf_req.status_code == 200:
        pdf_data = BytesIO(pdf_req.content)
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        pdf_text = " ".join([page.get_text("text") for page in doc])
        ref_pos = re.search(r"REFERENCES", pdf_text, re.IGNORECASE)
        if ref_pos:
            ref_pos = ref_pos.end()

        if ref_pos is not None:
            pdf_text = pdf_text[:ref_pos - 10]

        def remove_in_betweens(text):
            removed_brackets = re.sub(r'\[.*?\]', ' ', text)
            removed_parentheses = re.sub(r'\(.*?\)', ' ', removed_brackets)
            return removed_parentheses

        def remove_punctuations(text):
            return re.sub(r"[\,\;\:\?\!\'\’\"\(\)\{\}\[\]\/\\\*\-]", '', text)

        postprocess_text = remove_in_betweens(pdf_text)
        postprocess_text = remove_punctuations(postprocess_text)
        postprocess_text = re.sub(r"\ +", " ", postprocess_text)
        regex_titles = r"(?:[IVX]+|[0-9]+)\.\s[A-Z0-9\s]+$"
        titles = doc.get_toc()
        main_titles = []
        if len(titles) <= 0:
            main_titles = re.findall(regex_titles, postprocess_text, flags=re.MULTILINE)
        else:
            for title in titles:
                if title[0] == 1:
                    main_titles.append(title[1])
        return {"pub_id": id_doc, "titles": main_titles, "text": postprocess_text, "error": False} if len(main_titles) > 0 else {"pub_id": id_doc, "titles": "No titles found !", "text": postprocess_text, "error": False}
    else:
        print("ID: " + id_doc)
        print("URL: " + f"http://arxiv.org/pdf/{id_doc}")
        print("Status code: " + str(pdf_req.status_code))
        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}

@app.get("/extract/random/{keyword}/{limit}")
async def extract_random_pdf(keyword: str, limit: int):
    pubs = await get_articles(keyword, limit)
    return await extract_text_pdf(random.choice(list(pubs["message"].keys())))