Spaces:
Sleeping
Sleeping
Omar ID EL MOUMEN
commited on
Commit
·
577d055
1
Parent(s):
959f2b1
Add POST extract PDF
Browse files
app.py
CHANGED
@@ -52,6 +52,9 @@ class Query(BaseModel):
|
|
52 |
class DocumentID(BaseModel):
|
53 |
doc_id: str
|
54 |
|
|
|
|
|
|
|
55 |
@app.post("/search")
|
56 |
async def get_articles(query: Query):
|
57 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
@@ -78,8 +81,8 @@ async def get_articles(query: Query):
|
|
78 |
print(f"Error while downloading data : {str(e)}")
|
79 |
return {"error": True, "message": str(e)}
|
80 |
|
81 |
-
@app.post("/
|
82 |
-
async def
|
83 |
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
84 |
if pdf_req.status_code == 200:
|
85 |
pdf_data = BytesIO(pdf_req.content)
|
@@ -121,7 +124,21 @@ async def extract_text_pdf(document: DocumentID):
|
|
121 |
print("Status code: " + str(pdf_req.status_code))
|
122 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
123 |
|
124 |
-
@app.post("/
|
125 |
-
async def
|
126 |
pubs = await get_articles(query)
|
127 |
-
return await
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
class DocumentID(BaseModel):
|
53 |
doc_id: str
|
54 |
|
55 |
+
class URL(BaseModel):
|
56 |
+
url: str
|
57 |
+
|
58 |
@app.post("/search")
|
59 |
async def get_articles(query: Query):
|
60 |
XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
|
|
|
81 |
print(f"Error while downloading data : {str(e)}")
|
82 |
return {"error": True, "message": str(e)}
|
83 |
|
84 |
+
@app.post("/extract_pdf/arxiv_id")
|
85 |
+
async def extract_arxiv_pdf(document: DocumentID):
|
86 |
pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
|
87 |
if pdf_req.status_code == 200:
|
88 |
pdf_data = BytesIO(pdf_req.content)
|
|
|
124 |
print("Status code: " + str(pdf_req.status_code))
|
125 |
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|
126 |
|
127 |
+
@app.post("/extract_pdf/arxiv_id/random")
|
128 |
+
async def extract_random_arxiv_pdf(query: Query):
|
129 |
pubs = await get_articles(query)
|
130 |
+
return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
|
131 |
+
|
132 |
+
@app.post("/extract_pdf/url")
|
133 |
+
async def extract_pdf(url: URL):
|
134 |
+
pdf_req = requests.get(url, verify=False)
|
135 |
+
if pdf_req.status_code == 200:
|
136 |
+
pdf_data = BytesIO(pdf_req.content)
|
137 |
+
doc = fitz.open(stream=pdf_data, filetype="pdf")
|
138 |
+
pdf_text = " ".join([page.get_text("text") for page in doc])
|
139 |
+
pdf_metadata = doc.metadata
|
140 |
+
return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
|
141 |
+
else:
|
142 |
+
print("URL: " + url)
|
143 |
+
print("Status code: " + str(pdf_req.status_code))
|
144 |
+
return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
|