Spaces:

OrganizedProgrammers
/

arXiv

Sleeping

App Files Files Community

Omar ID EL MOUMEN commited on Mar 28

Commit

577d055

1 Parent(s): 959f2b1

Add POST extract PDF

Browse files

Files changed (1) hide show

app.py +22 -5

app.py CHANGED Viewed

@@ -52,6 +52,9 @@ class Query(BaseModel):
 class DocumentID(BaseModel):
     doc_id: str
 @app.post("/search")
 async def get_articles(query: Query):
     XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
@@ -78,8 +81,8 @@ async def get_articles(query: Query):
         print(f"Error while downloading data : {str(e)}")
         return {"error": True, "message": str(e)}
-@app.post("/extract")
-async def extract_text_pdf(document: DocumentID):
     pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
     if pdf_req.status_code == 200:
         pdf_data = BytesIO(pdf_req.content)
@@ -121,7 +124,21 @@ async def extract_text_pdf(document: DocumentID):
         print("Status code: " + str(pdf_req.status_code))
         return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
-@app.post("/extract/random")
-async def extract_random_pdf(query: Query):
     pubs = await get_articles(query)
-    return await extract_text_pdf(random.choice(list(pubs["message"].keys())))

 class DocumentID(BaseModel):
     doc_id: str
+class URL(BaseModel):
+    url: str
 @app.post("/search")
 async def get_articles(query: Query):
     XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
         print(f"Error while downloading data : {str(e)}")
         return {"error": True, "message": str(e)}
+@app.post("/extract_pdf/arxiv_id")
+async def extract_arxiv_pdf(document: DocumentID):
     pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
     if pdf_req.status_code == 200:
         pdf_data = BytesIO(pdf_req.content)
         print("Status code: " + str(pdf_req.status_code))
         return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
+@app.post("/extract_pdf/arxiv_id/random")
+async def extract_random_arxiv_pdf(query: Query):
     pubs = await get_articles(query)
+    return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
+@app.post("/extract_pdf/url")
+async def extract_pdf(url: URL):
+    pdf_req = requests.get(url, verify=False)
+    if pdf_req.status_code == 200:
+        pdf_data = BytesIO(pdf_req.content)
+        doc = fitz.open(stream=pdf_data, filetype="pdf")
+        pdf_text = " ".join([page.get_text("text") for page in doc])
+        pdf_metadata = doc.metadata
+        return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
+    else:
+        print("URL: " + url)
+        print("Status code: " + str(pdf_req.status_code))
+        return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}