Omar ID EL MOUMEN commited on
Commit
577d055
·
1 Parent(s): 959f2b1

Add POST extract PDF

Browse files
Files changed (1) hide show
  1. app.py +22 -5
app.py CHANGED
@@ -52,6 +52,9 @@ class Query(BaseModel):
52
  class DocumentID(BaseModel):
53
  doc_id: str
54
 
 
 
 
55
  @app.post("/search")
56
  async def get_articles(query: Query):
57
  XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
@@ -78,8 +81,8 @@ async def get_articles(query: Query):
78
  print(f"Error while downloading data : {str(e)}")
79
  return {"error": True, "message": str(e)}
80
 
81
- @app.post("/extract")
82
- async def extract_text_pdf(document: DocumentID):
83
  pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
84
  if pdf_req.status_code == 200:
85
  pdf_data = BytesIO(pdf_req.content)
@@ -121,7 +124,21 @@ async def extract_text_pdf(document: DocumentID):
121
  print("Status code: " + str(pdf_req.status_code))
122
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
123
 
124
- @app.post("/extract/random")
125
- async def extract_random_pdf(query: Query):
126
  pubs = await get_articles(query)
127
- return await extract_text_pdf(random.choice(list(pubs["message"].keys())))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  class DocumentID(BaseModel):
53
  doc_id: str
54
 
55
+ class URL(BaseModel):
56
+ url: str
57
+
58
  @app.post("/search")
59
  async def get_articles(query: Query):
60
  XML_NAMESPACE = "{http://www.w3.org/2005/Atom}"
 
81
  print(f"Error while downloading data : {str(e)}")
82
  return {"error": True, "message": str(e)}
83
 
84
+ @app.post("/extract_pdf/arxiv_id")
85
+ async def extract_arxiv_pdf(document: DocumentID):
86
  pdf_req = requests.get(f"http://arxiv.org/pdf/{document.doc_id}", verify=False)
87
  if pdf_req.status_code == 200:
88
  pdf_data = BytesIO(pdf_req.content)
 
124
  print("Status code: " + str(pdf_req.status_code))
125
  return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}
126
 
127
+ @app.post("/extract_pdf/arxiv_id/random")
128
+ async def extract_random_arxiv_pdf(query: Query):
129
  pubs = await get_articles(query)
130
+ return await extract_arxiv_pdf(random.choice(list(pubs["message"].keys())))
131
+
132
+ @app.post("/extract_pdf/url")
133
+ async def extract_pdf(url: URL):
134
+ pdf_req = requests.get(url, verify=False)
135
+ if pdf_req.status_code == 200:
136
+ pdf_data = BytesIO(pdf_req.content)
137
+ doc = fitz.open(stream=pdf_data, filetype="pdf")
138
+ pdf_text = " ".join([page.get_text("text") for page in doc])
139
+ pdf_metadata = doc.metadata
140
+ return {"error": False, "title": pdf_metadata.get("title", "").strip(), "text": pdf_text}
141
+ else:
142
+ print("URL: " + url)
143
+ print("Status code: " + str(pdf_req.status_code))
144
+ return {"error": True, "message": "Error while downloading PDF: HTTP/" + str(pdf_req.status_code)}