akshayp commited on
Commit
eff991c
·
1 Parent(s): 02c8ce6

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. app.py +13 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.11
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+ RUN apt update && apt install -y poppler-utils tesseract-ocr
15
+
16
+ COPY --chown=user . /app
17
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Query, HTTPException
2
+ from extractous import Extractor, TesseractOcrConfig
3
+
4
+ app = FastAPI()
5
+
6
+ @app.get("/")
7
+ def accepts_pdf_link(link: str = Query(..., description="The URL to pdf file")):
8
+ if not link.startswith(("http://", "https://")):
9
+ raise HTTPException(status_code=400, detail="Invalid URL format")
10
+ extractor = Extractor().set_ocr_config(TesseractOcrConfig())
11
+ extractor = extractor.set_xml_output(False)
12
+ content, metadata = extractor.extract_url_to_string(link)
13
+ return {"received_link": link, "content": content}
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ google-genai
4
+ vecs
5
+ extractous
6
+ beautifulsoup4
7
+ markdownify
8
+ pdf2image
9
+ firebase-admin