Spaces:
Sleeping
Sleeping
Commit
·
6946525
1
Parent(s):
51b2216
PDF to images
Browse files- Dockerfile +1 -0
- app.py +32 -16
- requirements.txt +3 -1
Dockerfile
CHANGED
@@ -4,6 +4,7 @@ WORKDIR /code
|
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
|
|
7 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
|
9 |
COPY . .
|
|
|
4 |
|
5 |
COPY ./requirements.txt /code/requirements.txt
|
6 |
|
7 |
+
RUN apt-get update && apt-get install -y poppler-utils
|
8 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
9 |
|
10 |
COPY . .
|
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
import base64
|
|
|
|
|
|
|
2 |
|
3 |
from llama_index.core.schema import BaseNode
|
4 |
from shiny import ui, render, App
|
@@ -37,16 +40,24 @@ data_dir = Path(__file__).parent / "data"
|
|
37 |
docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json"))
|
38 |
|
39 |
|
40 |
-
def
|
41 |
-
"""Convert PDF file to
|
42 |
try:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
except Exception as e:
|
48 |
-
logger.error(f"Error
|
49 |
-
return
|
50 |
|
51 |
|
52 |
def get_str_structure(doc: BaseNode) -> str:
|
@@ -127,7 +138,15 @@ def server(input, output, session):
|
|
127 |
logger.error(f"Document does not have a filename: {input.selected_doc_id()}")
|
128 |
return ui.p("Error: Document does not have a filename")
|
129 |
pdf_path = data_dir / "pdfs" / filename
|
130 |
-
pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
return ui.div(
|
133 |
ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"),
|
@@ -135,13 +154,10 @@ def server(input, output, session):
|
|
135 |
ui.column(
|
136 |
6, # Left column (PDF)
|
137 |
ui.h4("PDF View"),
|
138 |
-
ui.
|
139 |
-
|
140 |
-
style="
|
141 |
-
|
142 |
-
)
|
143 |
-
if pdf_data_url
|
144 |
-
else ui.p("No PDF available"),
|
145 |
),
|
146 |
ui.column(
|
147 |
6, # Right column (Markdown)
|
|
|
1 |
import base64
|
2 |
+
from pdf2image import convert_from_path
|
3 |
+
from io import BytesIO
|
4 |
+
from PIL import Image
|
5 |
|
6 |
from llama_index.core.schema import BaseNode
|
7 |
from shiny import ui, render, App
|
|
|
40 |
docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json"))
|
41 |
|
42 |
|
43 |
+
def get_pdf_as_images(pdf_path):
|
44 |
+
"""Convert PDF file to a list of base64 encoded images"""
|
45 |
try:
|
46 |
+
# Convert PDF to list of images
|
47 |
+
images = convert_from_path(pdf_path)
|
48 |
+
image_urls = []
|
49 |
+
|
50 |
+
for img in images:
|
51 |
+
# Convert PIL image to base64
|
52 |
+
buffered = BytesIO()
|
53 |
+
img.save(buffered, format="PNG")
|
54 |
+
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
55 |
+
image_urls.append(f"data:image/png;base64,{img_base64}")
|
56 |
+
|
57 |
+
return image_urls
|
58 |
except Exception as e:
|
59 |
+
logger.error(f"Error converting PDF to images: {e}")
|
60 |
+
return []
|
61 |
|
62 |
|
63 |
def get_str_structure(doc: BaseNode) -> str:
|
|
|
138 |
logger.error(f"Document does not have a filename: {input.selected_doc_id()}")
|
139 |
return ui.p("Error: Document does not have a filename")
|
140 |
pdf_path = data_dir / "pdfs" / filename
|
141 |
+
# pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None
|
142 |
+
image_urls = get_pdf_as_images(pdf_path) if pdf_path.exists() else []
|
143 |
+
image_elements = [
|
144 |
+
ui.tags.img(
|
145 |
+
src=img_url,
|
146 |
+
style="width: 100%; margin-bottom: 10px; border: 1px solid #ddd;"
|
147 |
+
)
|
148 |
+
for img_url in image_urls
|
149 |
+
]
|
150 |
|
151 |
return ui.div(
|
152 |
ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"),
|
|
|
154 |
ui.column(
|
155 |
6, # Left column (PDF)
|
156 |
ui.h4("PDF View"),
|
157 |
+
ui.div(
|
158 |
+
ui.div(image_elements) if image_elements else ui.p("No PDF available"),
|
159 |
+
style="height: 800px; overflow-y: auto;"
|
160 |
+
),
|
|
|
|
|
|
|
161 |
),
|
162 |
ui.column(
|
163 |
6, # Right column (Markdown)
|
requirements.txt
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
shiny
|
2 |
-
llama-index-core
|
|
|
|
|
|
1 |
shiny
|
2 |
+
llama-index-core
|
3 |
+
pdf2image
|
4 |
+
Pillow
|