nicolasb92 commited on
Commit
6946525
·
1 Parent(s): 51b2216

PDF to images

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. app.py +32 -16
  3. requirements.txt +3 -1
Dockerfile CHANGED
@@ -4,6 +4,7 @@ WORKDIR /code
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
 
7
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
 
9
  COPY . .
 
4
 
5
  COPY ./requirements.txt /code/requirements.txt
6
 
7
+ RUN apt-get update && apt-get install -y poppler-utils
8
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
9
 
10
  COPY . .
app.py CHANGED
@@ -1,4 +1,7 @@
1
  import base64
 
 
 
2
 
3
  from llama_index.core.schema import BaseNode
4
  from shiny import ui, render, App
@@ -37,16 +40,24 @@ data_dir = Path(__file__).parent / "data"
37
  docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json"))
38
 
39
 
40
- def get_pdf_data_url(file_path):
41
- """Convert PDF file to data URL for embedding"""
42
  try:
43
- with open(file_path, "rb") as f:
44
- pdf_data = f.read()
45
- b64_data = base64.b64encode(pdf_data).decode()
46
- return f"data:application/pdf;base64,{b64_data}"
 
 
 
 
 
 
 
 
47
  except Exception as e:
48
- logger.error(f"Error reading PDF file: {e}")
49
- return None
50
 
51
 
52
  def get_str_structure(doc: BaseNode) -> str:
@@ -127,7 +138,15 @@ def server(input, output, session):
127
  logger.error(f"Document does not have a filename: {input.selected_doc_id()}")
128
  return ui.p("Error: Document does not have a filename")
129
  pdf_path = data_dir / "pdfs" / filename
130
- pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None
 
 
 
 
 
 
 
 
131
 
132
  return ui.div(
133
  ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"),
@@ -135,13 +154,10 @@ def server(input, output, session):
135
  ui.column(
136
  6, # Left column (PDF)
137
  ui.h4("PDF View"),
138
- ui.tags.iframe(
139
- src=pdf_data_url,
140
- style="width: 100%; height: 800px; border: 1px solid #ddd;",
141
- type="application/pdf",
142
- )
143
- if pdf_data_url
144
- else ui.p("No PDF available"),
145
  ),
146
  ui.column(
147
  6, # Right column (Markdown)
 
1
  import base64
2
+ from pdf2image import convert_from_path
3
+ from io import BytesIO
4
+ from PIL import Image
5
 
6
  from llama_index.core.schema import BaseNode
7
  from shiny import ui, render, App
 
40
  docstore = SimpleDocumentStore.from_persist_path(str(data_dir / "storage_metadata" / "processed_docstore_storage.json"))
41
 
42
 
43
+ def get_pdf_as_images(pdf_path):
44
+ """Convert PDF file to a list of base64 encoded images"""
45
  try:
46
+ # Convert PDF to list of images
47
+ images = convert_from_path(pdf_path)
48
+ image_urls = []
49
+
50
+ for img in images:
51
+ # Convert PIL image to base64
52
+ buffered = BytesIO()
53
+ img.save(buffered, format="PNG")
54
+ img_base64 = base64.b64encode(buffered.getvalue()).decode()
55
+ image_urls.append(f"data:image/png;base64,{img_base64}")
56
+
57
+ return image_urls
58
  except Exception as e:
59
+ logger.error(f"Error converting PDF to images: {e}")
60
+ return []
61
 
62
 
63
  def get_str_structure(doc: BaseNode) -> str:
 
138
  logger.error(f"Document does not have a filename: {input.selected_doc_id()}")
139
  return ui.p("Error: Document does not have a filename")
140
  pdf_path = data_dir / "pdfs" / filename
141
+ # pdf_data_url = get_pdf_data_url(pdf_path) if pdf_path else None
142
+ image_urls = get_pdf_as_images(pdf_path) if pdf_path.exists() else []
143
+ image_elements = [
144
+ ui.tags.img(
145
+ src=img_url,
146
+ style="width: 100%; margin-bottom: 10px; border: 1px solid #ddd;"
147
+ )
148
+ for img_url in image_urls
149
+ ]
150
 
151
  return ui.div(
152
  ui.h3(f"Display panel for: {doc.metadata.get('filename', 'Unknown')}"),
 
154
  ui.column(
155
  6, # Left column (PDF)
156
  ui.h4("PDF View"),
157
+ ui.div(
158
+ ui.div(image_elements) if image_elements else ui.p("No PDF available"),
159
+ style="height: 800px; overflow-y: auto;"
160
+ ),
 
 
 
161
  ),
162
  ui.column(
163
  6, # Right column (Markdown)
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  shiny
2
- llama-index-core
 
 
 
1
  shiny
2
+ llama-index-core
3
+ pdf2image
4
+ Pillow