Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,7 @@ from io import BytesIO
|
|
13 |
from fastapi import FastAPI, File, UploadFile
|
14 |
from fastapi.responses import JSONResponse
|
15 |
import uvicorn
|
|
|
16 |
|
17 |
# Initialize FastAPI app
|
18 |
app = FastAPI()
|
@@ -235,7 +236,7 @@ async def process_document(
|
|
235 |
content = await file.read()
|
236 |
buffer.write(content)
|
237 |
|
238 |
-
#
|
239 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
240 |
temp_path,
|
241 |
end_pages=end_pages,
|
@@ -245,18 +246,25 @@ async def process_document(
|
|
245 |
table_enable=table_enable,
|
246 |
language=language
|
247 |
)
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
|
253 |
# Clean up
|
254 |
os.remove(temp_path)
|
255 |
|
256 |
return JSONResponse({
|
257 |
-
"
|
258 |
-
"
|
259 |
-
"zip_file_base64": zip_content
|
260 |
})
|
261 |
|
262 |
except Exception as e:
|
|
|
13 |
from fastapi import FastAPI, File, UploadFile
|
14 |
from fastapi.responses import JSONResponse
|
15 |
import uvicorn
|
16 |
+
from PyPDF2 import PdfReader
|
17 |
|
18 |
# Initialize FastAPI app
|
19 |
app = FastAPI()
|
|
|
236 |
content = await file.read()
|
237 |
buffer.write(content)
|
238 |
|
239 |
+
# Source 1: Using magic-pdf processing
|
240 |
md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
|
241 |
temp_path,
|
242 |
end_pages=end_pages,
|
|
|
246 |
table_enable=table_enable,
|
247 |
language=language
|
248 |
)
|
249 |
+
source_1 = txt_content
|
250 |
+
|
251 |
+
# Source 2: Using PyPDF2
|
252 |
+
def extract_text_from_pdf(doc_path):
|
253 |
+
try:
|
254 |
+
reader = PdfReader(doc_path)
|
255 |
+
text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
|
256 |
+
return text
|
257 |
+
except Exception as e:
|
258 |
+
return str(e)
|
259 |
+
|
260 |
+
source_2 = extract_text_from_pdf(temp_path)
|
261 |
|
262 |
# Clean up
|
263 |
os.remove(temp_path)
|
264 |
|
265 |
return JSONResponse({
|
266 |
+
"source_1": source_1,
|
267 |
+
"source_2": source_2
|
|
|
268 |
})
|
269 |
|
270 |
except Exception as e:
|