dmitrynovikov2121 commited on
Commit
4bb1eb6
Β·
verified Β·
1 Parent(s): 38b4e43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -13,6 +13,7 @@ from io import BytesIO
13
  from fastapi import FastAPI, File, UploadFile
14
  from fastapi.responses import JSONResponse
15
  import uvicorn
 
16
 
17
  # Initialize FastAPI app
18
  app = FastAPI()
@@ -235,7 +236,7 @@ async def process_document(
235
  content = await file.read()
236
  buffer.write(content)
237
 
238
- # Process file
239
  md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
240
  temp_path,
241
  end_pages=end_pages,
@@ -245,18 +246,25 @@ async def process_document(
245
  table_enable=table_enable,
246
  language=language
247
  )
248
-
249
- # Read the zip file as base64
250
- with open(archive_zip_path, "rb") as zip_file:
251
- zip_content = base64.b64encode(zip_file.read()).decode()
 
 
 
 
 
 
 
 
252
 
253
  # Clean up
254
  os.remove(temp_path)
255
 
256
  return JSONResponse({
257
- "markdown_content": md_content,
258
- "text_content": txt_content,
259
- "zip_file_base64": zip_content
260
  })
261
 
262
  except Exception as e:
 
13
  from fastapi import FastAPI, File, UploadFile
14
  from fastapi.responses import JSONResponse
15
  import uvicorn
16
+ from PyPDF2 import PdfReader
17
 
18
  # Initialize FastAPI app
19
  app = FastAPI()
 
236
  content = await file.read()
237
  buffer.write(content)
238
 
239
+ # Source 1: Using magic-pdf processing
240
  md_content, txt_content, archive_zip_path, new_pdf_path = to_markdown(
241
  temp_path,
242
  end_pages=end_pages,
 
246
  table_enable=table_enable,
247
  language=language
248
  )
249
+ source_1 = txt_content
250
+
251
+ # Source 2: Using PyPDF2
252
+ def extract_text_from_pdf(doc_path):
253
+ try:
254
+ reader = PdfReader(doc_path)
255
+ text = "\n".join(page.extract_text() for page in reader.pages[:end_pages] if page.extract_text())
256
+ return text
257
+ except Exception as e:
258
+ return str(e)
259
+
260
+ source_2 = extract_text_from_pdf(temp_path)
261
 
262
  # Clean up
263
  os.remove(temp_path)
264
 
265
  return JSONResponse({
266
+ "source_1": source_1,
267
+ "source_2": source_2
 
268
  })
269
 
270
  except Exception as e: