Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from fastapi import FastAPI
|
2 |
import os
|
3 |
-
import
|
4 |
from pptx import Presentation # PowerPoint
|
5 |
from sentence_transformers import SentenceTransformer # Text embeddings
|
6 |
import torch
|
@@ -29,7 +29,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
|
|
29 |
|
30 |
# Extract text from PDF
|
31 |
def extract_text_from_pdf(pdf_path):
|
32 |
-
return " ".join([page.get_text() for page in
|
33 |
|
34 |
# Extract text from PowerPoint
|
35 |
def extract_text_from_pptx(pptx_path):
|
@@ -38,7 +38,7 @@ def extract_text_from_pptx(pptx_path):
|
|
38 |
# Extract images from PDF
|
39 |
def extract_images_from_pdf(pdf_path):
|
40 |
images = []
|
41 |
-
doc =
|
42 |
for i, page in enumerate(doc):
|
43 |
for img_index, img in enumerate(page.get_images(full=True)):
|
44 |
xref = img[0]
|
|
|
1 |
from fastapi import FastAPI
|
2 |
import os
|
3 |
+
import pymupdf
|
4 |
from pptx import Presentation # PowerPoint
|
5 |
from sentence_transformers import SentenceTransformer # Text embeddings
|
6 |
import torch
|
|
|
29 |
|
30 |
# Extract text from PDF
|
31 |
def extract_text_from_pdf(pdf_path):
|
32 |
+
return " ".join([page.get_text() for page in pymupdf.open(pdf_path)]).strip()
|
33 |
|
34 |
# Extract text from PowerPoint
|
35 |
def extract_text_from_pptx(pptx_path):
|
|
|
38 |
# Extract images from PDF
|
39 |
def extract_images_from_pdf(pdf_path):
|
40 |
images = []
|
41 |
+
doc = pymupdf.open(pdf_path)
|
42 |
for i, page in enumerate(doc):
|
43 |
for img_index, img in enumerate(page.get_images(full=True)):
|
44 |
xref = img[0]
|