Redmind commited on
Commit
6bbcff4
·
verified ·
1 Parent(s): a244d5b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI
2
  import os
3
- import fitz # pymupdf
4
  from pptx import Presentation # PowerPoint
5
  from sentence_transformers import SentenceTransformer # Text embeddings
6
  import torch
@@ -29,7 +29,7 @@ os.makedirs(IMAGE_FOLDER, exist_ok=True)
29
 
30
  # Extract text from PDF
31
  def extract_text_from_pdf(pdf_path):
32
- return " ".join([page.get_text() for page in fitz.open(pdf_path)]).strip()
33
 
34
  # Extract text from PowerPoint
35
  def extract_text_from_pptx(pptx_path):
@@ -38,7 +38,7 @@ def extract_text_from_pptx(pptx_path):
38
  # Extract images from PDF
39
  def extract_images_from_pdf(pdf_path):
40
  images = []
41
- doc = fitz.open(pdf_path)
42
  for i, page in enumerate(doc):
43
  for img_index, img in enumerate(page.get_images(full=True)):
44
  xref = img[0]
 
1
  from fastapi import FastAPI
2
  import os
3
+ import pymupdf
4
  from pptx import Presentation # PowerPoint
5
  from sentence_transformers import SentenceTransformer # Text embeddings
6
  import torch
 
29
 
30
  # Extract text from PDF
31
  def extract_text_from_pdf(pdf_path):
32
+ return " ".join([page.get_text() for page in pymupdf.open(pdf_path)]).strip()
33
 
34
  # Extract text from PowerPoint
35
  def extract_text_from_pptx(pptx_path):
 
38
  # Extract images from PDF
39
  def extract_images_from_pdf(pdf_path):
40
  images = []
41
+ doc = pymupdf.open(pdf_path)
42
  for i, page in enumerate(doc):
43
  for img_index, img in enumerate(page.get_images(full=True)):
44
  xref = img[0]