louiecerv commited on
Commit
4a47ee5
Β·
1 Parent(s): 59e2122

extracted the images from the PDF

Browse files
Files changed (1) hide show
  1. app.py +22 -3
app.py CHANGED
@@ -7,6 +7,7 @@ from reportlab.lib.pagesizes import letter
7
  from io import BytesIO
8
  from PyPDF2 import PdfReader
9
  from PIL import Image
 
10
 
11
  # OpenAI API Key
12
  api_key = os.getenv("OPENAI_API_KEY") # Ensure this environment variable is set
@@ -17,6 +18,15 @@ headers = {
17
  }
18
 
19
  def extract_text_and_images_from_pdf(pdf_file):
 
 
 
 
 
 
 
 
 
20
  text_content = ""
21
  images = []
22
 
@@ -28,9 +38,18 @@ def extract_text_and_images_from_pdf(pdf_file):
28
  for page in pdf_reader.pages:
29
  text_content += page.extract_text()
30
 
31
- # Extract images (This part requires creating images using ReportLab for demonstration)
32
- # In a real scenario, extracting images from PDF is more complex and usually done with specialized libraries.
33
- # Add image extraction logic if needed
 
 
 
 
 
 
 
 
 
34
 
35
  return text_content, images
36
 
 
7
  from io import BytesIO
8
  from PyPDF2 import PdfReader
9
  from PIL import Image
10
+ import fitz # PyMuPDF
11
 
12
  # OpenAI API Key
13
  api_key = os.getenv("OPENAI_API_KEY") # Ensure this environment variable is set
 
18
  }
19
 
20
  def extract_text_and_images_from_pdf(pdf_file):
21
+ """
22
+ Extracts text and images from a PDF file.
23
+
24
+ Args:
25
+ pdf_file (UploadedFile): The uploaded PDF file.
26
+
27
+ Returns:
28
+ tuple: A tuple containing the extracted text and images.
29
+ """
30
  text_content = ""
31
  images = []
32
 
 
38
  for page in pdf_reader.pages:
39
  text_content += page.extract_text()
40
 
41
+ # Extract images using PyMuPDF
42
+ doc = fitz.open(stream=pdf_stream)
43
+ for page_index in range(len(doc)):
44
+ page = doc.load_page(page_index)
45
+ image_list = page.get_images()
46
+ for img_index, img in enumerate(image_list):
47
+ xref = img[0]
48
+ base_image = doc.extract_image(xref)
49
+ image_bytes = base_image["image"]
50
+ # Convert image bytes to a PIL Image object
51
+ image = Image.open(BytesIO(image_bytes))
52
+ images.append(image)
53
 
54
  return text_content, images
55