Spaces:
Runtime error
Runtime error
File size: 2,293 Bytes
b0a8ef0 2ab7abe b0a8ef0 fe3c6cf e289392 b0a8ef0 2a4396e aa708c8 2ab7abe b0a8ef0 fe3c6cf b0a8ef0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import fitz
import io
import base64
from PIL import Image
import gradio as gr
import cv2
import tempfile
import os
def pdf_to_img(pdf_path):
pdf_document = fitz.open(pdf_path)
counter = 1
img_list = []
for page_number in range(len(pdf_document)):
page = pdf_document[page_number]
image_list = page.get_images()
for image in image_list:
base_img = pdf_document.extract_image(image[0])
image_data = base_img["image"]
img = Image.open(io.BytesIO(image_data))
# if img.mode == "RGBA":
# # Convert RGBA image to RGB
# img = img.convert("RGB")
extention = base_img['ext']
img.save(open(f"image{counter}.{extention}","wb"))
img_list.append(f"image{counter}.{extention}")
counter += 1
return (img_list)
def extract_text_from_pdf(pdf_file):
# Open the PDF file
doc = fitz.open(pdf_file)
# Initialize an empty string to store the extracted text
extracted_text = ""
# Iterate through each page of the PDF
for page_num in range(len(doc)):
# Load the page
page = doc.load_page(page_num)
# Extract text from the page and append it to the extracted_text string
extracted_text += page.get_text()
# Close the PDF document
doc.close()
return extracted_text
title = "Extract Image and Text"
with gr.Blocks(theme=gr.themes.Glass(primary_hue=gr.themes.colors.slate)) as demo:
gr.Markdown(f'<h1 style="text-align: center;">{title}</h1>')
with gr.Row():
with gr.Row():
with gr.Column():
file_input = gr.File(type="filepath", label="Upload .pdf file")
upload_button = gr.Button(value="Show Images")
img_gallery = gr.Gallery(label="Generated images", show_label=True, elem_id="gallery", object_fit="contain", height="auto",allow_preview=True)
with gr.Row():
with gr.Column():
output_text = text = gr.Textbox(label="Output", lines=4, autoscroll=False)
upload_button.click(pdf_to_img, inputs=file_input, outputs=[img_gallery])
upload_button.click(extract_text_from_pdf, inputs=file_input, outputs=[output_text])
demo.launch() |