Spaces:
Running
Running
import streamlit as st | |
import fitz # PyMuPDF | |
from PIL import Image | |
import pytesseract | |
import numpy as np | |
from streamlit_drawable_canvas import st_canvas | |
import io | |
def pdf_page_to_image(doc, page_number=0, scale=1.0): | |
page = doc.load_page(page_number) | |
pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale)) | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
gray_img = img.convert("L") | |
return gray_img | |
def extract_text_tesseract(image): | |
"""Use Tesseract to extract text from an image.""" | |
return pytesseract.image_to_string(image) | |
def main(): | |
st.sidebar.title("PDF Navigation") | |
pdf_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"]) | |
if pdf_file: | |
doc = fitz.open("pdf", pdf_file.getvalue()) | |
total_pages = doc.page_count | |
selected_page = st.sidebar.slider("Select a Page", 1, total_pages, 1) - 1 | |
zoom_factor = st.sidebar.slider("Zoom Factor", 0.5, 3.0, 1.0, 0.1) | |
img = pdf_page_to_image(doc, page_number=selected_page, scale=zoom_factor) | |
img_array = np.array(img) | |
# Container to add scrollbars | |
container = st.container() | |
with container: | |
st.image(img_array, use_column_width=True, caption=f"Page {selected_page + 1}") | |
canvas_result = st_canvas( | |
fill_color="rgba(255, 165, 0, 0.3)", | |
stroke_width=0, | |
stroke_color="#ffffff", | |
background_image=Image.fromarray(img_array), | |
update_streamlit=True, | |
height=int(img.height), | |
width=int(img.width), | |
drawing_mode="rect", | |
key="canvas" + str(selected_page) + str(zoom_factor), | |
) | |
if st.button("Extract Text from Selected Region"): | |
selected_areas = len(canvas_result.json_data["objects"]) | |
texts = [] | |
for area_id in range(selected_areas): | |
bbox = canvas_result.json_data["objects"][area_id] if canvas_result.json_data["objects"] else None | |
if bbox: | |
x, y, w, h = bbox['left'], bbox['top'], bbox['width'], bbox['height'] | |
rect = [int(x), int(y), int(x + w), int(y + h)] | |
img_crop = img.crop(rect) | |
text = extract_text_tesseract(img_crop) | |
texts.append(text) | |
for id, text in enumerate(texts): | |
st.write(f"Extracted Text from selection {id}:") | |
st.write(text) | |
doc.close() | |
if __name__ == "__main__": | |
main() |