Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

copyright_checker / pdf_supporter /demo.py

aliasgerovs

Updated

9c71743 6 months ago

raw

history blame

2.56 kB

	import streamlit as st
	import fitz # PyMuPDF
	from PIL import Image
	import pytesseract
	import numpy as np
	from streamlit_drawable_canvas import st_canvas
	import io

	def pdf_page_to_image(doc, page_number=0, scale=1.0):
	page = doc.load_page(page_number)
	pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	gray_img = img.convert("L")
	return gray_img

	def extract_text_tesseract(image):
	"""Use Tesseract to extract text from an image."""
	return pytesseract.image_to_string(image)

	def main():
	st.sidebar.title("PDF Navigation")
	pdf_file = st.sidebar.file_uploader("Upload a PDF file", type=["pdf"])
	if pdf_file:
	doc = fitz.open("pdf", pdf_file.getvalue())
	total_pages = doc.page_count
	selected_page = st.sidebar.slider("Select a Page", 1, total_pages, 1) - 1
	zoom_factor = st.sidebar.slider("Zoom Factor", 0.5, 3.0, 1.0, 0.1)

	img = pdf_page_to_image(doc, page_number=selected_page, scale=zoom_factor)
	img_array = np.array(img)

	# Container to add scrollbars
	container = st.container()
	with container:
	st.image(img_array, use_column_width=True, caption=f"Page {selected_page + 1}")

	canvas_result = st_canvas(
	fill_color="rgba(255, 165, 0, 0.3)",
	stroke_width=0,
	stroke_color="#ffffff",
	background_image=Image.fromarray(img_array),
	update_streamlit=True,
	height=int(img.height),
	width=int(img.width),
	drawing_mode="rect",
	key="canvas" + str(selected_page) + str(zoom_factor),
	)

	if st.button("Extract Text from Selected Region"):
	selected_areas = len(canvas_result.json_data["objects"])
	texts = []
	for area_id in range(selected_areas):
	bbox = canvas_result.json_data["objects"][area_id] if canvas_result.json_data["objects"] else None
	if bbox:
	x, y, w, h = bbox['left'], bbox['top'], bbox['width'], bbox['height']
	rect = [int(x), int(y), int(x + w), int(y + h)]
	img_crop = img.crop(rect)
	text = extract_text_tesseract(img_crop)
	texts.append(text)

	for id, text in enumerate(texts):
	st.write(f"Extracted Text from selection {id}:")
	st.write(text)

	doc.close()

	if __name__ == "__main__":
	main()