Spaces:

zjrwtxtechstudio
/

video-to-pdf

Running

App Files Files Community

video-to-pdf / app.py

zjrwtxtechstudio

Update app.py

2a08f3d verified 7 months ago

raw

history blame contribute delete

17.9 kB

	import os
	import time
	import cv2
	import imutils
	import shutil
	import img2pdf
	import glob
	from skimage.metrics import structural_similarity
	import gradio as gr
	import tempfile
	import whisper
	from moviepy.editor import VideoFileClip
	from PIL import Image, ImageDraw, ImageFont

	############# Define constants

	OUTPUT_SLIDES_DIR = f"./output"

	FRAME_RATE = 3 # no.of frames per second that needs to be processed, fewer the count faster the speed
	WARMUP = FRAME_RATE # initial number of frames to be skipped
	FGBG_HISTORY = FRAME_RATE * 15 # no.of frames in background object
	VAR_THRESHOLD = 16 # Threshold on the squared Mahalanobis distance between the pixel and the model to decide whether a pixel is well described by the background model.
	DETECT_SHADOWS = False # If true, the algorithm will detect shadows and mark them.
	MIN_PERCENT = 0.1 # min % of diff between foreground and background to detect if motion has stopped
	MAX_PERCENT = 3 # max % of diff between foreground and background to detect if frame is still in motion
	SSIM_THRESHOLD = 0.9 # SSIM threshold of two consecutive frame


	def get_frames(video_path):
	'''A fucntion to return the frames from a video located at video_path
	this function skips frames as defined in FRAME_RATE'''


	# open a pointer to the video file initialize the width and height of the frame
	vs = cv2.VideoCapture(video_path)
	if not vs.isOpened():
	raise Exception(f'unable to open file {video_path}')


	total_frames = vs.get(cv2.CAP_PROP_FRAME_COUNT)
	frame_time = 0
	frame_count = 0

	# loop over the frames of the video
	while True:
	vs.set(cv2.CAP_PROP_POS_MSEC, frame_time * 1000) # move frame to a timestamp
	frame_time += 1/FRAME_RATE

	(_, frame) = vs.read()
	# if the frame is None, then we have reached the end of the video file
	if frame is None:
	break

	frame_count += 1
	yield frame_count, frame_time, frame

	vs.release()



	def detect_unique_screenshots(video_path, output_folder_screenshot_path, progress=gr.Progress()):
	'''Extract unique screenshots from video'''
	fgbg = cv2.createBackgroundSubtractorMOG2(history=FGBG_HISTORY, varThreshold=VAR_THRESHOLD,detectShadows=DETECT_SHADOWS)

	captured = False
	start_time = time.time()
	(W, H) = (None, None)

	# Get total frames for progress calculation
	cap = cv2.VideoCapture(video_path)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	cap.release()

	screenshoots_count = 0
	last_screenshot = None
	saved_files = []

	progress(0, desc="初始化视频处理...")

	for frame_count, frame_time, frame in get_frames(video_path):
	# Update progress
	progress((frame_count / total_frames) * 0.7, desc=f"处理视频帧 {frame_count}/{total_frames}")

	orig = frame.copy()
	frame = imutils.resize(frame, width=600)
	mask = fgbg.apply(frame)

	if W is None or H is None:
	(H, W) = mask.shape[:2]

	p_diff = (cv2.countNonZero(mask) / float(W * H)) * 100

	if p_diff < MIN_PERCENT and not captured and frame_count > WARMUP:
	captured = True
	filename = f"{screenshoots_count:03}_{round(frame_time/60, 2)}.png"
	path = os.path.join(output_folder_screenshot_path, filename)

	image_ssim = 0.0
	if last_screenshot is not None:
	image_ssim = structural_similarity(last_screenshot, orig, channel_axis=2, data_range=255)

	if image_ssim < SSIM_THRESHOLD:
	try:
	progress(0.7 + (screenshoots_count * 0.1), desc=f"保存截图 {screenshoots_count + 1}")
	print("saving {}".format(path))
	cv2.imwrite(str(path), orig)
	last_screenshot = orig
	saved_files.append(path)
	screenshoots_count += 1
	except Exception as e:
	print(f"Error saving image: {str(e)}")
	continue

	elif captured and p_diff >= MAX_PERCENT:
	captured = False

	progress(0.8, desc="截图提取完成")
	print(f'{screenshoots_count} screenshots Captured!')
	print(f'Time taken {time.time()-start_time}s')
	return saved_files


	def initialize_output_folder(video_path):
	'''Clean the output folder if already exists'''
	# Create a safe folder name from video filename
	video_filename = os.path.splitext(os.path.basename(video_path))[0]
	# Replace potentially problematic characters
	safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
	output_folder_screenshot_path = os.path.join(OUTPUT_SLIDES_DIR, safe_filename)

	if os.path.exists(output_folder_screenshot_path):
	shutil.rmtree(output_folder_screenshot_path)

	os.makedirs(output_folder_screenshot_path, exist_ok=True)
	print('initialized output folder', output_folder_screenshot_path)
	return output_folder_screenshot_path


	def convert_screenshots_to_pdf(video_path, output_folder_screenshot_path):
	# Create a safe filename
	video_filename = os.path.splitext(os.path.basename(video_path))[0]
	safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
	output_pdf_path = os.path.join(OUTPUT_SLIDES_DIR, f"{safe_filename}.pdf")

	try:
	print('output_folder_screenshot_path', output_folder_screenshot_path)
	print('output_pdf_path', output_pdf_path)
	print('converting images to pdf..')

	# Get all PNG files and ensure they exist
	png_files = sorted(glob.glob(os.path.join(output_folder_screenshot_path, "*.png")))
	if not png_files:
	raise Exception("No PNG files found to convert to PDF")

	with open(output_pdf_path, "wb") as f:
	f.write(img2pdf.convert(png_files))

	print('Pdf Created!')
	print('pdf saved at', output_pdf_path)
	return output_pdf_path
	except Exception as e:
	print(f"Error creating PDF: {str(e)}")
	raise


	def video_to_slides(video_path, progress=gr.Progress()):
	progress(0.1, desc="准备处理视频...")
	output_folder_screenshot_path = initialize_output_folder(video_path)
	saved_files = detect_unique_screenshots(video_path, output_folder_screenshot_path, progress)
	return output_folder_screenshot_path, saved_files


	def slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress=gr.Progress()):
	video_filename = os.path.splitext(os.path.basename(video_path))[0]
	safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
	output_pdf_path = os.path.join(OUTPUT_SLIDES_DIR, f"{safe_filename}.pdf")

	try:
	progress(0.9, desc="正在生成PDF...")
	print('output_folder_screenshot_path', output_folder_screenshot_path)
	print('output_pdf_path', output_pdf_path)

	if not saved_files:
	raise Exception("未从视频中捕获到截图")

	existing_files = [f for f in saved_files if os.path.exists(f)]
	if not existing_files:
	raise Exception("未找到保存的截图文件")

	with open(output_pdf_path, "wb") as f:
	f.write(img2pdf.convert(existing_files))

	progress(1.0, desc="处理完成！")
	print('PDF创建成功！')
	print('PDF保存位置:', output_pdf_path)
	return output_pdf_path
	except Exception as e:
	print(f"创建PDF时出错: {str(e)}")
	raise


	def run_app(video_path, progress=gr.Progress()):
	try:
	if not video_path:
	raise gr.Error("请选择要处理的视频文件")

	progress(0, desc="开始处理...")
	output_folder_screenshot_path, saved_files = video_to_slides(video_path, progress)
	return slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress)
	except Exception as e:
	raise gr.Error(f"处理失败: {str(e)}")


	def process_video_file(video_file):
	"""Handle uploaded video file and return PDF"""
	try:
	# If video_file is a string (path), use it directly
	if isinstance(video_file, str):
	if video_file.strip() == "":
	return None
	return run_app(video_file)

	# If it's an uploaded file, create a temporary file
	if video_file is not None:
	# Generate a unique filename for the temporary video
	temp_filename = f"temp_video_{int(time.time())}.mp4"
	temp_path = os.path.join(tempfile.gettempdir(), temp_filename)

	try:
	if hasattr(video_file, 'name'): # If it's already a file path
	shutil.copyfile(video_file, temp_path)
	else: # If it's file content
	with open(temp_path, 'wb') as f:
	f.write(video_file)

	# Process the video
	output_folder_screenshot_path, saved_files = video_to_slides(temp_path)
	pdf_path = slides_to_pdf(temp_path, output_folder_screenshot_path, saved_files)

	# Cleanup
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	return pdf_path

	except Exception as e:
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	raise gr.Error(f"处理视频时出错: {str(e)}")
	return None
	except Exception as e:
	raise gr.Error(f"处理视频时出错: {str(e)}")


	def extract_audio_and_transcribe(video_path, progress=gr.Progress()):
	"""Extract audio from video and transcribe it using Whisper"""
	progress(0, desc="正在提取音频...")

	# Load the video and extract audio
	video = VideoFileClip(video_path)
	audio = video.audio

	# Save audio to temporary file
	temp_audio = tempfile.mktemp(suffix='.wav')
	audio.write_audiofile(temp_audio)

	progress(0.3, desc="正在转录音频...")

	# Load Whisper model and transcribe
	model = whisper.load_model("base")
	result = model.transcribe(temp_audio)
	print("完成的转录文本结果如下："+result)

	# Clean up
	os.remove(temp_audio)
	video.close()

	# Process segments with timestamps
	segments = []
	for segment in result["segments"]:
	segments.append({
	"start": segment["start"],
	"end": segment["end"],
	"text": segment["text"].strip()
	})

	return segments

	def add_text_to_image(image_path, text):
	"""Add text below the image"""
	# Open image
	img = Image.open(image_path)
	width, height = img.size

	# Create new image with space for text
	font_size = 30
	font = ImageFont.truetype("arial.ttf", font_size)
	text_height = font_size * (text.count('\n') + 2) # Add padding

	new_img = Image.new('RGB', (width, height + text_height), 'white')
	new_img.paste(img, (0, 0))

	# Add text
	draw = ImageDraw.Draw(new_img)
	draw.text((10, height + 10), text, font=font, fill='black')

	# Save the modified image
	new_img.save(image_path)

	def process_video_with_transcription(video_path, output_folder_screenshot_path, progress=gr.Progress()):
	"""Process video with transcription and add text to images"""
	# First, get the transcription
	segments = extract_audio_and_transcribe(video_path, progress)

	# Then get the frames as before
	saved_files = detect_unique_screenshots(video_path, output_folder_screenshot_path, progress)

	progress(0.8, desc="正在添加字幕...")

	# Match transcription segments with images
	for i, image_path in enumerate(saved_files):
	# Extract timestamp from filename (format: 000_1.23.png)
	timestamp = float(os.path.basename(image_path).split('_')[1].split('.png')[0])

	# Find relevant text segments for this timestamp
	relevant_text = []
	for segment in segments:
	if segment["start"] <= timestamp * 60 <= segment["end"]:
	relevant_text.append(segment["text"])

	# Add text to image
	if relevant_text:
	text = "\n".join(relevant_text)
	add_text_to_image(image_path, text)

	progress(0.9, desc="处理完成...")
	return saved_files

	def run_app_with_transcription(video_path, progress=gr.Progress()):
	try:
	if not video_path:
	raise gr.Error("请选择要处理的视频文件")

	progress(0, desc="开始处理...")
	output_folder_screenshot_path = initialize_output_folder(video_path)
	saved_files = process_video_with_transcription(video_path, output_folder_screenshot_path, progress)
	return slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress)
	except Exception as e:
	raise gr.Error(f"处理失败: {str(e)}")

	def process_video_file_with_transcription(video_file):
	"""Handle uploaded video file and return PDF with transcription"""
	try:
	# If video_file is a string (path), use it directly
	if isinstance(video_file, str):
	if video_file.strip() == "":
	return None
	return run_app_with_transcription(video_file)

	# If it's an uploaded file, create a temporary file
	if video_file is not None:
	# Generate a unique filename for the temporary video
	temp_filename = f"temp_video_{int(time.time())}.mp4"
	temp_path = os.path.join(tempfile.gettempdir(), temp_filename)

	try:
	if hasattr(video_file, 'name'): # If it's already a file path
	shutil.copyfile(video_file, temp_path)
	else: # If it's file content
	with open(temp_path, 'wb') as f:
	f.write(video_file)

	# Process the video
	output_folder_screenshot_path, saved_files = video_to_slides(temp_path)
	saved_files = process_video_with_transcription(temp_path, output_folder_screenshot_path)
	pdf_path = slides_to_pdf(temp_path, output_folder_screenshot_path, saved_files)

	# Cleanup
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	return pdf_path

	except Exception as e:
	if os.path.exists(temp_path):
	os.unlink(temp_path)
	raise gr.Error(f"处理视频时出错: {str(e)}")
	return None
	except Exception as e:
	raise gr.Error(f"处理视频时出错: {str(e)}")


	def process_video(video, path):
	if video:
	return run_app(video)
	elif path:
	return run_app(path)
	else:
	raise gr.Error("请上传视频或输入视频路径")

	def handle_video_with_transcription(video, path):
	if video:
	return run_app_with_transcription(video)
	elif path:
	return run_app_with_transcription(path)
	else:
	raise gr.Error("请上传视频或输入视频路径")

	# Create a modern interface with custom CSS
	css = """
	.gradio-container {
	font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
	}
	.container {
	max-width: 900px;
	margin: auto;
	padding: 20px;
	}
	.gr-button {
	background: linear-gradient(90deg, #2563eb, #3b82f6);
	border: none;
	color: white;
	}
	.gr-button:hover {
	background: linear-gradient(90deg, #1d4ed8, #2563eb);
	transform: translateY(-1px);
	box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
	}
	.status-info {
	margin-top: 10px;
	padding: 10px;
	border-radius: 4px;
	background-color: #f3f4f6;
	}
	"""

	if __name__ == "__main__":
	with gr.Blocks(css=css) as iface:
	gr.Markdown("# 视频转PDF工具")

	with gr.Tab("基础转换"):
	with gr.Row():
	with gr.Column():
	video_input = gr.Video(label="上传视频")
	video_path = gr.Textbox(label="或输入视频路径", placeholder="例如: ./input/video.mp4")
	convert_btn = gr.Button("开始转换", variant="primary")

	with gr.Row():
	output_file = gr.File(label="下载PDF")

	with gr.Tab("带语音转文字"):
	with gr.Row():
	with gr.Column():
	video_input_with_transcription = gr.Video(label="上传视频")
	video_path_with_transcription = gr.Textbox(label="或输入视频路径", placeholder="例如: ./input/video.mp4")
	convert_btn_with_transcription = gr.Button("开始转换（带字幕）", variant="primary")

	with gr.Row():
	output_file_with_transcription = gr.File(label="下载PDF（带字幕）")

	convert_btn.click(
	fn=process_video,
	inputs=[video_input, video_path],
	outputs=[output_file],
	)

	convert_btn_with_transcription.click(
	fn=handle_video_with_transcription,
	inputs=[video_input_with_transcription, video_path_with_transcription],
	outputs=[output_file_with_transcription],
	)

	iface.launch()