Spaces:

zjrwtxtechstudio
/

video-to-pdf

Running

File size: 17,866 Bytes

2a08f3d

import os
import time
import cv2
import imutils
import shutil
import img2pdf
import glob
from skimage.metrics import structural_similarity
import gradio as gr
import tempfile
import whisper
from moviepy.editor import VideoFileClip
from PIL import Image, ImageDraw, ImageFont

############# Define constants

OUTPUT_SLIDES_DIR = f"./output"

FRAME_RATE = 3                   # no.of frames per second that needs to be processed, fewer the count faster the speed
WARMUP = FRAME_RATE              # initial number of frames to be skipped
FGBG_HISTORY = FRAME_RATE * 15   # no.of frames in background object
VAR_THRESHOLD = 16               # Threshold on the squared Mahalanobis distance between the pixel and the model to decide whether a pixel is well described by the background model.
DETECT_SHADOWS = False            # If true, the algorithm will detect shadows and mark them.
MIN_PERCENT = 0.1                # min % of diff between foreground and background to detect if motion has stopped
MAX_PERCENT = 3                  # max % of diff between foreground and background to detect if frame is still in motion
SSIM_THRESHOLD = 0.9             # SSIM threshold of two consecutive frame


def get_frames(video_path):
    '''A fucntion to return the frames from a video located at video_path
    this function skips frames as defined in FRAME_RATE'''
    
    
    # open a pointer to the video file initialize the width and height of the frame
    vs = cv2.VideoCapture(video_path)
    if not vs.isOpened():
        raise Exception(f'unable to open file {video_path}')


    total_frames = vs.get(cv2.CAP_PROP_FRAME_COUNT)
    frame_time = 0
    frame_count = 0

    # loop over the frames of the video
    while True:
        vs.set(cv2.CAP_PROP_POS_MSEC, frame_time * 1000)    # move frame to a timestamp
        frame_time += 1/FRAME_RATE

        (_, frame) = vs.read()
        # if the frame is None, then we have reached the end of the video file
        if frame is None:
            break

        frame_count += 1
        yield frame_count, frame_time, frame

    vs.release()
 


def detect_unique_screenshots(video_path, output_folder_screenshot_path, progress=gr.Progress()):
    '''Extract unique screenshots from video'''
    fgbg = cv2.createBackgroundSubtractorMOG2(history=FGBG_HISTORY, varThreshold=VAR_THRESHOLD,detectShadows=DETECT_SHADOWS)

    captured = False
    start_time = time.time()
    (W, H) = (None, None)

    # Get total frames for progress calculation
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    screenshoots_count = 0
    last_screenshot = None
    saved_files = []
    
    progress(0, desc="初始化视频处理...")
    
    for frame_count, frame_time, frame in get_frames(video_path):
        # Update progress
        progress((frame_count / total_frames) * 0.7, desc=f"处理视频帧 {frame_count}/{total_frames}")
        
        orig = frame.copy()
        frame = imutils.resize(frame, width=600)
        mask = fgbg.apply(frame)

        if W is None or H is None:
            (H, W) = mask.shape[:2]

        p_diff = (cv2.countNonZero(mask) / float(W * H)) * 100

        if p_diff < MIN_PERCENT and not captured and frame_count > WARMUP:
            captured = True
            filename = f"{screenshoots_count:03}_{round(frame_time/60, 2)}.png"
            path = os.path.join(output_folder_screenshot_path, filename)

            image_ssim = 0.0
            if last_screenshot is not None:
                image_ssim = structural_similarity(last_screenshot, orig, channel_axis=2, data_range=255)

            if image_ssim < SSIM_THRESHOLD:
                try:
                    progress(0.7 + (screenshoots_count * 0.1), desc=f"保存截图 {screenshoots_count + 1}")
                    print("saving {}".format(path))
                    cv2.imwrite(str(path), orig)
                    last_screenshot = orig
                    saved_files.append(path)
                    screenshoots_count += 1
                except Exception as e:
                    print(f"Error saving image: {str(e)}")
                    continue

        elif captured and p_diff >= MAX_PERCENT:
            captured = False

    progress(0.8, desc="截图提取完成")
    print(f'{screenshoots_count} screenshots Captured!')
    print(f'Time taken {time.time()-start_time}s')
    return saved_files


def initialize_output_folder(video_path):
    '''Clean the output folder if already exists'''
    # Create a safe folder name from video filename
    video_filename = os.path.splitext(os.path.basename(video_path))[0]
    # Replace potentially problematic characters
    safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
    output_folder_screenshot_path = os.path.join(OUTPUT_SLIDES_DIR, safe_filename)

    if os.path.exists(output_folder_screenshot_path):
        shutil.rmtree(output_folder_screenshot_path)

    os.makedirs(output_folder_screenshot_path, exist_ok=True)
    print('initialized output folder', output_folder_screenshot_path)
    return output_folder_screenshot_path


def convert_screenshots_to_pdf(video_path, output_folder_screenshot_path):
    # Create a safe filename
    video_filename = os.path.splitext(os.path.basename(video_path))[0]
    safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
    output_pdf_path = os.path.join(OUTPUT_SLIDES_DIR, f"{safe_filename}.pdf")
    
    try:
        print('output_folder_screenshot_path', output_folder_screenshot_path)
        print('output_pdf_path', output_pdf_path)
        print('converting images to pdf..')
        
        # Get all PNG files and ensure they exist
        png_files = sorted(glob.glob(os.path.join(output_folder_screenshot_path, "*.png")))
        if not png_files:
            raise Exception("No PNG files found to convert to PDF")
            
        with open(output_pdf_path, "wb") as f:
            f.write(img2pdf.convert(png_files))
            
        print('Pdf Created!')
        print('pdf saved at', output_pdf_path)
        return output_pdf_path
    except Exception as e:
        print(f"Error creating PDF: {str(e)}")
        raise


def video_to_slides(video_path, progress=gr.Progress()):
    progress(0.1, desc="准备处理视频...")
    output_folder_screenshot_path = initialize_output_folder(video_path)
    saved_files = detect_unique_screenshots(video_path, output_folder_screenshot_path, progress)
    return output_folder_screenshot_path, saved_files


def slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress=gr.Progress()):
    video_filename = os.path.splitext(os.path.basename(video_path))[0]
    safe_filename = "".join(x for x in video_filename if x.isalnum() or x in (' ', '-', '_'))
    output_pdf_path = os.path.join(OUTPUT_SLIDES_DIR, f"{safe_filename}.pdf")
    
    try:
        progress(0.9, desc="正在生成PDF...")
        print('output_folder_screenshot_path', output_folder_screenshot_path)
        print('output_pdf_path', output_pdf_path)
        
        if not saved_files:
            raise Exception("未从视频中捕获到截图")
            
        existing_files = [f for f in saved_files if os.path.exists(f)]
        if not existing_files:
            raise Exception("未找到保存的截图文件")
            
        with open(output_pdf_path, "wb") as f:
            f.write(img2pdf.convert(existing_files))
            
        progress(1.0, desc="处理完成！")
        print('PDF创建成功！')
        print('PDF保存位置:', output_pdf_path)
        return output_pdf_path
    except Exception as e:
        print(f"创建PDF时出错: {str(e)}")
        raise


def run_app(video_path, progress=gr.Progress()):
    try:
        if not video_path:
            raise gr.Error("请选择要处理的视频文件")
            
        progress(0, desc="开始处理...")
        output_folder_screenshot_path, saved_files = video_to_slides(video_path, progress)
        return slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress)
    except Exception as e:
        raise gr.Error(f"处理失败: {str(e)}")


def process_video_file(video_file):
    """Handle uploaded video file and return PDF"""
    try:
        # If video_file is a string (path), use it directly
        if isinstance(video_file, str):
            if video_file.strip() == "":
                return None
            return run_app(video_file)
            
        # If it's an uploaded file, create a temporary file
        if video_file is not None:
            # Generate a unique filename for the temporary video
            temp_filename = f"temp_video_{int(time.time())}.mp4"
            temp_path = os.path.join(tempfile.gettempdir(), temp_filename)
            
            try:
                if hasattr(video_file, 'name'):  # If it's already a file path
                    shutil.copyfile(video_file, temp_path)
                else:  # If it's file content
                    with open(temp_path, 'wb') as f:
                        f.write(video_file)
                
                # Process the video
                output_folder_screenshot_path, saved_files = video_to_slides(temp_path)
                pdf_path = slides_to_pdf(temp_path, output_folder_screenshot_path, saved_files)
                
                # Cleanup
                if os.path.exists(temp_path):
                    os.unlink(temp_path)
                return pdf_path
                
            except Exception as e:
                if os.path.exists(temp_path):
                    os.unlink(temp_path)
                raise gr.Error(f"处理视频时出错: {str(e)}")
        return None
    except Exception as e:
        raise gr.Error(f"处理视频时出错: {str(e)}")


def extract_audio_and_transcribe(video_path, progress=gr.Progress()):
    """Extract audio from video and transcribe it using Whisper"""
    progress(0, desc="正在提取音频...")
    
    # Load the video and extract audio
    video = VideoFileClip(video_path)
    audio = video.audio
    
    # Save audio to temporary file
    temp_audio = tempfile.mktemp(suffix='.wav')
    audio.write_audiofile(temp_audio)
    
    progress(0.3, desc="正在转录音频...")
    
    # Load Whisper model and transcribe
    model = whisper.load_model("base")
    result = model.transcribe(temp_audio)
    print("完成的转录文本结果如下："+result)
    
    # Clean up
    os.remove(temp_audio)
    video.close()
    
    # Process segments with timestamps
    segments = []
    for segment in result["segments"]:
        segments.append({
            "start": segment["start"],
            "end": segment["end"],
            "text": segment["text"].strip()
        })
    
    return segments

def add_text_to_image(image_path, text):
    """Add text below the image"""
    # Open image
    img = Image.open(image_path)
    width, height = img.size
    
    # Create new image with space for text
    font_size = 30
    font = ImageFont.truetype("arial.ttf", font_size)
    text_height = font_size * (text.count('\n') + 2)  # Add padding
    
    new_img = Image.new('RGB', (width, height + text_height), 'white')
    new_img.paste(img, (0, 0))
    
    # Add text
    draw = ImageDraw.Draw(new_img)
    draw.text((10, height + 10), text, font=font, fill='black')
    
    # Save the modified image
    new_img.save(image_path)

def process_video_with_transcription(video_path, output_folder_screenshot_path, progress=gr.Progress()):
    """Process video with transcription and add text to images"""
    # First, get the transcription
    segments = extract_audio_and_transcribe(video_path, progress)
    
    # Then get the frames as before
    saved_files = detect_unique_screenshots(video_path, output_folder_screenshot_path, progress)
    
    progress(0.8, desc="正在添加字幕...")
    
    # Match transcription segments with images
    for i, image_path in enumerate(saved_files):
        # Extract timestamp from filename (format: 000_1.23.png)
        timestamp = float(os.path.basename(image_path).split('_')[1].split('.png')[0])
        
        # Find relevant text segments for this timestamp
        relevant_text = []
        for segment in segments:
            if segment["start"] <= timestamp * 60 <= segment["end"]:
                relevant_text.append(segment["text"])
        
        # Add text to image
        if relevant_text:
            text = "\n".join(relevant_text)
            add_text_to_image(image_path, text)
    
    progress(0.9, desc="处理完成...")
    return saved_files

def run_app_with_transcription(video_path, progress=gr.Progress()):
    try:
        if not video_path:
            raise gr.Error("请选择要处理的视频文件")
            
        progress(0, desc="开始处理...")
        output_folder_screenshot_path = initialize_output_folder(video_path)
        saved_files = process_video_with_transcription(video_path, output_folder_screenshot_path, progress)
        return slides_to_pdf(video_path, output_folder_screenshot_path, saved_files, progress)
    except Exception as e:
        raise gr.Error(f"处理失败: {str(e)}")

def process_video_file_with_transcription(video_file):
    """Handle uploaded video file and return PDF with transcription"""
    try:
        # If video_file is a string (path), use it directly
        if isinstance(video_file, str):
            if video_file.strip() == "":
                return None
            return run_app_with_transcription(video_file)
            
        # If it's an uploaded file, create a temporary file
        if video_file is not None:
            # Generate a unique filename for the temporary video
            temp_filename = f"temp_video_{int(time.time())}.mp4"
            temp_path = os.path.join(tempfile.gettempdir(), temp_filename)
            
            try:
                if hasattr(video_file, 'name'):  # If it's already a file path
                    shutil.copyfile(video_file, temp_path)
                else:  # If it's file content
                    with open(temp_path, 'wb') as f:
                        f.write(video_file)
                
                # Process the video
                output_folder_screenshot_path, saved_files = video_to_slides(temp_path)
                saved_files = process_video_with_transcription(temp_path, output_folder_screenshot_path)
                pdf_path = slides_to_pdf(temp_path, output_folder_screenshot_path, saved_files)
                
                # Cleanup
                if os.path.exists(temp_path):
                    os.unlink(temp_path)
                return pdf_path
                
            except Exception as e:
                if os.path.exists(temp_path):
                    os.unlink(temp_path)
                raise gr.Error(f"处理视频时出错: {str(e)}")
        return None
    except Exception as e:
        raise gr.Error(f"处理视频时出错: {str(e)}")


def process_video(video, path):
    if video:
        return run_app(video)
    elif path:
        return run_app(path)
    else:
        raise gr.Error("请上传视频或输入视频路径")

def handle_video_with_transcription(video, path):
    if video:
        return run_app_with_transcription(video)
    elif path:
        return run_app_with_transcription(path)
    else:
        raise gr.Error("请上传视频或输入视频路径")

# Create a modern interface with custom CSS
css = """
.gradio-container {
    font-family: 'SF Pro Display', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
}
.container {
    max-width: 900px;
    margin: auto;
    padding: 20px;
}
.gr-button {
    background: linear-gradient(90deg, #2563eb, #3b82f6);
    border: none;
    color: white;
}
.gr-button:hover {
    background: linear-gradient(90deg, #1d4ed8, #2563eb);
    transform: translateY(-1px);
    box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
}
.status-info {
    margin-top: 10px;
    padding: 10px;
    border-radius: 4px;
    background-color: #f3f4f6;
}
"""

if __name__ == "__main__":
    with gr.Blocks(css=css) as iface:
        gr.Markdown("# 视频转PDF工具")
        
        with gr.Tab("基础转换"):
            with gr.Row():
                with gr.Column():
                    video_input = gr.Video(label="上传视频")
                    video_path = gr.Textbox(label="或输入视频路径", placeholder="例如: ./input/video.mp4")
                    convert_btn = gr.Button("开始转换", variant="primary")
                
            with gr.Row():
                output_file = gr.File(label="下载PDF")
        
        with gr.Tab("带语音转文字"):
            with gr.Row():
                with gr.Column():
                    video_input_with_transcription = gr.Video(label="上传视频")
                    video_path_with_transcription = gr.Textbox(label="或输入视频路径", placeholder="例如: ./input/video.mp4")
                    convert_btn_with_transcription = gr.Button("开始转换（带字幕）", variant="primary")
                
            with gr.Row():
                output_file_with_transcription = gr.File(label="下载PDF（带字幕）")
        
        convert_btn.click(
            fn=process_video,
            inputs=[video_input, video_path],
            outputs=[output_file],
        )
        
        convert_btn_with_transcription.click(
            fn=handle_video_with_transcription,
            inputs=[video_input_with_transcription, video_path_with_transcription],
            outputs=[output_file_with_transcription],
        )
        
    iface.launch()