# requirements.txt


# app.py
import streamlit as st
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import tempfile
import os
from moviepy.editor import VideoFileClip
import datetime

def create_srt(chunks):
    srt_content = ""
    for i, chunk in enumerate(chunks, start=1):
        start_time = str(datetime.timedelta(seconds=chunk['timestamp'][0]))
        end_time = str(datetime.timedelta(seconds=chunk['timestamp'][1]))
        # Ensure proper SRT timestamp format (HH:MM:SS,mmm)
        start_time = start_time.rstrip('0').rstrip('.') + ',000' if '.' in start_time else start_time + ',000'
        end_time = end_time.rstrip('0').rstrip('.') + ',000' if '.' in end_time else end_time + ',000'
        
        srt_content += f"{i}\n{start_time} --> {end_time}\n{chunk['text']}\n\n"
    return srt_content

def extract_audio(video_path):
    with VideoFileClip(video_path) as video:
        audio = video.audio
        _, temp_audio_path = tempfile.mkstemp(suffix='.mp3')
        audio.write_audiofile(temp_audio_path)
    return temp_audio_path

def setup_model():
    device = "cpu"
    torch_dtype = torch.float32

    model_id = "openai/whisper-tiny"

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype, 
        low_cpu_mem_usage=True, 
        use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )
    
    return pipe

def main():
    st.title("Audio/Video Transcription App")
    
    # Initialize session state for model
    if 'pipe' not in st.session_state:
        with st.spinner("Loading model... This might take a few minutes."):
            st.session_state.pipe = setup_model()

    uploaded_file = st.file_uploader("Upload an audio or video file", type=['mp3', 'wav', 'mp4', 'avi', 'mov'])
    
    if uploaded_file is not None:
        with st.spinner("Processing file..."):
            # Save uploaded file temporarily
            temp_dir = tempfile.mkdtemp()
            temp_path = os.path.join(temp_dir, uploaded_file.name)
            
            with open(temp_path, 'wb') as f:
                f.write(uploaded_file.getvalue())
            
            # Extract audio if it's a video file
            if uploaded_file.type.startswith('video'):
                audio_path = extract_audio(temp_path)
            else:
                audio_path = temp_path
            
            # Transcribe
            generate_kwargs = {
                "return_timestamps": True
            }
            
            result = st.session_state.pipe(
                audio_path, 
                generate_kwargs=generate_kwargs, 
                chunk_length_s=30, 
                batch_size=8
            )
            
            # Display results
            st.subheader("Transcription:")
            st.write(result["text"])
            
            # Create and offer SRT download
            srt_content = create_srt(result["chunks"])
            st.download_button(
                label="Download SRT file",
                data=srt_content,
                file_name="transcription.srt",
                mime="text/plain"
            )
            
            # Cleanup
            os.remove(temp_path)
            if uploaded_file.type.startswith('video'):
                os.remove(audio_path)
            os.rmdir(temp_dir)

if __name__ == "__main__":
    main()