Spaces:

lcjln
/

AIME

Sleeping

File size: 3,994 Bytes

64a29d6
f6beab7
 
d20cd0c
f6beab7
 
 
 
 
 
 
 
 
 
 
 
 
d20cd0c
f6beab7
 
8f5fb37
 
f6beab7
8f5fb37
 
 
 
 
f6beab7
8f5fb37
 
 
 
 
f6beab7
8f5fb37
 
f6beab7
8f5fb37
 
f6beab7
8f5fb37
 
 
f6beab7
8f5fb37
f6beab7
8f5fb37
 
d20cd0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6beab7
8f5fb37
d20cd0c
f6beab7
8f5fb37
 
 
f6beab7
8f5fb37
 
 
 
 
 
 
 
d20cd0c

import os
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
import librosa
import srt
from datetime import timedelta

# 모델 및 프로세서 로드
@st.cache_resource
def load_model():
    model = WhisperForConditionalGeneration.from_pretrained("lcjln/AIME_Project_The_Final")
    processor = WhisperProcessor.from_pretrained("lcjln/AIME_The_Final")
    return model, processor

model, processor = load_model()

# Streamlit 웹 애플리케이션 인터페이스
st.title("Whisper 자막 생성기")

# 여러 WAV 파일 업로드
uploaded_files = st.file_uploader("여기에 WAV 파일들을 드래그 앤 드롭 하세요", type=["wav"], accept_multiple_files=True)

# 파일 목록을 보여줌
if uploaded_files:
    st.write("업로드된 파일 목록:")
    for uploaded_file in uploaded_files:
        st.write(uploaded_file.name)

    # 실행 버튼
    if st.button("실행"):
        combined_subs = []
        last_end_time = timedelta(0)
        subtitle_index = 1

        for uploaded_file in uploaded_files:
            st.write(f"처리 중: {uploaded_file.name}")

            # 진행바 초기화
            progress_bar = st.progress(0)

            # WAV 파일 로드 및 처리
            st.write("오디오 파일을 처리하는 중입니다...")
            audio, sr = librosa.load(uploaded_file, sr=16000)

            progress_bar.progress(50)

            # Whisper 모델로 변환
            st.write("모델을 통해 자막을 생성하는 중입니다...")
            segments = split_audio(audio, sr, segment_duration=5)

            for i, segment in enumerate(segments):
                inputs = processor(segment, return_tensors="pt", sampling_rate=16000)
                with torch.no_grad():
                    outputs = model.generate(inputs["input_features"], max_length=2048, return_dict_in_generate=True, output_scores=True)

                # 텍스트 디코딩
                transcription = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()

                # 신뢰도 점수 계산 (추가적인 신뢰도 필터링 적용)
                avg_logit_score = torch.mean(outputs.scores[-1]).item()

                # 신뢰도 점수가 낮거나 텍스트가 비어있는 경우 무시
                if transcription and avg_logit_score > -5.0:
                    segment_duration = librosa.get_duration(y=segment, sr=sr)
                    end_time = last_end_time + timedelta(seconds=segment_duration)

                    combined_subs.append(
                        srt.Subtitle(
                            index=subtitle_index,
                            start=last_end_time,
                            end=end_time,
                            content=transcription
                        )
                    )
                    last_end_time = end_time
                    subtitle_index += 1

            progress_bar.progress(100)
            st.success(f"{uploaded_file.name}의 자막이 성공적으로 생성되었습니다!")

        # 모든 자막을 하나의 SRT 파일로 저장
        st.write("최종 SRT 파일을 생성하는 중입니다...")
        srt_content = srt.compose(combined_subs)

        final_srt_file_path = "combined_output.srt"
        with open(final_srt_file_path, "w", encoding="utf-8") as f:
            f.write(srt_content)

        st.success("최종 SRT 파일이 성공적으로 생성되었습니다!")

        # 최종 SRT 파일 다운로드 버튼
        with open(final_srt_file_path, "rb") as srt_file:
            st.download_button(label="SRT 파일 다운로드", data=srt_file, file_name=final_srt_file_path, mime="text/srt")

def split_audio(audio, sr, segment_duration=5):
    segments = []
    for i in range(0, len(audio), int(segment_duration * sr)):
        segment = audio[i:i + int(segment_duration * sr)]
        segments.append(segment)
    return segments