Spaces:

lcjln
/

AIME

Sleeping

App Files Files Community

AIME / app.py

lcjln

Update app.py

d20cd0c verified 10 months ago

raw

history blame

3.99 kB

	import os
	import streamlit as st
	from transformers import WhisperForConditionalGeneration, WhisperProcessor
	import torch
	import librosa
	import srt
	from datetime import timedelta

	# 모델 및 프로세서 로드
	@st.cache_resource
	def load_model():
	model = WhisperForConditionalGeneration.from_pretrained("lcjln/AIME_Project_The_Final")
	processor = WhisperProcessor.from_pretrained("lcjln/AIME_The_Final")
	return model, processor

	model, processor = load_model()

	# Streamlit 웹 애플리케이션 인터페이스
	st.title("Whisper 자막 생성기")

	# 여러 WAV 파일 업로드
	uploaded_files = st.file_uploader("여기에 WAV 파일들을 드래그 앤 드롭 하세요", type=["wav"], accept_multiple_files=True)

	# 파일 목록을 보여줌
	if uploaded_files:
	st.write("업로드된 파일 목록:")
	for uploaded_file in uploaded_files:
	st.write(uploaded_file.name)

	# 실행 버튼
	if st.button("실행"):
	combined_subs = []
	last_end_time = timedelta(0)
	subtitle_index = 1

	for uploaded_file in uploaded_files:
	st.write(f"처리 중: {uploaded_file.name}")

	# 진행바 초기화
	progress_bar = st.progress(0)

	# WAV 파일 로드 및 처리
	st.write("오디오 파일을 처리하는 중입니다...")
	audio, sr = librosa.load(uploaded_file, sr=16000)

	progress_bar.progress(50)

	# Whisper 모델로 변환
	st.write("모델을 통해 자막을 생성하는 중입니다...")
	segments = split_audio(audio, sr, segment_duration=5)

	for i, segment in enumerate(segments):
	inputs = processor(segment, return_tensors="pt", sampling_rate=16000)
	with torch.no_grad():
	outputs = model.generate(inputs["input_features"], max_length=2048, return_dict_in_generate=True, output_scores=True)

	# 텍스트 디코딩
	transcription = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0].strip()

	# 신뢰도 점수 계산 (추가적인 신뢰도 필터링 적용)
	avg_logit_score = torch.mean(outputs.scores[-1]).item()

	# 신뢰도 점수가 낮거나 텍스트가 비어있는 경우 무시
	if transcription and avg_logit_score > -5.0:
	segment_duration = librosa.get_duration(y=segment, sr=sr)
	end_time = last_end_time + timedelta(seconds=segment_duration)

	combined_subs.append(
	srt.Subtitle(
	index=subtitle_index,
	start=last_end_time,
	end=end_time,
	content=transcription
	)
	)
	last_end_time = end_time
	subtitle_index += 1

	progress_bar.progress(100)
	st.success(f"{uploaded_file.name}의 자막이 성공적으로 생성되었습니다!")

	# 모든 자막을 하나의 SRT 파일로 저장
	st.write("최종 SRT 파일을 생성하는 중입니다...")
	srt_content = srt.compose(combined_subs)

	final_srt_file_path = "combined_output.srt"
	with open(final_srt_file_path, "w", encoding="utf-8") as f:
	f.write(srt_content)

	st.success("최종 SRT 파일이 성공적으로 생성되었습니다!")

	# 최종 SRT 파일 다운로드 버튼
	with open(final_srt_file_path, "rb") as srt_file:
	st.download_button(label="SRT 파일 다운로드", data=srt_file, file_name=final_srt_file_path, mime="text/srt")

	def split_audio(audio, sr, segment_duration=5):
	segments = []
	for i in range(0, len(audio), int(segment_duration * sr)):
	segment = audio[i:i + int(segment_duration * sr)]
	segments.append(segment)
	return segments