vericudebuget commited on
Commit
b45ed63
·
verified ·
1 Parent(s): 7f4ce18

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -65
app.py CHANGED
@@ -9,79 +9,86 @@ import os
9
  device = "cpu"
10
  torch_dtype = torch.float32
11
 
12
- # Load the Whisper model and processor
13
- model_id = "openai/whisper-large-v3-turbo"
14
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
15
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
16
- ).to(device)
17
- processor = AutoProcessor.from_pretrained(model_id)
18
 
19
- # Create the pipeline for transcription
20
- pipe = pipeline(
21
- "automatic-speech-recognition",
22
- model=model,
23
- tokenizer=processor.tokenizer,
24
- feature_extractor=processor.feature_extractor,
25
- torch_dtype=torch_dtype,
26
- device=device,
27
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  st.title("Audio/Video Transcription App")
30
 
 
 
 
31
  # File upload
32
  uploaded_file = st.file_uploader("Upload an audio or video file", type=["mp3", "wav", "mp4", "m4a"])
33
 
34
  if uploaded_file is not None:
35
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
36
- # If it's a video, extract audio
37
- if uploaded_file.name.endswith(("mp4", "m4a")):
38
- audio = AudioSegment.from_file(uploaded_file)
39
- audio.export(temp_audio.name, format="wav")
40
- else:
41
- audio = AudioSegment.from_file(uploaded_file)
42
- audio.export(temp_audio.name, format="wav")
43
-
44
- # Run the transcription
45
- transcription_result = pipe(temp_audio.name, return_timestamps="word")
46
-
47
- # Extract text and timestamps
48
- transcription_text = transcription_result['text']
49
- transcription_chunks = transcription_result['chunks']
50
-
51
- # Display transcription
52
- st.subheader("Transcription")
53
- st.write(transcription_text)
54
-
55
- # Generate SRT file
56
- srt_content = ""
57
- for i, chunk in enumerate(transcription_chunks, start=1):
58
- start_time = chunk["timestamp"][0]
59
- end_time = chunk["timestamp"][1]
60
- text = chunk["text"]
61
-
62
- # Format time for SRT (hours, minutes, seconds, milliseconds)
63
- def format_srt_time(seconds):
64
- hours, remainder = divmod(seconds, 3600)
65
- minutes, seconds = divmod(remainder, 60)
66
- milliseconds = int((seconds % 1) * 1000)
67
- seconds = int(seconds)
68
- return f"{int(hours):02}:{int(minutes):02}:{seconds:02},{milliseconds:03}"
69
-
70
- srt_content += f"{i}\n"
71
- srt_content += f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n"
72
- srt_content += f"{text}\n\n"
73
 
74
- # Save the SRT file
75
- srt_path = tempfile.mktemp(suffix=".srt")
76
- with open(srt_path, "w") as srt_file:
77
- srt_file.write(srt_content)
78
 
79
- # Provide download for SRT file
 
80
  st.subheader("Download SRT File")
81
- with open(srt_path, "rb") as file:
82
- st.download_button(
83
- label="Download SRT",
84
- data=file,
85
- file_name="transcription.srt",
86
- mime="text/plain"
87
- )
 
9
  device = "cpu"
10
  torch_dtype = torch.float32
11
 
12
+ # Initialize session state
13
+ if 'transcription_text' not in st.session_state:
14
+ st.session_state.transcription_text = None
15
+ if 'srt_content' not in st.session_state:
16
+ st.session_state.srt_content = None
 
17
 
18
+ @st.cache_resource
19
+ def load_model():
20
+ model_id = "openai/whisper-large-v3-turbo"
21
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
23
+ ).to(device)
24
+ processor = AutoProcessor.from_pretrained(model_id)
25
+ pipe = pipeline(
26
+ "automatic-speech-recognition",
27
+ model=model,
28
+ tokenizer=processor.tokenizer,
29
+ feature_extractor=processor.feature_extractor,
30
+ torch_dtype=torch_dtype,
31
+ device=device,
32
+ )
33
+ return pipe
34
+
35
+ def format_srt_time(seconds):
36
+ hours, remainder = divmod(seconds, 3600)
37
+ minutes, seconds = divmod(remainder, 60)
38
+ milliseconds = int((seconds % 1) * 1000)
39
+ seconds = int(seconds)
40
+ return f"{int(hours):02}:{int(minutes):02}:{seconds:02},{milliseconds:03}"
41
 
42
  st.title("Audio/Video Transcription App")
43
 
44
+ # Load model
45
+ pipe = load_model()
46
+
47
  # File upload
48
  uploaded_file = st.file_uploader("Upload an audio or video file", type=["mp3", "wav", "mp4", "m4a"])
49
 
50
  if uploaded_file is not None:
51
+ with st.spinner("Processing audio..."):
52
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
53
+ # If it's a video, extract audio
54
+ if uploaded_file.name.endswith(("mp4", "m4a")):
55
+ audio = AudioSegment.from_file(uploaded_file)
56
+ audio.export(temp_audio.name, format="wav")
57
+ else:
58
+ audio = AudioSegment.from_file(uploaded_file)
59
+ audio.export(temp_audio.name, format="wav")
60
+
61
+ # Run the transcription
62
+ transcription_result = pipe(temp_audio.name, return_timestamps="word")
63
+
64
+ # Extract text and timestamps
65
+ st.session_state.transcription_text = transcription_result['text']
66
+ transcription_chunks = transcription_result['chunks']
67
+
68
+ # Generate SRT content
69
+ srt_content = ""
70
+ for i, chunk in enumerate(transcription_chunks, start=1):
71
+ start_time = chunk["timestamp"][0]
72
+ end_time = chunk["timestamp"][1]
73
+ text = chunk["text"]
74
+
75
+ srt_content += f"{i}\n"
76
+ srt_content += f"{format_srt_time(start_time)} --> {format_srt_time(end_time)}\n"
77
+ srt_content += f"{text}\n\n"
78
+
79
+ st.session_state.srt_content = srt_content
 
 
 
 
 
 
 
 
 
80
 
81
+ # Display transcription
82
+ if st.session_state.transcription_text:
83
+ st.subheader("Transcription")
84
+ st.write(st.session_state.transcription_text)
85
 
86
+ # Provide download for SRT file
87
+ if st.session_state.srt_content:
88
  st.subheader("Download SRT File")
89
+ st.download_button(
90
+ label="Download SRT",
91
+ data=st.session_state.srt_content,
92
+ file_name="transcription.srt",
93
+ mime="text/plain"
94
+ )