File size: 2,723 Bytes
ccabf63
6b694b5
dfb92f3
 
163138e
dfb92f3
5b29361
6b694b5
dfb92f3
1620753
ccabf63
163138e
 
 
1620753
dfb92f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163138e
ccabf63
dfb92f3
ccabf63
dfb92f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163138e
 
dfb92f3
163138e
 
 
 
 
 
 
 
 
 
dfb92f3
 
 
163138e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import requests
import pyaudio
import wave
import os
import tempfile

API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
headers = {"Authorization": f"Bearer {st.secrets['hf_token']}"}

def query(file):
    data = file.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

def record_audio(duration=5, sample_rate=44100, chunk=1024, channels=1):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=sample_rate,
                    input=True,
                    frames_per_buffer=chunk)

    st.info(f"Recording for {duration} seconds...")
    frames = []
    for i in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)
    st.info("Recording finished.")

    stream.stop_stream()
    stream.close()
    p.terminate()

    return frames, sample_rate

st.title("Speech Recognition with Whisper")

option = st.radio("Choose input method:", ('Upload File', 'Record from Microphone'))

if option == 'Upload File':
    uploaded_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'flac'])
    if uploaded_file is not None:
        st.audio(uploaded_file, format='audio/wav')
        file_to_transcribe = uploaded_file
else:
    duration = st.slider("Recording duration (seconds)", 1, 30, 5)
    if st.button('Start Recording'):
        frames, sample_rate = record_audio(duration=duration)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmpfile:
            wf = wave.open(tmpfile.name, 'wb')
            wf.setnchannels(1)
            wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
            wf.setframerate(sample_rate)
            wf.writeframes(b''.join(frames))
            wf.close()
            
            st.audio(tmpfile.name, format='audio/wav')
            file_to_transcribe = open(tmpfile.name, 'rb')

if 'file_to_transcribe' in locals():
    if st.button('Transcribe'):
        with st.spinner('Transcribing...'):
            result = query(file_to_transcribe)
            
            if 'text' in result:
                st.success("Transcription completed!")
                st.write("Transcribed text:")
                st.write(result['text'])
            else:
                st.error("An error occurred during transcription.")
                st.write("Error details:")
                st.write(result)

        if option == 'Record from Microphone':
            os.unlink(file_to_transcribe.name)

st.markdown("---")
st.write("Note: This app uses the Whisper API from Hugging Face.")