VDNT11 commited on
Commit
1252e4e
·
verified ·
1 Parent(s): b5bdd0a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
+ import streamlit as st
4
+ from pydub import AudioSegment
5
+ import os
6
+ import soundfile as sf
7
+ import uuid
8
+
9
+ # Set device and dtype
10
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
+
13
+ # Load Whisper model from Hugging Face
14
+ @st.cache_resource
15
+ def load_model():
16
+ model_id = "openai/whisper-large-v2"
17
+
18
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
19
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
20
+ )
21
+ model.to(device)
22
+
23
+ processor = AutoProcessor.from_pretrained(model_id)
24
+
25
+ pipe = pipeline(
26
+ "automatic-speech-recognition",
27
+ model=model,
28
+ tokenizer=processor.tokenizer,
29
+ feature_extractor=processor.feature_extractor,
30
+ torch_dtype=torch_dtype,
31
+ device=device,
32
+ )
33
+ return pipe, processor
34
+
35
+ # Load model and processor
36
+ pipe, processor = load_model()
37
+
38
+ # Streamlit UI
39
+ st.title("Hindi Audio to Text Transcription")
40
+
41
+ uploaded_file = st.file_uploader(
42
+ "Upload a .wav audio file for transcription", type=["wav"]
43
+ )
44
+
45
+ if uploaded_file is not None:
46
+ st.info("Processing uploaded file...")
47
+
48
+ temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
49
+ with open(temp_filename, "wb") as f:
50
+ f.write(uploaded_file.read())
51
+
52
+ # Preprocess the audio
53
+ sound = AudioSegment.from_file(temp_filename)
54
+ sound = sound.set_channels(1) # Convert to mono
55
+ sound.export(temp_filename, format="wav") # Save the processed file
56
+
57
+ audio, _ = sf.read(temp_filename) # Read audio data
58
+
59
+ # Preprocess the audio for the model
60
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
61
+ inputs = {k: v.to(device) for k, v in inputs.items()}
62
+
63
+ # Perform transcription
64
+ with torch.no_grad():
65
+ outputs = pipe.model.generate(**inputs)
66
+ transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
67
+
68
+ # Display the transcription
69
+ st.success("Transcription complete!")
70
+ st.markdown(f"### Transcription:\n\n{transcription}")
71
+
72
+ os.remove(temp_filename) # Clean up temporary file
73
+ else:
74
+ st.warning("Please upload a .wav file to start transcription.")