thak123 commited on
Commit
70a53fa
·
verified ·
1 Parent(s): 0d95a29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -23
app.py CHANGED
@@ -19,31 +19,74 @@ pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"t
19
  # )
20
  # )
21
 
22
- def transcribe_speech(filepath):
23
- # waveform, sample_rate = torchaudio.load(filepath)
24
-
25
- # Resample the audio signal to 16k sampling rate
26
- # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
27
- # waveform_16k = resampler(waveform)
28
-
29
- # Save the resampled audio signal to a new file
30
- # torchaudio.save(filepath, waveform_16k, 16000)
31
- output = pipe(
32
- filepath,
33
- max_new_tokens=3,
34
- generate_kwargs={
35
- "task": "transcribe",
36
- # "language": "konkani",
37
- }, # update with the language you've fine-tuned on
38
- chunk_length_s=30,
39
- batch_size=8,
40
- # sampling_rate=16000,
41
- # padding=True
42
- )
43
- print(output)
44
- return output["text"]
45
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  demo = gr.Blocks()
48
 
49
  mic_transcribe = gr.Interface(
 
19
  # )
20
  # )
21
 
22
+ # def transcribe_speech(filepath):
23
+ # # waveform, sample_rate = torchaudio.load(filepath)
24
+
25
+ # # Resample the audio signal to 16k sampling rate
26
+ # # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
27
+ # # waveform_16k = resampler(waveform)
28
+
29
+ # # Save the resampled audio signal to a new file
30
+ # # torchaudio.save(filepath, waveform_16k, 16000)
31
+ # output = pipe(
32
+ # filepath,
33
+ # max_new_tokens=3,
34
+ # generate_kwargs={
35
+ # "task": "transcribe",
36
+ # # "language": "konkani",
37
+ # }, # update with the language you've fine-tuned on
38
+ # chunk_length_s=30,
39
+ # batch_size=8,
40
+ # # sampling_rate=16000,
41
+ # # padding=True
42
+ # )
43
+ # print(output)
44
+ # return output["text"]
45
 
46
 
47
+ def transcribe_speech(filepath):
48
+
49
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
50
+ import torch
51
+ import librosa
52
+
53
+ # Load model and processor
54
+ model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
55
+ tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
56
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
57
+ output = ""
58
+ # Load and preprocess audio
59
+ audio_path = filepath
60
+ audio, sr = librosa.load(audio_path, sr=16000)
61
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features
62
+
63
+ # Check length and process
64
+ if input_features.shape[-1] > 3000:
65
+ print("Splitting audio required")
66
+ from pydub import AudioSegment
67
+
68
+ def split_audio(file_path, chunk_length_ms=30000): # 30 sec chunks
69
+ audio = AudioSegment.from_file(file_path)
70
+ chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
71
+ return chunks
72
+
73
+ # Split and transcribe
74
+ audio_chunks = split_audio(audio_path)
75
+
76
+ for i, chunk in enumerate(audio_chunks):
77
+ print(i)
78
+ chunk.export(f"chunk_{i}.wav", format="wav")
79
+ result = pipe(f"chunk_{i}.wav")
80
+ output += result['text'] + " "
81
+ print(f"Chunk {i}: {result['text']}")
82
+ else:
83
+ predicted_ids = model.generate(input_features)
84
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
85
+ output = transcription
86
+ print(transcription)
87
+
88
+ return output #output["text"]
89
+
90
  demo = gr.Blocks()
91
 
92
  mic_transcribe = gr.Interface(