VDNT11 commited on
Commit
b10f48a
·
verified ·
1 Parent(s): c3e8af7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -64
app.py CHANGED
@@ -1,82 +1,215 @@
1
- import torch
2
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
  import streamlit as st
4
- from pydub import AudioSegment
 
 
 
5
  import os
6
- import soundfile as sf
7
- import uuid
8
-
9
- # Set device and dtype
10
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
 
13
-
14
- @st.cache_resource
15
- def load_model():
16
- # Use a specific Hindi-optimized Whisper model
17
- model_id = "openai/whisper-large-v2" # or consider a multilingual model
18
-
19
- # For Hindi, you might want to specify additional parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
21
- model_id,
22
- torch_dtype=torch_dtype,
23
- low_cpu_mem_usage=True,
24
- use_safetensors=True,
25
  )
26
  model.to(device)
27
 
28
- # Use the processor from the same model
29
  processor = AutoProcessor.from_pretrained(model_id)
30
 
31
- # Create pipeline with language specification
32
- pipe = pipeline(
33
  "automatic-speech-recognition",
34
  model=model,
35
  tokenizer=processor.tokenizer,
36
  feature_extractor=processor.feature_extractor,
37
  torch_dtype=torch_dtype,
38
  device=device,
39
- generate_kwargs={"language": "hi"} # Specify Hindi language
40
  )
41
- return pipe, processor
42
-
43
- # Load model and processor
44
- pipe, processor = load_model()
45
-
46
- # Streamlit UI
47
- st.title("Hindi Audio to Text Transcription")
48
-
49
- uploaded_file = st.file_uploader(
50
- "Upload a .wav audio file for transcription", type=["wav"]
51
- )
52
 
53
- if uploaded_file is not None:
54
- st.info("Processing uploaded file...")
55
-
56
- temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
57
- with open(temp_filename, "wb") as f:
58
- f.write(uploaded_file.read())
59
-
60
- # Preprocess the audio
61
- sound = AudioSegment.from_file(temp_filename)
62
- sound = sound.set_channels(1) # Convert to mono
63
- sound.export(temp_filename, format="wav") # Save the processed file
64
-
65
- audio, _ = sf.read(temp_filename) # Read audio data
66
-
67
- # Preprocess the audio for the model
68
- inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
69
- inputs = {k: v.to(device) for k, v in inputs.items()}
70
-
71
- # Perform transcription
72
- with torch.no_grad():
73
- outputs = pipe.model.generate(**inputs)
74
- transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
75
-
76
- # Display the transcription
77
- st.success("Transcription complete!")
78
- st.markdown(f"### Transcription:\n\n{transcription}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- os.remove(temp_filename) # Clean up temporary file
81
- else:
82
- st.warning("Please upload a .wav file to start transcription.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import torch
3
+ import librosa
4
+ import matplotlib.pyplot as plt
5
+ from PIL import Image
6
  import os
 
 
 
 
 
 
7
 
8
+ # Import the required functions and classes from your previous code
9
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
10
+ import torchaudio
11
+ import torch
12
+ from transformers import (
13
+ AutoModelForSeq2SeqLM,
14
+ AutoTokenizer,
15
+ )
16
+ from IndicTransToolkit import IndicProcessor
17
+ from transformers import BitsAndBytesConfig
18
+ from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
19
+ from diffusers import StableDiffusionImg2ImgPipeline
20
+ import stanza
21
+
22
+ # Ensure you have the same TransGen class and other supporting functions from your previous implementation
23
+ class TransGen:
24
+ def __init__(self, translation_model="ai4bharat/indictrans2-indic-en-1B",
25
+ stable_diff_model="stabilityai/stable-diffusion-2-base",
26
+ src_lang='hin_Deva', tgt_lang='eng_Latn'):
27
+ # Same implementation as in your previous code
28
+ self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
29
+ self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
30
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
31
+ self.ip = IndicProcessor(inference=True)
32
+ self.src_lang = src_lang
33
+ self.tgt_lang = tgt_lang
34
+
35
+ scheduler = EulerDiscreteScheduler.from_pretrained(stable_diff_model, subfolder="scheduler")
36
+ self.pipe = StableDiffusionPipeline.from_pretrained(stable_diff_model, scheduler=scheduler, torch_dtype=torch.bfloat16)
37
+ self.pipe = self.pipe.to("cuda")
38
+
39
+ self.img2img_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(stable_diff_model, torch_dtype=torch.float16)
40
+ self.img2img_pipe = self.img2img_pipe.to('cuda')
41
+
42
+ def translate(self, input_sentences):
43
+ # Same implementation as in your previous code
44
+ batch = self.ip.preprocess_batch(
45
+ input_sentences,
46
+ src_lang=self.src_lang,
47
+ tgt_lang=self.tgt_lang,
48
+ )
49
+ inputs = self.tokenizer(
50
+ batch,
51
+ truncation=True,
52
+ padding="longest",
53
+ return_tensors="pt",
54
+ return_attention_mask=True,
55
+ )
56
+
57
+ with torch.no_grad():
58
+ generated_tokens = self.model.generate(
59
+ **inputs,
60
+ use_cache=True,
61
+ min_length=0,
62
+ max_length=256,
63
+ num_beams=5,
64
+ num_return_sequences=1,
65
+ )
66
+
67
+ with self.tokenizer.as_target_tokenizer():
68
+ generated_tokens = self.tokenizer.batch_decode(
69
+ generated_tokens.detach().cpu().tolist(),
70
+ skip_special_tokens=True,
71
+ clean_up_tokenization_spaces=True,
72
+ )
73
+
74
+ translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
75
+
76
+ return translations
77
+
78
+ def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
79
+ # Same implementation as in your previous code
80
+ strength = float(strength) if strength is not None else 1.0
81
+ guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
82
+
83
+ strength = max(0.0, min(1.0, strength))
84
+
85
+ if prev_image is not None:
86
+ image = self.img2img_pipe(
87
+ prompt,
88
+ image=prev_image,
89
+ strength=strength,
90
+ guidance_scale=guidance_scale,
91
+ negative_prompt='generate text in image'
92
+ ).images[0]
93
+ return image
94
+
95
+ image = self.pipe(prompt)
96
+ return image.images[0]
97
+
98
+ def run(self, input_sentences, strength, guidance_scale, prev_image=None):
99
+ # Same implementation as in your previous code
100
+ translations = self.translate(input_sentences)
101
+ sentence = translations[0]
102
+ image = self.generate_image(sentence, prev_image, strength, guidance_scale)
103
+ return sentence, image
104
+
105
+ # Initialize global variables
106
+ stanza.download('hi')
107
+ transgen = TransGen()
108
+
109
+ def transcribe_audio_to_hindi(audio_path: str) -> str:
110
+ # Same implementation as in your previous code
111
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
112
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
113
+
114
+ model_id = "openai/whisper-large-v3"
115
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
116
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 
 
 
117
  )
118
  model.to(device)
119
 
 
120
  processor = AutoProcessor.from_pretrained(model_id)
121
 
122
+ whisper_pipe = pipeline(
 
123
  "automatic-speech-recognition",
124
  model=model,
125
  tokenizer=processor.tokenizer,
126
  feature_extractor=processor.feature_extractor,
127
  torch_dtype=torch_dtype,
128
  device=device,
129
+ model_kwargs={"language": "hi"}
130
  )
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ waveform, sample_rate = torchaudio.load(audio_path)
133
+
134
+ if sample_rate != 16000:
135
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
136
+ waveform = resampler(waveform)
137
+
138
+ result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
139
+ return result["text"]
140
+
141
+ nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')
142
+
143
+ def POS_policy(input):
144
+ # Same implementation as in your previous code
145
+ lst = input
146
+ doc = nlp(lst)
147
+ words = doc.sentences[-1].words
148
+ n = len(words)
149
+ i = n-1
150
+ while(i):
151
+ if words[i].upos == 'NOUN' or words[i].upos == 'VERB':
152
+ return i
153
+ else:
154
+ pass
155
+ i -= 1
156
+ return 0
157
+
158
+ def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
159
+ # Similar implementation with modifications for Streamlit
160
+ text_tot = transcribe_audio_to_hindi(audio_path)
161
+
162
+ st.write(f'Transcripted sentence: {text_tot}')
163
+
164
+ cur_sent = ''
165
+ prev_idx = 0
166
+ generated_images = []
167
+
168
+ for word in text_tot.split():
169
+ cur_sent += word + ' '
170
+
171
+ str_idx = POS_policy(cur_sent)
172
+
173
+ if str_idx != 0 and str_idx != prev_idx:
174
+ prev_idx = str_idx
175
+
176
+ sent, image = transgen.run(
177
+ [cur_sent],
178
+ base_strength,
179
+ base_guidance_scale,
180
+ image if 'image' in locals() else None
181
+ )
182
+
183
+ generated_images.append({
184
+ 'sentence': cur_sent,
185
+ 'image': image
186
+ })
187
+
188
+ return generated_images
189
 
190
+ def main():
191
+ st.title("Audio to Image Generation App")
192
+
193
+ # File uploader
194
+ uploaded_file = st.file_uploader("Choose a WAV audio file", type="wav")
195
+
196
+ # Strength and Guidance Scale sliders
197
+ base_strength = st.slider("Image Generation Strength", min_value=0.0, max_value=1.0, value=0.8, step=0.1)
198
+ base_guidance_scale = st.slider("Guidance Scale", min_value=1.0, max_value=20.0, value=12.0, step=0.5)
199
+
200
+ if uploaded_file is not None:
201
+ # Save the uploaded file temporarily
202
+ with open("temp_audio.wav", "wb") as f:
203
+ f.write(uploaded_file.getvalue())
204
+
205
+ # Generate images
206
+ st.write("Generating Images...")
207
+ generated_images = generate_images_from_audio("temp_audio.wav", base_strength, base_guidance_scale)
208
+
209
+ # Display generated images
210
+ st.write("Generated Images:")
211
+ for img_data in generated_images:
212
+ st.image(img_data['image'], caption=img_data['sentence'])
213
+
214
+ if __name__ == "__main__":
215
+ main()