Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,30 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import torch
|
3 |
import librosa
|
4 |
import matplotlib.pyplot as plt
|
5 |
from PIL import Image
|
6 |
-
import os
|
7 |
-
|
8 |
-
# Import the required functions and classes from your previous code
|
9 |
-
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
10 |
import torchaudio
|
11 |
-
import torch
|
12 |
from transformers import (
|
|
|
|
|
|
|
13 |
AutoModelForSeq2SeqLM,
|
14 |
AutoTokenizer,
|
|
|
15 |
)
|
16 |
-
from
|
17 |
-
from transformers import BitsAndBytesConfig
|
18 |
-
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
19 |
-
from diffusers import StableDiffusionImg2ImgPipeline
|
20 |
import stanza
|
|
|
|
|
|
|
21 |
|
22 |
-
# Ensure you have the same TransGen class and other supporting functions from your previous implementation
|
23 |
class TransGen:
|
24 |
-
def __init__(
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
29 |
self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
|
30 |
self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
|
@@ -40,7 +60,6 @@ class TransGen:
|
|
40 |
self.img2img_pipe = self.img2img_pipe.to('cuda')
|
41 |
|
42 |
def translate(self, input_sentences):
|
43 |
-
# Same implementation as in your previous code
|
44 |
batch = self.ip.preprocess_batch(
|
45 |
input_sentences,
|
46 |
src_lang=self.src_lang,
|
@@ -72,11 +91,9 @@ class TransGen:
|
|
72 |
)
|
73 |
|
74 |
translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
|
75 |
-
|
76 |
return translations
|
77 |
|
78 |
def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
|
79 |
-
# Same implementation as in your previous code
|
80 |
strength = float(strength) if strength is not None else 1.0
|
81 |
guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
|
82 |
|
@@ -96,18 +113,12 @@ class TransGen:
|
|
96 |
return image.images[0]
|
97 |
|
98 |
def run(self, input_sentences, strength, guidance_scale, prev_image=None):
|
99 |
-
# Same implementation as in your previous code
|
100 |
translations = self.translate(input_sentences)
|
101 |
sentence = translations[0]
|
102 |
image = self.generate_image(sentence, prev_image, strength, guidance_scale)
|
103 |
return sentence, image
|
104 |
|
105 |
-
# Initialize global variables
|
106 |
-
stanza.download('hi')
|
107 |
-
transgen = TransGen()
|
108 |
-
|
109 |
def transcribe_audio_to_hindi(audio_path: str) -> str:
|
110 |
-
# Same implementation as in your previous code
|
111 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
112 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
113 |
|
@@ -138,25 +149,23 @@ def transcribe_audio_to_hindi(audio_path: str) -> str:
|
|
138 |
result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
|
139 |
return result["text"]
|
140 |
|
|
|
|
|
141 |
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')
|
142 |
|
143 |
-
def POS_policy(
|
144 |
-
|
145 |
-
lst = input
|
146 |
-
doc = nlp(lst)
|
147 |
words = doc.sentences[-1].words
|
148 |
n = len(words)
|
149 |
i = n-1
|
150 |
-
|
151 |
-
|
|
|
152 |
return i
|
153 |
-
else:
|
154 |
-
pass
|
155 |
i -= 1
|
156 |
return 0
|
157 |
|
158 |
def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
|
159 |
-
# Similar implementation with modifications for Streamlit
|
160 |
text_tot = transcribe_audio_to_hindi(audio_path)
|
161 |
|
162 |
st.write(f'Transcripted sentence: {text_tot}')
|
@@ -164,6 +173,7 @@ def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scal
|
|
164 |
cur_sent = ''
|
165 |
prev_idx = 0
|
166 |
generated_images = []
|
|
|
167 |
|
168 |
for word in text_tot.split():
|
169 |
cur_sent += word + ' '
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import sys
|
4 |
+
|
5 |
+
# Clone required repositories
|
6 |
+
def clone_repositories():
|
7 |
+
repos = [
|
8 |
+
('https://github.com/AI4Bharat/IndicTrans2.git', 'indictrans2'),
|
9 |
+
('https://github.com/VarunGumma/IndicTransToolkit.git', 'indictranstoolkit')
|
10 |
+
]
|
11 |
+
|
12 |
+
for repo_url, repo_dir in repos:
|
13 |
+
if not os.path.exists(repo_dir):
|
14 |
+
subprocess.check_call(['git', 'clone', repo_url, repo_dir])
|
15 |
+
sys.path.append(os.path.abspath(repo_dir))
|
16 |
+
|
17 |
+
# Clone repositories before importing
|
18 |
+
clone_repositories()
|
19 |
+
|
20 |
import streamlit as st
|
21 |
import torch
|
22 |
import librosa
|
23 |
import matplotlib.pyplot as plt
|
24 |
from PIL import Image
|
|
|
|
|
|
|
|
|
25 |
import torchaudio
|
|
|
26 |
from transformers import (
|
27 |
+
AutoModelForSpeechSeq2Seq,
|
28 |
+
AutoProcessor,
|
29 |
+
pipeline,
|
30 |
AutoModelForSeq2SeqLM,
|
31 |
AutoTokenizer,
|
32 |
+
BitsAndBytesConfig
|
33 |
)
|
34 |
+
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline
|
|
|
|
|
|
|
35 |
import stanza
|
36 |
+
import numpy as np
|
37 |
+
|
38 |
+
from indictranstoolkit import IndicProcessor
|
39 |
|
|
|
40 |
class TransGen:
|
41 |
+
def __init__(
|
42 |
+
self,
|
43 |
+
translation_model="ai4bharat/indictrans2-indic-en-1B",
|
44 |
+
stable_diff_model="stabilityai/stable-diffusion-2-base",
|
45 |
+
src_lang='hin_Deva',
|
46 |
+
tgt_lang='eng_Latn'
|
47 |
+
):
|
48 |
self.bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
49 |
self.tokenizer = AutoTokenizer.from_pretrained(translation_model, trust_remote_code=True)
|
50 |
self.model = AutoModelForSeq2SeqLM.from_pretrained(translation_model, trust_remote_code=True, quantization_config=self.bnb_config)
|
|
|
60 |
self.img2img_pipe = self.img2img_pipe.to('cuda')
|
61 |
|
62 |
def translate(self, input_sentences):
|
|
|
63 |
batch = self.ip.preprocess_batch(
|
64 |
input_sentences,
|
65 |
src_lang=self.src_lang,
|
|
|
91 |
)
|
92 |
|
93 |
translations = self.ip.postprocess_batch(generated_tokens, lang=self.tgt_lang)
|
|
|
94 |
return translations
|
95 |
|
96 |
def generate_image(self, prompt, prev_image, strength=1.0, guidance_scale=7.5):
|
|
|
97 |
strength = float(strength) if strength is not None else 1.0
|
98 |
guidance_scale = float(guidance_scale) if guidance_scale is not None else 7.5
|
99 |
|
|
|
113 |
return image.images[0]
|
114 |
|
115 |
def run(self, input_sentences, strength, guidance_scale, prev_image=None):
|
|
|
116 |
translations = self.translate(input_sentences)
|
117 |
sentence = translations[0]
|
118 |
image = self.generate_image(sentence, prev_image, strength, guidance_scale)
|
119 |
return sentence, image
|
120 |
|
|
|
|
|
|
|
|
|
121 |
def transcribe_audio_to_hindi(audio_path: str) -> str:
|
|
|
122 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
123 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
124 |
|
|
|
149 |
result = whisper_pipe(waveform.squeeze(0).cpu().numpy(), return_timestamps=True)
|
150 |
return result["text"]
|
151 |
|
152 |
+
# Download Stanza resources
|
153 |
+
stanza.download('hi')
|
154 |
nlp = stanza.Pipeline(lang='hi', processors='tokenize,pos')
|
155 |
|
156 |
+
def POS_policy(input_text):
|
157 |
+
doc = nlp(input_text)
|
|
|
|
|
158 |
words = doc.sentences[-1].words
|
159 |
n = len(words)
|
160 |
i = n-1
|
161 |
+
|
162 |
+
while i >= 0:
|
163 |
+
if words[i].upos in ['NOUN', 'VERB']:
|
164 |
return i
|
|
|
|
|
165 |
i -= 1
|
166 |
return 0
|
167 |
|
168 |
def generate_images_from_audio(audio_path, base_strength=0.8, base_guidance_scale=12):
|
|
|
169 |
text_tot = transcribe_audio_to_hindi(audio_path)
|
170 |
|
171 |
st.write(f'Transcripted sentence: {text_tot}')
|
|
|
173 |
cur_sent = ''
|
174 |
prev_idx = 0
|
175 |
generated_images = []
|
176 |
+
transgen = TransGen()
|
177 |
|
178 |
for word in text_tot.split():
|
179 |
cur_sent += word + ' '
|