histlearn commited on
Commit
75e6a72
·
verified ·
1 Parent(s): b5b5620

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -0
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, AutoModelForCausalLM, MarianMTModel, MarianTokenizer
3
+ from PIL import Image
4
+ import torch
5
+ from gtts import gTTS
6
+ import spacy
7
+ import requests
8
+ import nltk.tree
9
+ import re
10
+
11
+ # Carregar o modelo de português do spaCy
12
+ nlp = spacy.load("pt_core_news_sm")
13
+
14
+ # Chave para o LX-Parser
15
+ key = "eb159d39469d84f0ff47167a4d89cada"
16
+
17
+ # Funções de manipulação gramatical
18
+ def invert_adj_n(doc, tags):
19
+ frase = []
20
+ already = False
21
+ for i in range(len(doc)):
22
+ if already:
23
+ already = False
24
+ continue
25
+ if doc[i].tag_ != "PUNCT":
26
+ if tags[i] == "A":
27
+ if i + 1 < len(tags) and tags[i + 1] == "N":
28
+ frase.append(doc[i + 1].text)
29
+ frase.append(doc[i].text)
30
+ already = True
31
+ else:
32
+ frase.append(doc[i].text)
33
+ else:
34
+ frase.append(doc[i].text)
35
+ else:
36
+ frase.append(doc[i].text)
37
+ return frase
38
+
39
+ def adjust_adj(doc, tags):
40
+ frase = []
41
+ for i in range(len(doc)):
42
+ frase.append(doc[i].text)
43
+ if tags[i] == "A":
44
+ if i + 1 < len(tags) and tags[i + 1] == "A":
45
+ frase.append("e")
46
+ return frase
47
+
48
+ def adjust_art(doc, tags):
49
+ frase = []
50
+ already = False
51
+ for i in range(len(doc)):
52
+ if already:
53
+ already = False
54
+ continue
55
+ text = doc[i].text
56
+ if tags[i] == "ART" and text.lower() == "a":
57
+ if i + 1 < len(doc):
58
+ gender = doc[i + 1].morph.get("Gender")
59
+ number = doc[i + 1].morph.get("Number")
60
+ if gender and number:
61
+ if gender[0] == "Masc" and number[0] == "Sing":
62
+ frase.append("um")
63
+ elif gender[0] == "Fem" and number[0] == "Sing":
64
+ frase.append("uma")
65
+ elif gender[0] == "Masc" and number[0] != "Sing":
66
+ frase.append("os")
67
+ else:
68
+ frase.append("as")
69
+ else:
70
+ frase.append(text)
71
+ else:
72
+ frase.append(text)
73
+ else:
74
+ frase.append(text)
75
+ return frase
76
+
77
+ def create_sentence(doc, tags, frase):
78
+ tmp = frase
79
+ for i in range(len(doc)):
80
+ text = doc[i].text
81
+ if doc[i].is_sent_start:
82
+ tmp[i] = tmp[i].capitalize()
83
+ if doc[i].tag_ == "PUNCT":
84
+ tmp[i - 1] += text
85
+ return tmp
86
+
87
+ def get_productions(texto):
88
+ format = 'parentheses'
89
+ url = "https://portulanclarin.net/workbench/lx-parser/api/"
90
+ request_data = {
91
+ 'method': 'parse',
92
+ 'jsonrpc': '2.0',
93
+ 'id': 0,
94
+ 'params': {
95
+ 'text': texto,
96
+ 'format': format,
97
+ 'key': key,
98
+ },
99
+ }
100
+ request = requests.post(url, json=request_data)
101
+ response_data = request.json()
102
+ if "error" in response_data:
103
+ print("Error:", response_data["error"])
104
+ return []
105
+ else:
106
+ result = response_data["result"]
107
+ productions = []
108
+ tree = nltk.tree.Tree.fromstring(result)
109
+ for tag in tree.productions():
110
+ if len(re.findall(r"'.*'", str(tag))) > 0:
111
+ productions.append(str(tag))
112
+ return productions
113
+
114
+ def get_tags(productions):
115
+ tags = []
116
+ for item in productions:
117
+ if isinstance(item, str):
118
+ tags.append(item[:item.find(' ->')])
119
+ else:
120
+ tags.append(item)
121
+ for item in tags:
122
+ if "'" in item:
123
+ tags.remove(item)
124
+ return tags
125
+
126
+ def reordenar_sentenca(sentenca):
127
+ if not sentenca.strip():
128
+ return sentenca
129
+ sentenca = sentenca.lower()
130
+ sentence = get_productions(sentenca)
131
+ tags = get_tags(sentence)
132
+ doc = nlp(sentenca)
133
+ if tags[0] != "ART":
134
+ sentenca = "A " + sentenca.strip()
135
+ sentence = get_productions(sentenca)
136
+ tags = get_tags(sentence)
137
+ doc = nlp(sentenca)
138
+ if not sentence:
139
+ return sentenca.strip()
140
+ aux = []
141
+ if len(tags) > 2 and tags[1] == "N" and tags[2] == "N":
142
+ aux = sentenca.split()
143
+ tmp = aux[1]
144
+ aux[1] = aux[2]
145
+ aux.insert(2, "de")
146
+ aux[3] = tmp
147
+ sentenca = " ".join(aux)
148
+ sentence = get_productions(sentenca)
149
+ tags = get_tags(sentence)
150
+ doc = nlp(sentenca)
151
+ frase = []
152
+ already = False
153
+ person = 3
154
+ tmp_doc = []
155
+ for token in doc:
156
+ tmp_doc.append(token)
157
+ frase = invert_adj_n(tmp_doc, tags)
158
+ nova_sentenca = ' '.join(frase)
159
+ productions = get_productions(nova_sentenca)
160
+ tags = get_tags(productions)
161
+ doc = nlp(nova_sentenca)
162
+ while nova_sentenca != sentenca:
163
+ frase = invert_adj_n(doc, tags)
164
+ sentenca = nova_sentenca
165
+ nova_sentenca = ' '.join(frase)
166
+ productions = get_productions(nova_sentenca)
167
+ tags = get_tags(productions)
168
+ doc = nlp(nova_sentenca)
169
+ frase = adjust_adj(doc, tags)
170
+ nova_sentenca = ' '.join(frase)
171
+ productions = get_productions(nova_sentenca)
172
+ tags = get_tags(productions)
173
+ doc = nlp(nova_sentenca)
174
+ while nova_sentenca != sentenca:
175
+ frase = adjust_adj(doc, tags)
176
+ sentenca = nova_sentenca
177
+ nova_sentenca = ' '.join(frase)
178
+ productions = get_productions(nova_sentenca)
179
+ tags = get_tags(productions)
180
+ doc = nlp(nova_sentenca)
181
+ frase = adjust_art(doc, tags)
182
+ sentenca = ' '.join(frase)
183
+ productions = get_productions(sentenca)
184
+ tags = get_tags(productions)
185
+ doc = nlp(sentenca)
186
+ frase = create_sentence(doc, tags, frase)
187
+ sentenca_normalizada = ""
188
+ for i in range(len(frase)):
189
+ sentenca_normalizada += frase[i] + " "
190
+ return sentenca_normalizada.strip()
191
+
192
+ # Carregar os modelos
193
+ processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
194
+ model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
195
+ translation_model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
196
+ translation_tokenizer = MarianTokenizer.from_pretrained(translation_model_name)
197
+ translation_model = MarianMTModel.from_pretrained(translation_model_name)
198
+
199
+ # Configurar o dispositivo (GPU ou CPU)
200
+ device = "cuda" if torch.cuda.is_available() else "cpu"
201
+ model.to(device)
202
+ translation_model.to(device)
203
+
204
+ # Funções auxiliares
205
+ def prepare_image(image_path):
206
+ image = Image.open(image_path).convert("RGB")
207
+ inputs = processor(images=image, return_tensors="pt").to(device)
208
+ return image, inputs.pixel_values
209
+
210
+ def generate_caption(pixel_values):
211
+ model.eval()
212
+ with torch.no_grad():
213
+ generated_ids = model.generate(
214
+ pixel_values=pixel_values,
215
+ max_length=50,
216
+ num_beams=4,
217
+ early_stopping=True,
218
+ no_repeat_ngram_size=2
219
+ )
220
+ return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
221
+
222
+ def translate_to_portuguese(text):
223
+ inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to(device)
224
+ translated_ids = translation_model.generate(inputs["input_ids"], max_length=50, num_beams=4, early_stopping=True)
225
+ return translation_tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
226
+
227
+ def text_to_speech_gtts(text, lang='pt'):
228
+ tts = gTTS(text=text, lang=lang)
229
+ tts.save("output.mp3")
230
+ return "output.mp3"
231
+
232
+ # Função principal para processar a imagem e gerar a voz
233
+ def process_image(image):
234
+ _, pixel_values = prepare_image(image)
235
+ caption_en = generate_caption(pixel_values)
236
+ caption_pt = translate_to_portuguese(caption_en)
237
+ caption_pt = reordenar_sentenca(caption_pt)
238
+ audio_file = text_to_speech_gtts(caption_pt)
239
+ return caption_pt, audio_file
240
+
241
+ # Caminhos para as imagens de exemplo
242
+ example_image_paths = [
243
+ "main/example1.png",
244
+ "main/example2.png",
245
+ "main/example3.png"
246
+ ]
247
+
248
+ # Interface Gradio
249
+ iface = gr.Interface(
250
+ fn=process_image,
251
+ inputs=gr.Image(type="filepath"),
252
+ outputs=[gr.Textbox(), gr.Audio(type="filepath")],
253
+ examples=example_image_paths,
254
+ title="Image to Voice",
255
+ description="Gera uma descrição em português e a converte em voz a partir de uma imagem."
256
+ )
257
+
258
+ if __name__ == "__main__":
259
+ iface.launch()