histlearn commited on
Commit
6a562d1
·
verified ·
1 Parent(s): 6b05787

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +253 -0
app.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from transformers import AutoProcessor, AutoModelForCausalLM
4
+ from PIL import Image
5
+ import torch
6
+ from gtts import gTTS
7
+ import spacy
8
+ import requests
9
+ import nltk.tree
10
+ import re
11
+
12
+ # Baixar o modelo de português do spaCy
13
+ os.system("python -m spacy download pt_core_news_sm")
14
+
15
+ # Carregar o modelo de português do spaCy
16
+ nlp = spacy.load("pt_core_news_sm")
17
+
18
+ # Chave para o LX-Parser
19
+ key = "eb159d39469d84f0ff47167a4d89cada"
20
+
21
+ # Funções de manipulação gramatical
22
+ def invert_adj_n(doc, tags):
23
+ frase = []
24
+ already = False
25
+ for i in range(len(doc)):
26
+ if already:
27
+ already = False
28
+ continue
29
+ if doc[i].tag_ != "PUNCT":
30
+ if tags[i] == "A":
31
+ if i + 1 < len(tags) and tags[i + 1] == "N":
32
+ frase.append(doc[i + 1].text)
33
+ frase.append(doc[i].text)
34
+ already = True
35
+ else:
36
+ frase.append(doc[i].text)
37
+ else:
38
+ frase.append(doc[i].text)
39
+ else:
40
+ frase.append(doc[i].text)
41
+ return frase
42
+
43
+ def adjust_adj(doc, tags):
44
+ frase = []
45
+ for i in range(len(doc)):
46
+ frase.append(doc[i].text)
47
+ if tags[i] == "A":
48
+ if i + 1 < len(tags) and tags[i + 1] == "A":
49
+ frase.append("e")
50
+ return frase
51
+
52
+ def adjust_art(doc, tags):
53
+ frase = []
54
+ already = False
55
+ for i in range(len(doc)):
56
+ if already:
57
+ already = False
58
+ continue
59
+ text = doc[i].text
60
+ if tags[i] == "ART" and text.lower() == "a":
61
+ if i + 1 < len(doc):
62
+ gender = doc[i + 1].morph.get("Gender")
63
+ number = doc[i + 1].morph.get("Number")
64
+ if gender and number:
65
+ if gender[0] == "Masc" and number[0] == "Sing":
66
+ frase.append("um")
67
+ elif gender[0] == "Fem" and number[0] == "Sing":
68
+ frase.append("uma")
69
+ elif gender[0] == "Masc" and number[0] != "Sing":
70
+ frase.append("os")
71
+ else:
72
+ frase.append("as")
73
+ else:
74
+ frase.append(text)
75
+ else:
76
+ frase.append(text)
77
+ else:
78
+ frase.append(text)
79
+ return frase
80
+
81
+ def create_sentence(doc, tags, frase):
82
+ tmp = frase
83
+ for i in range(len(doc)):
84
+ text = doc[i].text
85
+ if doc[i].is_sent_start:
86
+ tmp[i] = tmp[i].capitalize()
87
+ if doc[i].tag_ == "PUNCT":
88
+ tmp[i - 1] += text
89
+ return tmp
90
+
91
+ def get_productions(texto):
92
+ format = 'parentheses'
93
+ url = "https://portulanclarin.net/workbench/lx-parser/api/"
94
+ request_data = {
95
+ 'method': 'parse',
96
+ 'jsonrpc': '2.0',
97
+ 'id': 0,
98
+ 'params': {
99
+ 'text': texto,
100
+ 'format': format,
101
+ 'key': key,
102
+ },
103
+ }
104
+ request = requests.post(url, json=request_data)
105
+ response_data = request.json()
106
+ if "error" in response_data:
107
+ print("Error:", response_data["error"])
108
+ return []
109
+ else:
110
+ result = response_data["result"]
111
+ productions = []
112
+ tree = nltk.tree.Tree.fromstring(result)
113
+ for tag in tree.productions():
114
+ if len(re.findall(r"'.*'", str(tag))) > 0:
115
+ productions.append(str(tag))
116
+ return productions
117
+
118
+ def get_tags(productions):
119
+ tags = []
120
+ for item in productions:
121
+ if isinstance(item, str):
122
+ tags.append(item[:item.find(' ->')])
123
+ else:
124
+ tags.append(item)
125
+ for item in tags:
126
+ if "'" in item:
127
+ tags.remove(item)
128
+ return tags
129
+
130
+ def reordenar_sentenca(sentenca):
131
+ if not sentenca.strip():
132
+ return sentenca
133
+ sentenca = sentenca.lower()
134
+ sentence = get_productions(sentenca)
135
+ tags = get_tags(sentence)
136
+ doc = nlp(sentenca)
137
+ if tags[0] != "ART":
138
+ sentenca = "A " + sentenca.strip()
139
+ sentence = get_productions(sentenca)
140
+ tags = get_tags(sentence)
141
+ doc = nlp(sentenca)
142
+ if not sentence:
143
+ return sentenca.strip()
144
+ aux = []
145
+ if len(tags) > 2 and tags[1] == "N" and tags[2] == "N":
146
+ aux = sentenca.split()
147
+ tmp = aux[1]
148
+ aux[1] = aux[2]
149
+ aux.insert(2, "de")
150
+ aux[3] = tmp
151
+ sentenca = " ".join(aux)
152
+ sentence = get_productions(sentenca)
153
+ tags = get_tags(sentence)
154
+ doc = nlp(sentenca)
155
+ frase = []
156
+ already = False
157
+ person = 3
158
+ tmp_doc = []
159
+ for token in doc:
160
+ tmp_doc.append(token)
161
+ frase = invert_adj_n(tmp_doc, tags)
162
+ nova_sentenca = ' '.join(frase)
163
+ productions = get_productions(nova_sentenca)
164
+ tags = get_tags(productions)
165
+ doc = nlp(nova_sentenca)
166
+ while nova_sentenca != sentenca:
167
+ frase = invert_adj_n(doc, tags)
168
+ sentenca = nova_sentenca
169
+ nova_sentenca = ' '.join(frase)
170
+ productions = get_productions(nova_sentenca)
171
+ tags = get_tags(productions)
172
+ doc = nlp(nova_sentenca)
173
+ frase = adjust_adj(doc, tags)
174
+ nova_sentenca = ' '.join(frase)
175
+ productions = get_productions(nova_sentenca)
176
+ tags = get_tags(productions)
177
+ doc = nlp(nova_sentenca)
178
+ while nova_sentenca != sentenca:
179
+ frase = adjust_adj(doc, tags)
180
+ sentenca = nova_sentenca
181
+ nova_sentenca = ' '.join(frase)
182
+ productions = get_productions(nova_sentenca)
183
+ tags = get_tags(productions)
184
+ doc = nlp(nova_sentenca)
185
+ frase = adjust_art(doc, tags)
186
+ sentenca = ' '.join(frase)
187
+ productions = get_productions(sentenca)
188
+ tags = get_tags(productions)
189
+ doc = nlp(sentenca)
190
+ frase = create_sentence(doc, tags, frase)
191
+ sentenca_normalizada = ""
192
+ for i in range(len(frase)):
193
+ sentenca_normalizada += frase[i] + " "
194
+ return sentenca_normalizada.strip()
195
+
196
+ # Carregar os modelos
197
+ processor = AutoProcessor.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
198
+ model = AutoModelForCausalLM.from_pretrained("histlearn/microsoft-git-portuguese-neuro-simbolic")
199
+
200
+ # Configurar o dispositivo (GPU ou CPU)
201
+ device = "cuda" if torch.cuda.is_available() else "cpu"
202
+ model.to(device)
203
+
204
+ # Funções auxiliares
205
+ def prepare_image(image_path):
206
+ image = Image.open(image_path).convert("RGB")
207
+ inputs = processor(images=image, return_tensors="pt").to(device)
208
+ return image, inputs.pixel_values
209
+
210
+ def generate_caption(pixel_values):
211
+ model.eval()
212
+ with torch.no_grad():
213
+ generated_ids = model.generate(
214
+ pixel_values=pixel_values,
215
+ max_length=50,
216
+ num_beams=4,
217
+ early_stopping=True,
218
+ no_repeat_ngram_size=2
219
+ )
220
+ return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
221
+
222
+ def text_to_speech_gtts(text, lang='pt'):
223
+ tts = gTTS(text=text, lang=lang)
224
+ tts.save("output.mp3")
225
+ return "output.mp3"
226
+
227
+ # Função principal para processar a imagem e gerar a voz
228
+ def process_image(image):
229
+ _, pixel_values = prepare_image(image)
230
+ caption_pt = generate_caption(pixel_values)
231
+ caption_pt = reordenar_sentenca(caption_pt)
232
+ audio_file = text_to_speech_gtts(caption_pt)
233
+ return caption_pt, audio_file
234
+
235
+ # Caminhos para as imagens de exemplo
236
+ example_image_paths = [
237
+ "main/example1.jpeg",
238
+ "main/example2.jpeg",
239
+ "main/example3.jpeg"
240
+ ]
241
+
242
+ # Interface Gradio
243
+ iface = gr.Interface(
244
+ fn=process_image,
245
+ inputs=gr.Image(type="filepath"),
246
+ outputs=[gr.Textbox(), gr.Audio(type="filepath")],
247
+ examples=example_image_paths,
248
+ title="Image to Voice",
249
+ description="Gera uma descrição em português e a converte em voz a partir de uma imagem."
250
+ )
251
+
252
+ if __name__ == "__main__":
253
+ iface.launch()