nktssk commited on
Commit
2d45773
·
verified ·
1 Parent(s): 06ac09f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -33
app.py CHANGED
@@ -1,45 +1,227 @@
1
  import os
2
- import gradio as gr
3
  import torch
4
- import torch.nn.functional as F
5
- import numpy as np
 
 
 
 
 
6
  from PIL import Image
 
 
 
 
7
  from transformers import pipeline
 
 
8
 
9
- depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-hybrid-midas")
10
 
11
- def launch(input_image):
12
- out = depth_estimator(input_image)
13
 
14
- predicted_depth = torch.tensor(out["predicted_depth"])
15
-
16
- if len(predicted_depth.shape) == 2: # Если двумерен, добавляем оси
17
- predicted_depth = predicted_depth.unsqueeze(0).unsqueeze(0)
18
-
19
- prediction = F.interpolate(
20
- predicted_depth,
21
- size=input_image.size[::-1], # Порядок: (ширина, высота)
22
- mode="bicubic",
23
- align_corners=False,
24
- )
25
-
26
- output = prediction.squeeze().numpy()
27
- formatted = (output * 255 / np.max(output)).astype("uint8")
28
- depth = Image.fromarray(formatted)
29
- return depth
30
-
31
- iface = gr.Interface(
32
- launch,
33
- inputs=gr.Image(type="pil"),
34
- outputs=gr.Image(type="pil"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- demo = gr.Blocks()
 
 
38
 
39
- with demo:
40
- gr.TabbedInterface(
41
- [iface],
42
- ["Depth Estimation Interface"],
43
- )
 
 
 
 
 
44
 
45
  demo.launch(debug=True)
 
1
  import os
2
+ import io
3
  import torch
4
+ import gradio as gr
5
+ import wikipediaapi
6
+ import re
7
+ import inflect
8
+ import soundfile as sf
9
+ import unicodedata
10
+ import num2words
11
  from PIL import Image
12
+ from datasets import load_dataset
13
+ from scipy.io.wavfile import write
14
+
15
+ from transformers import VitsModel, AutoTokenizer
16
  from transformers import pipeline
17
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
18
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
19
 
20
+ from google.cloud import vision
21
 
22
+ from transformers import CLIPProcessor, CLIPModel
 
23
 
24
+ ########################################
25
+ # (Опционально) Установите переменную окружения для Google Cloud:
26
+ # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service_account.json"
27
+ ########################################
28
+
29
+ def clean_text(text):
30
+ # Очистка от некоторых спецсимволов, ссылок, диакритики
31
+ text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
32
+ text = re.sub(r'\[.*?\]', '', text)
33
+
34
+ def remove_diacritics(char):
35
+ if unicodedata.category(char) == 'Mn':
36
+ return ''
37
+ return char
38
+
39
+ text = unicodedata.normalize('NFD', text)
40
+ text = ''.join(remove_diacritics(char) for char in text)
41
+ text = unicodedata.normalize('NFC', text)
42
+
43
+ text = re.sub(r'\s+', ' ', text)
44
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
45
+
46
+ return text.strip()
47
+
48
+ from num2words import num2words
49
+
50
+ def number_to_russian_text(number):
51
+ try:
52
+ return num2words(number, lang='ru')
53
+ except NotImplementedError:
54
+ return "Ошибка: Не поддерживается преобразование для этого числа."
55
+
56
+ summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
57
+
58
+ wiki = wikipediaapi.Wikipedia("Nikita", "en")
59
+
60
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
61
+
62
+ t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
63
+
64
+ translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
65
+
66
+ def text_to_speech(text, output_path="speech.wav"):
67
+ model = VitsModel.from_pretrained("facebook/mms-tts-rus")
68
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
69
+
70
+ inputs = tokenizer(text, return_tensors="pt")
71
+
72
+ with torch.no_grad():
73
+ output = model(**inputs).waveform.squeeze().numpy()
74
+
75
+ sf.write(output_path, output, samplerate=model.config.sampling_rate)
76
+
77
+ return output_path
78
+
79
+ def fetch_wikipedia_summary(landmark):
80
+ page = wiki.page(landmark)
81
+ if page.exists():
82
+ return clean_text(page.summary)
83
+ else:
84
+ return "Found error!"
85
+
86
+ def recognize_landmark_google_cloud(image):
87
+ client = vision.ImageAnnotatorClient()
88
+
89
+ if not isinstance(image, Image.Image):
90
+ image = Image.fromarray(image)
91
+
92
+ img_bytes = io.BytesIO()
93
+ image.save(img_bytes, format='PNG')
94
+ content = img_bytes.getvalue()
95
+ vision_image = vision.Image(content=content)
96
+
97
+ response = client.landmark_detection(image=vision_image)
98
+ landmarks = response.landmark_annotations
99
+ if landmarks:
100
+ return landmarks[0].description
101
+ else:
102
+ return "Unknown"
103
+
104
+ def tourist_helper_english(landmark):
105
+ wiki_text = fetch_wikipedia_summary(landmark)
106
+ if wiki_text == "Found error!":
107
+ return None
108
+
109
+ summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
110
+ audio_path = text_to_speech(summarized_text)
111
+ return audio_path
112
+
113
+ def process_image_google_cloud(image):
114
+ recognized = recognize_landmark_google_cloud(image)
115
+ print(f"[GoogleVision] Распознано: {recognized}")
116
+ audio_path = tourist_helper_english(recognized)
117
+ return audio_path
118
+
119
+ def process_text_google_cloud(landmark):
120
+ return tourist_helper_english(landmark)
121
+
122
+
123
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
124
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
125
+
126
+ text_inputs = clip_processor(
127
+ text=landmark_titles,
128
+ images=None,
129
+ return_tensors="pt",
130
+ padding=True
131
  )
132
+ with torch.no_grad():
133
+ text_embeds = clip_model.get_text_features(**text_inputs)
134
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
135
+
136
+ def recognize_landmark_clip(image):
137
+ if not isinstance(image, Image.Image):
138
+ image = Image.fromarray(image)
139
+
140
+ image_inputs = clip_processor(images=image, return_tensors="pt")
141
+ with torch.no_grad():
142
+ image_embed = clip_model.get_image_features(**image_inputs)
143
+ image_embed = image_embed / image_embed.norm(p=2, dim=-1, keepdim=True)
144
+
145
+ similarity = (image_embed @ text_embeds.T).squeeze(0)
146
+ best_idx = similarity.argmax().item()
147
+ best_score = similarity[best_idx].item()
148
+ recognized_landmark = landmark_titles[best_idx]
149
+ return recognized_landmark, best_score
150
+
151
+ def tourist_helper_with_russian(landmark):
152
+ wiki_text = fetch_wikipedia_summary(landmark)
153
+ if wiki_text == "Found error!":
154
+ return None
155
+
156
+ print(wiki_text)
157
+ summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
158
+ print(summarized_text)
159
+
160
+ translated = translator(summarized_text, max_length=1000)[0]["translation_text"]
161
+ print(translated)
162
+
163
+ audio_path = text_to_speech(translated)
164
+ return audio_path
165
+
166
+ def process_image_clip(image):
167
+ recognized, score = recognize_landmark_clip(image)
168
+ print(f"[CLIP] Распознано: {recognized}, score={score:.2f}")
169
+ audio_path = tourist_helper_with_russian(recognized)
170
+ return audio_path
171
+
172
+ def process_text_clip(landmark):
173
+ return tourist_helper_with_russian(landmark)
174
+
175
+ with gr.Blocks() as demo:
176
+ gr.Markdown("## Две демки: Google Cloud Vision и CLIP (с переводом на русский)")
177
+
178
+ with gr.Tabs():
179
+ with gr.Tab("CLIP + Sum + Translate + T2S"):
180
+ gr.Markdown("### Распознавание (CLIP) и перевод на русский")
181
+
182
+ with gr.Row():
183
+ image_input_c = gr.Image(label="Загрузите фото", type="pil")
184
+ text_input_c = gr.Textbox(label="Или введите название")
185
+
186
+ audio_output_c = gr.Audio(label="Результатт")
187
+
188
+ with gr.Row():
189
+ btn_recognize_c = gr.Button("Распознать и перевести на русский")
190
+ btn_text_c = gr.Button("Поиск по тексту")
191
+
192
+ btn_recognize_c.click(
193
+ fn=process_image_clip,
194
+ inputs=image_input_c,
195
+ outputs=audio_output_c
196
+ )
197
+ btn_text_c.click(
198
+ fn=process_text_clip,
199
+ inputs=text_input_c,
200
+ outputs=audio_output_c
201
+ )
202
+
203
+ with gr.Tab("Google + Sum + T2S"):
204
+ gr.Markdown("### Распознавание достопримечательности (Google)")
205
+
206
+ with gr.Row():
207
+ image_input_g = gr.Image(label="Загрузите фото", type="pil")
208
+ text_input_g = gr.Textbox(label="Или введите название вручную")
209
+
210
+ audio_output_g = gr.Audio(label="Результат")
211
 
212
+ with gr.Row():
213
+ btn_recognize_g = gr.Button("Распознать и озвучить")
214
+ btn_text_g = gr.Button("Распознать по тексту и озвучить")
215
 
216
+ btn_recognize_g.click(
217
+ fn=process_image_google_cloud,
218
+ inputs=image_input_g,
219
+ outputs=audio_output_g
220
+ )
221
+ btn_text_g.click(
222
+ fn=process_text_google_cloud,
223
+ inputs=text_input_g,
224
+ outputs=audio_output_g
225
+ )
226
 
227
  demo.launch(debug=True)