nktssk commited on
Commit
297d1e5
·
verified ·
1 Parent(s): d96e864

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -402
app.py CHANGED
@@ -8,338 +8,32 @@ import inflect
8
  import soundfile as sf
9
  import unicodedata
10
  import num2words
 
 
11
  from PIL import Image
 
 
12
  from datasets import load_dataset
13
  from scipy.io.wavfile import write
14
-
15
  from transformers import VitsModel, AutoTokenizer
16
  from transformers import pipeline
 
17
  from transformers import T5ForConditionalGeneration, T5Tokenizer
18
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
19
 
20
- from google.cloud import vision
 
 
 
 
 
21
 
22
- from transformers import CLIPProcessor, CLIPModel
23
-
24
- ########################################
25
- # (Опционально) Установите переменную окружения для Google Cloud:
26
- # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service_account.json"
27
- ########################################
28
-
29
- landmark_titles = [
30
- "Eiffel Tower",
31
- "Louvre Museum",
32
- "Cathédrale Notre-Dame de Paris",
33
- "Arc de Triomphe",
34
- "Palace of Versailles",
35
-
36
- "Big Ben",
37
- "London Eye",
38
- "Tower of London",
39
- "Buckingham Palace",
40
- "Stonehenge",
41
-
42
- "Colosseum",
43
- "Pantheon",
44
- "Trevi Fountain",
45
- "Leaning Tower of Pisa",
46
- "Doge's Palace",
47
-
48
- "Sagrada Família",
49
- "Alhambra",
50
- "Park Güell",
51
- "Cathedral of Santiago de Compostela",
52
- "La Rambla",
53
-
54
- "Neuschwanstein Castle",
55
- "Brandenburg Gate",
56
- "Reichstag Building",
57
- "Cologne Cathedral",
58
- "Hofbräuhaus München",
59
-
60
- "Acropolis of Athens",
61
- "Parthenon",
62
- "Temple of Olympian Zeus",
63
- "Delphi Ruins",
64
- "Meteora Monasteries",
65
-
66
- "Charles Bridge",
67
- "Prague Castle",
68
- "Old Town Square in Prague",
69
- "St. Vitus Cathedral",
70
- "Cesky Krumlov Castle",
71
-
72
- "Schönbrunn Palace",
73
- "Hofburg Palace",
74
- "St. Stephen's Cathedral in Vienna",
75
- "Belvedere Palace",
76
- "Mozarthaus Vienna",
77
-
78
- "Rijksmuseum",
79
- "Van Gogh Museum",
80
- "Anne Frank House",
81
- "Kinderdijk Windmills",
82
- "Keukenhof Gardens",
83
-
84
- "Bruges' Historic Centre",
85
- "Grand Place in Brussels",
86
- "Atomium",
87
- "Ghent Belfry",
88
- "Basilica of the Holy Blood",
89
-
90
- "Great Wall of China",
91
- "Forbidden City",
92
- "Terracotta Army",
93
- "Potala Palace",
94
- "Temple of Heaven",
95
-
96
- "Taj Mahal",
97
- "Red Fort",
98
- "Gateway of India",
99
- "Hawa Mahal",
100
- "Golden Temple",
101
-
102
- "Tokyo Tower",
103
- "Fushimi Inari-taisha",
104
- "Kiyomizu-dera",
105
- "Himeji Castle",
106
- "Itsukushima Shrine",
107
-
108
- "Petronas Twin Towers",
109
- "Batu Caves",
110
- "Langkawi Sky Bridge",
111
- "Kek Lok Si Temple",
112
- "Mount Kinabalu",
113
-
114
- "Gardens by the Bay",
115
- "Marina Bay Sands",
116
- "Merlion Park",
117
- "Buddha Tooth Relic Temple",
118
- "Chinatown Singapore",
119
-
120
- "Angkor Wat",
121
- "Ta Prohm",
122
- "Bayon Temple",
123
- "Preah Khan",
124
- "Banteay Srei",
125
-
126
- "Borobudur Temple",
127
- "Prambanan Temple",
128
- "Tanah Lot",
129
- "Uluwatu Temple",
130
- "Mount Bromo",
131
-
132
- "Ha Long Bay",
133
- "Imperial City of Huế",
134
- "My Son Sanctuary",
135
- "Hoi An Ancient Town",
136
- "Phong Nha-Ke Bang National Park",
137
-
138
- "Gyeongbokgung Palace",
139
- "Bukchon Hanok Village",
140
- "N Seoul Tower",
141
- "Jeju Island",
142
- "Changdeokgung Palace",
143
-
144
- "Shwedagon Pagoda",
145
- "Bagan Temples",
146
- "Inle Lake",
147
- "Kyaiktiyo Pagoda",
148
- "Mandalay Palace",
149
-
150
- "Pyramids of Giza",
151
- "Great Sphinx of Giza",
152
- "Karnak Temple",
153
- "Valley of the Kings",
154
- "Abu Simbel Temples",
155
-
156
- "Victoria Falls",
157
- "Hwange National Park",
158
- "Matobo Hills",
159
- "Great Zimbabwe Ruins",
160
- "Lake Kariba",
161
-
162
- "Serengeti National Park",
163
- "Mount Kilimanjaro",
164
- "Ngorongoro Crater",
165
- "Zanzibar Stone Town",
166
- "Lake Manyara",
167
-
168
- "Table Mountain",
169
- "Robben Island",
170
- "Kruger National Park",
171
- "Cape of Good Hope",
172
- "Blyde River Canyon",
173
-
174
- "Djmaa el Fna",
175
- "Koutoubia Mosque",
176
- "Hassan II Mosque",
177
- "Chefchaouen",
178
- "Aït Benhaddou",
179
-
180
- "Gorée Island",
181
- "Lake Retba (Lac Rose)",
182
- "Saloum Delta",
183
- "Saint-Louis Island",
184
- "Niokolo-Koba National Park",
185
-
186
- "Sossusvlei",
187
- "Etosha National Park",
188
- "Fish River Canyon",
189
- "Skeleton Coast",
190
- "Twyfelfontein",
191
-
192
- "Lalibela Churches",
193
- "Simien Mountains",
194
- "Blue Nile Falls",
195
- "Aksum Obelisks",
196
- "Harar Jugol",
197
-
198
- "Carthage Archaeological Site",
199
- "El Jem Amphitheatre",
200
- "Medina of Tunis",
201
- "Sidi Bou Said",
202
- "Dougga",
203
-
204
- "Mount Kenya",
205
- "Maasai Mara National Reserve",
206
- "Lake Nakuru",
207
- "Amboseli National Park",
208
- "Tsavo National Park",
209
-
210
- "Statue of Liberty",
211
- "Grand Canyon",
212
- "Yellowstone National Park",
213
- "Mount Rushmore",
214
- "Golden Gate Bridge",
215
-
216
- "Niagara Falls",
217
- "CN Tower",
218
- "Banff National Park",
219
- "Notre-Dame Basilica in Montreal",
220
- "Stanley Park",
221
-
222
- "Chichén Itzá",
223
- "Teotihuacan",
224
- "Tulum Ruins",
225
- "Copper Canyon",
226
- "Palenque",
227
-
228
- "Havana Old Town",
229
- "Varadero Beach",
230
- "Trinidad Historic Center",
231
- "Viñales Valley",
232
- "Castillo de San Pedro de la Roca",
233
-
234
- "Altos de Chavón",
235
- "Santo Domingo Colonial Zone",
236
- "Punta Cana Beaches",
237
- "Los Haitises National Park",
238
- "Bahía de las Águilas",
239
-
240
- "Blue Mountains",
241
- "Dunn's River Falls",
242
- "Bob Marley Museum",
243
- "Negril Seven Mile Beach",
244
- "Port Royal",
245
-
246
- "Belize Barrier Reef",
247
- "Caracol",
248
- "Lamanai",
249
- "Xunantunich",
250
- "Caye Caulker",
251
-
252
- "Tikal",
253
- "Antigua Guatemala",
254
- "Lake Atitlán",
255
- "Semuc Champey",
256
- "Pacaya Volcano",
257
-
258
- "Panama Canal",
259
- "Casco Viejo",
260
- "San Blas Islands",
261
- "Bocas del Toro",
262
- "Coiba National Park",
263
-
264
- "Arenal Volcano",
265
- "Manuel Antonio National Park",
266
- "Monteverde Cloud Forest",
267
- "Tortuguero National Park",
268
- "Corcovado National Park",
269
-
270
- "Machu Picchu",
271
- "Nazca Lines",
272
- "Lake Titicaca",
273
- "Colca Canyon",
274
- "Cusco Historic Center",
275
-
276
- "Red Square",
277
- "Saint Basil's Cathedral",
278
- "Kremlin (Moscow)",
279
- "Hermitage Museum",
280
- "Peterhof",
281
- "Catherine Palace",
282
- "Lake Baikal",
283
- "Valley of Geysers (Kamchatka)",
284
- "Kazan Kremlin",
285
- "Kizhi Pogost",
286
-
287
- "Christ the Redeemer",
288
- "Sugarloaf Mountain",
289
- "Iguazu Falls",
290
- "Amazon Rainforest",
291
- "Copacabana Beach",
292
-
293
- "Easter Island (Rapa Nui)",
294
- "Torres del Paine",
295
- "Valle de la Luna",
296
- "Atacama Desert",
297
- "San Cristobal Hill",
298
-
299
- "Perito Moreno Glacier",
300
- "Mount Fitz Roy",
301
- "Buenos Aires Obelisk",
302
- "La Boca Neighborhood",
303
- "Talampaya National Park",
304
-
305
- "Salt Cathedral of Zipaquirá",
306
- "Cartagena Old Town",
307
- "Tayrona National Park",
308
- "Cocora Valley",
309
- "Monserrate",
310
-
311
- "Galápagos Islands",
312
- "Cotopaxi Volcano",
313
- "Quilotoa Lake",
314
- "Middle of the World City",
315
- "Historic Center of Quito",
316
-
317
- "Los Roques Archipelago",
318
- "Angel Falls",
319
- "Margarita Island",
320
- "Canaima National Park",
321
- "Roraima",
322
-
323
- "Salar de Uyuni",
324
- "Laguna Colorada",
325
- "Tiwanaku",
326
- "La Paz Cable Car",
327
- "Death Road (Yungas Road)",
328
-
329
- "Casapueblo",
330
- "Punta del Este",
331
- "Colonia del Sacramento",
332
- "Plaza Independencia in Montevideo",
333
- "Teatro Solís",
334
-
335
- "Itaipu Dam",
336
- "Asunción Historic Centre",
337
- "Jesuit Missions of La Santísima Trinidad",
338
- "Ybycuí National Park",
339
- "Cerro Tobatí"
340
- ]
341
 
 
342
 
 
343
  def clean_text(text):
344
  text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
345
  text = re.sub(r'\[.*?\]', '', text)
@@ -358,8 +52,6 @@ def clean_text(text):
358
 
359
  return text.strip()
360
 
361
- from num2words import num2words
362
-
363
  def replace_numbers_with_text(input_string):
364
  def convert_number(match):
365
  number = match.group(0)
@@ -369,16 +61,27 @@ def replace_numbers_with_text(input_string):
369
  return number
370
  return re.sub(r'\d+(\.\d+)?', convert_number, input_string)
371
 
 
372
  summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
373
-
374
  wiki = wikipediaapi.Wikipedia("Nikita", "en")
375
-
376
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
377
-
378
  t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
379
-
380
  translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  def text_to_speech(text, output_path="speech.wav"):
383
  text = replace_numbers_with_text(text)
384
  model = VitsModel.from_pretrained("facebook/mms-tts-rus")
@@ -393,6 +96,7 @@ def text_to_speech(text, output_path="speech.wav"):
393
 
394
  return output_path
395
 
 
396
  def fetch_wikipedia_summary(landmark):
397
  page = wiki.page(landmark)
398
  if page.exists():
@@ -400,56 +104,7 @@ def fetch_wikipedia_summary(landmark):
400
  else:
401
  return "Found error!"
402
 
403
- def recognize_landmark_google_cloud(image):
404
- client = vision.ImageAnnotatorClient()
405
-
406
- if not isinstance(image, Image.Image):
407
- image = Image.fromarray(image)
408
-
409
- img_bytes = io.BytesIO()
410
- image.save(img_bytes, format='PNG')
411
- content = img_bytes.getvalue()
412
- vision_image = vision.Image(content=content)
413
-
414
- response = client.landmark_detection(image=vision_image)
415
- landmarks = response.landmark_annotations
416
- if landmarks:
417
- return landmarks[0].description
418
- else:
419
- return "Unknown"
420
-
421
- def tourist_helper_english(landmark):
422
- wiki_text = fetch_wikipedia_summary(landmark)
423
- if wiki_text == "Found error!":
424
- return None
425
-
426
- summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
427
- audio_path = text_to_speech(summarized_text)
428
- return audio_path
429
-
430
- def process_image_google_cloud(image):
431
- recognized = recognize_landmark_google_cloud(image)
432
- print(f"[GoogleVision] Распознано: {recognized}")
433
- audio_path = tourist_helper_english(recognized)
434
- return audio_path
435
-
436
- def process_text_google_cloud(landmark):
437
- return tourist_helper_english(landmark)
438
-
439
-
440
- clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
441
- clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
442
-
443
- text_inputs = clip_processor(
444
- text=landmark_titles,
445
- images=None,
446
- return_tensors="pt",
447
- padding=True
448
- )
449
- with torch.no_grad():
450
- text_embeds = clip_model.get_text_features(**text_inputs)
451
- text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
452
-
453
  def recognize_landmark_clip(image):
454
  if not isinstance(image, Image.Image):
455
  image = Image.fromarray(image)
@@ -465,6 +120,7 @@ def recognize_landmark_clip(image):
465
  recognized_landmark = landmark_titles[best_idx]
466
  return recognized_landmark, best_score
467
 
 
468
  def tourist_helper_with_russian(landmark):
469
  wiki_text = fetch_wikipedia_summary(landmark)
470
  if wiki_text == "Found error!":
@@ -490,33 +146,9 @@ def process_text_clip(landmark):
490
  return tourist_helper_with_russian(landmark)
491
 
492
  with gr.Blocks() as demo:
493
- gr.Markdown("## Две демки: Google Cloud Vision и CLIP (с переводом на русский)")
494
 
495
  with gr.Tabs():
496
- with gr.Tab("Google + Sum + T2S"):
497
- gr.Markdown("### Распознавание достопримечательности (Google)")
498
-
499
- with gr.Row():
500
- image_input_g = gr.Image(label="Загрузите фото", type="pil")
501
- text_input_g = gr.Textbox(label="Или введите название вручную")
502
-
503
- audio_output_g = gr.Audio(label="Результат")
504
-
505
- with gr.Row():
506
- btn_recognize_g = gr.Button("Распознать и озвучить")
507
- btn_text_g = gr.Button("Распознать по тексту и озвучить")
508
-
509
- btn_recognize_g.click(
510
- fn=process_image_google_cloud,
511
- inputs=image_input_g,
512
- outputs=audio_output_g
513
- )
514
- btn_text_g.click(
515
- fn=process_text_google_cloud,
516
- inputs=text_input_g,
517
- outputs=audio_output_g
518
- )
519
-
520
  with gr.Tab("CLIP + Sum + Translate + T2S"):
521
  gr.Markdown("### Распознавание (CLIP) и перевод на русский")
522
 
 
8
  import soundfile as sf
9
  import unicodedata
10
  import num2words
11
+ import requests
12
+ import json
13
  from PIL import Image
14
+ from num2words import num2words
15
+ from google.cloud import vision
16
  from datasets import load_dataset
17
  from scipy.io.wavfile import write
 
18
  from transformers import VitsModel, AutoTokenizer
19
  from transformers import pipeline
20
+ from transformers import CLIPProcessor, CLIPModel
21
  from transformers import T5ForConditionalGeneration, T5Tokenizer
22
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
23
 
24
+ def load_attractions_json(url):
25
+ response = requests.get(url)
26
+ response.raise_for_status()
27
+ json_text = response.text
28
+ data = json.loads(json_text)
29
+ return data
30
 
31
+ url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
32
+ landmark_titles = load_attractions_json(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ print(landmark_titles)
35
 
36
+ # HELPERS
37
  def clean_text(text):
38
  text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
39
  text = re.sub(r'\[.*?\]', '', text)
 
52
 
53
  return text.strip()
54
 
 
 
55
  def replace_numbers_with_text(input_string):
56
  def convert_number(match):
57
  number = match.group(0)
 
61
  return number
62
  return re.sub(r'\d+(\.\d+)?', convert_number, input_string)
63
 
64
+ # MODELS
65
  summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
 
66
  wiki = wikipediaapi.Wikipedia("Nikita", "en")
 
67
  embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 
68
  t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
 
69
  translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
70
 
71
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
72
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
73
+
74
+ text_inputs = clip_processor(
75
+ text=landmark_titles,
76
+ images=None,
77
+ return_tensors="pt",
78
+ padding=True
79
+ )
80
+ with torch.no_grad():
81
+ text_embeds = clip_model.get_text_features(**text_inputs)
82
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
83
+
84
+ # TEXT-TO-SPEECH
85
  def text_to_speech(text, output_path="speech.wav"):
86
  text = replace_numbers_with_text(text)
87
  model = VitsModel.from_pretrained("facebook/mms-tts-rus")
 
96
 
97
  return output_path
98
 
99
+ # WIKI
100
  def fetch_wikipedia_summary(landmark):
101
  page = wiki.page(landmark)
102
  if page.exists():
 
104
  else:
105
  return "Found error!"
106
 
107
+ # CLIP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def recognize_landmark_clip(image):
109
  if not isinstance(image, Image.Image):
110
  image = Image.fromarray(image)
 
120
  recognized_landmark = landmark_titles[best_idx]
121
  return recognized_landmark, best_score
122
 
123
+ # DEMO
124
  def tourist_helper_with_russian(landmark):
125
  wiki_text = fetch_wikipedia_summary(landmark)
126
  if wiki_text == "Found error!":
 
146
  return tourist_helper_with_russian(landmark)
147
 
148
  with gr.Blocks() as demo:
149
+ gr.Markdown("## Помощь туристу")
150
 
151
  with gr.Tabs():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with gr.Tab("CLIP + Sum + Translate + T2S"):
153
  gr.Markdown("### Распознавание (CLIP) и перевод на русский")
154