Spaces:

nktssk
/

itis

Running

App Files Files Community

nktssk commited on Jan 29

Commit

297d1e5

verified ·

1 Parent(s): d96e864

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -402

app.py CHANGED Viewed

@@ -8,338 +8,32 @@ import inflect
 import soundfile as sf
 import unicodedata
 import num2words
 from PIL import Image
 from datasets import load_dataset
 from scipy.io.wavfile import write
 from transformers import VitsModel, AutoTokenizer
 from transformers import pipeline
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from google.cloud import vision
-from transformers import CLIPProcessor, CLIPModel
-########################################
-# (Опционально) Установите переменную окружения для Google Cloud:
-# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service_account.json"
-########################################
-landmark_titles = [
-    "Eiffel Tower",
-    "Louvre Museum",
-    "Cathédrale Notre-Dame de Paris",
-    "Arc de Triomphe",
-    "Palace of Versailles",
-    "Big Ben",
-    "London Eye",
-    "Tower of London",
-    "Buckingham Palace",
-    "Stonehenge",
-    "Colosseum",
-    "Pantheon",
-    "Trevi Fountain",
-    "Leaning Tower of Pisa",
-    "Doge's Palace",
-    "Sagrada Família",
-    "Alhambra",
-    "Park Güell",
-    "Cathedral of Santiago de Compostela",
-    "La Rambla",
-    "Neuschwanstein Castle",
-    "Brandenburg Gate",
-    "Reichstag Building",
-    "Cologne Cathedral",
-    "Hofbräuhaus München",
-    "Acropolis of Athens",
-    "Parthenon",
-    "Temple of Olympian Zeus",
-    "Delphi Ruins",
-    "Meteora Monasteries",
-    "Charles Bridge",
-    "Prague Castle",
-    "Old Town Square in Prague",
-    "St. Vitus Cathedral",
-    "Cesky Krumlov Castle",
-    "Schönbrunn Palace",
-    "Hofburg Palace",
-    "St. Stephen's Cathedral in Vienna",
-    "Belvedere Palace",
-    "Mozarthaus Vienna",
-    "Rijksmuseum",
-    "Van Gogh Museum",
-    "Anne Frank House",
-    "Kinderdijk Windmills",
-    "Keukenhof Gardens",
-    "Bruges' Historic Centre",
-    "Grand Place in Brussels",
-    "Atomium",
-    "Ghent Belfry",
-    "Basilica of the Holy Blood",
-    "Great Wall of China",
-    "Forbidden City",
-    "Terracotta Army",
-    "Potala Palace",
-    "Temple of Heaven",
-    "Taj Mahal",
-    "Red Fort",
-    "Gateway of India",
-    "Hawa Mahal",
-    "Golden Temple",
-    "Tokyo Tower",
-    "Fushimi Inari-taisha",
-    "Kiyomizu-dera",
-    "Himeji Castle",
-    "Itsukushima Shrine",
-    "Petronas Twin Towers",
-    "Batu Caves",
-    "Langkawi Sky Bridge",
-    "Kek Lok Si Temple",
-    "Mount Kinabalu",
-    "Gardens by the Bay",
-    "Marina Bay Sands",
-    "Merlion Park",
-    "Buddha Tooth Relic Temple",
-    "Chinatown Singapore",
-    "Angkor Wat",
-    "Ta Prohm",
-    "Bayon Temple",
-    "Preah Khan",
-    "Banteay Srei",
-    "Borobudur Temple",
-    "Prambanan Temple",
-    "Tanah Lot",
-    "Uluwatu Temple",
-    "Mount Bromo",
-    "Ha Long Bay",
-    "Imperial City of Huế",
-    "My Son Sanctuary",
-    "Hoi An Ancient Town",
-    "Phong Nha-Ke Bang National Park",
-    "Gyeongbokgung Palace",
-    "Bukchon Hanok Village",
-    "N Seoul Tower",
-    "Jeju Island",
-    "Changdeokgung Palace",
-    "Shwedagon Pagoda",
-    "Bagan Temples",
-    "Inle Lake",
-    "Kyaiktiyo Pagoda",
-    "Mandalay Palace",
-    "Pyramids of Giza",
-    "Great Sphinx of Giza",
-    "Karnak Temple",
-    "Valley of the Kings",
-    "Abu Simbel Temples",
-    "Victoria Falls",
-    "Hwange National Park",
-    "Matobo Hills",
-    "Great Zimbabwe Ruins",
-    "Lake Kariba",
-    "Serengeti National Park",
-    "Mount Kilimanjaro",
-    "Ngorongoro Crater",
-    "Zanzibar Stone Town",
-    "Lake Manyara",
-    "Table Mountain",
-    "Robben Island",
-    "Kruger National Park",
-    "Cape of Good Hope",
-    "Blyde River Canyon",
-    "Djmaa el Fna",
-    "Koutoubia Mosque",
-    "Hassan II Mosque",
-    "Chefchaouen",
-    "Aït Benhaddou",
-    "Gorée Island",
-    "Lake Retba (Lac Rose)",
-    "Saloum Delta",
-    "Saint-Louis Island",
-    "Niokolo-Koba National Park",
-    "Sossusvlei",
-    "Etosha National Park",
-    "Fish River Canyon",
-    "Skeleton Coast",
-    "Twyfelfontein",
-    "Lalibela Churches",
-    "Simien Mountains",
-    "Blue Nile Falls",
-    "Aksum Obelisks",
-    "Harar Jugol",
-    "Carthage Archaeological Site",
-    "El Jem Amphitheatre",
-    "Medina of Tunis",
-    "Sidi Bou Said",
-    "Dougga",
-    "Mount Kenya",
-    "Maasai Mara National Reserve",
-    "Lake Nakuru",
-    "Amboseli National Park",
-    "Tsavo National Park",
-    "Statue of Liberty",
-    "Grand Canyon",
-    "Yellowstone National Park",
-    "Mount Rushmore",
-    "Golden Gate Bridge",
-    "Niagara Falls",
-    "CN Tower",
-    "Banff National Park",
-    "Notre-Dame Basilica in Montreal",
-    "Stanley Park",
-    "Chichén Itzá",
-    "Teotihuacan",
-    "Tulum Ruins",
-    "Copper Canyon",
-    "Palenque",
-    "Havana Old Town",
-    "Varadero Beach",
-    "Trinidad Historic Center",
-    "Viñales Valley",
-    "Castillo de San Pedro de la Roca",
-    "Altos de Chavón",
-    "Santo Domingo Colonial Zone",
-    "Punta Cana Beaches",
-    "Los Haitises National Park",
-    "Bahía de las Águilas",
-    "Blue Mountains",
-    "Dunn's River Falls",
-    "Bob Marley Museum",
-    "Negril Seven Mile Beach",
-    "Port Royal",
-    "Belize Barrier Reef",
-    "Caracol",
-    "Lamanai",
-    "Xunantunich",
-    "Caye Caulker",
-    "Tikal",
-    "Antigua Guatemala",
-    "Lake Atitlán",
-    "Semuc Champey",
-    "Pacaya Volcano",
-    "Panama Canal",
-    "Casco Viejo",
-    "San Blas Islands",
-    "Bocas del Toro",
-    "Coiba National Park",
-    "Arenal Volcano",
-    "Manuel Antonio National Park",
-    "Monteverde Cloud Forest",
-    "Tortuguero National Park",
-    "Corcovado National Park",
-    "Machu Picchu",
-    "Nazca Lines",
-    "Lake Titicaca",
-    "Colca Canyon",
-    "Cusco Historic Center",
-    "Red Square",
-    "Saint Basil's Cathedral",
-    "Kremlin (Moscow)",
-    "Hermitage Museum",
-    "Peterhof",
-    "Catherine Palace",
-    "Lake Baikal",
-    "Valley of Geysers (Kamchatka)",
-    "Kazan Kremlin",
-    "Kizhi Pogost",
-    "Christ the Redeemer",
-    "Sugarloaf Mountain",
-    "Iguazu Falls",
-    "Amazon Rainforest",
-    "Copacabana Beach",
-    "Easter Island (Rapa Nui)",
-    "Torres del Paine",
-    "Valle de la Luna",
-    "Atacama Desert",
-    "San Cristobal Hill",
-    "Perito Moreno Glacier",
-    "Mount Fitz Roy",
-    "Buenos Aires Obelisk",
-    "La Boca Neighborhood",
-    "Talampaya National Park",
-    "Salt Cathedral of Zipaquirá",
-    "Cartagena Old Town",
-    "Tayrona National Park",
-    "Cocora Valley",
-    "Monserrate",
-    "Galápagos Islands",
-    "Cotopaxi Volcano",
-    "Quilotoa Lake",
-    "Middle of the World City",
-    "Historic Center of Quito",
-    "Los Roques Archipelago",
-    "Angel Falls",
-    "Margarita Island",
-    "Canaima National Park",
-    "Roraima",
-    "Salar de Uyuni",
-    "Laguna Colorada",
-    "Tiwanaku",
-    "La Paz Cable Car",
-    "Death Road (Yungas Road)",
-    "Casapueblo",
-    "Punta del Este",
-    "Colonia del Sacramento",
-    "Plaza Independencia in Montevideo",
-    "Teatro Solís",
-    "Itaipu Dam",
-    "Asunción Historic Centre",
-    "Jesuit Missions of La Santísima Trinidad",
-    "Ybycuí National Park",
-    "Cerro Tobatí"
-]
 def clean_text(text):
     text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
     text = re.sub(r'\[.*?\]', '', text)
@@ -358,8 +52,6 @@ def clean_text(text):
     return text.strip()
-from num2words import num2words
 def replace_numbers_with_text(input_string):
     def convert_number(match):
         number = match.group(0)
@@ -369,16 +61,27 @@ def replace_numbers_with_text(input_string):
             return number
     return re.sub(r'\d+(\.\d+)?', convert_number, input_string)
 summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
 wiki = wikipediaapi.Wikipedia("Nikita", "en")
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
 translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
 def text_to_speech(text, output_path="speech.wav"):
   text = replace_numbers_with_text(text)
   model = VitsModel.from_pretrained("facebook/mms-tts-rus")
@@ -393,6 +96,7 @@ def text_to_speech(text, output_path="speech.wav"):
   return output_path
 def fetch_wikipedia_summary(landmark):
     page = wiki.page(landmark)
     if page.exists():
@@ -400,56 +104,7 @@ def fetch_wikipedia_summary(landmark):
     else:
         return "Found error!"
-def recognize_landmark_google_cloud(image):
-    client = vision.ImageAnnotatorClient()
-    if not isinstance(image, Image.Image):
-        image = Image.fromarray(image)
-    img_bytes = io.BytesIO()
-    image.save(img_bytes, format='PNG')
-    content = img_bytes.getvalue()
-    vision_image = vision.Image(content=content)
-    response = client.landmark_detection(image=vision_image)
-    landmarks = response.landmark_annotations
-    if landmarks:
-        return landmarks[0].description
-    else:
-        return "Unknown"
-def tourist_helper_english(landmark):
-    wiki_text = fetch_wikipedia_summary(landmark)
-    if wiki_text == "Found error!":
-        return None
-    summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
-    audio_path = text_to_speech(summarized_text)
-    return audio_path
-def process_image_google_cloud(image):
-    recognized = recognize_landmark_google_cloud(image)
-    print(f"[GoogleVision] Распознано: {recognized}")
-    audio_path = tourist_helper_english(recognized)
-    return audio_path
-def process_text_google_cloud(landmark):
-    return tourist_helper_english(landmark)
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-text_inputs = clip_processor(
-    text=landmark_titles,
-    images=None,
-    return_tensors="pt",
-    padding=True
-)
-with torch.no_grad():
-    text_embeds = clip_model.get_text_features(**text_inputs)
-    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
 def recognize_landmark_clip(image):
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
@@ -465,6 +120,7 @@ def recognize_landmark_clip(image):
     recognized_landmark = landmark_titles[best_idx]
     return recognized_landmark, best_score
 def tourist_helper_with_russian(landmark):
     wiki_text = fetch_wikipedia_summary(landmark)
     if wiki_text == "Found error!":
@@ -490,33 +146,9 @@ def process_text_clip(landmark):
     return tourist_helper_with_russian(landmark)
 with gr.Blocks() as demo:
-    gr.Markdown("## Две демки: Google Cloud Vision и CLIP (с переводом на русский)")
     with gr.Tabs():
-        with gr.Tab("Google + Sum + T2S"):
-            gr.Markdown("### Распознавание достопримечательности (Google)")
-            with gr.Row():
-                image_input_g = gr.Image(label="Загрузите фото", type="pil")
-                text_input_g = gr.Textbox(label="Или введите название вручную")
-            audio_output_g = gr.Audio(label="Результат")
-            with gr.Row():
-                btn_recognize_g = gr.Button("Распознать и озвучить")
-                btn_text_g = gr.Button("Распознать по тексту и озвучить")
-            btn_recognize_g.click(
-                fn=process_image_google_cloud,
-                inputs=image_input_g,
-                outputs=audio_output_g
-            )
-            btn_text_g.click(
-                fn=process_text_google_cloud,
-                inputs=text_input_g,
-                outputs=audio_output_g
-            )
         with gr.Tab("CLIP + Sum + Translate + T2S"):
             gr.Markdown("### Распознавание (CLIP) и перевод на русский")

 import soundfile as sf
 import unicodedata
 import num2words
+import requests
+import json
 from PIL import Image
+from num2words import num2words
+from google.cloud import vision
 from datasets import load_dataset
 from scipy.io.wavfile import write
 from transformers import VitsModel, AutoTokenizer
 from transformers import pipeline
+from transformers import CLIPProcessor, CLIPModel
 from transformers import T5ForConditionalGeneration, T5Tokenizer
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+def load_attractions_json(url):
+    response = requests.get(url)
+    response.raise_for_status()
+    json_text = response.text
+    data = json.loads(json_text)
+    return data
+url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json"
+landmark_titles = load_attractions_json(url)
+print(landmark_titles)
+# HELPERS
 def clean_text(text):
     text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
     text = re.sub(r'\[.*?\]', '', text)
     return text.strip()
 def replace_numbers_with_text(input_string):
     def convert_number(match):
         number = match.group(0)
             return number
     return re.sub(r'\d+(\.\d+)?', convert_number, input_string)
+# MODELS
 summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
 wiki = wikipediaapi.Wikipedia("Nikita", "en")
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
 translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+text_inputs = clip_processor(
+    text=landmark_titles,
+    images=None,
+    return_tensors="pt",
+    padding=True
+)
+with torch.no_grad():
+    text_embeds = clip_model.get_text_features(**text_inputs)
+    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+# TEXT-TO-SPEECH
 def text_to_speech(text, output_path="speech.wav"):
   text = replace_numbers_with_text(text)
   model = VitsModel.from_pretrained("facebook/mms-tts-rus")
   return output_path
+# WIKI
 def fetch_wikipedia_summary(landmark):
     page = wiki.page(landmark)
     if page.exists():
     else:
         return "Found error!"
+# CLIP
 def recognize_landmark_clip(image):
     if not isinstance(image, Image.Image):
         image = Image.fromarray(image)
     recognized_landmark = landmark_titles[best_idx]
     return recognized_landmark, best_score
+# DEMO
 def tourist_helper_with_russian(landmark):
     wiki_text = fetch_wikipedia_summary(landmark)
     if wiki_text == "Found error!":
     return tourist_helper_with_russian(landmark)
 with gr.Blocks() as demo:
+    gr.Markdown("## Помощь туристу")
     with gr.Tabs():
         with gr.Tab("CLIP + Sum + Translate + T2S"):
             gr.Markdown("### Распознавание (CLIP) и перевод на русский")