Spaces:

nktssk
/

itis

Running

App Files Files Community

nktssk commited on Jan 24

Commit

2d45773

verified ·

1 Parent(s): 06ac09f

Update app.py

Browse files

Files changed (1) hide show

app.py +215 -33

app.py CHANGED Viewed

@@ -1,45 +1,227 @@
 import os
-import gradio as gr
 import torch
-import torch.nn.functional as F
-import numpy as np
 from PIL import Image
 from transformers import pipeline
-depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-hybrid-midas")
-def launch(input_image):
-    out = depth_estimator(input_image)
-    predicted_depth = torch.tensor(out["predicted_depth"])
-    if len(predicted_depth.shape) == 2:  # Если двумерен, добавляем оси
-        predicted_depth = predicted_depth.unsqueeze(0).unsqueeze(0)
-    prediction = F.interpolate(
-        predicted_depth,
-        size=input_image.size[::-1],  # Порядок: (ширина, высота)
-        mode="bicubic",
-        align_corners=False,
-    )
-    output = prediction.squeeze().numpy()
-    formatted = (output * 255 / np.max(output)).astype("uint8")
-    depth = Image.fromarray(formatted)
-    return depth
-iface = gr.Interface(
-    launch,
-    inputs=gr.Image(type="pil"),
-    outputs=gr.Image(type="pil"),
 )
-demo = gr.Blocks()
-with demo:
-    gr.TabbedInterface(
-        [iface],
-        ["Depth Estimation Interface"],
-    )
 demo.launch(debug=True)

 import os
+import io
 import torch
+import gradio as gr
+import wikipediaapi
+import re
+import inflect
+import soundfile as sf
+import unicodedata
+import num2words
 from PIL import Image
+from datasets import load_dataset
+from scipy.io.wavfile import write
+from transformers import VitsModel, AutoTokenizer
 from transformers import pipeline
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from google.cloud import vision
+from transformers import CLIPProcessor, CLIPModel
+########################################
+# (Опционально) Установите переменную окружения для Google Cloud:
+# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/your/service_account.json"
+########################################
+def clean_text(text):
+    # Очистка от некоторых спецсимволов, ссылок, диакритики
+    text = re.sub(r'МФА:?\s?\[.*?\]', '', text)
+    text = re.sub(r'\[.*?\]', '', text)
+    def remove_diacritics(char):
+        if unicodedata.category(char) == 'Mn':
+            return ''
+        return char
+    text = unicodedata.normalize('NFD', text)
+    text = ''.join(remove_diacritics(char) for char in text)
+    text = unicodedata.normalize('NFC', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[^\w\s.,!?-]', '', text)
+    return text.strip()
+from num2words import num2words
+def number_to_russian_text(number):
+    try:
+        return num2words(number, lang='ru')
+    except NotImplementedError:
+        return "Ошибка: Не поддерживается преобразование для этого числа."
+summarization_model = pipeline("summarization", model="facebook/bart-large-cnn")
+wiki = wikipediaapi.Wikipedia("Nikita", "en")
+embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+t2s_pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus")
+translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru")
+def text_to_speech(text, output_path="speech.wav"):
+  model = VitsModel.from_pretrained("facebook/mms-tts-rus")
+  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
+  inputs = tokenizer(text, return_tensors="pt")
+  with torch.no_grad():
+      output = model(**inputs).waveform.squeeze().numpy()
+  sf.write(output_path, output, samplerate=model.config.sampling_rate)
+  return output_path
+def fetch_wikipedia_summary(landmark):
+    page = wiki.page(landmark)
+    if page.exists():
+        return clean_text(page.summary)
+    else:
+        return "Found error!"
+def recognize_landmark_google_cloud(image):
+    client = vision.ImageAnnotatorClient()
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    img_bytes = io.BytesIO()
+    image.save(img_bytes, format='PNG')
+    content = img_bytes.getvalue()
+    vision_image = vision.Image(content=content)
+    response = client.landmark_detection(image=vision_image)
+    landmarks = response.landmark_annotations
+    if landmarks:
+        return landmarks[0].description
+    else:
+        return "Unknown"
+def tourist_helper_english(landmark):
+    wiki_text = fetch_wikipedia_summary(landmark)
+    if wiki_text == "Found error!":
+        return None
+    summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
+    audio_path = text_to_speech(summarized_text)
+    return audio_path
+def process_image_google_cloud(image):
+    recognized = recognize_landmark_google_cloud(image)
+    print(f"[GoogleVision] Распознано: {recognized}")
+    audio_path = tourist_helper_english(recognized)
+    return audio_path
+def process_text_google_cloud(landmark):
+    return tourist_helper_english(landmark)
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+text_inputs = clip_processor(
+    text=landmark_titles,
+    images=None,
+    return_tensors="pt",
+    padding=True
 )
+with torch.no_grad():
+    text_embeds = clip_model.get_text_features(**text_inputs)
+    text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+def recognize_landmark_clip(image):
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    image_inputs = clip_processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        image_embed = clip_model.get_image_features(**image_inputs)
+        image_embed = image_embed / image_embed.norm(p=2, dim=-1, keepdim=True)
+    similarity = (image_embed @ text_embeds.T).squeeze(0)
+    best_idx = similarity.argmax().item()
+    best_score = similarity[best_idx].item()
+    recognized_landmark = landmark_titles[best_idx]
+    return recognized_landmark, best_score
+def tourist_helper_with_russian(landmark):
+    wiki_text = fetch_wikipedia_summary(landmark)
+    if wiki_text == "Found error!":
+        return None
+    print(wiki_text)
+    summarized_text = summarization_model(wiki_text, min_length=20, max_length=210)[0]["summary_text"]
+    print(summarized_text)
+    translated = translator(summarized_text, max_length=1000)[0]["translation_text"]
+    print(translated)
+    audio_path = text_to_speech(translated)
+    return audio_path
+def process_image_clip(image):
+    recognized, score = recognize_landmark_clip(image)
+    print(f"[CLIP] Распознано: {recognized}, score={score:.2f}")
+    audio_path = tourist_helper_with_russian(recognized)
+    return audio_path
+def process_text_clip(landmark):
+    return tourist_helper_with_russian(landmark)
+with gr.Blocks() as demo:
+    gr.Markdown("## Две демки: Google Cloud Vision и CLIP (с переводом на русский)")
+    with gr.Tabs():
+        with gr.Tab("CLIP + Sum + Translate + T2S"):
+            gr.Markdown("### Распознавание (CLIP) и перевод на русский")
+            with gr.Row():
+                image_input_c = gr.Image(label="Загрузите фото", type="pil")
+                text_input_c = gr.Textbox(label="Или введите название")
+            audio_output_c = gr.Audio(label="Результатт")
+            with gr.Row():
+                btn_recognize_c = gr.Button("Распознать и перевести на русский")
+                btn_text_c = gr.Button("Поиск по тексту")
+            btn_recognize_c.click(
+                fn=process_image_clip,
+                inputs=image_input_c,
+                outputs=audio_output_c
+            )
+            btn_text_c.click(
+                fn=process_text_clip,
+                inputs=text_input_c,
+                outputs=audio_output_c
+            )
+        with gr.Tab("Google + Sum + T2S"):
+            gr.Markdown("### Распознавание достопримечательности (Google)")
+            with gr.Row():
+                image_input_g = gr.Image(label="Загрузите фото", type="pil")
+                text_input_g = gr.Textbox(label="Или введите название вручную")
+            audio_output_g = gr.Audio(label="Результат")
+            with gr.Row():
+                btn_recognize_g = gr.Button("Распознать и озвучить")
+                btn_text_g = gr.Button("Распознать по тексту и озвучить")
+            btn_recognize_g.click(
+                fn=process_image_google_cloud,
+                inputs=image_input_g,
+                outputs=audio_output_g
+            )
+            btn_text_g.click(
+                fn=process_text_google_cloud,
+                inputs=text_input_g,
+                outputs=audio_output_g
+            )
 demo.launch(debug=True)