|
import os |
|
import re |
|
import json |
|
import torch |
|
import requests |
|
import unicodedata |
|
import soundfile as sf |
|
import pymorphy2 |
|
|
|
import gradio as gr |
|
import wikipediaapi |
|
from PIL import Image |
|
from transformers import pipeline, CLIPProcessor, CLIPModel |
|
|
|
import inspect |
|
|
|
if not hasattr(inspect, 'getargspec'): |
|
def getargspec(func): |
|
sig = inspect.signature(func) |
|
defaults = [] |
|
args = [] |
|
varargs = None |
|
varkw = None |
|
for name, param in sig.parameters.items(): |
|
if param.default != param.empty: |
|
defaults.append(param.default) |
|
if param.kind == param.VAR_POSITIONAL: |
|
varargs = name |
|
elif param.kind == param.VAR_KEYWORD: |
|
varkw = name |
|
else: |
|
args.append(name) |
|
return args, varargs, varkw, tuple(defaults) if defaults else None |
|
inspect.getargspec = getargspec |
|
|
|
morph = pymorphy2.MorphAnalyzer() |
|
|
|
def load_attractions_json(url): |
|
r = requests.get(url) |
|
r.raise_for_status() |
|
return json.loads(r.text) |
|
|
|
url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json" |
|
landmark_titles = load_attractions_json(url) |
|
|
|
def clean_text(text): |
|
text = re.sub(r'МФА:?\s?\[.*?\]', '', text) |
|
text = re.sub(r'\[.*?\]', '', text) |
|
def rm_diacritics(c): |
|
return '' if unicodedata.category(c) == 'Mn' else c |
|
text = unicodedata.normalize('NFD', text) |
|
text = ''.join(rm_diacritics(c) for c in text) |
|
text = unicodedata.normalize('NFC', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'[^\w\s.,!?-]', '', text) |
|
return text.strip() |
|
|
|
|
|
def get_case_for_preposition(prep): |
|
d = { |
|
'в': 'loc', 'на': 'loc', 'о': 'loc', 'об': 'loc', 'обо': 'loc', |
|
'к': 'dat', |
|
'с': 'ins', 'со': 'ins', 'над': 'ins', 'под': 'ins', |
|
'из': 'gen', 'от': 'gen', 'у': 'gen', 'до': 'gen', 'для': 'gen' |
|
} |
|
return d.get(prep.lower(), 'nomn') |
|
|
|
def replace_numbers_with_text_in_context(text): |
|
tokens = text.split() |
|
result = [] |
|
for i, token in enumerate(tokens): |
|
if re.match(r'^\d+(\.\d+)?$', token): |
|
cse = 'nom' |
|
if i > 0: |
|
cse = get_case_for_preposition(tokens[i - 1]) |
|
|
|
from num2words import num2words |
|
number_as_words = num2words(float(token) if '.' in token else int(token), lang='ru') |
|
number_as_words = number_as_words.replace('-', ' ') |
|
subtokens = number_as_words.split() |
|
inflected_subtokens = [] |
|
for st in subtokens: |
|
p = morph.parse(st) |
|
if p: |
|
best = p[0] |
|
if cse in best.tag.case: |
|
form = best.inflect({cse}) |
|
inflected_subtokens.append(form.word if form else st) |
|
else: |
|
inflected_subtokens.append(st) |
|
else: |
|
inflected_subtokens.append(st) |
|
result.append(' '.join(inflected_subtokens)) |
|
else: |
|
result.append(token) |
|
return ' '.join(result) |
|
|
|
summarizer = pipeline( |
|
"summarization", |
|
model="sshleifer/distilbart-cnn-12-6", |
|
tokenizer="sshleifer/distilbart-cnn-12-6" |
|
) |
|
translator = pipeline("translation_en_to_ru", model="Helsinki-NLP/opus-mt-en-ru") |
|
wiki = wikipediaapi.Wikipedia("Nikita", "en") |
|
|
|
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
|
text_inputs = clip_processor(text=landmark_titles, images=None, return_tensors="pt", padding=True) |
|
with torch.no_grad(): |
|
text_embeds = clip_model.get_text_features(**text_inputs) |
|
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) |
|
|
|
language = 'ru' |
|
model_id = 'v3_1_ru' |
|
sample_rate = 48000 |
|
speaker = 'eugene' |
|
silero_model, _ = torch.hub.load( |
|
repo_or_dir='snakers4/silero-models', |
|
model='silero_tts', |
|
language=language, |
|
speaker=model_id |
|
) |
|
|
|
def text_to_speech(text, out_path="speech.wav"): |
|
text = replace_numbers_with_text_in_context(text) |
|
audio = silero_model.apply_tts(text=text, speaker=speaker, sample_rate=sample_rate) |
|
sf.write(out_path, audio, sample_rate) |
|
return out_path |
|
|
|
def fetch_wikipedia_summary(landmark): |
|
page = wiki.page(landmark) |
|
return clean_text(page.summary) if page.exists() else "Found error!" |
|
|
|
def recognize_landmark_clip(image): |
|
if not isinstance(image, Image.Image): |
|
image = Image.fromarray(image) |
|
img_in = clip_processor(images=image, return_tensors="pt") |
|
with torch.no_grad(): |
|
img_embed = clip_model.get_image_features(**img_in) |
|
img_embed = img_embed / img_embed.norm(p=2, dim=-1, keepdim=True) |
|
sim = (img_embed @ text_embeds.T).squeeze(0) |
|
best_idx = sim.argmax().item() |
|
return landmark_titles[best_idx], sim[best_idx].item() |
|
|
|
def process_landmark(landmark): |
|
txt = fetch_wikipedia_summary(landmark) |
|
if txt == "Found error!": |
|
return None |
|
print('Wiki text: ') |
|
print(txt) |
|
if len(txt) < 210: |
|
summary = txt |
|
else: |
|
summary = summarizer(txt, min_length=10, max_length=200)[0]["summary_text"] |
|
print('Summarized text: ') |
|
print(summary) |
|
tr = translator(summary, max_length=1000)[0]["translation_text"] |
|
print('Translated text: ') |
|
print(tr) |
|
return text_to_speech(tr) |
|
|
|
def process_image_clip(image): |
|
recognized, score = recognize_landmark_clip(image) |
|
print('Recognized: ') |
|
print(recognized) |
|
return process_landmark(recognized) |
|
|
|
def process_text_clip(landmark): |
|
return process_landmark(landmark) |
|
|
|
def reload_landmarks(): |
|
global landmark_titles, text_embeds |
|
url = "https://raw.githubusercontent.com/nktssk/tourist-helper/refs/heads/main/landmarks.json" |
|
landmark_titles = load_attractions_json(url) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## Помощь туристу") |
|
with gr.Tabs(): |
|
with gr.Tab("CLIP + Sum + Translate + T2S"): |
|
with gr.Row(): |
|
image_input = gr.Image(label="Загрузите фото", type="pil") |
|
text_input = gr.Textbox(label="Или введите название") |
|
audio_output = gr.Audio(label="Результат") |
|
with gr.Row(): |
|
btn_img = gr.Button("Распознать и перевести") |
|
btn_txt = gr.Button("Поиск по названию") |
|
btn_reload = gr.Button("Обновить список (Техническое)") |
|
btn_img.click(fn=process_image_clip, inputs=image_input, outputs=audio_output) |
|
btn_txt.click(fn=process_text_clip, inputs=text_input, outputs=audio_output) |
|
btn_reload.click(fn=reload_landmarks, inputs=None, outputs=None) |
|
|
|
demo.launch(debug=True) |