Spaces:
Sleeping
Sleeping
File size: 4,584 Bytes
e1f1b19 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import streamlit as st
from deep_translator import GoogleTranslator
from streamlit_mic_recorder import speech_to_text
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
import json
import time
st.set_page_config(layout="wide")
# Language dictionaries
language_dict = {
'English': 'en', 'Hindi': 'hi', 'Bengali': 'bn', 'Gujarati': 'gu', 'Marathi': 'mr',
'Telugu': 'te', 'Tamil': 'ta', 'Punjabi': 'pa', 'Odia': 'or', 'Nepali': 'ne', 'Malayalam': 'ml'
}
nllb_langs = {
'English':'eng_Latn','Hindi':'hin_Deva','Punjabi':'pan_Guru','Odia':'ory_Orya',
'Bengali':'ben_Beng','Telugu':'tel_Telu','Tamil':'tam_Taml','Nepali':'npi_Deva',
'Marathi':'mar_Deva','Malayalam':'mal_Mlym','Gujarati':'guj_Gujr'
}
CHAT_FILE = "chat_data.json"
@st.cache_resource
def load_nllb_model():
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
translator = pipeline('translation', model=model, tokenizer=tokenizer)
return translator
@st.cache_resource
def load_sentence_model():
return SentenceTransformer("google/muril-base-cased")
translator_nllb = load_nllb_model()
sentence_model = load_sentence_model()
def load_messages():
try:
with open(CHAT_FILE, "r") as file:
return json.load(file)
except (FileNotFoundError, json.JSONDecodeError):
return []
def save_messages(messages):
with open(CHAT_FILE, "w") as file:
json.dump(messages, file)
def translate_text_multimodel(text, source_lang_name, target_lang_name):
source_nllb = nllb_langs[source_lang_name]
target_nllb = nllb_langs[target_lang_name]
# NLLB Translation
translation_nllb = translator_nllb(text, src_lang=source_nllb, tgt_lang=target_nllb)[0]['translation_text']
print(translation_nllb)
# Google Translation
translation_google = GoogleTranslator(source='auto', target=language_dict[target_lang_name]).translate(text)
# Cosine similarity comparison
embedding_original = sentence_model.encode(text, convert_to_tensor=True)
embedding_nllb = sentence_model.encode(translation_nllb, convert_to_tensor=True)
embedding_google = sentence_model.encode(translation_google, convert_to_tensor=True)
cosine_score_nllb = util.cos_sim(embedding_original, embedding_nllb).item()
cosine_score_google = util.cos_sim(embedding_original, embedding_google).item()
# Select more accurate translation
if cosine_score_nllb >= cosine_score_google:
print('nllb')
return translation_nllb
else:
print('gt')
return translation_google
def main():
st.title("Multilingual Chat Application with Speech Input")
# Sidebar for user setup
st.sidebar.header("User Setup")
username = st.sidebar.text_input("Enter your name:")
language = st.sidebar.selectbox("Choose your language:", list(language_dict.keys()))
if not username:
st.warning("Please enter your name to start chatting.")
return
user_lang_code = language_dict[language]
if "messages" not in st.session_state:
st.session_state["messages"] = load_messages()
# Display chat history
st.subheader("Chat Room")
# chat_container = st.container()
# with chat_container:
for msg in st.session_state["messages"]:
# translated_text = GoogleTranslator(source='auto', target=user_lang_code).translate(msg['text'])
#translated_text
with st.chat_message(msg['name']):
st.write(f"{msg['name']} ({msg['lang']}): {msg['translations'][language]}")
# Speech input integration
st.subheader("Speak your message")
spoken_text = speech_to_text(language=user_lang_code, use_container_width=True, just_once=True, key='speech_input')
if spoken_text:
input_text = spoken_text
translations = {}
st.write(f"You said: {spoken_text}")
if spoken_text:
for lang in nllb_langs:
translation = translate_text_multimodel(spoken_text, language, lang)
translations[lang] = translation
new_message = {"user": username, "name": username, "lang": language, "text": input_text, "translations": translations}
st.session_state["messages"].append(new_message)
save_messages(st.session_state["messages"])
st.rerun()
time.sleep(1)
st.rerun()
if __name__ == "__main__":
main()
|