File size: 4,584 Bytes
e1f1b19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import streamlit as st
from deep_translator import GoogleTranslator
from streamlit_mic_recorder import speech_to_text
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
import json
import time
st.set_page_config(layout="wide")
# Language dictionaries
language_dict = {
    'English': 'en', 'Hindi': 'hi', 'Bengali': 'bn', 'Gujarati': 'gu', 'Marathi': 'mr',
    'Telugu': 'te', 'Tamil': 'ta', 'Punjabi': 'pa', 'Odia': 'or', 'Nepali': 'ne', 'Malayalam': 'ml'
}

nllb_langs = {
    'English':'eng_Latn','Hindi':'hin_Deva','Punjabi':'pan_Guru','Odia':'ory_Orya',
    'Bengali':'ben_Beng','Telugu':'tel_Telu','Tamil':'tam_Taml','Nepali':'npi_Deva',
    'Marathi':'mar_Deva','Malayalam':'mal_Mlym','Gujarati':'guj_Gujr'
}

CHAT_FILE = "chat_data.json"

@st.cache_resource
def load_nllb_model():
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
    translator = pipeline('translation', model=model, tokenizer=tokenizer)
    return translator

@st.cache_resource
def load_sentence_model():
    return SentenceTransformer("google/muril-base-cased")

translator_nllb = load_nllb_model()
sentence_model = load_sentence_model()

def load_messages():
    try:
        with open(CHAT_FILE, "r") as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        return []

def save_messages(messages):
    with open(CHAT_FILE, "w") as file:
        json.dump(messages, file)

def translate_text_multimodel(text, source_lang_name, target_lang_name):
    source_nllb = nllb_langs[source_lang_name]
    target_nllb = nllb_langs[target_lang_name]

    # NLLB Translation
    translation_nllb = translator_nllb(text, src_lang=source_nllb, tgt_lang=target_nllb)[0]['translation_text']
    print(translation_nllb)
    # Google Translation
    translation_google = GoogleTranslator(source='auto', target=language_dict[target_lang_name]).translate(text)

    # Cosine similarity comparison
    embedding_original = sentence_model.encode(text, convert_to_tensor=True)
    embedding_nllb = sentence_model.encode(translation_nllb, convert_to_tensor=True)
    embedding_google = sentence_model.encode(translation_google, convert_to_tensor=True)

    cosine_score_nllb = util.cos_sim(embedding_original, embedding_nllb).item()
    cosine_score_google = util.cos_sim(embedding_original, embedding_google).item()

    # Select more accurate translation
    if cosine_score_nllb >= cosine_score_google:
        print('nllb')
        return translation_nllb
    else:
        print('gt')
        return translation_google

def main():
    
    st.title("Multilingual Chat Application with Speech Input")

    # Sidebar for user setup
    st.sidebar.header("User Setup")
    username = st.sidebar.text_input("Enter your name:")
    language = st.sidebar.selectbox("Choose your language:", list(language_dict.keys()))
    
    if not username:
        st.warning("Please enter your name to start chatting.")
        return

    user_lang_code = language_dict[language]

    if "messages" not in st.session_state:
        st.session_state["messages"] = load_messages()

    # Display chat history
    st.subheader("Chat Room")
    
    # chat_container = st.container()
    
    # with chat_container:
    for msg in st.session_state["messages"]:
            # translated_text = GoogleTranslator(source='auto', target=user_lang_code).translate(msg['text'])
            #translated_text
        with st.chat_message(msg['name']):
            st.write(f"{msg['name']} ({msg['lang']}): {msg['translations'][language]}")

    # Speech input integration
    st.subheader("Speak your message")
    
    spoken_text = speech_to_text(language=user_lang_code, use_container_width=True, just_once=True, key='speech_input')

    if spoken_text:
        input_text = spoken_text
        translations = {}
        st.write(f"You said: {spoken_text}")

        if spoken_text:
            for lang in nllb_langs:
                translation = translate_text_multimodel(spoken_text, language, lang)
                translations[lang] = translation
            new_message = {"user": username, "name": username, "lang": language, "text": input_text, "translations": translations}
            st.session_state["messages"].append(new_message)
            save_messages(st.session_state["messages"])
            st.rerun()
    time.sleep(1)
    st.rerun()






if __name__ == "__main__":
   main()