import streamlit as st import os import itertools import os from pathlib import Path import yaml from download_utils import download_dataset import utils from formality_transformer import FormalityTransformer from hazm import SentenceTokenizer def translate_short_sent(model, sent): out_dict = {} txt = utils.cleanify(sent) is_valid = lambda w: model.oneshot_transformer.transform(w, None) cnd_tokens = model.informal_tokenizer.tokenize(txt, is_valid) for tokens in cnd_tokens: tokens = [t for t in tokens if t != ''] new_tokens = [] for t in tokens: new_tokens.extend(t.split()) txt = ' '.join(new_tokens) tokens = txt.split() candidates = [] for index in range(len(tokens)): tok = tokens[index] cnd = set() pos = None if model.verb_handler.informal_to_formal(tok): pos = 'VERB' f_words_lemma = model.oneshot_transformer.transform(tok, pos) f_words_lemma = list(f_words_lemma) for index, (word, lemma) in enumerate(f_words_lemma): if pos != 'VERB' and tok not in model.mapper and model.should_filtered_by_one_bigram(lemma, word, tok): f_words_lemma[index] = (tok, tok) else: word_toks = word.split() word_repr = '' for t in word_toks: word_repr += ' ' + t word_repr = word_repr.strip() word_repr = model.repalce_for_gpt2(word_repr) f_words_lemma[index] = (word, word_repr) if f_words_lemma: cnd.update(f_words_lemma) else: cnd = {(tok, tok)} candidates.append(cnd) all_combinations = itertools.product(*candidates) all_combinations_list = list(all_combinations) for id, cnd in enumerate(all_combinations_list): normal_seq = ' '.join([c[0] for c in cnd]) lemma_seq = ' '.join([c[1] for c in cnd]) lemma_seq = utils.clean_text_for_lm(lemma_seq) out_dict[id] = (normal_seq, lemma_seq) candidates = [[item[0] for item in candidate_phrases] for candidate_phrases in candidates] return model.lm_obj.get_best(candidates) def translate(model, sentence_tokenizer, txt): sents = sentence_tokenizer.tokenize(txt) formal_output = '' for sentence in sents: formal_sentence = translate_short_sent(model, sentence) formal_output += ' ' + formal_sentence return formal_output class Informal2Formal: def __init__(self) -> None: #download or load files DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal') config = load_config('dadmatools/informal2formal/config.yml') file_urls = config['files'].values() download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None) # set assets files address verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv') irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv') lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin') assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl') self.sentence_tokenizer = SentenceTokenizer() self.model = FormalityTransformer(asset_file_addr=assets_file_addr, irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr) def load_config(config_file): with open(config_file, "r") as file: config = yaml.safe_load(file) return config st.cache(suppress_st_warning=True, allow_output_mutation=True) st.set_page_config(page_title="Persian Informal to formal translator") # @st.cache(suppress_st_warning=True, allow_output_mutation=True) def load_model(): DEFAULT_CACHE_DIR = os.path.join(str(Path.home()), '.dadmatools', 'informal2formal') config = load_config('config.yml') file_urls = config['files'].values() download_dataset(file_urls, DEFAULT_CACHE_DIR, filename=None) # set assets files address verbs_csv_addr = os.path.join(DEFAULT_CACHE_DIR, 'verbs.csv') irregular_verbs_mapper = os.path.join(DEFAULT_CACHE_DIR, 'irregular_verb_mapper.csv') lm_addr = os.path.join(DEFAULT_CACHE_DIR,'3gram.bin') assets_file_addr = os.path.join(DEFAULT_CACHE_DIR,'assets.pkl') model = FormalityTransformer(asset_file_addr=assets_file_addr, irregular_verbs_mapper_addr=irregular_verbs_mapper, verbs_csv_addr=verbs_csv_addr, lm_addr=lm_addr) return model st.title("Persian/Farsi Formality Transformer") st.write("Translate informal Persian texts to formal") user_input: str = st.text_area( "Input text", height=200, max_chars=5120, ) if st.button("Run"): model = load_model() sentence_tokenizer = SentenceTokenizer() translated_text = translate(model, sentence_tokenizer, user_input) st.success(translated_text)