Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import pickle | |
from tqdm import tqdm | |
from Levenshtein import distance as lev | |
import joblib | |
from googletrans import Translator | |
from indictrans import Transliterator | |
from pyphonetics import RefinedSoundex | |
import enchant | |
from bs4 import BeautifulSoup | |
import re | |
def main(): | |
st.title('Text Processing App') | |
dictn = enchant.Dict("en_US") | |
rs = RefinedSoundex() | |
normalized_string_final=[] | |
translator = Translator() | |
trn = Transliterator(source='eng', target='hin') | |
with open(r'./english_vocab.pkl', "rb") as fp: | |
english = pickle.load(fp) | |
english_vocab=english | |
with open(r'./hinglish_vocab.pkl', "rb") as fp: | |
hinglish = pickle.load(fp) | |
hinglish_vocab=hinglish | |
english_vocab['and'] = ['and'] | |
english_vocab['is'] = ['is'] | |
def clean_tweet(tweet): | |
text=re.sub(r'@ [A-Za-z0-9\']+','',tweet) | |
text=BeautifulSoup(text,'lxml').get_text() | |
text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text) | |
text=re.sub(r'https[A-Za-z0-9/. ]*','',text) | |
text=re.sub("[^a-zA-Z]"," ",text) | |
text=re.sub(r'\bRT\b',' ',text) | |
text=re.sub(r'\bnan\b',' ',text) | |
return text | |
input_text = st.text_area("Enter the text:") | |
total_translated = [] | |
if st.button('Process'): | |
# Create a DataFrame with the user input text | |
data = {'Text': [input_text]} | |
df1 = pd.DataFrame(data) | |
# Apply the clean_tweet function to the user input text | |
df1['Text'] = df1['Text'].apply(clean_tweet) | |
# Extract the cleaned text | |
cleaned_text = df1['Text'].tolist()[0] | |
# Process the cleaned text further if needed | |
total_text = [cleaned_text] | |
st.write("Input Text:", total_text) | |
for i in tqdm(total_text): | |
test_text=i.split() | |
# english word change from vocab | |
not_changed_idx=[] | |
for i in range(len(test_text)): | |
not_changed_idx.append(0) | |
changed_text=[] | |
changed_idx=[] | |
# print("1st",changed_text) | |
for i in range(len(test_text)): | |
for key in english_vocab: | |
done=0 | |
for val in english_vocab[key]: | |
if(test_text[i]==val): | |
# print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key]) | |
# print("yahan par",key,val,test_text[i]) | |
changed_text.append(key) | |
changed_idx.append(i) | |
not_changed_idx[i]=1 | |
done=1 | |
# print("breaking") | |
break | |
if done==1: | |
# print("breaking again") | |
break | |
normalized_string=[] | |
# making changed text and idx to a dictionary with two lists | |
res = dict(zip(changed_idx, changed_text)) | |
# print(res) | |
for i in range(len(test_text)): | |
try: | |
normalized_string.append(res[i]) | |
except: | |
normalized_string.append(test_text[i]) | |
print("English Normalized String : ",normalized_string) | |
# hinglish word change | |
test_list = [i for i in range(len(test_text))] | |
changed_hing_idx = [i for i in test_list if i not in changed_idx] | |
# print(changed_hing_idx) | |
hinglish_text_part=[] | |
for i in changed_hing_idx: | |
try: | |
hinglish_text_part.append(test_text[i]) | |
except: | |
pass | |
# print(hinglish_text_part) | |
changed_text2=[] | |
changed_idx2=[] | |
# print("1st hing",changed_text2) | |
for i in range(len(hinglish_text_part)): | |
for key in hinglish_vocab: | |
done=0 | |
for val in hinglish_vocab[key]: | |
if(hinglish_text_part[i]==val): | |
# print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key]) | |
# print(key,val,hinglish_text_part[i]) | |
changed_text2.append(key) | |
changed_idx2.append(i) | |
not_changed_idx[i]=1 | |
done=1 | |
# print("breaking") | |
break | |
if done==1: | |
# print("breaking again") | |
break | |
# making changed text and idx to a dictionary with two lists | |
normalized_string2=[] | |
# print("changed_text 2 ",changed_text2) | |
res2 = dict(zip(changed_idx2, changed_text2)) | |
# print(res2) | |
for i in range(len(hinglish_text_part)): | |
try: | |
normalized_string2.append(res2[i]) | |
except: | |
normalized_string2.append(hinglish_text_part[i]) | |
# print("normalised string 2 :",normalized_string2) | |
changed_idx=list(set(changed_idx)) | |
changed_idx.sort() | |
# print("changed idx",changed_idx) | |
for i in changed_idx: | |
normalized_string2.append(res[i]) | |
print("Hinglish Normalized String : ",normalized_string) | |
# print(not_changed_idx) | |
# finding phoneme and leventise distance for unchanged word | |
for i in range(len(not_changed_idx)): | |
try: | |
if not_changed_idx[i]==0: | |
eng_phoneme_correction=[] | |
for j in english_vocab: | |
# print(normalized_string2[i],j) | |
try: | |
phoneme=rs.distance(normalized_string2[i],j) | |
except: | |
pass | |
if phoneme<=1: | |
eng_phoneme_correction.append(j) | |
eng_lev_correction=[] | |
for k in eng_phoneme_correction: | |
dist=lev(normalized_string2[i],k) | |
if dist <=2: | |
eng_lev_correction.append(k) | |
# print(eng_phoneme_correction) | |
# print(eng_lev_correction) | |
hing_phoneme_correction=[] | |
for j in hinglish_vocab: | |
try: | |
phoneme=rs.distance(normalized_string2[i],j) | |
except: | |
pass | |
if phoneme<=1: | |
hing_phoneme_correction.append(j) | |
hing_lev_correction=[] | |
for k in hing_phoneme_correction: | |
dist=lev(normalized_string2[i],k) | |
if dist <=2: | |
hing_lev_correction.append(k) | |
# print(hing_phoneme_correction) | |
# print(hing_lev_correction) | |
eng_lev_correction.extend(hing_lev_correction) | |
new_correction=eng_lev_correction | |
eng_lev_correction=[] | |
# hing_lev_correction=[] | |
# print(eng_lev_correction) | |
for l in new_correction: | |
dist=lev(normalized_string2[i],l) | |
eng_lev_correction.append(dist) | |
min_val=min(eng_lev_correction) | |
min_idx=eng_lev_correction.index(min_val) | |
suggestion=dictn.suggest(new_correction[min_idx]) | |
suggestion_lit=[] | |
for t in suggestion: | |
dist=lev(new_correction[min_idx],t) | |
suggestion_lit.append(dist) | |
min_suggestion_val=min(suggestion_lit) | |
min_suggestion_idx=suggestion_lit.index(min_suggestion_val) | |
# print("Suggestions : ",min_suggestion_val) | |
# print(suggestion[min_suggestion_idx]) | |
normalized_string2[i]=suggestion[min_suggestion_idx] | |
except: | |
pass | |
normalized_string=normalized_string2 | |
normalized_string_final=normalized_string2 | |
print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final) | |
# sentence tagging | |
classifier=joblib.load(r"./classifer.joblib") | |
classify=[] | |
for i in normalized_string: | |
test_classify=classifier(i) | |
classify.append(test_classify[0].get("label")) | |
# print(normalized_string) | |
# print(classify) | |
for i in range(len(classify)): | |
if classify[i]=='en': | |
try: | |
normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text | |
except: | |
normalized_string[i]="delete" | |
print("English -> Hindi Translated String : ",normalized_string) | |
conversion_list=[] | |
for i in tqdm(normalized_string): | |
conversion_list.append(trn.transform(i)) | |
print("Hinglish -> Hindi Transliterated String : ",conversion_list) | |
conversion_list=normalized_string | |
string="" | |
sentence=[] | |
for i in conversion_list: | |
string=i+' '+string | |
sentence.append(string) | |
translated=[] | |
for i in tqdm(sentence): | |
try: | |
translated_text = translator.translate(i ,src='hi',dest='en') | |
translated.append(translated_text.text) | |
except: | |
translated.append("delete") | |
print("Hindi -> English Translated String : ",translated) | |
total_translated.append(translated[0]) | |
total_translated=pd.DataFrame(total_translated) | |
st.write("English Normalized String:", normalized_string) | |
st.write("Hinglish Normalized String:", normalized_string) | |
st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final) | |
st.write("English -> Hindi Translated String:", normalized_string) | |
st.write("Hinglish -> Hindi Transliterated String:", conversion_list) | |
st.write("Hindi -> English Translated String:", translated) | |
if __name__ == '__main__': | |
main() | |