Tihsrah-CD's picture
first commit
6237b12
raw
history blame
11 kB
import streamlit as st
import pandas as pd
import pickle
from tqdm import tqdm
from Levenshtein import distance as lev
import joblib
from googletrans import Translator
from indictrans import Transliterator
from pyphonetics import RefinedSoundex
import enchant
from bs4 import BeautifulSoup
import re
def main():
st.title('Text Processing App')
dictn = enchant.Dict("en_US")
rs = RefinedSoundex()
normalized_string_final=[]
translator = Translator()
trn = Transliterator(source='eng', target='hin')
with open(r'./english_vocab.pkl', "rb") as fp:
english = pickle.load(fp)
english_vocab=english
with open(r'./hinglish_vocab.pkl', "rb") as fp:
hinglish = pickle.load(fp)
hinglish_vocab=hinglish
english_vocab['and'] = ['and']
english_vocab['is'] = ['is']
def clean_tweet(tweet):
text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
text=BeautifulSoup(text,'lxml').get_text()
text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
text=re.sub("[^a-zA-Z]"," ",text)
text=re.sub(r'\bRT\b',' ',text)
text=re.sub(r'\bnan\b',' ',text)
return text
input_text = st.text_area("Enter the text:")
total_translated = []
if st.button('Process'):
# Create a DataFrame with the user input text
data = {'Text': [input_text]}
df1 = pd.DataFrame(data)
# Apply the clean_tweet function to the user input text
df1['Text'] = df1['Text'].apply(clean_tweet)
# Extract the cleaned text
cleaned_text = df1['Text'].tolist()[0]
# Process the cleaned text further if needed
total_text = [cleaned_text]
st.write("Input Text:", total_text)
for i in tqdm(total_text):
test_text=i.split()
# english word change from vocab
not_changed_idx=[]
for i in range(len(test_text)):
not_changed_idx.append(0)
changed_text=[]
changed_idx=[]
# print("1st",changed_text)
for i in range(len(test_text)):
for key in english_vocab:
done=0
for val in english_vocab[key]:
if(test_text[i]==val):
# print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
# print("yahan par",key,val,test_text[i])
changed_text.append(key)
changed_idx.append(i)
not_changed_idx[i]=1
done=1
# print("breaking")
break
if done==1:
# print("breaking again")
break
normalized_string=[]
# making changed text and idx to a dictionary with two lists
res = dict(zip(changed_idx, changed_text))
# print(res)
for i in range(len(test_text)):
try:
normalized_string.append(res[i])
except:
normalized_string.append(test_text[i])
print("English Normalized String : ",normalized_string)
# hinglish word change
test_list = [i for i in range(len(test_text))]
changed_hing_idx = [i for i in test_list if i not in changed_idx]
# print(changed_hing_idx)
hinglish_text_part=[]
for i in changed_hing_idx:
try:
hinglish_text_part.append(test_text[i])
except:
pass
# print(hinglish_text_part)
changed_text2=[]
changed_idx2=[]
# print("1st hing",changed_text2)
for i in range(len(hinglish_text_part)):
for key in hinglish_vocab:
done=0
for val in hinglish_vocab[key]:
if(hinglish_text_part[i]==val):
# print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
# print(key,val,hinglish_text_part[i])
changed_text2.append(key)
changed_idx2.append(i)
not_changed_idx[i]=1
done=1
# print("breaking")
break
if done==1:
# print("breaking again")
break
# making changed text and idx to a dictionary with two lists
normalized_string2=[]
# print("changed_text 2 ",changed_text2)
res2 = dict(zip(changed_idx2, changed_text2))
# print(res2)
for i in range(len(hinglish_text_part)):
try:
normalized_string2.append(res2[i])
except:
normalized_string2.append(hinglish_text_part[i])
# print("normalised string 2 :",normalized_string2)
changed_idx=list(set(changed_idx))
changed_idx.sort()
# print("changed idx",changed_idx)
for i in changed_idx:
normalized_string2.append(res[i])
print("Hinglish Normalized String : ",normalized_string)
# print(not_changed_idx)
# finding phoneme and leventise distance for unchanged word
for i in range(len(not_changed_idx)):
try:
if not_changed_idx[i]==0:
eng_phoneme_correction=[]
for j in english_vocab:
# print(normalized_string2[i],j)
try:
phoneme=rs.distance(normalized_string2[i],j)
except:
pass
if phoneme<=1:
eng_phoneme_correction.append(j)
eng_lev_correction=[]
for k in eng_phoneme_correction:
dist=lev(normalized_string2[i],k)
if dist <=2:
eng_lev_correction.append(k)
# print(eng_phoneme_correction)
# print(eng_lev_correction)
hing_phoneme_correction=[]
for j in hinglish_vocab:
try:
phoneme=rs.distance(normalized_string2[i],j)
except:
pass
if phoneme<=1:
hing_phoneme_correction.append(j)
hing_lev_correction=[]
for k in hing_phoneme_correction:
dist=lev(normalized_string2[i],k)
if dist <=2:
hing_lev_correction.append(k)
# print(hing_phoneme_correction)
# print(hing_lev_correction)
eng_lev_correction.extend(hing_lev_correction)
new_correction=eng_lev_correction
eng_lev_correction=[]
# hing_lev_correction=[]
# print(eng_lev_correction)
for l in new_correction:
dist=lev(normalized_string2[i],l)
eng_lev_correction.append(dist)
min_val=min(eng_lev_correction)
min_idx=eng_lev_correction.index(min_val)
suggestion=dictn.suggest(new_correction[min_idx])
suggestion_lit=[]
for t in suggestion:
dist=lev(new_correction[min_idx],t)
suggestion_lit.append(dist)
min_suggestion_val=min(suggestion_lit)
min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
# print("Suggestions : ",min_suggestion_val)
# print(suggestion[min_suggestion_idx])
normalized_string2[i]=suggestion[min_suggestion_idx]
except:
pass
normalized_string=normalized_string2
normalized_string_final=normalized_string2
print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
# sentence tagging
classifier=joblib.load(r"./classifer.joblib")
classify=[]
for i in normalized_string:
test_classify=classifier(i)
classify.append(test_classify[0].get("label"))
# print(normalized_string)
# print(classify)
for i in range(len(classify)):
if classify[i]=='en':
try:
normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
except:
normalized_string[i]="delete"
print("English -> Hindi Translated String : ",normalized_string)
conversion_list=[]
for i in tqdm(normalized_string):
conversion_list.append(trn.transform(i))
print("Hinglish -> Hindi Transliterated String : ",conversion_list)
conversion_list=normalized_string
string=""
sentence=[]
for i in conversion_list:
string=i+' '+string
sentence.append(string)
translated=[]
for i in tqdm(sentence):
try:
translated_text = translator.translate(i ,src='hi',dest='en')
translated.append(translated_text.text)
except:
translated.append("delete")
print("Hindi -> English Translated String : ",translated)
total_translated.append(translated[0])
total_translated=pd.DataFrame(total_translated)
st.write("English Normalized String:", normalized_string)
st.write("Hinglish Normalized String:", normalized_string)
st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
st.write("English -> Hindi Translated String:", normalized_string)
st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
st.write("Hindi -> English Translated String:", translated)
if __name__ == '__main__':
main()