Spaces:

Tihsrah-CD
/

Hinglish-Text-Normalizer

Sleeping

App Files Files Community

Tihsrah-CD commited on Aug 14, 2023

Commit

f756937

1 Parent(s): 6237b12

test

Browse files

Files changed (3) hide show

app.py +34 -275
app_test.py +283 -0
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -1,283 +1,42 @@
 import streamlit as st
-import pandas as pd
-import pickle
-from tqdm import tqdm
-from Levenshtein import distance as lev
-import joblib
-from googletrans import Translator
-from indictrans import Transliterator
-from pyphonetics import RefinedSoundex
-import enchant
-from bs4 import BeautifulSoup
-import re
 def main():
-    st.title('Text Processing App')
-    dictn = enchant.Dict("en_US")
-    rs = RefinedSoundex()
-    normalized_string_final=[]
-    translator = Translator()
-    trn = Transliterator(source='eng', target='hin')
-    with open(r'./english_vocab.pkl', "rb") as fp:
-       english = pickle.load(fp)
-    english_vocab=english
-    with open(r'./hinglish_vocab.pkl', "rb") as fp:
-       hinglish = pickle.load(fp)
-    hinglish_vocab=hinglish
-    english_vocab['and'] = ['and']
-    english_vocab['is'] = ['is']
-    def clean_tweet(tweet):
-        text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
-        text=BeautifulSoup(text,'lxml').get_text()
-        text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
-        text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
-        text=re.sub("[^a-zA-Z]"," ",text)
-        text=re.sub(r'\bRT\b',' ',text)
-        text=re.sub(r'\bnan\b',' ',text)
-        return text
-    input_text = st.text_area("Enter the text:")
-    total_translated = []
-    if st.button('Process'):
-        # Create a DataFrame with the user input text
-        data = {'Text': [input_text]}
-        df1 = pd.DataFrame(data)
-        # Apply the clean_tweet function to the user input text
-        df1['Text'] = df1['Text'].apply(clean_tweet)
-        # Extract the cleaned text
-        cleaned_text = df1['Text'].tolist()[0]
-        # Process the cleaned text further if needed
-        total_text = [cleaned_text]
-        st.write("Input Text:", total_text)
-        for i in tqdm(total_text):
-            test_text=i.split()
-            # english word change from vocab
-            not_changed_idx=[]
-            for i in range(len(test_text)):
-                not_changed_idx.append(0)
-            changed_text=[]
-            changed_idx=[]
-        #     print("1st",changed_text)
-            for i in range(len(test_text)):
-                for key in english_vocab:
-                    done=0
-                    for val in  english_vocab[key]:
-                        if(test_text[i]==val):
-                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
-        #                     print("yahan par",key,val,test_text[i])
-                            changed_text.append(key)
-                            changed_idx.append(i)
-                            not_changed_idx[i]=1
-                            done=1
-                            # print("breaking")
-                            break
-                    if done==1:
-                        # print("breaking again")
-                        break
-            normalized_string=[]
-            # making changed text and idx to a dictionary with two lists
-            res = dict(zip(changed_idx, changed_text))
-        #     print(res)
-            for i in range(len(test_text)):
-                try:
-                    normalized_string.append(res[i])
-                except:
-                    normalized_string.append(test_text[i])
-            print("English Normalized String : ",normalized_string)
-            # hinglish word change
-            test_list = [i for i in range(len(test_text))]
-            changed_hing_idx = [i for i in test_list if i not in changed_idx]
-            # print(changed_hing_idx)
-            hinglish_text_part=[]
-            for i in changed_hing_idx:
-                try:
-                    hinglish_text_part.append(test_text[i])
-                except:
-                    pass
-        #     print(hinglish_text_part)
-            changed_text2=[]
-            changed_idx2=[]
-        #     print("1st hing",changed_text2)
-            for i in range(len(hinglish_text_part)):
-                for key in hinglish_vocab:
-                    done=0
-                    for val in  hinglish_vocab[key]:
-                        if(hinglish_text_part[i]==val):
-                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
-        #                     print(key,val,hinglish_text_part[i])
-                            changed_text2.append(key)
-                            changed_idx2.append(i)
-                            not_changed_idx[i]=1
-                            done=1
-                            # print("breaking")
-                            break
-                    if done==1:
-                        # print("breaking again")
-                        break
-            # making changed text and idx to a dictionary with two lists
-            normalized_string2=[]
-        #     print("changed_text 2 ",changed_text2)
-            res2 = dict(zip(changed_idx2, changed_text2))
-        #     print(res2)
-            for i in range(len(hinglish_text_part)):
-                try:
-                    normalized_string2.append(res2[i])
-                except:
-                    normalized_string2.append(hinglish_text_part[i])
-        #     print("normalised string 2 :",normalized_string2)
-            changed_idx=list(set(changed_idx))
-            changed_idx.sort()
-        #     print("changed idx",changed_idx)
-            for i in changed_idx:
-                normalized_string2.append(res[i])
-            print("Hinglish Normalized String : ",normalized_string)
-        #     print(not_changed_idx)
-            # finding phoneme and leventise distance for unchanged word
-            for i in range(len(not_changed_idx)):
-                try:
-                    if not_changed_idx[i]==0:
-                        eng_phoneme_correction=[]
-                        for j in english_vocab:
-                            # print(normalized_string2[i],j)
-                            try:
-                                phoneme=rs.distance(normalized_string2[i],j)
-                            except:
-                                pass
-                            if phoneme<=1:
-                                eng_phoneme_correction.append(j)
-                        eng_lev_correction=[]
-                        for k in eng_phoneme_correction:
-                            dist=lev(normalized_string2[i],k)
-                            if dist <=2:
-                                eng_lev_correction.append(k)
-        #                 print(eng_phoneme_correction)
-        #                 print(eng_lev_correction)
-                        hing_phoneme_correction=[]
-                        for j in hinglish_vocab:
-                            try:
-                                phoneme=rs.distance(normalized_string2[i],j)
-                            except:
-                                pass
-                            if phoneme<=1:
-                                hing_phoneme_correction.append(j)
-                        hing_lev_correction=[]
-                        for k in hing_phoneme_correction:
-                            dist=lev(normalized_string2[i],k)
-                            if dist <=2:
-                                hing_lev_correction.append(k)
-        #                 print(hing_phoneme_correction)
-        #                 print(hing_lev_correction)
-                        eng_lev_correction.extend(hing_lev_correction)
-                        new_correction=eng_lev_correction
-                        eng_lev_correction=[]
-                        # hing_lev_correction=[]
-        #                 print(eng_lev_correction)
-                        for l in new_correction:
-                            dist=lev(normalized_string2[i],l)
-                            eng_lev_correction.append(dist)
-                        min_val=min(eng_lev_correction)
-                        min_idx=eng_lev_correction.index(min_val)
-                        suggestion=dictn.suggest(new_correction[min_idx])
-                        suggestion_lit=[]
-                        for t in suggestion:
-                            dist=lev(new_correction[min_idx],t)
-                            suggestion_lit.append(dist)
-                        min_suggestion_val=min(suggestion_lit)
-                        min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
-        #                 print("Suggestions : ",min_suggestion_val)
-        #                 print(suggestion[min_suggestion_idx])
-                        normalized_string2[i]=suggestion[min_suggestion_idx]
-                except:
-                    pass
-            normalized_string=normalized_string2
-            normalized_string_final=normalized_string2
-            print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
-            # sentence tagging
-            classifier=joblib.load(r"./classifer.joblib")
-            classify=[]
-            for i in normalized_string:
-                test_classify=classifier(i)
-                classify.append(test_classify[0].get("label"))
-        #     print(normalized_string)
-        #     print(classify)
-            for i in range(len(classify)):
-                if classify[i]=='en':
-                    try:
-                        normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
-                    except:
-                        normalized_string[i]="delete"
-            print("English -> Hindi Translated String : ",normalized_string)
-            conversion_list=[]
-            for i in tqdm(normalized_string):
-                conversion_list.append(trn.transform(i))
-            print("Hinglish -> Hindi Transliterated String : ",conversion_list)
-            conversion_list=normalized_string
-            string=""
-            sentence=[]
-            for i in conversion_list:
-                string=i+' '+string
-            sentence.append(string)
-            translated=[]
-            for i in tqdm(sentence):
-                try:
-                    translated_text = translator.translate(i ,src='hi',dest='en')
-                    translated.append(translated_text.text)
-                except:
-                    translated.append("delete")
-            print("Hindi -> English Translated String : ",translated)
-            total_translated.append(translated[0])
-            total_translated=pd.DataFrame(total_translated)
-        st.write("English Normalized String:", normalized_string)
-        st.write("Hinglish Normalized String:", normalized_string)
-        st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
-        st.write("English -> Hindi Translated String:", normalized_string)
-        st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
-        st.write("Hindi -> English Translated String:", translated)
 if __name__ == '__main__':
     main()

 import streamlit as st
+import requests
+import os
 def main():
+    st.title('Download File from OneDrive')
+    download_link = "https://upesstd-my.sharepoint.com/:u:/g/personal/500082340_stu_upes_ac_in/EYwRTq9dcTJHppgydRR-8BMBYY2BehA6jxri5rKehcSZig?e=fjAYDf"
+    if st.button('Download File'):
+        response = requests.get(download_link, allow_redirects=True)
+        if response.status_code == 200:
+            filename = "downloaded_file.ext" # You can customize the filename
+            with open(filename, 'wb') as file:
+                file.write(response.content)
+            st.success(f"File downloaded successfully and saved as {filename}")
+        else:
+            st.error(f"Failed to download the file. Status code: {response.status_code}")
+if __name__ == '__main__':
+    main()
+import streamlit as st
+import requests
+import os
+def main():
+    st.title('Download File from OneDrive')
+    download_link = "https://upesstd-my.sharepoint.com/:u:/g/personal/500082340_stu_upes_ac_in/EYwRTq9dcTJHppgydRR-8BMBYY2BehA6jxri5rKehcSZig?e=fjAYDf"
+    if st.button('Download File'):
+        response = requests.get(download_link, allow_redirects=True)
+        if response.status_code == 200:
+            filename = "classifer.joblib" # You can customize the filename
+            with open(filename, 'wb') as file:
+                file.write(response.content)
+            st.success(f"File downloaded successfully and saved as {filename}")
+        else:
+            st.error(f"Failed to download the file. Status code: {response.status_code}")
 if __name__ == '__main__':
     main()

app_test.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import streamlit as st
+import pandas as pd
+import pickle
+from tqdm import tqdm
+from Levenshtein import distance as lev
+import joblib
+from googletrans import Translator
+from indictrans import Transliterator
+from pyphonetics import RefinedSoundex
+import enchant
+from bs4 import BeautifulSoup
+import re
+def main():
+    st.title('Text Processing App')
+    dictn = enchant.Dict("en_US")
+    rs = RefinedSoundex()
+    normalized_string_final=[]
+    translator = Translator()
+    trn = Transliterator(source='eng', target='hin')
+    with open(r'./english_vocab.pkl', "rb") as fp:
+       english = pickle.load(fp)
+    english_vocab=english
+    with open(r'./hinglish_vocab.pkl', "rb") as fp:
+       hinglish = pickle.load(fp)
+    hinglish_vocab=hinglish
+    english_vocab['and'] = ['and']
+    english_vocab['is'] = ['is']
+    def clean_tweet(tweet):
+        text=re.sub(r'@ [A-Za-z0-9\']+','',tweet)
+        text=BeautifulSoup(text,'lxml').get_text()
+        text=re.sub(r'https (//)[A-Za-z0-9. ]*(/) [A-Za-z0-9]+','',text)
+        text=re.sub(r'https[A-Za-z0-9/. ]*','',text)
+        text=re.sub("[^a-zA-Z]"," ",text)
+        text=re.sub(r'\bRT\b',' ',text)
+        text=re.sub(r'\bnan\b',' ',text)
+        return text
+    input_text = st.text_area("Enter the text:")
+    total_translated = []
+    if st.button('Process'):
+        # Create a DataFrame with the user input text
+        data = {'Text': [input_text]}
+        df1 = pd.DataFrame(data)
+        # Apply the clean_tweet function to the user input text
+        df1['Text'] = df1['Text'].apply(clean_tweet)
+        # Extract the cleaned text
+        cleaned_text = df1['Text'].tolist()[0]
+        # Process the cleaned text further if needed
+        total_text = [cleaned_text]
+        st.write("Input Text:", total_text)
+        for i in tqdm(total_text):
+            test_text=i.split()
+            # english word change from vocab
+            not_changed_idx=[]
+            for i in range(len(test_text)):
+                not_changed_idx.append(0)
+            changed_text=[]
+            changed_idx=[]
+        #     print("1st",changed_text)
+            for i in range(len(test_text)):
+                for key in english_vocab:
+                    done=0
+                    for val in  english_vocab[key]:
+                        if(test_text[i]==val):
+                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
+        #                     print("yahan par",key,val,test_text[i])
+                            changed_text.append(key)
+                            changed_idx.append(i)
+                            not_changed_idx[i]=1
+                            done=1
+                            # print("breaking")
+                            break
+                    if done==1:
+                        # print("breaking again")
+                        break
+            normalized_string=[]
+            # making changed text and idx to a dictionary with two lists
+            res = dict(zip(changed_idx, changed_text))
+        #     print(res)
+            for i in range(len(test_text)):
+                try:
+                    normalized_string.append(res[i])
+                except:
+                    normalized_string.append(test_text[i])
+            print("English Normalized String : ",normalized_string)
+            # hinglish word change
+            test_list = [i for i in range(len(test_text))]
+            changed_hing_idx = [i for i in test_list if i not in changed_idx]
+            # print(changed_hing_idx)
+            hinglish_text_part=[]
+            for i in changed_hing_idx:
+                try:
+                    hinglish_text_part.append(test_text[i])
+                except:
+                    pass
+        #     print(hinglish_text_part)
+            changed_text2=[]
+            changed_idx2=[]
+        #     print("1st hing",changed_text2)
+            for i in range(len(hinglish_text_part)):
+                for key in hinglish_vocab:
+                    done=0
+                    for val in  hinglish_vocab[key]:
+                        if(hinglish_text_part[i]==val):
+                            # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
+        #                     print(key,val,hinglish_text_part[i])
+                            changed_text2.append(key)
+                            changed_idx2.append(i)
+                            not_changed_idx[i]=1
+                            done=1
+                            # print("breaking")
+                            break
+                    if done==1:
+                        # print("breaking again")
+                        break
+            # making changed text and idx to a dictionary with two lists
+            normalized_string2=[]
+        #     print("changed_text 2 ",changed_text2)
+            res2 = dict(zip(changed_idx2, changed_text2))
+        #     print(res2)
+            for i in range(len(hinglish_text_part)):
+                try:
+                    normalized_string2.append(res2[i])
+                except:
+                    normalized_string2.append(hinglish_text_part[i])
+        #     print("normalised string 2 :",normalized_string2)
+            changed_idx=list(set(changed_idx))
+            changed_idx.sort()
+        #     print("changed idx",changed_idx)
+            for i in changed_idx:
+                normalized_string2.append(res[i])
+            print("Hinglish Normalized String : ",normalized_string)
+        #     print(not_changed_idx)
+            # finding phoneme and leventise distance for unchanged word
+            for i in range(len(not_changed_idx)):
+                try:
+                    if not_changed_idx[i]==0:
+                        eng_phoneme_correction=[]
+                        for j in english_vocab:
+                            # print(normalized_string2[i],j)
+                            try:
+                                phoneme=rs.distance(normalized_string2[i],j)
+                            except:
+                                pass
+                            if phoneme<=1:
+                                eng_phoneme_correction.append(j)
+                        eng_lev_correction=[]
+                        for k in eng_phoneme_correction:
+                            dist=lev(normalized_string2[i],k)
+                            if dist <=2:
+                                eng_lev_correction.append(k)
+        #                 print(eng_phoneme_correction)
+        #                 print(eng_lev_correction)
+                        hing_phoneme_correction=[]
+                        for j in hinglish_vocab:
+                            try:
+                                phoneme=rs.distance(normalized_string2[i],j)
+                            except:
+                                pass
+                            if phoneme<=1:
+                                hing_phoneme_correction.append(j)
+                        hing_lev_correction=[]
+                        for k in hing_phoneme_correction:
+                            dist=lev(normalized_string2[i],k)
+                            if dist <=2:
+                                hing_lev_correction.append(k)
+        #                 print(hing_phoneme_correction)
+        #                 print(hing_lev_correction)
+                        eng_lev_correction.extend(hing_lev_correction)
+                        new_correction=eng_lev_correction
+                        eng_lev_correction=[]
+                        # hing_lev_correction=[]
+        #                 print(eng_lev_correction)
+                        for l in new_correction:
+                            dist=lev(normalized_string2[i],l)
+                            eng_lev_correction.append(dist)
+                        min_val=min(eng_lev_correction)
+                        min_idx=eng_lev_correction.index(min_val)
+                        suggestion=dictn.suggest(new_correction[min_idx])
+                        suggestion_lit=[]
+                        for t in suggestion:
+                            dist=lev(new_correction[min_idx],t)
+                            suggestion_lit.append(dist)
+                        min_suggestion_val=min(suggestion_lit)
+                        min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
+        #                 print("Suggestions : ",min_suggestion_val)
+        #                 print(suggestion[min_suggestion_idx])
+                        normalized_string2[i]=suggestion[min_suggestion_idx]
+                except:
+                    pass
+            normalized_string=normalized_string2
+            normalized_string_final=normalized_string2
+            print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
+            # sentence tagging
+            classifier=joblib.load(r"./classifer.joblib")
+            classify=[]
+            for i in normalized_string:
+                test_classify=classifier(i)
+                classify.append(test_classify[0].get("label"))
+        #     print(normalized_string)
+        #     print(classify)
+            for i in range(len(classify)):
+                if classify[i]=='en':
+                    try:
+                        normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
+                    except:
+                        normalized_string[i]="delete"
+            print("English -> Hindi Translated String : ",normalized_string)
+            conversion_list=[]
+            for i in tqdm(normalized_string):
+                conversion_list.append(trn.transform(i))
+            print("Hinglish -> Hindi Transliterated String : ",conversion_list)
+            conversion_list=normalized_string
+            string=""
+            sentence=[]
+            for i in conversion_list:
+                string=i+' '+string
+            sentence.append(string)
+            translated=[]
+            for i in tqdm(sentence):
+                try:
+                    translated_text = translator.translate(i ,src='hi',dest='en')
+                    translated.append(translated_text.text)
+                except:
+                    translated.append("delete")
+            print("Hindi -> English Translated String : ",translated)
+            total_translated.append(translated[0])
+            total_translated=pd.DataFrame(total_translated)
+        st.write("English Normalized String:", normalized_string)
+        st.write("Hinglish Normalized String:", normalized_string)
+        st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
+        st.write("English -> Hindi Translated String:", normalized_string)
+        st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
+        st.write("Hindi -> English Translated String:", translated)
+if __name__ == '__main__':
+    main()

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-git+https://github.com/libindic/indic-trans.git
 streamlit
 pandas
 pickle5
@@ -8,5 +7,7 @@ joblib
 googletrans==4.0.0-rc1
 pyphonetics
 pyenchant
 beautifulsoup4
-lxml

 streamlit
 pandas
 pickle5
 googletrans==4.0.0-rc1
 pyphonetics
 pyenchant
+enchant
 beautifulsoup4
+lxml