Tihsrah-CD commited on
Commit
9c2faf5
·
1 Parent(s): 5d99706
Files changed (1) hide show
  1. app.py +66 -168
app.py CHANGED
@@ -1,17 +1,3 @@
1
- import subprocess
2
-
3
- # Run the pip install command for pyenchant
4
- subprocess.run(["pip", "install", "pyenchant"], check=True)
5
-
6
- # # Run the first command
7
- # subprocess.run(["apt", "install", "enchant", "--fix-missing", "-y"], check=True)
8
-
9
- # # Run the second command
10
- # subprocess.run(["apt", "install", "-qq", "enchant", "-y"], check=True)
11
-
12
- # # Run the pip install command for pyenchant
13
- # subprocess.run(["pip", "install", "pyenchant"], check=True)
14
-
15
  import streamlit as st
16
  import pandas as pd
17
  import pickle
@@ -21,24 +7,31 @@ import joblib
21
  from googletrans import Translator
22
  from indictrans import Transliterator
23
  from pyphonetics import RefinedSoundex
24
- import enchant
25
  from bs4 import BeautifulSoup
26
  import re
27
 
 
 
 
 
 
 
 
 
 
 
28
  def main():
29
  st.title('Text Processing App')
30
-
31
- dictn = enchant.Dict("en_US")
32
  rs = RefinedSoundex()
33
  normalized_string_final=[]
34
  translator = Translator()
35
  trn = Transliterator(source='eng', target='hin')
36
 
37
  with open(r'./english_vocab.pkl', "rb") as fp:
38
- english = pickle.load(fp)
39
  english_vocab=english
40
  with open(r'./hinglish_vocab.pkl', "rb") as fp:
41
- hinglish = pickle.load(fp)
42
  hinglish_vocab=hinglish
43
 
44
  english_vocab['and'] = ['and']
@@ -57,241 +50,146 @@ def main():
57
  input_text = st.text_area("Enter the text:")
58
  total_translated = []
59
  if st.button('Process'):
60
- # Create a DataFrame with the user input text
61
  data = {'Text': [input_text]}
62
  df1 = pd.DataFrame(data)
63
-
64
- # Apply the clean_tweet function to the user input text
65
  df1['Text'] = df1['Text'].apply(clean_tweet)
66
-
67
- # Extract the cleaned text
68
  cleaned_text = df1['Text'].tolist()[0]
69
-
70
- # Process the cleaned text further if needed
71
  total_text = [cleaned_text]
72
  st.write("Input Text:", total_text)
73
-
74
  for i in tqdm(total_text):
75
  test_text=i.split()
76
-
77
- # english word change from vocab
78
  not_changed_idx=[]
79
  for i in range(len(test_text)):
80
  not_changed_idx.append(0)
81
-
82
  changed_text=[]
83
  changed_idx=[]
84
- # print("1st",changed_text)
85
- for i in range(len(test_text)):
86
 
 
87
  for key in english_vocab:
88
  done=0
89
  for val in english_vocab[key]:
90
  if(test_text[i]==val):
91
- # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
92
- # print("yahan par",key,val,test_text[i])
93
  changed_text.append(key)
94
  changed_idx.append(i)
95
  not_changed_idx[i]=1
96
  done=1
97
- # print("breaking")
98
  break
99
  if done==1:
100
- # print("breaking again")
101
  break
102
-
103
- normalized_string=[]
104
 
105
- # making changed text and idx to a dictionary with two lists
 
106
  res = dict(zip(changed_idx, changed_text))
107
- # print(res)
108
  for i in range(len(test_text)):
109
  try:
110
  normalized_string.append(res[i])
111
  except:
112
  normalized_string.append(test_text[i])
113
- print("English Normalized String : ",normalized_string)
114
-
115
 
116
  # hinglish word change
117
  test_list = [i for i in range(len(test_text))]
118
  changed_hing_idx = [i for i in test_list if i not in changed_idx]
119
- # print(changed_hing_idx)
120
- hinglish_text_part=[]
121
- for i in changed_hing_idx:
122
- try:
123
- hinglish_text_part.append(test_text[i])
124
- except:
125
- pass
126
- # print(hinglish_text_part)
127
 
128
- changed_text2=[]
129
- changed_idx2=[]
130
- # print("1st hing",changed_text2)
131
  for i in range(len(hinglish_text_part)):
132
-
133
  for key in hinglish_vocab:
134
- done=0
135
- for val in hinglish_vocab[key]:
136
- if(hinglish_text_part[i]==val):
137
- # print("KEY = ",key,"VAL =",val,"i =",test_text[i],"ADJENCENCY_DATA =",adjacency_data[key])
138
- # print(key,val,hinglish_text_part[i])
139
  changed_text2.append(key)
140
  changed_idx2.append(i)
141
- not_changed_idx[i]=1
142
- done=1
143
- # print("breaking")
144
  break
145
- if done==1:
146
- # print("breaking again")
147
  break
148
 
149
-
150
- # making changed text and idx to a dictionary with two lists
151
- normalized_string2=[]
152
- # print("changed_text 2 ",changed_text2)
153
  res2 = dict(zip(changed_idx2, changed_text2))
154
- # print(res2)
155
  for i in range(len(hinglish_text_part)):
156
  try:
157
  normalized_string2.append(res2[i])
158
  except:
159
  normalized_string2.append(hinglish_text_part[i])
160
- # print("normalised string 2 :",normalized_string2)
161
-
162
 
163
- changed_idx=list(set(changed_idx))
164
- changed_idx.sort()
165
- # print("changed idx",changed_idx)
166
  for i in changed_idx:
167
  normalized_string2.append(res[i])
168
 
169
- print("Hinglish Normalized String : ",normalized_string)
170
- # print(not_changed_idx)
171
-
172
 
173
  # finding phoneme and leventise distance for unchanged word
174
-
175
  for i in range(len(not_changed_idx)):
176
  try:
177
- if not_changed_idx[i]==0:
178
- eng_phoneme_correction=[]
179
  for j in english_vocab:
180
- # print(normalized_string2[i],j)
181
  try:
182
- phoneme=rs.distance(normalized_string2[i],j)
183
  except:
184
  pass
185
- if phoneme<=1:
186
  eng_phoneme_correction.append(j)
187
- eng_lev_correction=[]
188
  for k in eng_phoneme_correction:
189
- dist=lev(normalized_string2[i],k)
190
- if dist <=2:
191
  eng_lev_correction.append(k)
192
- # print(eng_phoneme_correction)
193
- # print(eng_lev_correction)
194
-
195
-
196
- hing_phoneme_correction=[]
197
- for j in hinglish_vocab:
198
- try:
199
- phoneme=rs.distance(normalized_string2[i],j)
200
- except:
201
- pass
202
- if phoneme<=1:
203
- hing_phoneme_correction.append(j)
204
- hing_lev_correction=[]
205
- for k in hing_phoneme_correction:
206
- dist=lev(normalized_string2[i],k)
207
- if dist <=2:
208
- hing_lev_correction.append(k)
209
- # print(hing_phoneme_correction)
210
- # print(hing_lev_correction)
211
 
212
  eng_lev_correction.extend(hing_lev_correction)
213
- new_correction=eng_lev_correction
214
- eng_lev_correction=[]
215
- # hing_lev_correction=[]
216
- # print(eng_lev_correction)
217
-
218
  for l in new_correction:
219
- dist=lev(normalized_string2[i],l)
220
  eng_lev_correction.append(dist)
221
- min_val=min(eng_lev_correction)
222
- min_idx=eng_lev_correction.index(min_val)
223
-
224
-
225
- suggestion=dictn.suggest(new_correction[min_idx])
226
- suggestion_lit=[]
227
- for t in suggestion:
228
- dist=lev(new_correction[min_idx],t)
229
- suggestion_lit.append(dist)
230
- min_suggestion_val=min(suggestion_lit)
231
- min_suggestion_idx=suggestion_lit.index(min_suggestion_val)
232
- # print("Suggestions : ",min_suggestion_val)
233
- # print(suggestion[min_suggestion_idx])
234
 
235
-
236
-
237
- normalized_string2[i]=suggestion[min_suggestion_idx]
238
  except:
239
  pass
240
- normalized_string=normalized_string2
241
- normalized_string_final=normalized_string2
242
- print("Phoneme levenshtein Distionary suggestion Normalized String : ",normalized_string_final)
 
243
  # sentence tagging
244
- classifier=joblib.load(r"./classifer.joblib")
245
- classify=[]
246
  for i in normalized_string:
247
- test_classify=classifier(i)
248
  classify.append(test_classify[0].get("label"))
249
 
250
- # print(normalized_string)
251
- # print(classify)
252
-
253
  for i in range(len(classify)):
254
- if classify[i]=='en':
255
  try:
256
- normalized_string[i]=translator.translate(normalized_string[i] ,src='en',dest='hi').text
257
  except:
258
- normalized_string[i]="delete"
259
- print("English -> Hindi Translated String : ",normalized_string)
260
-
261
 
262
- conversion_list=[]
 
263
 
264
- for i in tqdm(normalized_string):
265
- conversion_list.append(trn.transform(i))
266
-
267
- print("Hinglish -> Hindi Transliterated String : ",conversion_list)
268
- conversion_list=normalized_string
269
- string=""
270
- sentence=[]
271
- for i in conversion_list:
272
- string=i+' '+string
273
- sentence.append(string)
274
- translated=[]
275
- for i in tqdm(sentence):
276
  try:
277
- translated_text = translator.translate(i ,src='hi',dest='en')
278
  translated.append(translated_text.text)
279
  except:
280
  translated.append("delete")
281
- print("Hindi -> English Translated String : ",translated)
282
  total_translated.append(translated[0])
283
 
284
- total_translated=pd.DataFrame(total_translated)
285
-
286
-
287
-
288
-
289
- st.write("English Normalized String:", normalized_string)
290
- st.write("Hinglish Normalized String:", normalized_string)
291
- st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
292
- st.write("English -> Hindi Translated String:", normalized_string)
293
- st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
294
- st.write("Hindi -> English Translated String:", translated)
295
 
296
  if __name__ == '__main__':
297
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import pickle
 
7
  from googletrans import Translator
8
  from indictrans import Transliterator
9
  from pyphonetics import RefinedSoundex
 
10
  from bs4 import BeautifulSoup
11
  import re
12
 
13
+ def closest_match(word, vocabulary):
14
+ best_match = None
15
+ best_distance = float('inf')
16
+ for vocab_word in vocabulary:
17
+ dist = lev(word, vocab_word)
18
+ if dist < best_distance:
19
+ best_distance = dist
20
+ best_match = vocab_word
21
+ return best_match
22
+
23
  def main():
24
  st.title('Text Processing App')
 
 
25
  rs = RefinedSoundex()
26
  normalized_string_final=[]
27
  translator = Translator()
28
  trn = Transliterator(source='eng', target='hin')
29
 
30
  with open(r'./english_vocab.pkl', "rb") as fp:
31
+ english = pickle.load(fp)
32
  english_vocab=english
33
  with open(r'./hinglish_vocab.pkl', "rb") as fp:
34
+ hinglish = pickle.load(fp)
35
  hinglish_vocab=hinglish
36
 
37
  english_vocab['and'] = ['and']
 
50
  input_text = st.text_area("Enter the text:")
51
  total_translated = []
52
  if st.button('Process'):
 
53
  data = {'Text': [input_text]}
54
  df1 = pd.DataFrame(data)
 
 
55
  df1['Text'] = df1['Text'].apply(clean_tweet)
 
 
56
  cleaned_text = df1['Text'].tolist()[0]
 
 
57
  total_text = [cleaned_text]
58
  st.write("Input Text:", total_text)
59
+
60
  for i in tqdm(total_text):
61
  test_text=i.split()
 
 
62
  not_changed_idx=[]
63
  for i in range(len(test_text)):
64
  not_changed_idx.append(0)
 
65
  changed_text=[]
66
  changed_idx=[]
 
 
67
 
68
+ for i in range(len(test_text)):
69
  for key in english_vocab:
70
  done=0
71
  for val in english_vocab[key]:
72
  if(test_text[i]==val):
 
 
73
  changed_text.append(key)
74
  changed_idx.append(i)
75
  not_changed_idx[i]=1
76
  done=1
 
77
  break
78
  if done==1:
 
79
  break
 
 
80
 
81
+
82
+ normalized_string=[]
83
  res = dict(zip(changed_idx, changed_text))
 
84
  for i in range(len(test_text)):
85
  try:
86
  normalized_string.append(res[i])
87
  except:
88
  normalized_string.append(test_text[i])
89
+ print("English Normalized String:", normalized_string)
 
90
 
91
  # hinglish word change
92
  test_list = [i for i in range(len(test_text))]
93
  changed_hing_idx = [i for i in test_list if i not in changed_idx]
94
+ hinglish_text_part = [test_text[i] for i in changed_hing_idx]
95
+ changed_text2 = []
96
+ changed_idx2 = []
 
 
 
 
 
97
 
 
 
 
98
  for i in range(len(hinglish_text_part)):
 
99
  for key in hinglish_vocab:
100
+ done = 0
101
+ for val in hinglish_vocab[key]:
102
+ if hinglish_text_part[i] == val:
 
 
103
  changed_text2.append(key)
104
  changed_idx2.append(i)
105
+ done = 1
 
 
106
  break
107
+ if done == 1:
 
108
  break
109
 
110
+ normalized_string2 = []
 
 
 
111
  res2 = dict(zip(changed_idx2, changed_text2))
 
112
  for i in range(len(hinglish_text_part)):
113
  try:
114
  normalized_string2.append(res2[i])
115
  except:
116
  normalized_string2.append(hinglish_text_part[i])
 
 
117
 
 
 
 
118
  for i in changed_idx:
119
  normalized_string2.append(res[i])
120
 
121
+ print("Hinglish Normalized String:", normalized_string)
 
 
122
 
123
  # finding phoneme and leventise distance for unchanged word
 
124
  for i in range(len(not_changed_idx)):
125
  try:
126
+ if not_changed_idx[i] == 0:
127
+ eng_phoneme_correction = []
128
  for j in english_vocab:
 
129
  try:
130
+ phoneme = rs.distance(normalized_string2[i], j)
131
  except:
132
  pass
133
+ if phoneme <= 1:
134
  eng_phoneme_correction.append(j)
135
+ eng_lev_correction = []
136
  for k in eng_phoneme_correction:
137
+ dist = lev(normalized_string2[i], k)
138
+ if dist <= 2:
139
  eng_lev_correction.append(k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  eng_lev_correction.extend(hing_lev_correction)
142
+ new_correction = eng_lev_correction
143
+ eng_lev_correction = []
 
 
 
144
  for l in new_correction:
145
+ dist = lev(normalized_string2[i], l)
146
  eng_lev_correction.append(dist)
147
+ min_val = min(eng_lev_correction)
148
+ min_idx = eng_lev_correction.index(min_val)
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ suggestion = closest_match(new_correction[min_idx], english_vocab.keys())
151
+ normalized_string2[i] = suggestion
 
152
  except:
153
  pass
154
+
155
+ normalized_string_final = normalized_string2
156
+ print("Phoneme levenshtein Distionary suggestion Normalized String:", normalized_string_final)
157
+
158
  # sentence tagging
159
+ classifier = joblib.load(r"./classifer.joblib")
160
+ classify = []
161
  for i in normalized_string:
162
+ test_classify = classifier(i)
163
  classify.append(test_classify[0].get("label"))
164
 
 
 
 
165
  for i in range(len(classify)):
166
+ if classify[i] == 'en':
167
  try:
168
+ normalized_string[i] = translator.translate(normalized_string[i], src='en', dest='hi').text
169
  except:
170
+ normalized_string[i] = "delete"
171
+ print("English -> Hindi Translated String:", normalized_string)
 
172
 
173
+ conversion_list = [trn.transform(i) for i in normalized_string]
174
+ print("Hinglish -> Hindi Transliterated String:", conversion_list)
175
 
176
+ sentence = [" ".join(conversion_list)]
177
+ translated = []
178
+ for i in sentence:
 
 
 
 
 
 
 
 
 
179
  try:
180
+ translated_text = translator.translate(i, src='hi', dest='en')
181
  translated.append(translated_text.text)
182
  except:
183
  translated.append("delete")
184
+ print("Hindi -> English Translated String:", translated)
185
  total_translated.append(translated[0])
186
 
187
+ st.write("English Normalized String:", normalized_string)
188
+ st.write("Hinglish Normalized String:", normalized_string)
189
+ st.write("Phoneme Levenshtein Dictionary Suggestion Normalized String:", normalized_string_final)
190
+ st.write("English -> Hindi Translated String:", normalized_string)
191
+ st.write("Hinglish -> Hindi Transliterated String:", conversion_list)
192
+ st.write("Hindi -> English Translated String:", translated)
 
 
 
 
 
193
 
194
  if __name__ == '__main__':
195
+ main()