Demosthene-OR commited on
Commit
580d952
·
1 Parent(s): da4bffa
Files changed (1) hide show
  1. tabs/exploration_tab.py +135 -135
tabs/exploration_tab.py CHANGED
@@ -94,169 +94,169 @@ full_txt_fr = load_data(dataPath+'/small_vocab_fr')
94
  if not st.session_state.reCalcule:
95
  full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
96
  full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
 
97
 
 
 
 
 
 
 
 
 
98
 
99
- def remove_stopwords(text, lang):
100
- stop_words = set(stopwords.words(lang))
101
- # stop_words will contain set all english stopwords
102
- filtered_sentence = []
103
- for word in text.split():
104
- if word not in stop_words:
105
- filtered_sentence.append(word)
106
- return " ".join(filtered_sentence)
107
 
108
- def clean_undesirable_from_text(sentence, lang):
109
-
110
- # Removing URLs
111
- sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
112
 
113
- # Removing Punctuations (we keep the . character)
114
- REPLACEMENTS = [("..", "."),
115
- (",", ""),
116
- (";", ""),
117
- (":", ""),
118
- ("?", ""),
119
- ('"', ""),
120
- ("-", " "),
121
- ("it's", "it is"),
122
- ("isn't","is not"),
123
- ("'", " ")
124
- ]
125
- for old, new in REPLACEMENTS:
126
- sentence = sentence.replace(old, new)
127
 
128
- # Removing Digits
129
- sentence= re.sub(r'[0-9]','',sentence)
130
 
131
- # Removing Additional Spaces
132
- sentence = re.sub(' +', ' ', sentence)
133
 
134
- return sentence
135
 
136
- def clean_untranslated_sentence(data1, data2):
137
- i=0
138
- while i<len(data1):
139
- if data1[i]==data2[i]:
140
- data1.pop(i)
141
- data2.pop(i)
142
- else: i+=1
143
- return data1,data2
144
 
145
- import spacy
146
 
147
- nlp_en = spacy.load('en_core_web_sm')
148
- nlp_fr = spacy.load('fr_core_news_sm')
149
 
150
 
151
- def lemmatize(sentence,lang):
152
- # Create a Doc object
153
- if lang=='en':
154
- nlp=nlp_en
155
- elif lang=='fr':
156
- nlp=nlp_fr
157
- else: return
158
- doc = nlp(sentence)
159
 
160
- # Create list of tokens from given string
161
- tokens = []
162
- for token in doc:
163
- tokens.append(token)
164
 
165
- lemmatized_sentence = " ".join([token.lemma_ for token in doc])
166
 
167
- return lemmatized_sentence
168
 
169
 
170
- def preprocess_txt (data, lang):
171
 
172
- word_count = collections.Counter()
173
- word_lem_count = collections.Counter()
174
- word_wosw_count = collections.Counter()
175
- corpus = []
176
- data_split = []
177
- sentence_length = []
178
- data_split_wo_stopwords = []
179
- data_length_wo_stopwords = []
180
- data_lem = []
181
- data_lem_length = []
182
 
183
- txt_en_one_string= ". ".join([s for s in data])
184
- txt_en_one_string = txt_en_one_string.replace('..', '.')
185
- txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
186
- data = txt_en_one_string.split('.')
187
- if data[-1]=="":
188
- data.pop(-1)
189
- for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
190
- if data[i][0] == ' ':
191
- data[i]=data[i][1:]
192
- if data[i][-1] == ' ':
193
- data[i]=data[i][:-1]
194
- nb_phrases = len(data)
195
 
196
- # Création d'un tableau de mots (sentence_split)
197
- for i,sentence in enumerate(data):
198
- sentence_split = word_tokenize(sentence)
199
- word_count.update(sentence_split)
200
- data_split.append(sentence_split)
201
- sentence_length.append(len(sentence_split))
202
-
203
- # La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
204
- # (au lieu de le faire phrase par phrase)
205
- # Ces 2 processus nécéssitent de connaitre la langue du corpus
206
- if lang == 'en': l='english'
207
- elif lang=='fr': l='french'
208
- else: l="unknown"
209
-
210
- if l!="unknown":
211
- # Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
212
- data_lemmatized=""
213
- if lemmatize_to_do:
214
- n_batch = 12
215
- batch_size = round((nb_phrases/ n_batch)+0.5)
216
- for i in range(n_batch):
217
- to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
218
- data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
219
-
220
- data_lem_for_sw = data_lemmatized[1:]
221
- data_lemmatized = data_lem_for_sw.split('.')
222
- for i in range(nb_phrases):
223
- data_lem.append(data_lemmatized[i].split())
224
- data_lem_length.append(len(data_lemmatized[i].split()))
225
- word_lem_count.update(data_lem[-1])
226
-
227
- # Elimination des StopWords en un lot
228
- # On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
229
- # (wosw signifie "WithOut Stop Words")
230
- if stopwords_to_do:
231
  if lemmatize_to_do:
232
- data_wosw = remove_stopwords(data_lem_for_sw,l)
233
- else:
234
- data_wosw = remove_stopwords(txt_en_one_string,l)
 
 
 
 
 
 
 
 
 
235
 
236
- data_wosw = data_wosw.split('.')
237
- for i in range(nb_phrases):
238
- data_split_wo_stopwords.append(data_wosw[i].split())
239
- data_length_wo_stopwords.append(len(data_wosw[i].split()))
240
- word_wosw_count.update(data_split_wo_stopwords[-1])
 
 
 
 
 
 
 
 
 
241
 
242
- corpus = list(word_count.keys())
243
 
244
- # Création d'un DataFrame txt_n_unique_val :
245
- # colonnes = mots
246
- # lignes = phases
247
- # valeur de la cellule = nombre d'occurence du mot dans la phrase
248
 
249
- ## BOW
250
- from sklearn.feature_extraction.text import CountVectorizer
251
- count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
252
 
253
- # Calcul du nombre d'apparition de chaque mot dans la phrases
254
- countvectors = count_vectorizer.fit_transform(data)
255
- corpus = count_vectorizer.get_feature_names_out()
256
 
257
- txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
258
 
259
- return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
260
 
261
 
262
  def count_world(data):
 
94
  if not st.session_state.reCalcule:
95
  full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
96
  full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
97
+ else:
98
 
99
+ def remove_stopwords(text, lang):
100
+ stop_words = set(stopwords.words(lang))
101
+ # stop_words will contain set all english stopwords
102
+ filtered_sentence = []
103
+ for word in text.split():
104
+ if word not in stop_words:
105
+ filtered_sentence.append(word)
106
+ return " ".join(filtered_sentence)
107
 
108
+ def clean_undesirable_from_text(sentence, lang):
 
 
 
 
 
 
 
109
 
110
+ # Removing URLs
111
+ sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
 
 
112
 
113
+ # Removing Punctuations (we keep the . character)
114
+ REPLACEMENTS = [("..", "."),
115
+ (",", ""),
116
+ (";", ""),
117
+ (":", ""),
118
+ ("?", ""),
119
+ ('"', ""),
120
+ ("-", " "),
121
+ ("it's", "it is"),
122
+ ("isn't","is not"),
123
+ ("'", " ")
124
+ ]
125
+ for old, new in REPLACEMENTS:
126
+ sentence = sentence.replace(old, new)
127
 
128
+ # Removing Digits
129
+ sentence= re.sub(r'[0-9]','',sentence)
130
 
131
+ # Removing Additional Spaces
132
+ sentence = re.sub(' +', ' ', sentence)
133
 
134
+ return sentence
135
 
136
+ def clean_untranslated_sentence(data1, data2):
137
+ i=0
138
+ while i<len(data1):
139
+ if data1[i]==data2[i]:
140
+ data1.pop(i)
141
+ data2.pop(i)
142
+ else: i+=1
143
+ return data1,data2
144
 
145
+ import spacy
146
 
147
+ nlp_en = spacy.load('en_core_web_sm')
148
+ nlp_fr = spacy.load('fr_core_news_sm')
149
 
150
 
151
+ def lemmatize(sentence,lang):
152
+ # Create a Doc object
153
+ if lang=='en':
154
+ nlp=nlp_en
155
+ elif lang=='fr':
156
+ nlp=nlp_fr
157
+ else: return
158
+ doc = nlp(sentence)
159
 
160
+ # Create list of tokens from given string
161
+ tokens = []
162
+ for token in doc:
163
+ tokens.append(token)
164
 
165
+ lemmatized_sentence = " ".join([token.lemma_ for token in doc])
166
 
167
+ return lemmatized_sentence
168
 
169
 
170
+ def preprocess_txt (data, lang):
171
 
172
+ word_count = collections.Counter()
173
+ word_lem_count = collections.Counter()
174
+ word_wosw_count = collections.Counter()
175
+ corpus = []
176
+ data_split = []
177
+ sentence_length = []
178
+ data_split_wo_stopwords = []
179
+ data_length_wo_stopwords = []
180
+ data_lem = []
181
+ data_lem_length = []
182
 
183
+ txt_en_one_string= ". ".join([s for s in data])
184
+ txt_en_one_string = txt_en_one_string.replace('..', '.')
185
+ txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
186
+ data = txt_en_one_string.split('.')
187
+ if data[-1]=="":
188
+ data.pop(-1)
189
+ for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
190
+ if data[i][0] == ' ':
191
+ data[i]=data[i][1:]
192
+ if data[i][-1] == ' ':
193
+ data[i]=data[i][:-1]
194
+ nb_phrases = len(data)
195
 
196
+ # Création d'un tableau de mots (sentence_split)
197
+ for i,sentence in enumerate(data):
198
+ sentence_split = word_tokenize(sentence)
199
+ word_count.update(sentence_split)
200
+ data_split.append(sentence_split)
201
+ sentence_length.append(len(sentence_split))
202
+
203
+ # La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
204
+ # (au lieu de le faire phrase par phrase)
205
+ # Ces 2 processus nécéssitent de connaitre la langue du corpus
206
+ if lang == 'en': l='english'
207
+ elif lang=='fr': l='french'
208
+ else: l="unknown"
209
+
210
+ if l!="unknown":
211
+ # Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
212
+ data_lemmatized=""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  if lemmatize_to_do:
214
+ n_batch = 12
215
+ batch_size = round((nb_phrases/ n_batch)+0.5)
216
+ for i in range(n_batch):
217
+ to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
218
+ data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
219
+
220
+ data_lem_for_sw = data_lemmatized[1:]
221
+ data_lemmatized = data_lem_for_sw.split('.')
222
+ for i in range(nb_phrases):
223
+ data_lem.append(data_lemmatized[i].split())
224
+ data_lem_length.append(len(data_lemmatized[i].split()))
225
+ word_lem_count.update(data_lem[-1])
226
 
227
+ # Elimination des StopWords en un lot
228
+ # On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
229
+ # (wosw signifie "WithOut Stop Words")
230
+ if stopwords_to_do:
231
+ if lemmatize_to_do:
232
+ data_wosw = remove_stopwords(data_lem_for_sw,l)
233
+ else:
234
+ data_wosw = remove_stopwords(txt_en_one_string,l)
235
+
236
+ data_wosw = data_wosw.split('.')
237
+ for i in range(nb_phrases):
238
+ data_split_wo_stopwords.append(data_wosw[i].split())
239
+ data_length_wo_stopwords.append(len(data_wosw[i].split()))
240
+ word_wosw_count.update(data_split_wo_stopwords[-1])
241
 
242
+ corpus = list(word_count.keys())
243
 
244
+ # Création d'un DataFrame txt_n_unique_val :
245
+ # colonnes = mots
246
+ # lignes = phases
247
+ # valeur de la cellule = nombre d'occurence du mot dans la phrase
248
 
249
+ ## BOW
250
+ from sklearn.feature_extraction.text import CountVectorizer
251
+ count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
252
 
253
+ # Calcul du nombre d'apparition de chaque mot dans la phrases
254
+ countvectors = count_vectorizer.fit_transform(data)
255
+ corpus = count_vectorizer.get_feature_names_out()
256
 
257
+ txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
258
 
259
+ return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
260
 
261
 
262
  def count_world(data):