Demosthene-OR
commited on
Commit
·
580d952
1
Parent(s):
da4bffa
Add
Browse files- tabs/exploration_tab.py +135 -135
tabs/exploration_tab.py
CHANGED
@@ -94,169 +94,169 @@ full_txt_fr = load_data(dataPath+'/small_vocab_fr')
|
|
94 |
if not st.session_state.reCalcule:
|
95 |
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
|
96 |
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
|
|
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
def
|
100 |
-
stop_words = set(stopwords.words(lang))
|
101 |
-
# stop_words will contain set all english stopwords
|
102 |
-
filtered_sentence = []
|
103 |
-
for word in text.split():
|
104 |
-
if word not in stop_words:
|
105 |
-
filtered_sentence.append(word)
|
106 |
-
return " ".join(filtered_sentence)
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
# Removing URLs
|
111 |
-
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
|
128 |
-
|
129 |
-
|
130 |
|
131 |
-
|
132 |
-
|
133 |
|
134 |
-
|
135 |
|
136 |
-
def clean_untranslated_sentence(data1, data2):
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
|
145 |
-
import spacy
|
146 |
|
147 |
-
nlp_en = spacy.load('en_core_web_sm')
|
148 |
-
nlp_fr = spacy.load('fr_core_news_sm')
|
149 |
|
150 |
|
151 |
-
def lemmatize(sentence,lang):
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
|
165 |
-
|
166 |
|
167 |
-
|
168 |
|
169 |
|
170 |
-
def preprocess_txt (data, lang):
|
171 |
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
if lemmatize_to_do:
|
214 |
-
n_batch = 12
|
215 |
-
batch_size = round((nb_phrases/ n_batch)+0.5)
|
216 |
-
for i in range(n_batch):
|
217 |
-
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
|
218 |
-
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
|
219 |
-
|
220 |
-
data_lem_for_sw = data_lemmatized[1:]
|
221 |
-
data_lemmatized = data_lem_for_sw.split('.')
|
222 |
-
for i in range(nb_phrases):
|
223 |
-
data_lem.append(data_lemmatized[i].split())
|
224 |
-
data_lem_length.append(len(data_lemmatized[i].split()))
|
225 |
-
word_lem_count.update(data_lem[-1])
|
226 |
-
|
227 |
-
# Elimination des StopWords en un lot
|
228 |
-
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
|
229 |
-
# (wosw signifie "WithOut Stop Words")
|
230 |
-
if stopwords_to_do:
|
231 |
if lemmatize_to_do:
|
232 |
-
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
|
242 |
-
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
|
257 |
-
|
258 |
|
259 |
-
|
260 |
|
261 |
|
262 |
def count_world(data):
|
|
|
94 |
if not st.session_state.reCalcule:
|
95 |
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
|
96 |
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
|
97 |
+
else:
|
98 |
|
99 |
+
def remove_stopwords(text, lang):
|
100 |
+
stop_words = set(stopwords.words(lang))
|
101 |
+
# stop_words will contain set all english stopwords
|
102 |
+
filtered_sentence = []
|
103 |
+
for word in text.split():
|
104 |
+
if word not in stop_words:
|
105 |
+
filtered_sentence.append(word)
|
106 |
+
return " ".join(filtered_sentence)
|
107 |
|
108 |
+
def clean_undesirable_from_text(sentence, lang):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
+
# Removing URLs
|
111 |
+
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence )
|
|
|
|
|
112 |
|
113 |
+
# Removing Punctuations (we keep the . character)
|
114 |
+
REPLACEMENTS = [("..", "."),
|
115 |
+
(",", ""),
|
116 |
+
(";", ""),
|
117 |
+
(":", ""),
|
118 |
+
("?", ""),
|
119 |
+
('"', ""),
|
120 |
+
("-", " "),
|
121 |
+
("it's", "it is"),
|
122 |
+
("isn't","is not"),
|
123 |
+
("'", " ")
|
124 |
+
]
|
125 |
+
for old, new in REPLACEMENTS:
|
126 |
+
sentence = sentence.replace(old, new)
|
127 |
|
128 |
+
# Removing Digits
|
129 |
+
sentence= re.sub(r'[0-9]','',sentence)
|
130 |
|
131 |
+
# Removing Additional Spaces
|
132 |
+
sentence = re.sub(' +', ' ', sentence)
|
133 |
|
134 |
+
return sentence
|
135 |
|
136 |
+
def clean_untranslated_sentence(data1, data2):
|
137 |
+
i=0
|
138 |
+
while i<len(data1):
|
139 |
+
if data1[i]==data2[i]:
|
140 |
+
data1.pop(i)
|
141 |
+
data2.pop(i)
|
142 |
+
else: i+=1
|
143 |
+
return data1,data2
|
144 |
|
145 |
+
import spacy
|
146 |
|
147 |
+
nlp_en = spacy.load('en_core_web_sm')
|
148 |
+
nlp_fr = spacy.load('fr_core_news_sm')
|
149 |
|
150 |
|
151 |
+
def lemmatize(sentence,lang):
|
152 |
+
# Create a Doc object
|
153 |
+
if lang=='en':
|
154 |
+
nlp=nlp_en
|
155 |
+
elif lang=='fr':
|
156 |
+
nlp=nlp_fr
|
157 |
+
else: return
|
158 |
+
doc = nlp(sentence)
|
159 |
|
160 |
+
# Create list of tokens from given string
|
161 |
+
tokens = []
|
162 |
+
for token in doc:
|
163 |
+
tokens.append(token)
|
164 |
|
165 |
+
lemmatized_sentence = " ".join([token.lemma_ for token in doc])
|
166 |
|
167 |
+
return lemmatized_sentence
|
168 |
|
169 |
|
170 |
+
def preprocess_txt (data, lang):
|
171 |
|
172 |
+
word_count = collections.Counter()
|
173 |
+
word_lem_count = collections.Counter()
|
174 |
+
word_wosw_count = collections.Counter()
|
175 |
+
corpus = []
|
176 |
+
data_split = []
|
177 |
+
sentence_length = []
|
178 |
+
data_split_wo_stopwords = []
|
179 |
+
data_length_wo_stopwords = []
|
180 |
+
data_lem = []
|
181 |
+
data_lem_length = []
|
182 |
|
183 |
+
txt_en_one_string= ". ".join([s for s in data])
|
184 |
+
txt_en_one_string = txt_en_one_string.replace('..', '.')
|
185 |
+
txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang')
|
186 |
+
data = txt_en_one_string.split('.')
|
187 |
+
if data[-1]=="":
|
188 |
+
data.pop(-1)
|
189 |
+
for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases
|
190 |
+
if data[i][0] == ' ':
|
191 |
+
data[i]=data[i][1:]
|
192 |
+
if data[i][-1] == ' ':
|
193 |
+
data[i]=data[i][:-1]
|
194 |
+
nb_phrases = len(data)
|
195 |
|
196 |
+
# Création d'un tableau de mots (sentence_split)
|
197 |
+
for i,sentence in enumerate(data):
|
198 |
+
sentence_split = word_tokenize(sentence)
|
199 |
+
word_count.update(sentence_split)
|
200 |
+
data_split.append(sentence_split)
|
201 |
+
sentence_length.append(len(sentence_split))
|
202 |
+
|
203 |
+
# La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse
|
204 |
+
# (au lieu de le faire phrase par phrase)
|
205 |
+
# Ces 2 processus nécéssitent de connaitre la langue du corpus
|
206 |
+
if lang == 'en': l='english'
|
207 |
+
elif lang=='fr': l='french'
|
208 |
+
else: l="unknown"
|
209 |
+
|
210 |
+
if l!="unknown":
|
211 |
+
# Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois)
|
212 |
+
data_lemmatized=""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
if lemmatize_to_do:
|
214 |
+
n_batch = 12
|
215 |
+
batch_size = round((nb_phrases/ n_batch)+0.5)
|
216 |
+
for i in range(n_batch):
|
217 |
+
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]])
|
218 |
+
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower()
|
219 |
+
|
220 |
+
data_lem_for_sw = data_lemmatized[1:]
|
221 |
+
data_lemmatized = data_lem_for_sw.split('.')
|
222 |
+
for i in range(nb_phrases):
|
223 |
+
data_lem.append(data_lemmatized[i].split())
|
224 |
+
data_lem_length.append(len(data_lemmatized[i].split()))
|
225 |
+
word_lem_count.update(data_lem[-1])
|
226 |
|
227 |
+
# Elimination des StopWords en un lot
|
228 |
+
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu
|
229 |
+
# (wosw signifie "WithOut Stop Words")
|
230 |
+
if stopwords_to_do:
|
231 |
+
if lemmatize_to_do:
|
232 |
+
data_wosw = remove_stopwords(data_lem_for_sw,l)
|
233 |
+
else:
|
234 |
+
data_wosw = remove_stopwords(txt_en_one_string,l)
|
235 |
+
|
236 |
+
data_wosw = data_wosw.split('.')
|
237 |
+
for i in range(nb_phrases):
|
238 |
+
data_split_wo_stopwords.append(data_wosw[i].split())
|
239 |
+
data_length_wo_stopwords.append(len(data_wosw[i].split()))
|
240 |
+
word_wosw_count.update(data_split_wo_stopwords[-1])
|
241 |
|
242 |
+
corpus = list(word_count.keys())
|
243 |
|
244 |
+
# Création d'un DataFrame txt_n_unique_val :
|
245 |
+
# colonnes = mots
|
246 |
+
# lignes = phases
|
247 |
+
# valeur de la cellule = nombre d'occurence du mot dans la phrase
|
248 |
|
249 |
+
## BOW
|
250 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
251 |
+
count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" )
|
252 |
|
253 |
+
# Calcul du nombre d'apparition de chaque mot dans la phrases
|
254 |
+
countvectors = count_vectorizer.fit_transform(data)
|
255 |
+
corpus = count_vectorizer.get_feature_names_out()
|
256 |
|
257 |
+
txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
|
258 |
|
259 |
+
return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
|
260 |
|
261 |
|
262 |
def count_world(data):
|