Spaces:
Sleeping
Sleeping
Commit
·
0819f8a
1
Parent(s):
9206937
Update Cleaning Text
Browse files
app.py
CHANGED
@@ -42,7 +42,16 @@ def preprocess_text(text, slank_formal_df):
|
|
42 |
text = text.lower()
|
43 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
44 |
text = re.sub(r'\@\w+|\#', '', text)
|
45 |
-
text = re.sub(r'[^\w\s]', '', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
text = replace_slank_to_formal(text, slank_formal_df)
|
47 |
tokens = word_tokenize(text)
|
48 |
preprocessed_text = ' '.join(tokens)
|
|
|
42 |
text = text.lower()
|
43 |
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
|
44 |
text = re.sub(r'\@\w+|\#', '', text)
|
45 |
+
text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
|
46 |
+
text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
|
47 |
+
text = re.sub(r'([.,])', r' \1 ', text)
|
48 |
+
text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
|
49 |
+
text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
|
50 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
51 |
+
text = re.sub(r'\s*-\s*', '-', text)
|
52 |
+
text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text)
|
53 |
+
text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text)
|
54 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
55 |
text = replace_slank_to_formal(text, slank_formal_df)
|
56 |
tokens = word_tokenize(text)
|
57 |
preprocessed_text = ' '.join(tokens)
|