dhanikitkat commited on
Commit
0819f8a
·
1 Parent(s): 9206937

Update Cleaning Text

Browse files
Files changed (1) hide show
  1. app.py +10 -1
app.py CHANGED
@@ -42,7 +42,16 @@ def preprocess_text(text, slank_formal_df):
42
  text = text.lower()
43
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
44
  text = re.sub(r'\@\w+|\#', '', text)
45
- text = re.sub(r'[^\w\s]', '', text)
 
 
 
 
 
 
 
 
 
46
  text = replace_slank_to_formal(text, slank_formal_df)
47
  tokens = word_tokenize(text)
48
  preprocessed_text = ' '.join(tokens)
 
42
  text = text.lower()
43
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
44
  text = re.sub(r'\@\w+|\#', '', text)
45
+ text = re.sub(r'([^\w\s\U0001F000-\U0001F9FF])\1+', r'\1', text)
46
+ text = re.sub(r'([\U0001F600-\U0001F64F\U0001F900-\U0001F9FF\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])', r' \1 ', text)
47
+ text = re.sub(r'([.,])', r' \1 ', text)
48
+ text = re.sub(r'[&%]', lambda x: f' {x.group()} ', text)
49
+ text = re.sub(r'(\w)\1{1,}', r'\1\1', text)
50
+ text = re.sub(r'\s+', ' ', text).strip()
51
+ text = re.sub(r'\s*-\s*', '-', text)
52
+ text = re.sub(r'(?<=\d)\s*\.\s*(?=\d)', '.', text)
53
+ text = re.sub(r'(?<=\d)\s*,\s*(?=\d)', ',', text)
54
+ text = re.sub(r'\s+', ' ', text).strip()
55
  text = replace_slank_to_formal(text, slank_formal_df)
56
  tokens = word_tokenize(text)
57
  preprocessed_text = ' '.join(tokens)