mihalykiss commited on
Commit
835c83e
·
1 Parent(s): 55cbf6a

formatted text cleaning

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -38,12 +38,18 @@ label_mapping = {
38
  def clean_text(text):
39
 
40
  text = text.replace("\r\n", "\n").replace("\r", "\n")
 
 
 
 
 
41
 
42
- text = re.sub(r"\n\s*\n+", "\n\n", text)
43
 
44
- text = re.sub(r"[ \t]+", " ", text)
45
 
46
  text = text.strip()
 
47
  return text
48
 
49
  def classify_text(text):
 
38
  def clean_text(text):
39
 
40
  text = text.replace("\r\n", "\n").replace("\r", "\n")
41
+
42
+
43
+ text = re.sub(r"\n\s*\n+", "\n\n", text)
44
+
45
+ text = re.sub(r"[ \t]+", " ", text)
46
 
47
+ text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
48
 
49
+ text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
50
 
51
  text = text.strip()
52
+
53
  return text
54
 
55
  def classify_text(text):