Spaces:
Running
Running
Commit
·
835c83e
1
Parent(s):
55cbf6a
formatted text cleaning
Browse files
app.py
CHANGED
@@ -38,12 +38,18 @@ label_mapping = {
|
|
38 |
def clean_text(text):
|
39 |
|
40 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
text = re.sub(r"\n\
|
43 |
|
44 |
-
text = re.sub(r"
|
45 |
|
46 |
text = text.strip()
|
|
|
47 |
return text
|
48 |
|
49 |
def classify_text(text):
|
|
|
38 |
def clean_text(text):
|
39 |
|
40 |
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
41 |
+
|
42 |
+
|
43 |
+
text = re.sub(r"\n\s*\n+", "\n\n", text)
|
44 |
+
|
45 |
+
text = re.sub(r"[ \t]+", " ", text)
|
46 |
|
47 |
+
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
|
48 |
|
49 |
+
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
|
50 |
|
51 |
text = text.strip()
|
52 |
+
|
53 |
return text
|
54 |
|
55 |
def classify_text(text):
|