Update clean text function
Browse files- src/utils.py +10 -4
src/utils.py
CHANGED
@@ -8,7 +8,7 @@ from PIL import Image
|
|
8 |
def clean_text(text):
|
9 |
"""
|
10 |
Cleans the given text by removing unwanted tokens, extra spaces,
|
11 |
-
and ensures proper spacing between words and after
|
12 |
|
13 |
Args:
|
14 |
text (str): The input text to be cleaned.
|
@@ -27,13 +27,19 @@ def clean_text(text):
|
|
27 |
# Join the cleaned lines into a single string with a space between each line
|
28 |
cleaned_text = " ".join(cleaned_lines)
|
29 |
|
30 |
-
# Ensure proper spacing
|
31 |
cleaned_text = re.sub(
|
32 |
r"\s+", " ", cleaned_text
|
33 |
) # Replace multiple spaces with a single space
|
34 |
cleaned_text = re.sub(
|
35 |
-
r"(?<=[
|
36 |
-
) # Add space after
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
# Return the cleaned text
|
39 |
return cleaned_text
|
|
|
8 |
def clean_text(text):
|
9 |
"""
|
10 |
Cleans the given text by removing unwanted tokens, extra spaces,
|
11 |
+
and ensures proper spacing between words and after punctuation marks.
|
12 |
|
13 |
Args:
|
14 |
text (str): The input text to be cleaned.
|
|
|
27 |
# Join the cleaned lines into a single string with a space between each line
|
28 |
cleaned_text = " ".join(cleaned_lines)
|
29 |
|
30 |
+
# Ensure proper spacing using regex
|
31 |
cleaned_text = re.sub(
|
32 |
r"\s+", " ", cleaned_text
|
33 |
) # Replace multiple spaces with a single space
|
34 |
cleaned_text = re.sub(
|
35 |
+
r"(?<=[.,!?])(?=[^\s])", r" ", cleaned_text
|
36 |
+
) # Add space after punctuation if not followed by a space
|
37 |
+
cleaned_text = re.sub(
|
38 |
+
r"(?<=[a-z])(?=[A-Z])", r" ", cleaned_text
|
39 |
+
) # Add space between joined words where a lowercase letter is followed by an uppercase letter
|
40 |
+
cleaned_text = re.sub(
|
41 |
+
r"(\w)([A-Z][a-z])", r"\1 \2", cleaned_text
|
42 |
+
) # Add space between camel case words
|
43 |
|
44 |
# Return the cleaned text
|
45 |
return cleaned_text
|