sitammeur commited on
Commit
19c5277
1 Parent(s): 5033bf7

Update clean text function

Browse files
Files changed (1) hide show
  1. src/utils.py +10 -4
src/utils.py CHANGED
@@ -8,7 +8,7 @@ from PIL import Image
8
  def clean_text(text):
9
  """
10
  Cleans the given text by removing unwanted tokens, extra spaces,
11
- and ensures proper spacing between words and after periods.
12
 
13
  Args:
14
  text (str): The input text to be cleaned.
@@ -27,13 +27,19 @@ def clean_text(text):
27
  # Join the cleaned lines into a single string with a space between each line
28
  cleaned_text = " ".join(cleaned_lines)
29
 
30
- # Ensure proper spacing between words and after periods using regex
31
  cleaned_text = re.sub(
32
  r"\s+", " ", cleaned_text
33
  ) # Replace multiple spaces with a single space
34
  cleaned_text = re.sub(
35
- r"(?<=[.])(?=[^\s])", r" ", cleaned_text
36
- ) # Add space after a period if not followed by a space
 
 
 
 
 
 
37
 
38
  # Return the cleaned text
39
  return cleaned_text
 
8
  def clean_text(text):
9
  """
10
  Cleans the given text by removing unwanted tokens, extra spaces,
11
+ and ensures proper spacing between words and after punctuation marks.
12
 
13
  Args:
14
  text (str): The input text to be cleaned.
 
27
  # Join the cleaned lines into a single string with a space between each line
28
  cleaned_text = " ".join(cleaned_lines)
29
 
30
+ # Ensure proper spacing using regex
31
  cleaned_text = re.sub(
32
  r"\s+", " ", cleaned_text
33
  ) # Replace multiple spaces with a single space
34
  cleaned_text = re.sub(
35
+ r"(?<=[.,!?])(?=[^\s])", r" ", cleaned_text
36
+ ) # Add space after punctuation if not followed by a space
37
+ cleaned_text = re.sub(
38
+ r"(?<=[a-z])(?=[A-Z])", r" ", cleaned_text
39
+ ) # Add space between joined words where a lowercase letter is followed by an uppercase letter
40
+ cleaned_text = re.sub(
41
+ r"(\w)([A-Z][a-z])", r"\1 \2", cleaned_text
42
+ ) # Add space between camel case words
43
 
44
  # Return the cleaned text
45
  return cleaned_text