seanpedrickcase commited on
Commit
28347d9
·
1 Parent(s): 773f8b8

Updated requirements packages and Dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. requirements.txt +7 -7
  3. tools/clean_funcs.py +18 -8
Dockerfile CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- RUN pip install --no-cache-dir gradio==4.31.5
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
 
16
 
17
  RUN pip install --no-cache-dir -r requirements.txt
18
 
19
+ # RUN pip install --no-cache-dir gradio==5.9.1
20
 
21
  # Set up a new user named "user" with user ID 1000
22
  RUN useradd -m -u 1000 user
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
- pandas==2.2.2
2
- spacy # Not specified as latest versions create a conflict with latest versions of gradio
3
- en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
4
- gradio # Not specified as latest versions create a conflict with latest versions of spacy
5
- boto3==1.34.103
6
- unstructured
7
  unstructured[pdf]
8
  unstructured[docx]
9
  unstructured[pptx]
@@ -16,5 +16,5 @@ unstructured[msg]
16
  Faker==22.2.0
17
  presidio_analyzer==2.2.351
18
  presidio_anonymizer==2.2.351
19
- polars==0.20.6
20
 
 
1
+ pandas==2.2.3
2
+ spacy==3.8.3
3
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
4
+ gradio==5.9.1
5
+ boto3==1.35.92
6
+ unstructured==0.16.12
7
  unstructured[pdf]
8
  unstructured[docx]
9
  unstructured[pptx]
 
16
  Faker==22.2.0
17
  presidio_analyzer==2.2.351
18
  presidio_anonymizer==2.2.351
19
+ polars==1.19.0
20
 
tools/clean_funcs.py CHANGED
@@ -26,7 +26,7 @@ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbs
26
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
27
  email_pattern_regex = r'\S*@\S*\s?'
28
  num_pattern_regex = r'[0-9]+'
29
- nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
30
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
31
  multiple_spaces_regex = r'\s{2,}'
32
 
@@ -57,13 +57,17 @@ def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean
57
  print("Starting data clean.")
58
 
59
  for element in data:
60
- if not custom_regex.empty:
61
- cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
 
 
 
 
 
 
 
62
  else:
63
- cleaned_data = initial_clean([element.text], [])
64
-
65
- element.text = cleaned_data[0]
66
- print(element.text)
67
 
68
  clean_toc = time.perf_counter()
69
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
@@ -110,6 +114,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
110
  #text = text.str.replace_all(nums_two_more_regex, ' ')
111
  #text = text.str.replace_all(postcode_pattern_regex, ' ')
112
 
 
 
 
 
 
 
113
  texts = pl.Series(texts)
114
 
115
  # Allow for custom regex patterns to be removed
@@ -117,7 +127,7 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
117
  for pattern in custom_regex:
118
  raw_string_pattern = rf"{pattern}" # Case-insensitive regex
119
  #print(f"Removing regex pattern: {raw_string_pattern}")
120
- text = text.str.replace_all(raw_string_pattern, " ")
121
  #print("Text without pattern: ", text[0])
122
 
123
 
 
26
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
27
  email_pattern_regex = r'\S*@\S*\s?'
28
  num_pattern_regex = r'[0-9]+'
29
+ nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
30
  postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
31
  multiple_spaces_regex = r'\s{2,}'
32
 
 
57
  print("Starting data clean.")
58
 
59
  for element in data:
60
+ if hasattr(element, 'text'): # Check if 'element' has 'text' attribute
61
+ if len(element.text) > 0:
62
+ if not custom_regex.empty:
63
+ cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
64
+ else:
65
+ cleaned_data = initial_clean([element.text], [])
66
+
67
+ element.text = cleaned_data[0]
68
+ print(element.text)
69
  else:
70
+ continue # Skip elements without 'text'
 
 
 
71
 
72
  clean_toc = time.perf_counter()
73
  clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
 
114
  #text = text.str.replace_all(nums_two_more_regex, ' ')
115
  #text = text.str.replace_all(postcode_pattern_regex, ' ')
116
 
117
+ print(texts)
118
+
119
+ if not texts:
120
+ print("The 'texts' argument is empty.")
121
+ return texts
122
+
123
  texts = pl.Series(texts)
124
 
125
  # Allow for custom regex patterns to be removed
 
127
  for pattern in custom_regex:
128
  raw_string_pattern = rf"{pattern}" # Case-insensitive regex
129
  #print(f"Removing regex pattern: {raw_string_pattern}")
130
+ text = texts.str.replace_all(raw_string_pattern, " ")
131
  #print("Text without pattern: ", text[0])
132
 
133