Commit
·
28347d9
1
Parent(s):
773f8b8
Updated requirements packages and Dockerfile
Browse files- Dockerfile +1 -1
- requirements.txt +7 -7
- tools/clean_funcs.py +18 -8
Dockerfile
CHANGED
@@ -16,7 +16,7 @@ COPY requirements.txt .
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
-
RUN pip install --no-cache-dir gradio==
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
|
|
16 |
|
17 |
RUN pip install --no-cache-dir -r requirements.txt
|
18 |
|
19 |
+
# RUN pip install --no-cache-dir gradio==5.9.1
|
20 |
|
21 |
# Set up a new user named "user" with user ID 1000
|
22 |
RUN useradd -m -u 1000 user
|
requirements.txt
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
-
pandas==2.2.
|
2 |
-
spacy
|
3 |
-
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.
|
4 |
-
gradio
|
5 |
-
boto3==1.
|
6 |
-
unstructured
|
7 |
unstructured[pdf]
|
8 |
unstructured[docx]
|
9 |
unstructured[pptx]
|
@@ -16,5 +16,5 @@ unstructured[msg]
|
|
16 |
Faker==22.2.0
|
17 |
presidio_analyzer==2.2.351
|
18 |
presidio_anonymizer==2.2.351
|
19 |
-
polars==
|
20 |
|
|
|
1 |
+
pandas==2.2.3
|
2 |
+
spacy==3.8.3
|
3 |
+
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
4 |
+
gradio==5.9.1
|
5 |
+
boto3==1.35.92
|
6 |
+
unstructured==0.16.12
|
7 |
unstructured[pdf]
|
8 |
unstructured[docx]
|
9 |
unstructured[pptx]
|
|
|
16 |
Faker==22.2.0
|
17 |
presidio_analyzer==2.2.351
|
18 |
presidio_anonymizer==2.2.351
|
19 |
+
polars==1.19.0
|
20 |
|
tools/clean_funcs.py
CHANGED
@@ -26,7 +26,7 @@ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbs
|
|
26 |
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
27 |
email_pattern_regex = r'\S*@\S*\s?'
|
28 |
num_pattern_regex = r'[0-9]+'
|
29 |
-
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
30 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
31 |
multiple_spaces_regex = r'\s{2,}'
|
32 |
|
@@ -57,13 +57,17 @@ def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean
|
|
57 |
print("Starting data clean.")
|
58 |
|
59 |
for element in data:
|
60 |
-
if
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
else:
|
63 |
-
|
64 |
-
|
65 |
-
element.text = cleaned_data[0]
|
66 |
-
print(element.text)
|
67 |
|
68 |
clean_toc = time.perf_counter()
|
69 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
@@ -110,6 +114,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
110 |
#text = text.str.replace_all(nums_two_more_regex, ' ')
|
111 |
#text = text.str.replace_all(postcode_pattern_regex, ' ')
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
texts = pl.Series(texts)
|
114 |
|
115 |
# Allow for custom regex patterns to be removed
|
@@ -117,7 +127,7 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
117 |
for pattern in custom_regex:
|
118 |
raw_string_pattern = rf"{pattern}" # Case-insensitive regex
|
119 |
#print(f"Removing regex pattern: {raw_string_pattern}")
|
120 |
-
text =
|
121 |
#print("Text without pattern: ", text[0])
|
122 |
|
123 |
|
|
|
26 |
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
27 |
email_pattern_regex = r'\S*@\S*\s?'
|
28 |
num_pattern_regex = r'[0-9]+'
|
29 |
+
nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
|
30 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
31 |
multiple_spaces_regex = r'\s{2,}'
|
32 |
|
|
|
57 |
print("Starting data clean.")
|
58 |
|
59 |
for element in data:
|
60 |
+
if hasattr(element, 'text'): # Check if 'element' has 'text' attribute
|
61 |
+
if len(element.text) > 0:
|
62 |
+
if not custom_regex.empty:
|
63 |
+
cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
|
64 |
+
else:
|
65 |
+
cleaned_data = initial_clean([element.text], [])
|
66 |
+
|
67 |
+
element.text = cleaned_data[0]
|
68 |
+
print(element.text)
|
69 |
else:
|
70 |
+
continue # Skip elements without 'text'
|
|
|
|
|
|
|
71 |
|
72 |
clean_toc = time.perf_counter()
|
73 |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
|
|
114 |
#text = text.str.replace_all(nums_two_more_regex, ' ')
|
115 |
#text = text.str.replace_all(postcode_pattern_regex, ' ')
|
116 |
|
117 |
+
print(texts)
|
118 |
+
|
119 |
+
if not texts:
|
120 |
+
print("The 'texts' argument is empty.")
|
121 |
+
return texts
|
122 |
+
|
123 |
texts = pl.Series(texts)
|
124 |
|
125 |
# Allow for custom regex patterns to be removed
|
|
|
127 |
for pattern in custom_regex:
|
128 |
raw_string_pattern = rf"{pattern}" # Case-insensitive regex
|
129 |
#print(f"Removing regex pattern: {raw_string_pattern}")
|
130 |
+
text = texts.str.replace_all(raw_string_pattern, " ")
|
131 |
#print("Text without pattern: ", text[0])
|
132 |
|
133 |
|