Spaces:

seanpedrickcase
/

document_rag_preparation

Running

App Files Files Community

seanpedrickcase commited on Jan 6

Commit

28347d9

1 Parent(s): 773f8b8

Updated requirements packages and Dockerfile

Browse files

Files changed (3) hide show

Dockerfile +1 -1
requirements.txt +7 -7
tools/clean_funcs.py +18 -8

Dockerfile CHANGED Viewed

@@ -16,7 +16,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir gradio==4.31.5
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

 RUN pip install --no-cache-dir -r requirements.txt
+# RUN pip install --no-cache-dir gradio==5.9.1
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-pandas==2.2.2
-spacy # Not specified as latest versions create a conflict with latest versions of gradio
-en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1.tar.gz
-gradio # Not specified as latest versions create a conflict with latest versions of spacy
-boto3==1.34.103
-unstructured
 unstructured[pdf]
 unstructured[docx]
 unstructured[pptx]
@@ -16,5 +16,5 @@ unstructured[msg]
 Faker==22.2.0
 presidio_analyzer==2.2.351
 presidio_anonymizer==2.2.351
-polars==0.20.6

+pandas==2.2.3
+spacy==3.8.3
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.9.1
+boto3==1.35.92
+unstructured==0.16.12
 unstructured[pdf]
 unstructured[docx]
 unstructured[pptx]
 Faker==22.2.0
 presidio_analyzer==2.2.351
 presidio_anonymizer==2.2.351
+polars==1.19.0

tools/clean_funcs.py CHANGED Viewed

@@ -26,7 +26,7 @@ html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbs
 html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
-nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
@@ -57,13 +57,17 @@ def pre_clean(data:List[Element], in_colnames:str, custom_regex:List[str], clean
         print("Starting data clean.")
         for element in data:
-            if not custom_regex.empty:
-                cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
             else:
-                cleaned_data = initial_clean([element.text], [])
-            element.text = cleaned_data[0]
-            print(element.text)
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
@@ -110,6 +114,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
     #text = text.str.replace_all(nums_two_more_regex, ' ')
     #text = text.str.replace_all(postcode_pattern_regex, ' ')
     texts = pl.Series(texts)
     # Allow for custom regex patterns to be removed
@@ -117,7 +127,7 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
         for pattern in custom_regex:
             raw_string_pattern = rf"{pattern}"  # Case-insensitive regex
             #print(f"Removing regex pattern: {raw_string_pattern}")
-            text = text.str.replace_all(raw_string_pattern, " ")
             #print("Text without pattern: ", text[0])

 html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
+nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
         print("Starting data clean.")
         for element in data:
+            if hasattr(element, 'text'):  # Check if 'element' has 'text' attribute
+                if len(element.text) > 0:
+                    if not custom_regex.empty:
+                        cleaned_data = initial_clean([element.text], custom_regex.iloc[:, 0].to_list())
+                    else:
+                        cleaned_data = initial_clean([element.text], [])
+                    element.text = cleaned_data[0]
+                    print(element.text)
             else:
+                continue  # Skip elements without 'text'
         clean_toc = time.perf_counter()
         clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
     #text = text.str.replace_all(nums_two_more_regex, ' ')
     #text = text.str.replace_all(postcode_pattern_regex, ' ')
+    print(texts)
+    if not texts:
+        print("The 'texts' argument is empty.")
+        return texts
     texts = pl.Series(texts)
     # Allow for custom regex patterns to be removed
         for pattern in custom_regex:
             raw_string_pattern = rf"{pattern}"  # Case-insensitive regex
             #print(f"Removing regex pattern: {raw_string_pattern}")
+            text = texts.str.replace_all(raw_string_pattern, " ")
             #print("Text without pattern: ", text[0])