Spaces:

mtyrrell
/

cpv_3.1

Sleeping

App Files Files Community

leavoigt commited on Aug 14, 2023

Commit

8bf9da8

1 Parent(s): 97ac771

Delete file_processing.py

Browse files

Files changed (1) hide show

file_processing.py +0 -41

file_processing.py DELETED Viewed

@@ -1,41 +0,0 @@
-from utils.preprocessing import processingpipeline
-def get_paragraphs(file_path_input):
-    # Declare params
-    SPLIT_BY = 'word'
-    # usually models have max-length of 384/512
-    SPLIT_LENGTH = 100
-    # too much overlap can lead to repeatitive text
-    # but as a rule fo thumb we keep (20% of Split Length)
-    SPLIT_OVERLAP = 10
-    # the text is cleaned for removing htmls and other annoying texts
-    # but if you need to remove all punctuations like ,.; etc.
-    # good to use for non-Transformers based models.
-    REMOVE_PUNC = False
-    # This param is used only for split_by ='word'
-    RESPECT_SENTENCE_BOUNDARY = True
-    # initialize the preprocessing pipeline and pass params for Preprocessor either
-    # on go or as per delcared variables above.
-    prep_pipeline  = processingpipeline()
-    output_pre = prep_pipeline.run(file_paths = file_path_input,
-                          params= {"FileConverter": {"file_path": file_path, \
-                                        "file_name": file_name},
-                                   "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
-                                        "split_by": SPLIT_BY, \
-                                        "split_length":SPLIT_LENGTH,\
-                                        "split_overlap": SPLIT_OVERLAP, \
-                                        "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
-    output_pre.keys()
-    par_list = output_pre['paraList']
-    #print(par_list)
-    return par_list