leavoigt commited on
Commit
8bf9da8
·
1 Parent(s): 97ac771

Delete file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +0 -41
file_processing.py DELETED
@@ -1,41 +0,0 @@
1
- from utils.preprocessing import processingpipeline
2
-
3
- def get_paragraphs(file_path_input):
4
-
5
- # Declare params
6
- SPLIT_BY = 'word'
7
-
8
- # usually models have max-length of 384/512
9
- SPLIT_LENGTH = 100
10
-
11
- # too much overlap can lead to repeatitive text
12
- # but as a rule fo thumb we keep (20% of Split Length)
13
- SPLIT_OVERLAP = 10
14
-
15
- # the text is cleaned for removing htmls and other annoying texts
16
- # but if you need to remove all punctuations like ,.; etc.
17
- # good to use for non-Transformers based models.
18
- REMOVE_PUNC = False
19
-
20
- # This param is used only for split_by ='word'
21
- RESPECT_SENTENCE_BOUNDARY = True
22
-
23
- # initialize the preprocessing pipeline and pass params for Preprocessor either
24
- # on go or as per delcared variables above.
25
- prep_pipeline = processingpipeline()
26
-
27
- output_pre = prep_pipeline.run(file_paths = file_path_input,
28
- params= {"FileConverter": {"file_path": file_path, \
29
- "file_name": file_name},
30
- "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
31
- "split_by": SPLIT_BY, \
32
- "split_length":SPLIT_LENGTH,\
33
- "split_overlap": SPLIT_OVERLAP, \
34
- "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
35
-
36
- output_pre.keys()
37
-
38
- par_list = output_pre['paraList']
39
- #print(par_list)
40
-
41
- return par_list