File size: 1,740 Bytes
22738ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# -*- coding: utf-8 -*-


from App.bin import constants
from App.bin.InputHandler import InputHandler
from App.bin.PatentHandler import PatentHandler
from App.bin.CorpusProcessor import CorpusProcessor
import time

start_time = time.time()

def main():
    #renseigner nom du dossier de corpus et extension de fichier

    print("Starting process!")
    while True:
        try:
            input_folder = input("Please Enter your input folder name and press 'ENTER': ")
            # comment next line for production mode
            #input_folder= "Staubli"
            if not input_folder:
                raise ValueError("We didn't understand you.")

            files_extension = input("Please Enter your files extensions(txt,xml or * for all): ")
            #comment next line for production mode


            # original code
            # files_extension = "txt"


            # files_extension = "xml"
            if not files_extension:
                raise ValueError("We didn't understand you.")
        except ValueError as e:
            print(e)
            continue
        else:
            break

    input_folder = constants.DATA_INPUT + input_folder
    files_extension = "*." + files_extension

    iInput = InputHandler(input_folder, files_extension)
    input_data = iInput.get_input()

    pretreat_data = PatentHandler(input_data)
    clean_patent_data = pretreat_data.pretreat_data()


    process_data = CorpusProcessor(clean_patent_data,input_folder, files_extension)
    processed_data = process_data.process_corpus()

    print("Process is finished within %s seconds" % round(time.time() - start_time,2))



if __name__ == "__main__":
    main()