ocr / augmentA.py
washeed's picture
Upload 18 files
b692870 verified
raw
history blame contribute delete
932 Bytes
import abc_1
import time
import sys
from docx import Document # Assuming DOCX support is desired
from pdfminer.high_level import extract_text # Import for PDF text extraction
import json
if __name__ == '__main__':
start = time.time()
if len(sys.argv) > 1:
data = sys.argv[1]
categories_keywords_dict = json.loads(data)
else:
print("No data provided.")
categories_keywords_dict1 = {
'AI': ['Artificial', 'Intelligence'],
'Automata': ['finite', 'state', 'machines'],
'DT': ['game', 'theory']
}
input='input'#file path here
output='output'#and here
compiled_keywords = abc_1.compile_keywords(categories_keywords_dict1)
abc_1.multi_process_categorizer(input, output , compiled_keywords, num_processes=8) # Adjust processes as needed
end = time.time()
print(f"Categorization completed in {end - start:.2f} seconds")