|
import os
|
|
import abc_1
|
|
import shutil
|
|
|
|
def get_subfolder_names(folder_path):
|
|
try:
|
|
subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
|
|
return subfolders
|
|
except FileNotFoundError:
|
|
print(f"Error: Folder not found: {folder_path}")
|
|
return []
|
|
|
|
|
|
def create_folder(folder_path):
|
|
"""Creates a folder if it doesn't exist."""
|
|
if not os.path.exists(folder_path):
|
|
try:
|
|
os.makedirs(folder_path)
|
|
except OSError as e:
|
|
print(f"Error creating folder {folder_path}: {e}")
|
|
|
|
|
|
def move_file(source_path, destination_path):
|
|
"""Moves a file from the source to the destination."""
|
|
try:
|
|
os.rename(source_path, destination_path)
|
|
except OSError as e:
|
|
print(f"Error moving file {source_path} to {destination_path}: {e}")
|
|
|
|
|
|
def process_file(folder_path, name):
|
|
"""Processes a single file, performing OCR, categorization, and moving."""
|
|
text = abc_1.use_ocr(os.path.join(folder_path, name))
|
|
category = abc_1.categorize_text_chunk(text, compiled_keywords)
|
|
|
|
category_folder = os.path.join(folder_output, category)
|
|
create_folder(category_folder)
|
|
has_pdf, has_docx= check_file_existence(folder_path,name)
|
|
if has_pdf:
|
|
source_file = os.path.join(folder_path, name + '.pdf')
|
|
destination_file = os.path.join(category_folder, name + '.pdf')
|
|
move_file(source_file, destination_file)
|
|
print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
|
|
if has_docx:
|
|
source_file = os.path.join(folder_path, name + '.docx')
|
|
destination_file = os.path.join(category_folder, name + '.docx')
|
|
move_file(source_file, destination_file)
|
|
print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
|
|
|
|
|
|
def check_file_existence(folder_path, filename):
|
|
has_pdf = False
|
|
has_docx = False
|
|
|
|
for filename_in_folder in os.listdir(folder_path):
|
|
base_filename, ext = os.path.splitext(filename_in_folder)
|
|
if base_filename == filename:
|
|
if ext == '.pdf':
|
|
has_pdf = True
|
|
elif ext == '.docx':
|
|
has_docx = True
|
|
|
|
return has_pdf, has_docx
|
|
|
|
def runOCR(subfolder_names):
|
|
for name in subfolder_names:
|
|
process_file(folder_path, name)
|
|
if os.path.exists(folder_path+'/'+name):
|
|
shutil.rmtree(folder_path+'/'+name)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
categories_keywords_dict = {
|
|
'AI': ['Artificial', 'Intelligence'],
|
|
'Automata': ['finite', 'state', 'machines'],
|
|
'DT': ['game', 'theory']
|
|
}
|
|
|
|
folder_path = 'input'
|
|
folder_output = 'output'
|
|
compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)
|
|
|
|
subfolder_names = get_subfolder_names(folder_path)
|
|
runOCR(subfolder_names)
|
|
|
|
|
|
|