import os import abc_1 import shutil def get_subfolder_names(folder_path): try: subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))] return subfolders except FileNotFoundError: print(f"Error: Folder not found: {folder_path}") return [] def create_folder(folder_path): """Creates a folder if it doesn't exist.""" if not os.path.exists(folder_path): try: os.makedirs(folder_path) # Create the folder and any missing parent directories except OSError as e: print(f"Error creating folder {folder_path}: {e}") def move_file(source_path, destination_path): """Moves a file from the source to the destination.""" try: os.rename(source_path, destination_path) except OSError as e: print(f"Error moving file {source_path} to {destination_path}: {e}") def process_file(folder_path, name): """Processes a single file, performing OCR, categorization, and moving.""" text = abc_1.use_ocr(os.path.join(folder_path, name)) category = abc_1.categorize_text_chunk(text, compiled_keywords) category_folder = os.path.join(folder_output, category) create_folder(category_folder) has_pdf, has_docx= check_file_existence(folder_path,name) if has_pdf: source_file = os.path.join(folder_path, name + '.pdf') destination_file = os.path.join(category_folder, name + '.pdf') move_file(source_file, destination_file) print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.") if has_docx: source_file = os.path.join(folder_path, name + '.docx') destination_file = os.path.join(category_folder, name + '.docx') move_file(source_file, destination_file) print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.") def check_file_existence(folder_path, filename): has_pdf = False has_docx = False for filename_in_folder in os.listdir(folder_path): base_filename, ext = os.path.splitext(filename_in_folder) if base_filename == filename: if ext == '.pdf': has_pdf = True elif ext == '.docx': has_docx = True return has_pdf, has_docx def runOCR(subfolder_names): for name in subfolder_names: process_file(folder_path, name) if os.path.exists(folder_path+'/'+name): # buffer folder delete shutil.rmtree(folder_path+'/'+name) if __name__ == '__main__': categories_keywords_dict = { 'AI': ['Artificial', 'Intelligence'], 'Automata': ['finite', 'state', 'machines'], 'DT': ['game', 'theory'] } folder_path = 'input' #output folder ni pdftoimage toh folder_output = 'output' # Fixed typo compiled_keywords = abc_1.compile_keywords(categories_keywords_dict) subfolder_names = get_subfolder_names(folder_path) runOCR(subfolder_names)