ocr / inputPDFToOutputOCR.py
washeed's picture
Upload 18 files
b692870 verified
import os
import abc_1
import shutil
def get_subfolder_names(folder_path):
try:
subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
return subfolders
except FileNotFoundError:
print(f"Error: Folder not found: {folder_path}")
return []
def create_folder(folder_path):
"""Creates a folder if it doesn't exist."""
if not os.path.exists(folder_path):
try:
os.makedirs(folder_path) # Create the folder and any missing parent directories
except OSError as e:
print(f"Error creating folder {folder_path}: {e}")
def move_file(source_path, destination_path):
"""Moves a file from the source to the destination."""
try:
os.rename(source_path, destination_path)
except OSError as e:
print(f"Error moving file {source_path} to {destination_path}: {e}")
def process_file(folder_path, name):
"""Processes a single file, performing OCR, categorization, and moving."""
text = abc_1.use_ocr(os.path.join(folder_path, name))
category = abc_1.categorize_text_chunk(text, compiled_keywords)
category_folder = os.path.join(folder_output, category)
create_folder(category_folder)
has_pdf, has_docx= check_file_existence(folder_path,name)
if has_pdf:
source_file = os.path.join(folder_path, name + '.pdf')
destination_file = os.path.join(category_folder, name + '.pdf')
move_file(source_file, destination_file)
print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
if has_docx:
source_file = os.path.join(folder_path, name + '.docx')
destination_file = os.path.join(category_folder, name + '.docx')
move_file(source_file, destination_file)
print(f"File '{name}' categorized as '{category}' and moved to '{category_folder}'.")
def check_file_existence(folder_path, filename):
has_pdf = False
has_docx = False
for filename_in_folder in os.listdir(folder_path):
base_filename, ext = os.path.splitext(filename_in_folder)
if base_filename == filename:
if ext == '.pdf':
has_pdf = True
elif ext == '.docx':
has_docx = True
return has_pdf, has_docx
def runOCR(subfolder_names):
for name in subfolder_names:
process_file(folder_path, name)
if os.path.exists(folder_path+'/'+name): # buffer folder delete
shutil.rmtree(folder_path+'/'+name)
if __name__ == '__main__':
categories_keywords_dict = {
'AI': ['Artificial', 'Intelligence'],
'Automata': ['finite', 'state', 'machines'],
'DT': ['game', 'theory']
}
folder_path = 'input' #output folder ni pdftoimage toh
folder_output = 'output' # Fixed typo
compiled_keywords = abc_1.compile_keywords(categories_keywords_dict)
subfolder_names = get_subfolder_names(folder_path)
runOCR(subfolder_names)