File size: 4,808 Bytes
b692870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import ocr
import os
import threading
import concurrent.futures
from multiprocessing import Pool  # Import for multiprocessing
import re
from docx import Document  # Assuming DOCX support is desired
from pdfminer.high_level import extract_text  # Import for PDF text extraction
import time
import pdftoimage
import docxtoimage
# Additional libraries for new file types
#import openpyxl  # For basic XLSX handling (consider pandas for structured data)
#from pptx import Presentation  # For PPTX presentations (install with: pip install python-pptx)

try:
    from docx import Document
except ImportError:
    print("To enable DOCX support, install python-docx: pip install python-docx")


class DecodingError(Exception):
    pass


def compile_keywords(categories_keywords_dict):
    """Pre-compiles keyword lists for faster matching"""
    compiled_keywords = {category: [re.compile(keyword, re.IGNORECASE) for keyword in keywords]
                         for category, keywords in categories_keywords_dict.items()}
    return compiled_keywords


def categorize_text_chunk(text_chunk, compiled_keywords):
    """Categorizes a chunk of text using compiled keywords"""
    for category, keyword_list in compiled_keywords.items():
        if all(keyword.search(text_chunk) for keyword in keyword_list):
            return category
    return 'Uncategorized'

def use_ocr(folder_path): #pag tinawag toh extract niya lahat ng text sa buffer folder
    all_extracted_text = ""
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            image_path = os.path.join(folder_path, filename)
        extracted_text = ocr.extract_text_from_image(image_path)
        all_extracted_text += "\n".join(extracted_text) + "\n\n"  # Add double newlines for separation

    return all_extracted_text



    
def convert_pages(folder_path, output_format ,max_pages):
    for root, directories, files in os.walk(folder_path):
        for filename in files:
      # Get the file extension (including the dot)
            extension = os.path.splitext(filename)[1].lower()
            if extension=='.pdf':
                pdftoimage.convert_pdfs(folder_path, output_format,max_pages)
            if extension=='.docx':
                docxtoimage.process(folder_path,max_pages)
    

def categorize_file(file_path, compiled_keywords):
    try:
        if file_path.endswith('.pdf'):
            text = extract_text(file_path)  # Use pdfminer to extract text (CPU-bound)
            return file_path, categorize_text_chunk(text, compiled_keywords)
        elif file_path.endswith('.docx') and Document:
            # ... (code for DOCX files - potentially I/O bound)
            try:
                doc = Document(file_path)
                text = '\n'.join(paragraph.text for paragraph in doc.paragraphs)  # Combine all paragraphs
                return file_path, categorize_text_chunk(text, compiled_keywords)
            except Exception as e:
                print(f"Error processing DOCX '{file_path}': {e}")
                return file_path, 'Uncategorized (Error)'
        elif file_path.endswith('.txt'):
            with open(file_path, 'r') as f:
                text = f.read()
            return file_path, categorize_text_chunk(text, compiled_keywords)
        else:
            print(f"Unsupported file type: {file_path}")
            return None, 'Unsupported File Type'
    except Exception as e:
        print(f"Error processing '{file_path}': {e}")
        return file_path, 'Uncategorized (Error)'


def threaded_worker(file_paths_categories, output_dir):
    for file_path, category in file_paths_categories:
        if category is not None:  # Skip unsupported files
            category_dir = os.path.join(output_dir, category)
            os.makedirs(category_dir, exist_ok=True)
            os.rename(file_path, os.path.join(category_dir, os.path.basename(file_path)))


def multi_process_categorizer(input_dir, output_dir, categories_keywords_dict, num_processes):
    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]

    # Use multiprocessing pool for CPU-bound text processing
    with Pool(processes=num_processes) as pool:
        results = pool.starmap(categorize_file, [(file_path, categories_keywords_dict) for file_path in files])

    # Use concurrent.futures for potentially I/O-bound tasks like moving files
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.submit(threaded_worker, results, output_dir)


def chunks(lst, chunk_size):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]