File size: 6,717 Bytes
87c3140
e91ac58
9d06861
87c3140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e91ac58
567930d
87c3140
 
 
 
 
 
 
 
e91ac58
87c3140
 
 
 
e91ac58
87c3140
e91ac58
 
 
 
87c3140
 
 
 
 
 
 
e91ac58
 
87c3140
567930d
 
 
87c3140
 
 
 
 
 
 
 
e91ac58
 
87c3140
567930d
87c3140
e91ac58
87c3140
e91ac58
567930d
e91ac58
87c3140
567930d
 
9d06861
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87c3140
 
 
 
 
 
 
 
 
e91ac58
87c3140
e91ac58
 
 
87c3140
 
567930d
 
87c3140
567930d
 
 
e91ac58
567930d
 
e91ac58
567930d
 
87c3140
567930d
 
 
 
 
 
 
 
e91ac58
567930d
 
 
e91ac58
87c3140
 
 
 
 
 
 
 
e91ac58
87c3140
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json, os, shutil

@dataclass
class PromptCatalog:
    domain_knowledge_example: str = ""
    similarity: str = ""
    OCR: str = ""
    n_fields: int = 0

    
    #############################################################################################
    #############################################################################################
    #############################################################################################
    #############################################################################################
    # These are for dynamically creating your own prompts with n-columns


    def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
        self.OCR = self.remove_colons_and_double_apostrophes(OCR)

        self.rules_config_path = rules_config_path
        self.rules_config = self.load_rules_config()

        self.instructions = self.rules_config['instructions']
        self.json_formatting_instructions = self.rules_config['json_formatting_instructions']

        self.rules_list = self.rules_config['rules']
        self.n_fields = len(self.rules_config['rules'])

        # Set the rules for processing OCR into JSON format
        self.rules = self.create_rules(is_palm)

        self.structure, self.dictionary_structure = self.create_structure(is_palm)

        ''' between  instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
        The unstructured OCR text is:
        {self.OCR}
        '''
        if is_palm:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.dictionary_structure}
                {self.dictionary_structure}
                {self.dictionary_structure}
                """
        else:
            prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
                The rules are:
                {self.instructions}
                {self.json_formatting_instructions}
                This is the JSON template that includes instructions for each key:
                {self.rules}
                The unstructured OCR text is:
                {self.OCR}
                Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
                {self.dictionary_structure}
                """
        # xlsx_headers = self.generate_xlsx_headers(is_palm)
        
        # return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
        # print(prompt)
        return prompt, self.dictionary_structure

    def remove_colons_and_double_apostrophes(self, text):
        return text.replace(":", "").replace("\"", "")

    def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
        # Ensure the target directory exists, create it if it doesn't
        if not os.path.exists(new_directory_path):
            os.makedirs(new_directory_path)
        
        # Define the path for the new file location
        new_file_path = os.path.join(new_directory_path, os.path.basename(rules_config_path))
        
        # Copy the file to the new location
        try:
            shutil.copy(rules_config_path, new_file_path)
            print(f"Prompt [{os.path.basename(rules_config_path)}] copied successfully to {new_file_path}")
        except Exception as exc:
            print(f"Error copying [{os.path.basename(rules_config_path)}] file: {exc}")


    def load_rules_config(self):
        with open(self.rules_config_path, 'r') as stream:
            try:
                return yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
                return None

    def create_rules(self, is_palm=False):
        dictionary_structure = {key: value for key, value in self.rules_list.items()}

        # Convert the structure to a JSON string without indentation
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
        return structure_json_str
    
    def create_structure(self, is_palm=False):
        # # Create fields for the Pydantic model dynamically
        # fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}

        # # Dynamically create the Pydantic model
        # DynamicJSONParsingModel = create_model('SLTPvA', **fields)
        # DynamicJSONParsingModel_use = DynamicJSONParsingModel()

        # # Define the structure for the "Dictionary" section
        # dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
        
        # # Dynamically create the "Dictionary" Pydantic model
        # PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)

        # # Convert the model to JSON string (for demonstration)
        # dictionary_structure = PromptJSONModel().dict()
        # structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)

        # Directly create the dictionary structure with empty strings as default values
        dictionary_structure = {key: '' for key in self.rules_list.keys()}

        # Convert the dictionary to JSON string for demonstration if needed
        structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
        # print(structure_json_str)
        # print(dictionary_structure)

        return structure_json_str, dictionary_structure


    def generate_xlsx_headers(self, is_palm):
        # Extract headers from the 'Dictionary' keys in the JSON template rules
        if is_palm:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers     
        else:
            xlsx_headers = list(self.rules_list.keys())
            return xlsx_headers