Spaces:
Running
Running
File size: 6,717 Bytes
87c3140 e91ac58 9d06861 87c3140 e91ac58 567930d 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 e91ac58 87c3140 567930d 87c3140 e91ac58 87c3140 567930d 87c3140 e91ac58 87c3140 e91ac58 567930d e91ac58 87c3140 567930d 9d06861 87c3140 e91ac58 87c3140 e91ac58 87c3140 567930d 87c3140 567930d e91ac58 567930d e91ac58 567930d 87c3140 567930d e91ac58 567930d e91ac58 87c3140 e91ac58 87c3140 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from dataclasses import dataclass
from langchain_core.pydantic_v1 import Field, create_model
import yaml, json, os, shutil
@dataclass
class PromptCatalog:
domain_knowledge_example: str = ""
similarity: str = ""
OCR: str = ""
n_fields: int = 0
#############################################################################################
#############################################################################################
#############################################################################################
#############################################################################################
# These are for dynamically creating your own prompts with n-columns
def prompt_SLTP(self, rules_config_path, OCR=None, is_palm=False):
self.OCR = self.remove_colons_and_double_apostrophes(OCR)
self.rules_config_path = rules_config_path
self.rules_config = self.load_rules_config()
self.instructions = self.rules_config['instructions']
self.json_formatting_instructions = self.rules_config['json_formatting_instructions']
self.rules_list = self.rules_config['rules']
self.n_fields = len(self.rules_config['rules'])
# Set the rules for processing OCR into JSON format
self.rules = self.create_rules(is_palm)
self.structure, self.dictionary_structure = self.create_structure(is_palm)
''' between instructions and json_formatting_instructions. Made the prompt too long. Better performance without it
The unstructured OCR text is:
{self.OCR}
'''
if is_palm:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.dictionary_structure}
{self.dictionary_structure}
{self.dictionary_structure}
"""
else:
prompt = f"""Please help me complete this text parsing task given the following rules and unstructured OCR text. Your task is to refactor the OCR text into a structured JSON dictionary that matches the structure specified in the following rules. Please follow the rules strictly.
The rules are:
{self.instructions}
{self.json_formatting_instructions}
This is the JSON template that includes instructions for each key:
{self.rules}
The unstructured OCR text is:
{self.OCR}
Please populate the following JSON dictionary based on the rules and the unformatted OCR text:
{self.dictionary_structure}
"""
# xlsx_headers = self.generate_xlsx_headers(is_palm)
# return prompt, self.PromptJSONModel, self.n_fields, xlsx_headers
# print(prompt)
return prompt, self.dictionary_structure
def remove_colons_and_double_apostrophes(self, text):
return text.replace(":", "").replace("\"", "")
def copy_prompt_template_to_new_dir(self, new_directory_path, rules_config_path):
# Ensure the target directory exists, create it if it doesn't
if not os.path.exists(new_directory_path):
os.makedirs(new_directory_path)
# Define the path for the new file location
new_file_path = os.path.join(new_directory_path, os.path.basename(rules_config_path))
# Copy the file to the new location
try:
shutil.copy(rules_config_path, new_file_path)
print(f"Prompt [{os.path.basename(rules_config_path)}] copied successfully to {new_file_path}")
except Exception as exc:
print(f"Error copying [{os.path.basename(rules_config_path)}] file: {exc}")
def load_rules_config(self):
with open(self.rules_config_path, 'r') as stream:
try:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
print(exc)
return None
def create_rules(self, is_palm=False):
dictionary_structure = {key: value for key, value in self.rules_list.items()}
# Convert the structure to a JSON string without indentation
structure_json_str = json.dumps(dictionary_structure, sort_keys=False)
return structure_json_str
def create_structure(self, is_palm=False):
# # Create fields for the Pydantic model dynamically
# fields = {key: (str, Field(default=value, description=value)) for key, value in self.rules_list.items()}
# # Dynamically create the Pydantic model
# DynamicJSONParsingModel = create_model('SLTPvA', **fields)
# DynamicJSONParsingModel_use = DynamicJSONParsingModel()
# # Define the structure for the "Dictionary" section
# dictionary_fields = {key: (str, Field(default='', description="")) for key in self.rules_list.keys()}
# # Dynamically create the "Dictionary" Pydantic model
# PromptJSONModel = create_model('PromptJSONModel', **dictionary_fields)
# # Convert the model to JSON string (for demonstration)
# dictionary_structure = PromptJSONModel().dict()
# structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
# Directly create the dictionary structure with empty strings as default values
dictionary_structure = {key: '' for key in self.rules_list.keys()}
# Convert the dictionary to JSON string for demonstration if needed
structure_json_str = json.dumps(dictionary_structure, sort_keys=False, indent=4)
# print(structure_json_str)
# print(dictionary_structure)
return structure_json_str, dictionary_structure
def generate_xlsx_headers(self, is_palm):
# Extract headers from the 'Dictionary' keys in the JSON template rules
if is_palm:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers
else:
xlsx_headers = list(self.rules_list.keys())
return xlsx_headers
|