prodigy-ecfr-textcat / python_Code /finalStep-formatLabel.py
ManjinderUNCC's picture
Update python_Code/finalStep-formatLabel.py
3715140 verified
import jsonlines
# Input file containing classified data
input_file = "data/thirdStep_file.jsonl"
# Output file to store transformed data
output_file = "data/Full-Labeled-Data-Final-4465.jsonl"
# Threshold for considering a label
threshold = 0.21
# Options for different categories
options = [
{"id": "CapitalRequirements", "text": "Capital Requirements", "meta": "0.00"},
{"id": "ConsumerProtection", "text": "Consumer Protection", "meta": "0.00"},
{"id": "RiskManagement", "text": "Risk Management", "meta": "0.00"},
{"id": "ReportingAndCompliance", "text": "Reporting And Compliance", "meta": "0.00"},
{"id": "CorporateGovernance", "text": "Corporate Governance", "meta": "0.00"}
]
# Function to process each record
def process_record(record):
# Extract text and predicted labels
text = record["text"]
predicted_labels = record["predicted_labels"]
# Determine accepted categories based on threshold
accepted_categories = [label for label, score in predicted_labels.items() if score > threshold]
# Determine answer based on accepted categories
answer = "accept" if accepted_categories else "reject"
# Prepare options with meta
options_with_meta = [
{"id": option["id"], "text": option["text"], "meta": option["meta"]} for option in options
]
# Construct the output record
output_record = {
"text": text,
"cats": predicted_labels,
"accept": accepted_categories,
"answer": answer,
"options": options_with_meta
}
return output_record
# Process input file and write transformed data to output file
with jsonlines.open(input_file, "r") as infile, jsonlines.open(output_file, "w") as outfile:
for record in infile:
output_record = process_record(record)
outfile.write(output_record)