ManjinderUNCC commited on
Commit
f0c6f53
1 Parent(s): 2f59df2

Upload 6 files

Browse files
python_Code/evaluate_model.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import jsonlines
3
+ from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
4
+
5
+ # Load the trained spaCy model
6
+ nlp = spacy.load("./my_trained_model")
7
+
8
+ # Load the golden evaluation data
9
+ golden_eval_data = []
10
+ with jsonlines.open("data/goldenEval.jsonl") as reader:
11
+ for record in reader:
12
+ golden_eval_data.append(record)
13
+
14
+ # Predict labels for each record using your model
15
+ predicted_labels = []
16
+ for record in golden_eval_data:
17
+ text = record["text"]
18
+ doc = nlp(text)
19
+ predicted_labels.append(doc.cats)
20
+
21
+ # Extract ground truth labels from the golden evaluation data
22
+ true_labels = [record["accept"] for record in golden_eval_data]
23
+
24
+ # Convert label format to match sklearn's classification report format
25
+ true_labels_flat = [label[0] if label else "reject" for label in true_labels]
26
+ predicted_labels_flat = [max(pred, key=pred.get) for pred in predicted_labels]
27
+
28
+ # Calculate evaluation metrics
29
+ accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)
30
+ precision = precision_score(true_labels_flat, predicted_labels_flat, average='weighted')
31
+ recall = recall_score(true_labels_flat, predicted_labels_flat, average='weighted')
32
+ f1 = f1_score(true_labels_flat, predicted_labels_flat, average='weighted')
33
+
34
+ # Additional classification report
35
+ report = classification_report(true_labels_flat, predicted_labels_flat)
36
+
37
+ # Print or save the evaluation metrics
38
+ print("Evaluation Metrics:")
39
+ print(f"Accuracy: {accuracy}")
40
+ print(f"Precision: {precision}")
41
+ print(f"Recall: {recall}")
42
+ print(f"F1-Score: {f1}")
43
+
44
+ # Print or save the detailed classification report
45
+ print("Detailed Classification Report:")
46
+ print(report)
python_Code/finalStep-formatLabel.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jsonlines
2
+
3
+ # Input file containing classified data
4
+ input_file = "data/thirdStep_file.jsonl"
5
+
6
+ # Output file to store transformed data
7
+ output_file = "data/Full-Labeled-Data-Final-4465.jsonl"
8
+
9
+ # Threshold for considering a label
10
+ threshold = 0.21
11
+
12
+ # Options for different categories
13
+ options = [
14
+ {"id": "CapitalRequirements", "text": "Capital Requirements", "meta": "0.00"},
15
+ {"id": "ConsumerProtection", "text": "Consumer Protection", "meta": "0.00"},
16
+ {"id": "RiskManagement", "text": "Risk Management", "meta": "0.00"},
17
+ {"id": "ReportingAndCompliance", "text": "Reporting And Compliance", "meta": "0.00"},
18
+ {"id": "CorporateGovernance", "text": "Corporate Governance", "meta": "0.00"}
19
+ ]
20
+
21
+ # Function to process each record
22
+ def process_record(record):
23
+ # Extract text and predicted labels
24
+ text = record["text"]
25
+ predicted_labels = record["predicted_labels"]
26
+
27
+ # Determine accepted categories based on threshold
28
+ accepted_categories = [label for label, score in predicted_labels.items() if score > threshold]
29
+
30
+ # Determine answer based on accepted categories
31
+ answer = "accept" if accepted_categories else "reject"
32
+
33
+ # Prepare options with meta
34
+ options_with_meta = [
35
+ {"id": option["id"], "text": option["text"], "meta": option["meta"]} for option in options
36
+ ]
37
+
38
+ # Construct the output record
39
+ output_record = {
40
+ "text": text,
41
+ "cats": predicted_labels,
42
+ "accept": accepted_categories,
43
+ "answer": answer,
44
+ "options": options_with_meta
45
+ }
46
+
47
+ return output_record
48
+
49
+ # Process input file and write transformed data to output file
50
+ with jsonlines.open(input_file, "r") as infile, jsonlines.open(output_file, "w") as outfile:
51
+ for record in infile:
52
+ output_record = process_record(record)
53
+ outfile.write(output_record)
python_Code/firstStep-format.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import jsonlines
2
+
3
+ # Path to your dataset file
4
+ dataset_file = "data/train200.jsonl"
5
+
6
+ # Path to the output file
7
+ output_file = "data/firstStep_file.jsonl"
8
+
9
+ # Open the JSONL file and extract text and labels
10
+ try:
11
+ with jsonlines.open(dataset_file) as reader, jsonlines.open(output_file, mode='w') as writer:
12
+ for obj in reader:
13
+ text = obj.get("text")
14
+ label = obj.get("accept", [])[0] # Get the first accepted label if available
15
+ if text and label:
16
+ writer.write({"text": text, "label": label})
17
+ else:
18
+ print("Warning: Text or label missing in the JSON object.")
19
+ print("Processing completed. Output written to:", output_file)
20
+ except Exception as e:
21
+ print("Error:", e)
python_Code/five_examples_annotated.ipynb ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "Text: Banks that are at risk of failing selling bonds? Absolutely not! No way! The idea of where this money needs to come from should've been a thought that was had before these institutions took on crazy amounts of leverage and debt they couldn't pay. It's an obvious attempt at shifting the massive risk they hold onto unsuspecting investors instead of owning the bag themselves, and admitting they had no real risk management. Free money is becoming a thing of the past, it's time for these institutions to grow up and learn. Failure is always an option. Funds raised by selling off these bonds has a high chance of being similarly mismanaged by these at risk of failing institutions due to the aforementioned lack of real risk management. Actions speak louder than words, and we still live in the shadow of a great financial crisis (hmm, I wonder who could've caused that and why?) And constantly throwing the average Joe under the bus does a pretty bad job of helping maintain public confidence in the finance system.\n",
13
+ "\n",
14
+ "ReportingAndCompliance: 0.3665\n",
15
+ "RiskManagement: 0.0330\n",
16
+ "ConsumerProtection: 0.0310\n",
17
+ "CorporateGovernance: 0.0423\n",
18
+ "CapitalRequirements: 0.0245\n",
19
+ "\n",
20
+ "Text: The Wisconsin Bankers Association (aka the WBA) is the largest financial trade association in Wisconsin, representing over 200 state and nationally chartered banks, savings banks,and savings and loan associations located in communities throughout the State. WBA appreciates the opportunity to comment on the interim final rule. Over the past year, the Board of Governors of the Federal Reserve System (FRB) issued several interim final rules to except certain loans that are guaranteed under the Small Business Administration's (SBA's) Paycheck Protection Program (PPP) from the requirements of the Federal Reserve Act and the corresponding provisions of Regulation O.To reflect the latest program extension by Congress, FRB issued this interim final rule to extend the Regulation O exception to PPP loans through March 31, 2022. WBA filed comment letters in support of FRB's previous interim final rules as the removal of Regulation O obstacles through the exception has helped allow Wisconsin's banks to more efficiently address the needs of their insider-owned small businesses. FRB'spast interim final rules have helped ensuree ligible businesses have timely access to liquidity to help overcome economic hurdles resulting from the effects of COVID-19 and the mitigating efforts in effect throughout Wisconsin. WBA appreciates FRB's actions to provide continued clarity that loans made by a bank to insider-owned businesses that are guaranteed under SBA's PPP remain excepted from the Federal Reserve Act and the corresponding provisions of Regulation O. Without an extension of the exception, WBA fears some auditors and examiners would treat such loans differently than PPP loans made on or before June 30 ,2020. As have been requirements of the program since inception, any PPP loan made during the extended program period must still meet certain eligibility and documentation criteria, and have the same interest rate, payment, and loan term. Additionally, all eligibility and documentation criteria and all loan terms and program requirements remain exclusively set by SBA and cannot be altered by the lender. Therefore, FRB should once again extend its exception for PPP loans; this time for PPP loans made through March 31 ,2022. WBA also appreciates FRB's efforts to have promulgated the interim final rules in such a straight-forward manner and for using plain language in its interim final rules. WBA encourages FRB to continue such efforts in future rule makings and for any other regulatory review efforts.\n",
21
+ "\n",
22
+ "ReportingAndCompliance: 0.6879\n",
23
+ "RiskManagement: 0.0000\n",
24
+ "ConsumerProtection: 0.0048\n",
25
+ "CorporateGovernance: 0.0000\n",
26
+ "CapitalRequirements: 0.0000\n",
27
+ "\n",
28
+ "Text: How about you crooks focus on the billions being laundered by banks in plain fucking sight instead of intruding in our lives more. Disgusting. Aweful.\n",
29
+ "\n",
30
+ "ReportingAndCompliance: 0.4072\n",
31
+ "RiskManagement: 0.2440\n",
32
+ "ConsumerProtection: 0.3574\n",
33
+ "CorporateGovernance: 0.3809\n",
34
+ "CapitalRequirements: 0.2414\n",
35
+ "\n",
36
+ "Text: If adopted, this proposal [R-1726], would prove to be an invasion of privacy. In terms of digital assets, crypto exchanges are not held accountable in the same way that other financial institutions are, and have a track record of bad operational security when it comes to securely storing client information.\n",
37
+ "\n",
38
+ "ReportingAndCompliance: 0.4365\n",
39
+ "RiskManagement: 0.5856\n",
40
+ "ConsumerProtection: 0.3847\n",
41
+ "CorporateGovernance: 0.1818\n",
42
+ "CapitalRequirements: 0.1904\n",
43
+ "\n",
44
+ "Text: Amendments to 20402(d)(2) and 204.2(e)(2) and (4) make a savings account without transfer or withdrawal limits transaction accounts. Can a depository institution avoid having a savings account be a transaction account by imposing a transfer/withdrawal restriction? Must such a restriction be absolute, or can it be suggested though the imposition of transaction fees for excess transfers/withdrawals in a stated period? The prefatory text, including the FAQ found there consistently uses the verb 'suspend.' Is 'suspend' used in the dictionary sense of 'temporarily prevent from continuing or being in force or effect'? If so, is that deliberate so as to suggest that it's expected that depository institutions will re-impose transfer/withdrawal limits at some future date (e.g., once the local economy recovers from the present pandemic)? Does the Board anticipate reinstating savings account transfer limits in the future, or believe that they will be reimposed by depository institutions as an account or contract provision? Relationship to Regulation CC A related question regarding the impact of the Reg D changes on the definition of 'account' in Regulation CC (12 CFR Part 229), which appears in the definition to exclude, except for the purposes of subpart D, any savings account described in 12 CFR 204.2(d)(2) 'even though such accounts permit third party transfers.' I note that the Official Interpretations applicable to the 229.2(a)(1) definition of 'account' in Regulation CC suggests that savings deposits are excluded because they :'may have limited third party payment powers,' and the Board believed the 'EFA Act is intended to apply only to accounts that permit UNLIMITED (emphasis added) third party transfers.' Will, then, a bank that 'suspends' its limits on savings deposit transfers and withdrawals be perforce (and perhaps unwittingly) making those savings accounts subject to Regulation CC, or does the Regulation CC 'account' definition continue to exclude savings accounts as described in 204.2(d)(2)? Thank you for your consideration of these comments and questions.\n",
45
+ "\n",
46
+ "ReportingAndCompliance: 0.2221\n",
47
+ "RiskManagement: 0.0007\n",
48
+ "ConsumerProtection: 0.0513\n",
49
+ "CorporateGovernance: 0.0031\n",
50
+ "CapitalRequirements: 0.0000\n",
51
+ "\n"
52
+ ]
53
+ }
54
+ ],
55
+ "source": [
56
+ "import spacy\n",
57
+ "\n",
58
+ "# Load the trained model\n",
59
+ "nlp = spacy.load('output/experiment1/model-best')\n",
60
+ "\n",
61
+ "# List of new text examples you want to classify\n",
62
+ "texts = [\n",
63
+ " \"Banks that are at risk of failing selling bonds? Absolutely not! No way! The idea of where this money needs to come from should've been a thought that was had before these institutions took on crazy amounts of leverage and debt they couldn't pay. It's an obvious attempt at shifting the massive risk they hold onto unsuspecting investors instead of owning the bag themselves, and admitting they had no real risk management. Free money is becoming a thing of the past, it's time for these institutions to grow up and learn. Failure is always an option. Funds raised by selling off these bonds has a high chance of being similarly mismanaged by these at risk of failing institutions due to the aforementioned lack of real risk management. Actions speak louder than words, and we still live in the shadow of a great financial crisis (hmm, I wonder who could've caused that and why?) And constantly throwing the average Joe under the bus does a pretty bad job of helping maintain public confidence in the finance system.\",\n",
64
+ " \"The Wisconsin Bankers Association (aka the WBA) is the largest financial trade association in Wisconsin, representing over 200 state and nationally chartered banks, savings banks,and savings and loan associations located in communities throughout the State. WBA appreciates the opportunity to comment on the interim final rule. Over the past year, the Board of Governors of the Federal Reserve System (FRB) issued several interim final rules to except certain loans that are guaranteed under the Small Business Administration's (SBA's) Paycheck Protection Program (PPP) from the requirements of the Federal Reserve Act and the corresponding provisions of Regulation O.To reflect the latest program extension by Congress, FRB issued this interim final rule to extend the Regulation O exception to PPP loans through March 31, 2022. WBA filed comment letters in support of FRB's previous interim final rules as the removal of Regulation O obstacles through the exception has helped allow Wisconsin's banks to more efficiently address the needs of their insider-owned small businesses. FRB'spast interim final rules have helped ensuree ligible businesses have timely access to liquidity to help overcome economic hurdles resulting from the effects of COVID-19 and the mitigating efforts in effect throughout Wisconsin. WBA appreciates FRB's actions to provide continued clarity that loans made by a bank to insider-owned businesses that are guaranteed under SBA's PPP remain excepted from the Federal Reserve Act and the corresponding provisions of Regulation O. Without an extension of the exception, WBA fears some auditors and examiners would treat such loans differently than PPP loans made on or before June 30 ,2020. As have been requirements of the program since inception, any PPP loan made during the extended program period must still meet certain eligibility and documentation criteria, and have the same interest rate, payment, and loan term. Additionally, all eligibility and documentation criteria and all loan terms and program requirements remain exclusively set by SBA and cannot be altered by the lender. Therefore, FRB should once again extend its exception for PPP loans; this time for PPP loans made through March 31 ,2022. WBA also appreciates FRB's efforts to have promulgated the interim final rules in such a straight-forward manner and for using plain language in its interim final rules. WBA encourages FRB to continue such efforts in future rule makings and for any other regulatory review efforts.\",\n",
65
+ " \"How about you crooks focus on the billions being laundered by banks in plain fucking sight instead of intruding in our lives more. Disgusting. Aweful.\",\n",
66
+ " \"If adopted, this proposal [R-1726], would prove to be an invasion of privacy. In terms of digital assets, crypto exchanges are not held accountable in the same way that other financial institutions are, and have a track record of bad operational security when it comes to securely storing client information.\",\n",
67
+ " \"Amendments to 20402(d)(2) and 204.2(e)(2) and (4) make a savings account without transfer or withdrawal limits transaction accounts. Can a depository institution avoid having a savings account be a transaction account by imposing a transfer/withdrawal restriction? Must such a restriction be absolute, or can it be suggested though the imposition of transaction fees for excess transfers/withdrawals in a stated period? The prefatory text, including the FAQ found there consistently uses the verb 'suspend.' Is 'suspend' used in the dictionary sense of 'temporarily prevent from continuing or being in force or effect'? If so, is that deliberate so as to suggest that it's expected that depository institutions will re-impose transfer/withdrawal limits at some future date (e.g., once the local economy recovers from the present pandemic)? Does the Board anticipate reinstating savings account transfer limits in the future, or believe that they will be reimposed by depository institutions as an account or contract provision? Relationship to Regulation CC A related question regarding the impact of the Reg D changes on the definition of 'account' in Regulation CC (12 CFR Part 229), which appears in the definition to exclude, except for the purposes of subpart D, any savings account described in 12 CFR 204.2(d)(2) 'even though such accounts permit third party transfers.' I note that the Official Interpretations applicable to the 229.2(a)(1) definition of 'account' in Regulation CC suggests that savings deposits are excluded because they :'may have limited third party payment powers,' and the Board believed the 'EFA Act is intended to apply only to accounts that permit UNLIMITED (emphasis added) third party transfers.' Will, then, a bank that 'suspends' its limits on savings deposit transfers and withdrawals be perforce (and perhaps unwittingly) making those savings accounts subject to Regulation CC, or does the Regulation CC 'account' definition continue to exclude savings accounts as described in 204.2(d)(2)? Thank you for your consideration of these comments and questions.\"\n",
68
+ "]\n",
69
+ "\n",
70
+ "for text in texts:\n",
71
+ " doc = nlp(text)\n",
72
+ " print(f\"Text: {text}\\n\")\n",
73
+ " for label, score in doc.cats.items():\n",
74
+ " print(f\"{label}: {score:.4f}\")\n",
75
+ " print()"
76
+ ]
77
+ }
78
+ ],
79
+ "metadata": {
80
+ "kernelspec": {
81
+ "display_name": "venv",
82
+ "language": "python",
83
+ "name": "python3"
84
+ },
85
+ "language_info": {
86
+ "codemirror_mode": {
87
+ "name": "ipython",
88
+ "version": 3
89
+ },
90
+ "file_extension": ".py",
91
+ "mimetype": "text/x-python",
92
+ "name": "python",
93
+ "nbconvert_exporter": "python",
94
+ "pygments_lexer": "ipython3",
95
+ "version": "3.11.4"
96
+ }
97
+ },
98
+ "nbformat": 4,
99
+ "nbformat_minor": 2
100
+ }
python_Code/secondStep-score.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.training import Example
3
+ import jsonlines
4
+ import random
5
+
6
+ # Load a blank English model
7
+ nlp = spacy.blank("en")
8
+
9
+ # Add text classification pipeline to the model
10
+ textcat = nlp.add_pipe('textcat_multilabel', last=True)
11
+ textcat.add_label("CapitalRequirements")
12
+ textcat.add_label("ConsumerProtection")
13
+ textcat.add_label("RiskManagement")
14
+ textcat.add_label("ReportingAndCompliance")
15
+ textcat.add_label("CorporateGovernance")
16
+
17
+ # Path to the processed data file
18
+ processed_data_file = "data/firstStep_file.jsonl"
19
+
20
+ # Open the JSONL file and extract text and labels
21
+ with jsonlines.open(processed_data_file) as reader:
22
+ processed_data = list(reader)
23
+
24
+ # Convert processed data to spaCy format
25
+ spacy_train_data = []
26
+ for obj in processed_data:
27
+ text = obj["text"]
28
+ label = {
29
+ "CapitalRequirements": obj["label"] == "CapitalRequirements",
30
+ "ConsumerProtection": obj["label"] == "ConsumerProtection",
31
+ "RiskManagement": obj["label"] == "RiskManagement",
32
+ "ReportingAndCompliance": obj["label"] == "ReportingAndCompliance",
33
+ "CorporateGovernance": obj["label"] == "CorporateGovernance"
34
+ }
35
+ spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label}))
36
+
37
+ # Initialize the model and get the optimizer
38
+ optimizer = nlp.initialize()
39
+
40
+ # Train the text classification model
41
+ n_iter = 10
42
+ for i in range(n_iter):
43
+ spacy.util.fix_random_seed(1)
44
+ random.shuffle(spacy_train_data)
45
+ losses = {}
46
+ for batch in spacy.util.minibatch(spacy_train_data, size=8):
47
+ nlp.update(batch, losses=losses, sgd=optimizer)
48
+ print("Iteration:", i, "Losses:", losses)
49
+
50
+ # Save the trained model
51
+ output_dir = "./my_trained_model"
52
+ nlp.to_disk(output_dir)
python_Code/thirdStep-label.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import jsonlines
3
+
4
+ # Load the trained model
5
+ model_path = "./my_trained_model"
6
+ nlp = spacy.load(model_path)
7
+
8
+ # Load the unlabeled data
9
+ unlabeled_data_file = "data/train.jsonl"
10
+
11
+ # Open the JSONL file and classify each record
12
+ classified_data = []
13
+ with jsonlines.open(unlabeled_data_file) as reader:
14
+ for record in reader:
15
+ text = record["text"]
16
+ doc = nlp(text)
17
+ predicted_labels = doc.cats
18
+ classified_data.append({"text": text, "predicted_labels": predicted_labels})
19
+
20
+ # Optionally, you can save the classified data to a file or process it further
21
+ output_file = "data/thirdStep_file.jsonl"
22
+ with jsonlines.open(output_file, mode="w") as writer:
23
+ writer.write_all(classified_data)