Spaces:
Sleeping
Sleeping
import spacy | |
from spacy.training import Example | |
import jsonlines | |
import random | |
# Load a blank English model | |
nlp = spacy.blank("en") | |
# Add text classification pipeline to the model | |
textcat = nlp.add_pipe('textcat_multilabel', last=True) | |
textcat.add_label("CapitalRequirements") | |
textcat.add_label("ConsumerProtection") | |
textcat.add_label("RiskManagement") | |
textcat.add_label("ReportingAndCompliance") | |
textcat.add_label("CorporateGovernance") | |
# Path to the processed data file | |
processed_data_file = "data/firstStep_file.jsonl" | |
# Open the JSONL file and extract text and labels | |
with jsonlines.open(processed_data_file) as reader: | |
processed_data = list(reader) | |
# Convert processed data to spaCy format | |
spacy_train_data = [] | |
for obj in processed_data: | |
text = obj["text"] | |
label = { | |
"CapitalRequirements": obj["label"] == "CapitalRequirements", | |
"ConsumerProtection": obj["label"] == "ConsumerProtection", | |
"RiskManagement": obj["label"] == "RiskManagement", | |
"ReportingAndCompliance": obj["label"] == "ReportingAndCompliance", | |
"CorporateGovernance": obj["label"] == "CorporateGovernance" | |
} | |
spacy_train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": label})) | |
# Initialize the model and get the optimizer | |
optimizer = nlp.initialize() | |
# Train the text classification model | |
n_iter = 10 | |
for i in range(n_iter): | |
spacy.util.fix_random_seed(1) | |
random.shuffle(spacy_train_data) | |
losses = {} | |
for batch in spacy.util.minibatch(spacy_train_data, size=8): | |
nlp.update(batch, losses=losses, sgd=optimizer) | |
print("Iteration:", i, "Losses:", losses) | |
# Save the trained model | |
output_dir = "./my_trained_model" | |
nlp.to_disk(output_dir) | |