In [None]:
!pip install transformers datasets seqeval huggingface_hub


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

In [None]:
# Standard library imports
import os                 # Provides functions for interacting with the operating system
import warnings           # Used to handle or suppress warnings
import numpy as np        # Essential for numerical operations and array manipulation
import torch              # PyTorch library for tensor computations and model handling
import ast                # Used for safe evaluation of strings to Python objects (e.g., parsing tokens)

# Hugging Face and Transformers imports
from datasets import load_dataset                     # Loads datasets for model training and evaluation
from transformers import (
    AutoTokenizer,                                   # Initializes a tokenizer from a pre-trained model
    DataCollatorForTokenClassification,              # Handles padding and formatting of token classification data
    TrainingArguments,                               # Defines training parameters like batch size and learning rate
    Trainer,                                         # High-level API for managing training and evaluation
    AutoModelForTokenClassification,                 # Loads a pre-trained model for token classification tasks
    get_linear_schedule_with_warmup,                 # Learning rate scheduler for gradual warm-up and linear decay
    EarlyStoppingCallback                           # Callback to stop training if validation performance plateaus
)

# Hugging Face Hub
from huggingface_hub import login                   # Allows logging in to Hugging Face Hub to upload models

# seqeval metrics for NER evaluation
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
# Provides precision, recall, F1-score, and classification report for evaluating NER model performance


In [None]:
# Log in to Hugging Face Hub
login(token="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")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Disable WandB (Weights & Biases) logging to avoid unwanted log outputs during training
os.environ["WANDB_DISABLED"] = "true"

# Suppress warning messages to keep output clean, especially during training and evaluation
warnings.filterwarnings("ignore")


In [None]:
# Load the Azerbaijani NER dataset from Hugging Face
dataset = load_dataset("LocalDoc/azerbaijani-ner-dataset")
print(dataset)  # Display dataset structure (e.g., train/validation splits)

# Preprocessing function to format tokens and NER tags correctly
def preprocess_example(example):
    try:
        # Convert string of tokens to a list and parse NER tags to integers
        example["tokens"] = ast.literal_eval(example["tokens"])
        example["ner_tags"] = list(map(int, ast.literal_eval(example["ner_tags"])))
    except (ValueError, SyntaxError) as e:
        # Skip and log malformed examples, ensuring error resilience
        print(f"Skipping malformed example: {example['index']} due to error: {e}")
        example["tokens"] = []
        example["ner_tags"] = []
    return example

# Apply preprocessing to each dataset entry, ensuring consistent formatting
dataset = dataset.map(preprocess_example)


README.md:   0%|          | 0.00/2.87k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/99545 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'tokens', 'ner_tags'],
        num_rows: 99545
    })
})


Map:   0%|          | 0/99545 [00:00<?, ? examples/s]

Skipping malformed example: 7171f30e-fa1e-49ec-975e-16c88c9b95e9 due to error: malformed node or string: None
Skipping malformed example: 91dfd97b-2997-4080-8054-00cadec14dfc due to error: malformed node or string: None
Skipping malformed example: cfb8beb4-ae7a-4185-9a54-08b0e85d03d3 due to error: malformed node or string: None
Skipping malformed example: 5f0a2991-38b3-435b-9059-a05382e89a62 due to error: malformed node or string: None
Skipping malformed example: 9d705fde-ce09-4bef-9f4a-9ad1fa452cc9 due to error: malformed node or string: None
Skipping malformed example: 182457fb-c648-4fca-a207-af5a00072d4a due to error: malformed node or string: None
Skipping malformed example: d9205ccd-c692-4cf1-8310-181de8f4cdc8 due to error: malformed node or string: None
Skipping malformed example: dac55265-38cd-4c4b-9e56-a48a77e108d4 due to error: malformed node or string: None
Skipping malformed example: f3d38b45-0035-45ab-b0aa-79ae7c63ba7a due to error: malformed node or string: None
Skipping m

In [None]:
# Initialize the tokenizer for multilingual NER using XLM-RoBERTa
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Function to tokenize input and align labels with tokenized words
def tokenize_and_align_labels(example):
    # Tokenize the sentence while preserving word boundaries for correct NER tag alignment
    tokenized_inputs = tokenizer(
        example["tokens"],            # List of words (tokens) in the sentence
        truncation=True,               # Truncate sentences longer than max_length
        is_split_into_words=True,      # Specify that input is a list of words
        padding="max_length",          # Pad to maximum sequence length
        max_length=128,                # Set the maximum sequence length to 128 tokens
    )

    labels = []                        # List to store aligned NER labels
    word_ids = tokenized_inputs.word_ids()  # Get word IDs for each token
    previous_word_idx = None           # Initialize previous word index for tracking

    # Loop through word indices to align NER tags with subword tokens
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)        # Set padding token labels to -100 (ignored in loss)
        elif word_idx != previous_word_idx:
            # Assign the label from example's NER tags if word index matches
            labels.append(example["ner_tags"][word_idx] if word_idx < len(example["ner_tags"]) else -100)
        else:
            labels.append(-100)        # Label subword tokens with -100 to avoid redundant labels
        previous_word_idx = word_idx   # Update previous word index

    tokenized_inputs["labels"] = labels  # Add labels to tokenized inputs
    return tokenized_inputs

# Apply tokenization and label alignment function to the dataset
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=False)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/99545 [00:00<?, ? examples/s]

In [None]:
# Create a 90-10 split of the dataset for training and validation
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)
print(tokenized_datasets)  # Output structure of split datasets

DatasetDict({
    train: Dataset({
        features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 89590
    })
    test: Dataset({
        features: ['index', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9955
    })
})


In [None]:
# Define a list of entity labels for NER tagging with B- (beginning) and I- (inside) markers
label_list = [
    "O",                  # Outside of a named entity
    "B-PERSON", "I-PERSON",         # Person name (e.g., "John" in "John Doe")
    "B-LOCATION", "I-LOCATION",     # Geographical location (e.g., "Paris")
    "B-ORGANISATION", "I-ORGANISATION", # Organization name (e.g., "UNICEF")
    "B-DATE", "I-DATE",             # Date entity (e.g., "2024-11-05")
    "B-TIME", "I-TIME",             # Time (e.g., "12:00 PM")
    "B-MONEY", "I-MONEY",           # Monetary values (e.g., "$20")
    "B-PERCENTAGE", "I-PERCENTAGE", # Percentage values (e.g., "20%")
    "B-FACILITY", "I-FACILITY",     # Physical facilities (e.g., "Airport")
    "B-PRODUCT", "I-PRODUCT",       # Product names (e.g., "iPhone")
    "B-EVENT", "I-EVENT",           # Named events (e.g., "Olympics")
    "B-ART", "I-ART",               # Works of art (e.g., "Mona Lisa")
    "B-LAW", "I-LAW",               # Laws and legal documents (e.g., "Article 50")
    "B-LANGUAGE", "I-LANGUAGE",     # Languages (e.g., "Azerbaijani")
    "B-GPE", "I-GPE",               # Geopolitical entities (e.g., "Europe")
    "B-NORP", "I-NORP",             # Nationalities, religious groups, political groups
    "B-ORDINAL", "I-ORDINAL",       # Ordinal indicators (e.g., "first", "second")
    "B-CARDINAL", "I-CARDINAL",     # Cardinal numbers (e.g., "three")
    "B-DISEASE", "I-DISEASE",       # Diseases (e.g., "COVID-19")
    "B-CONTACT", "I-CONTACT",       # Contact info (e.g., email or phone number)
    "B-ADAGE", "I-ADAGE",           # Common sayings or adages
    "B-QUANTITY", "I-QUANTITY",     # Quantities (e.g., "5 km")
    "B-MISCELLANEOUS", "I-MISCELLANEOUS", # Miscellaneous entities not fitting other categories
    "B-POSITION", "I-POSITION",     # Job titles or positions (e.g., "CEO")
    "B-PROJECT", "I-PROJECT"        # Project names (e.g., "Project Apollo")
]


In [None]:
# Initialize a data collator to handle padding and formatting for token classification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Load a pre-trained model for token classification, adapted for NER tasks
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-large",               # Base model (multilingual XLM-RoBERTa) for NER
    num_labels=len(label_list)        # Set the number of output labels to match NER categories
)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define a function to compute evaluation metrics for the model's predictions
def compute_metrics(p):
    predictions, labels = p  # Unpack predictions and true labels from the input

    # Convert logits to predicted label indices by taking the argmax along the last axis
    predictions = np.argmax(predictions, axis=2)

    # Filter out special padding labels (-100) and convert indices to label names
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Print a detailed classification report for each label category
    print(classification_report(true_labels, true_predictions))

    # Calculate and return key evaluation metrics
    return {
        # Precision measures the accuracy of predicted positive instances
        # Important in NER to ensure entity predictions are correct and reduce false positives.
        "precision": precision_score(true_labels, true_predictions),

        # Recall measures the model's ability to capture all relevant entities
        # Essential in NER to ensure the model captures all entities, reducing false negatives.
        "recall": recall_score(true_labels, true_predictions),

        # F1-score is the harmonic mean of precision and recall, balancing both metrics
        # Useful in NER for providing an overall performance measure, especially when precision and recall are both important.
        "f1": f1_score(true_labels, true_predictions),
    }


In [None]:
# Set up training arguments for model training, defining essential training configurations
training_args = TrainingArguments(
    output_dir="./results",               # Directory to save model checkpoints and final outputs
    evaluation_strategy="epoch",          # Evaluate model on the validation set at the end of each epoch
    save_strategy="epoch",                # Save model checkpoints at the end of each epoch
    learning_rate=2e-5,                   # Set a low learning rate to ensure stable training for fine-tuning
    per_device_train_batch_size=128,       # Number of examples per batch during training, balancing speed and memory
    per_device_eval_batch_size=128,        # Number of examples per batch during evaluation
    num_train_epochs=12,                   # Number of full training passes over the dataset
    weight_decay=0.005,                    # Regularization term to prevent overfitting by penalizing large weights
    fp16=True,                            # Use 16-bit floating point for faster and memory-efficient training
    logging_dir='./logs',                 # Directory to store training logs
    save_total_limit=2,                   # Keep only the 2 latest model checkpoints to save storage space
    load_best_model_at_end=True,          # Load the best model based on metrics at the end of training
    metric_for_best_model="f1",           # Use F1-score to determine the best model checkpoint
    report_to="none"                      # Disable reporting to external services (useful in local runs)
)


In [None]:
# Initialize the Trainer class to manage the training loop with all necessary components
trainer = Trainer(
    model=model,                         # The pre-trained model to be fine-tuned
    args=training_args,                  # Training configuration parameters defined in TrainingArguments
    train_dataset=tokenized_datasets["train"],  # Tokenized training dataset
    eval_dataset=tokenized_datasets["test"],    # Tokenized validation dataset
    tokenizer=tokenizer,                 # Tokenizer used for processing input text
    data_collator=data_collator,         # Data collator for padding and batching during training
    compute_metrics=compute_metrics,     # Function to calculate evaluation metrics like precision, recall, F1
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] # Stop training early if validation metrics don't improve for 2 epochs
)


In [None]:
# Begin the training process and capture the training metrics
training_metrics = trainer.train()

# Evaluate the model on the validation set after training
eval_results = trainer.evaluate()

# Print evaluation results, including precision, recall, and F1-score
print(eval_results)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.3231,0.275503,0.775799,0.694886,0.733117
2,0.2725,0.262481,0.739266,0.7399,0.739583
3,0.2486,0.252498,0.751478,0.741152,0.74628
4,0.2368,0.249968,0.754882,0.741449,0.748105
5,0.2238,0.252187,0.76439,0.74046,0.752235
6,0.2186,0.249887,0.756352,0.741646,0.748927
7,0.2097,0.250748,0.760696,0.739438,0.749916


              precision    recall  f1-score   support

         ART       0.63      0.15      0.25      1857
        DATE       0.53      0.39      0.45       880
       EVENT       1.00      0.10      0.19        96
    FACILITY       0.71      0.64      0.67      1170
         LAW       0.56      0.59      0.57      1122
    LOCATION       0.80      0.75      0.77      9132
       MONEY       0.63      0.48      0.54       540
ORGANISATION       0.67      0.58      0.62       544
  PERCENTAGE       0.78      0.80      0.79      3591
      PERSON       0.86      0.82      0.84      7037
     PRODUCT       0.82      0.84      0.83      2808
        TIME       0.59      0.35      0.44      1569

   micro avg       0.78      0.69      0.73     30346
   macro avg       0.71      0.54      0.58     30346
weighted avg       0.76      0.69      0.72     30346

              precision    recall  f1-score   support

         ART       0.62      0.17      0.26      1857
        DATE       0.46 

              precision    recall  f1-score   support

         ART       0.54      0.20      0.29      1857
        DATE       0.52      0.47      0.50       880
       EVENT       0.69      0.35      0.47        96
    FACILITY       0.69      0.69      0.69      1170
         LAW       0.60      0.61      0.60      1122
    LOCATION       0.77      0.82      0.80      9132
       MONEY       0.61      0.57      0.59       540
ORGANISATION       0.69      0.68      0.69       544
  PERCENTAGE       0.79      0.82      0.81      3591
      PERSON       0.87      0.83      0.85      7037
     PRODUCT       0.83      0.85      0.84      2808
        TIME       0.55      0.51      0.53      1569

   micro avg       0.76      0.74      0.75     30346
   macro avg       0.68      0.62      0.64     30346
weighted avg       0.75      0.74      0.74     30346

{'eval_loss': 0.2521866261959076, 'eval_precision': 0.7643897128861069, 'eval_recall': 0.7404600276807487, 'eval_f1': 0.7522346088179

In [None]:
# Define the directory where the trained model and tokenizer will be saved
save_directory = "./XLM-RoBERTa"

# Save the trained model to the specified directory
model.save_pretrained(save_directory)

# Save the tokenizer to the same directory for compatibility with the model
tokenizer.save_pretrained(save_directory)


('./XLM-RoBERTa/tokenizer_config.json',
 './XLM-RoBERTa/special_tokens_map.json',
 './XLM-RoBERTa/sentencepiece.bpe.model',
 './XLM-RoBERTa/added_tokens.json',
 './XLM-RoBERTa/tokenizer.json')

In [None]:
from transformers import pipeline

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForTokenClassification.from_pretrained(save_directory)

# Initialize the NER pipeline
device = 0 if torch.cuda.is_available() else -1
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=device)


In [None]:
label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list) if label != "O"}

def evaluate_model(test_texts, true_labels):
    predictions = []
    for i, text in enumerate(test_texts):
        pred_entities = nlp_ner(text)
        pred_labels = [label_mapping.get(entity["entity_group"], "O") for entity in pred_entities if entity["entity_group"] in label_mapping]
        if len(pred_labels) != len(true_labels[i]):
            print(f"Warning: Inconsistent number of entities in sample {i+1}. Adjusting predicted entities.")
            pred_labels = pred_labels[:len(true_labels[i])]
        predictions.append(pred_labels)
    if all(len(true) == len(pred) for true, pred in zip(true_labels, predictions)):
        precision = precision_score(true_labels, predictions)
        recall = recall_score(true_labels, predictions)
        f1 = f1_score(true_labels, predictions)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-Score:", f1)
        print(classification_report(true_labels, predictions))
    else:
        print("Error: Could not align all samples correctly for evaluation.")


In [None]:
test_texts = ["Shahla Khuduyeva və Pasha Sığorta şirkəti haqqında məlumat."]
true_labels = [["B-PERSON", "B-ORGANISATION"]]
evaluate_model(test_texts, true_labels)


Precision: 0.5
Recall: 0.5
F1-Score: 0.5
              precision    recall  f1-score   support

    LOCATION       0.00      0.00      0.00         0
ORGANISATION       0.00      0.00      0.00         1
      PERSON       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.33      0.33      0.33         2
weighted avg       0.50      0.50      0.50         2

