import pandas as pd from seqeval.metrics import f1_score, precision_score, recall_score from transformers import pipeline, AutoTokenizer from datasets import load_dataset tokenizer = AutoTokenizer.from_pretrained( "HooshvareLab/albert-fa-zwnj-base-v2-ner") dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True) # tag_to_num = {'O':0, 'I-EVE':1, 'I-FAC':2, 'I-LOC':3, 'I-ORG':4, 'I-PER':5, 'I-PRO':6, 'B-EVE':7, 'B-FAC':8, 'B-LOC':9, 'B-ORG':10, 'B-PER':11, 'B-PRO':12} num_to_tags = {0: 'O', 1: 'I-EVE', 2: 'I-FAC', 3: 'I-LOC', 4: 'I-ORG', 5: 'I-PER', 6: 'I-PRO', 7: 'B-EVE', 8: 'B-FAC', 9: 'B-LOC', 10: 'B-ORG', 11: 'B-PER', 12: 'B-PRO'} def add_text(examples): results = {'text': [' '.join(example) for example in examples['tokens']], # 'ner_tags': [[num_to_tags[tag] for tag in example] for example in examples['ner_tags']] } return results dataset = dataset.map(add_text, batched=True) dataset = dataset.shuffle(seed=42).select(range(100)) pipe = pipeline("token-classification", model="HooshvareLab/albert-fa-zwnj-base-v2-ner") def predict(example): tokenized = tokenizer(example['text']) words = set(tokenized.word_ids()) words.remove(None) words_num = len(words) result = pipe(example['text']) predictions = ['O'] * words_num for entity in result: word_id = tokenized.token_to_word(entity['index']) if predictions[word_id] == 'O': # if entity['entity'] not in tag_to_num.keys(): # predictions[word_id] = 1 # continue predictions[word_id] = entity['entity'] return {'predictions': predictions} dataset = dataset.map(predict) true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']] # true_labels = dataset['ner_tags'] predicted_labels = dataset['predictions'] result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner", 'evaluation_dataset': 'HaniehPoostchi/persian_ner', 'Recall': recall_score(true_labels, predicted_labels), 'Precision': precision_score(true_labels, predicted_labels), 'F1': f1_score(true_labels, predicted_labels)} result = pd.DataFrame([result]) result.to_csv('result.csv', index=False)