File size: 2,495 Bytes
a029051
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
from seqeval.metrics import f1_score, precision_score, recall_score
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained(
    "HooshvareLab/albert-fa-zwnj-base-v2-ner")

dataset = load_dataset('HaniehPoostchi/persian_ner', split='test', trust_remote_code=True)

# tag_to_num = {'O':0, 'I-EVE':1, 'I-FAC':2, 'I-LOC':3, 'I-ORG':4, 'I-PER':5, 'I-PRO':6, 'B-EVE':7, 'B-FAC':8, 'B-LOC':9, 'B-ORG':10, 'B-PER':11, 'B-PRO':12}
num_to_tags = {0: 'O',
                1: 'I-EVE',
                2: 'I-FAC',
                3: 'I-LOC',
                4: 'I-ORG',
                5: 'I-PER',
                6: 'I-PRO',
                7: 'B-EVE',
                8: 'B-FAC',
                9: 'B-LOC',
                10: 'B-ORG',
                11: 'B-PER',
                12: 'B-PRO'}

def add_text(examples):
    results = {'text': [' '.join(example) for example in examples['tokens']],
            #    'ner_tags': [[num_to_tags[tag] for tag in example] for example in examples['ner_tags']]
               }
    return results

dataset = dataset.map(add_text, batched=True)
dataset = dataset.shuffle(seed=42).select(range(100))

pipe = pipeline("token-classification",
                model="HooshvareLab/albert-fa-zwnj-base-v2-ner")


def predict(example):
    tokenized = tokenizer(example['text'])

    words = set(tokenized.word_ids())
    words.remove(None)
    words_num = len(words)

    result = pipe(example['text'])

    predictions = ['O'] * words_num

    for entity in result:
        word_id = tokenized.token_to_word(entity['index'])
        if predictions[word_id] == 'O':
            # if entity['entity'] not in tag_to_num.keys():
            #     predictions[word_id] = 1
            #     continue
            predictions[word_id] = entity['entity']
    return {'predictions': predictions}

dataset = dataset.map(predict)

true_labels = [[num_to_tags[tag] for tag in example] for example in dataset['ner_tags']]
# true_labels = dataset['ner_tags']
predicted_labels = dataset['predictions']

result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2-ner",
          'evaluation_dataset': 'HaniehPoostchi/persian_ner',
          'Recall': recall_score(true_labels, predicted_labels),
          'Precision': precision_score(true_labels, predicted_labels),
          'F1': f1_score(true_labels, predicted_labels)}

result = pd.DataFrame([result])

result.to_csv('result.csv', index=False)