albert-fa-zwnj-base-v2 / evaluate.py
Montazerh82's picture
add evaluate file
3535e25
raw
history blame
3.48 kB
import numpy as np
import fasttext
import fasttext.util
import pandas as pd
import random
import normalizer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
random.seed(42)
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
# model = AutoModelForMaskedLM.from_pretrained("HooshvareLab/albert-fa-zwnj-base-v2")
# Load pre-trained word embeddings (e.g., fasttext)
fasttext.util.download_model('fa', if_exists='ignore') # English
embeddings = fasttext.load_model(r'cc.fa.300.bin')
# Example sentences with masked tokens
# masked_sentences = [
# ("The capital of France is [MASK].", "Paris"),
# ("The [MASK] is the largest mammal.", "whale"),
# ("The fastest land animal is the [MASK].", "cheetah")
# ]
# df = pd.read_excel('law_excel.xlsx', sheet_name='Sheet1')
# dataset = Dataset.from_pandas(df)
dataset = load_dataset('community-datasets/farsi_news', split='hamshahri')
dataset = dataset.shuffle(seed=42).select(range(100))
def tokenize_dataset(examples):
result = tokenizer(examples['summary'])
temp = {'masked_token': [-1] * len(result['input_ids']), 'input_ids': result['input_ids']}
for i, example in enumerate(result['input_ids']):
rand = random.randint(1, len(example)-2)
temp['masked_token'][i] = tokenizer.decode(example[rand])
temp['input_ids'][i][rand] = 4
result['input_ids'] = temp['input_ids']
result['masked_token'] = temp['masked_token']
return result
dataset = dataset.map(tokenize_dataset, batched=True)
# Initialize the fill-mask pipeline
fill_mask = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
# Define k for top-k predictions
k = 5
# Define similarity threshold
similarity_threshold = 0.5
# Initialize counters
TPP = 0
FPP = 0
FNR = 0
TPR = 0
def get_embedding(word):
try:
return embeddings[word]
except KeyError:
return None
for _, data in enumerate(dataset.iter(1)):
sentence = tokenizer.decode(data['input_ids'][0][1:-1])
sentence = normalizer.cleaning(sentence)
ground_truth = data['masked_token'][0]
# Get top-k predictions
predictions = fill_mask(sentence)[:k]
predicted_tokens = [pred['token_str'] for pred in predictions]
ground_truth_emb = get_embedding(ground_truth)
if ground_truth_emb is None:
continue # Skip if ground truth is not in the embeddings
flag = False
for token in predicted_tokens:
token_emb = get_embedding(token)
if token_emb is not None:
similarity = cosine_similarity([ground_truth_emb], [token_emb])[0][0]
if similarity >= similarity_threshold:
TPP += 1
flag = True
else:
FPP += 1
if flag:
TPR += 1
else:
FNR += 1
# Compute precision and recall
precision = TPP / (TPP + FPP) if (TPP + FPP) > 0 else 0
recall = TPR / (TPR + FNR) if (TPR + FNR) > 0 else 0
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
result = {'model': "HooshvareLab/albert-fa-zwnj-base-v2",
'evaluation_dataset': 'allenai/c4',
'Recall': recall,
'Precision': precision,
'F1': (recall*precision) / (recall + precision)}
result = pd.DataFrame([result])
result.to_csv('result.csv', index=False)