|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
from transformers import pipeline |
|
import random |
|
from nltk.corpus import stopwords |
|
import math |
|
|
|
|
|
def mask_non_stopword(sentence): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
non_stop_words = [word for word in words if word.lower() not in stop_words] |
|
if not non_stop_words: |
|
return sentence |
|
word_to_mask = random.choice(non_stop_words) |
|
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) |
|
return masked_sentence |
|
|
|
def mask_non_stopword_pseudorandom(sentence): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
non_stop_words = [word for word in words if word.lower() not in stop_words] |
|
if not non_stop_words: |
|
return sentence |
|
random.seed(10) |
|
word_to_mask = random.choice(non_stop_words) |
|
masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) |
|
return masked_sentence |
|
|
|
def high_entropy_words(sentence, non_melting_points): |
|
stop_words = set(stopwords.words('english')) |
|
words = sentence.split() |
|
|
|
non_melting_words = set() |
|
for _, point in non_melting_points: |
|
non_melting_words.update(point.lower().split()) |
|
|
|
candidate_words = [word for word in words if word.lower() not in stop_words and word.lower() not in non_melting_words] |
|
|
|
if not candidate_words: |
|
return sentence |
|
|
|
max_entropy = -float('inf') |
|
max_entropy_word = None |
|
|
|
for word in candidate_words: |
|
masked_sentence = sentence.replace(word, '[MASK]', 1) |
|
predictions = fill_mask(masked_sentence) |
|
|
|
|
|
entropy = -sum(pred['score'] * math.log(pred['score']) for pred in predictions[:5]) |
|
|
|
if entropy > max_entropy: |
|
max_entropy = entropy |
|
max_entropy_word = word |
|
|
|
return sentence.replace(max_entropy_word, '[MASK]', 1) |
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") |
|
model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") |
|
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) |
|
|
|
def mask(sentence): |
|
predictions = fill_mask(sentence) |
|
masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))] |
|
return masked_sentences |
|
|
|
|