Spaces:
Runtime error
Runtime error
import nltk | |
import pickle | |
import numpy as np | |
import pandas as pd | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
class TrigramBlock: | |
def __init__(self): | |
self.trigrams = set() | |
def check_overlap(self, text): | |
tokens = self._preprocess(text) | |
trigrams = set(self._get_trigrams(tokens)) | |
overlap = bool(self.trigrams & trigrams) | |
self.trigrams |= trigrams | |
return overlap | |
def _preprocess(self, text): | |
text = text.lower() | |
text = ''.join([c for c in text if c.isalpha() or c.isspace()]) | |
tokens = nltk.word_tokenize(text) | |
return tokens | |
def _get_trigrams(self, tokens): | |
trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)] | |
return trigrams | |
def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking): | |
body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]], | |
columns=['section', 'text']).astype({'section': 'category', 'text': 'string'}) | |
# 加上預測結果和機率 | |
body['predict'] = pred.astype('bool') | |
body['proba'] = true_proba.astype('float16') | |
# 對每章節的提取句子進行 trigram blocking | |
if set_trigram_blocking: | |
for section in 'IMRD': | |
block = TrigramBlock() | |
temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False) | |
for i, row in temp.iterrows(): | |
if block.check_overlap(row['text']): | |
body.at[i, 'predict'] = False | |
return body | |
# 提取式方法 | |
def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False): | |
#預測 | |
def predict(x): | |
true_proba = model.predict_proba(x)[:, 1] | |
# 如果沒有任何句子的預測機率大於閾值,則選取最大機率的句子為摘要句 | |
if not np.any(true_proba > threshold): | |
true_proba[true_proba == np.max(true_proba)] = 1 | |
pred = (true_proba > threshold).astype('int') | |
return pred, true_proba | |
grouped = sentFeat.groupby('section') | |
pred = np.array([]) | |
true_proba = np.array([]) | |
for group_name, group_data in grouped: | |
pred_sec, true_proba_sec = predict(group_data) | |
# Append to the NumPy arrays | |
pred = np.append(pred, pred_sec) | |
true_proba = np.append(true_proba, true_proba_sec) | |
body = convert_sentence_df(sentJson, pred, true_proba, TGB) | |
res = body[body['predict'] == True] | |
ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'} | |
return ext | |
def abstractive_method(ext, tokenizer, model, device='cpu'): | |
abstr = {key: '' for key in 'IMRD'} | |
for section in 'IMRD': | |
text = ext[section] | |
model_inputs = tokenizer(text, truncation=True, return_tensors='pt').input_ids | |
outputs = model.generate(model_inputs.to(device)) | |
abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
abstr[section] = abstr_text | |
return abstr | |
# extractive summarizer | |
def load_ExtModel(path): | |
return pickle.load(open(path, 'rb')) | |
# abstractive summarizer | |
def load_AbstrModel(path, device='cpu'): | |
model_checkpoint = path | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024) | |
abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) | |
abstrModel = abstrModel.to(device) | |
generation_config = { | |
'num_beams': 5, | |
'max_length': 512, | |
'min_length': 64, | |
'length_penalty': 2.0, | |
'early_stopping': True, | |
'no_repeat_ngram_size': 3 | |
} | |
abstrModel.config.update(generation_config) | |
return tokenizer, abstrModel | |