fuhsiao's picture
history blame
3.79 kB
import nltk
import pickle
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
class TrigramBlock:
def __init__(self):
self.trigrams = set()
def check_overlap(self, text):
tokens = self._preprocess(text)
trigrams = set(self._get_trigrams(tokens))
overlap = bool(self.trigrams & trigrams)
self.trigrams |= trigrams
return overlap
def _preprocess(self, text):
text = text.lower()
text = ''.join([c for c in text if c.isalpha() or c.isspace()])
tokens = nltk.word_tokenize(text)
return tokens
def _get_trigrams(self, tokens):
trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)]
return trigrams
def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking):
body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]],
columns=['section', 'text']).astype({'section': 'category', 'text': 'string'})
# 加上預測結果和機率
body['predict'] = pred.astype('bool')
body['proba'] = true_proba.astype('float16')
# 對每章節的提取句子進行 trigram blocking
if set_trigram_blocking:
for section in 'IMRD':
block = TrigramBlock()
temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False)
for i, row in temp.iterrows():
if block.check_overlap(row['text']):
body.at[i, 'predict'] = False
return body
# 提取式方法
def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False):
def predict(x):
true_proba = model.predict_proba(x)[:, 1]
# 如果沒有任何句子的預測機率大於閾值,則選取最大機率的句子為摘要句
if not np.any(true_proba > threshold):
true_proba[true_proba == np.max(true_proba)] = 1
pred = (true_proba > threshold).astype('int')
return pred, true_proba
grouped = sentFeat.groupby('section')
pred = np.array([])
true_proba = np.array([])
for group_name, group_data in grouped:
pred_sec, true_proba_sec = predict(group_data)
# Append to the NumPy arrays
pred = np.append(pred, pred_sec)
true_proba = np.append(true_proba, true_proba_sec)
body = convert_sentence_df(sentJson, pred, true_proba, TGB)
res = body[body['predict'] == True]
ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'}
return ext
def abstractive_method(ext, tokenizer, model, device='cpu'):
abstr = {key: '' for key in 'IMRD'}
for section in 'IMRD':
text = ext[section]
model_inputs = tokenizer(text, truncation=True, return_tensors='pt').input_ids
outputs = model.generate(model_inputs.to(device))
abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
abstr[section] = abstr_text
return abstr
# extractive summarizer
def load_ExtModel(path):
return pickle.load(open(path, 'rb'))
# abstractive summarizer
def load_AbstrModel(path, device='cpu'):
model_checkpoint = path
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024)
abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
abstrModel = abstrModel.to(device)
generation_config = {
'num_beams': 5,
'max_length': 512,
'min_length': 64,
'length_penalty': 2.0,
'early_stopping': True,
'no_repeat_ngram_size': 3
return tokenizer, abstrModel