Spaces:

fuhsiao
/

Ext-Abs-StructuredSum

Runtime error

File size: 3,871 Bytes

import nltk
import pickle
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



class TrigramBlock:
    def __init__(self):
        self.trigrams = set()

    def check_overlap(self, text):
        tokens = self._preprocess(text)
        trigrams = set(self._get_trigrams(tokens))
        overlap = bool(self.trigrams & trigrams)
        self.trigrams |= trigrams
        return overlap

    def _preprocess(self, text):
        text = text.lower()
        text = ''.join([c for c in text if c.isalpha() or c.isspace()])
        tokens = nltk.word_tokenize(text)
        return tokens

    def _get_trigrams(self, tokens):
        trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)]
        return trigrams



def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking):
    
    body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]],
                       columns=['section', 'text']).astype({'section': 'category', 'text': 'string'})
    # 加上預測結果和機率
    body['predict'] = pred.astype('bool')
    body['proba'] = true_proba.astype('float16')
    # 對每章節的提取句子進行 trigram blocking
    if set_trigram_blocking:
        for section in 'IMRD':
            block = TrigramBlock()
            temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False)
            for i, row in temp.iterrows():
                if block.check_overlap(row['text']):
                    body.at[i, 'predict'] = False                   
    return body

# 提取式方法
def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False):
    #預測
    def predict(x):
        print(x)
        true_proba = model.predict_proba(x)[:, 1]
        print(threshold)
        print(type(threshold))
        print(true_proba)
        # 如果沒有任何句子的預測機率大於閾值，則選取最大機率的句子為摘要句
        if not np.any(true_proba > threshold):
            true_proba[true_proba == np.max(true_proba)] = 1
        pred = (true_proba > threshold).astype('int')
        return pred, true_proba


    grouped = sentFeat.groupby('section')
    pred = []
    true_proba = []

    for group_name, group_data in grouped:
        print(group_name)
        pred_sec, true_proba_sec = predict(group_data)
        pred.append(pred_sec)
        true_proba.append(true_proba_sec)

    
    pred, true_proba = predict(sentFeat)
    body = convert_sentence_df(sentJson, pred, true_proba, TGB)
    res = body[body['predict'] == True]
    ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'}
    return ext

def abstractive_method(ext, tokenizer, model, device='cpu'):
    abstr = {key: '' for key in 'IMRD'}
    for section in 'IMRD':
        text = ext[section]
        model_inputs = tokenizer(text,  truncation=True, return_tensors='pt').input_ids
        outputs = model.generate(model_inputs.to(device))
        abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        abstr[section] = abstr_text
    return abstr

# extractive summarizer
def load_ExtModel(path):
    return pickle.load(open(path, 'rb'))

# abstractive summarizer
def load_AbstrModel(path, device='cpu'):
    model_checkpoint = path
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024)
    abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    abstrModel = abstrModel.to(device)

    generation_config = {
        'num_beams': 5,
        'max_length': 512,
        'min_length': 64,
        'length_penalty': 2.0,
        'early_stopping': True,
        'no_repeat_ngram_size': 3
    }

    abstrModel.config.update(generation_config)
    return tokenizer, abstrModel