Spaces:

fuhsiao
/

Ext-Abs-StructuredSum

Runtime error

App Files Files Community

Ext-Abs-StructuredSum / utils /methods.py

fuhsiao

update

ea748ba almost 2 years ago

raw

history blame contribute delete

3.79 kB

	import nltk
	import pickle
	import numpy as np
	import pandas as pd
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



	class TrigramBlock:
	def __init__(self):
	self.trigrams = set()

	def check_overlap(self, text):
	tokens = self._preprocess(text)
	trigrams = set(self._get_trigrams(tokens))
	overlap = bool(self.trigrams & trigrams)
	self.trigrams \|= trigrams
	return overlap

	def _preprocess(self, text):
	text = text.lower()
	text = ''.join([c for c in text if c.isalpha() or c.isspace()])
	tokens = nltk.word_tokenize(text)
	return tokens

	def _get_trigrams(self, tokens):
	trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)]
	return trigrams



	def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking):

	body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]],
	columns=['section', 'text']).astype({'section': 'category', 'text': 'string'})
	# 加上預測結果和機率
	body['predict'] = pred.astype('bool')
	body['proba'] = true_proba.astype('float16')
	# 對每章節的提取句子進行 trigram blocking
	if set_trigram_blocking:
	for section in 'IMRD':
	block = TrigramBlock()
	temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False)
	for i, row in temp.iterrows():
	if block.check_overlap(row['text']):
	body.at[i, 'predict'] = False
	return body

	# 提取式方法
	def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False):
	#預測
	def predict(x):
	true_proba = model.predict_proba(x)[:, 1]
	# 如果沒有任何句子的預測機率大於閾值，則選取最大機率的句子為摘要句
	if not np.any(true_proba > threshold):
	true_proba[true_proba == np.max(true_proba)] = 1
	pred = (true_proba > threshold).astype('int')
	return pred, true_proba


	grouped = sentFeat.groupby('section')
	pred = np.array([])
	true_proba = np.array([])

	for group_name, group_data in grouped:
	pred_sec, true_proba_sec = predict(group_data)
	# Append to the NumPy arrays
	pred = np.append(pred, pred_sec)
	true_proba = np.append(true_proba, true_proba_sec)

	body = convert_sentence_df(sentJson, pred, true_proba, TGB)
	res = body[body['predict'] == True]
	ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'}
	return ext

	def abstractive_method(ext, tokenizer, model, device='cpu'):
	abstr = {key: '' for key in 'IMRD'}
	for section in 'IMRD':
	text = ext[section]
	model_inputs = tokenizer(text, truncation=True, return_tensors='pt').input_ids
	outputs = model.generate(model_inputs.to(device))
	abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	abstr[section] = abstr_text
	return abstr

	# extractive summarizer
	def load_ExtModel(path):
	return pickle.load(open(path, 'rb'))

	# abstractive summarizer
	def load_AbstrModel(path, device='cpu'):
	model_checkpoint = path
	tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024)
	abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
	abstrModel = abstrModel.to(device)

	generation_config = {
	'num_beams': 5,
	'max_length': 512,
	'min_length': 64,
	'length_penalty': 2.0,
	'early_stopping': True,
	'no_repeat_ngram_size': 3
	}

	abstrModel.config.update(generation_config)
	return tokenizer, abstrModel