Spaces:

fuhsiao
/

Ext-Abs-StructuredSum

Runtime error

App Files Files Community

fuhsiao418 commited on Aug 3, 2023

Commit

99d8161

1 Parent(s): 9b6c439

update

Browse files

Files changed (5) hide show

app.py +5 -1
model/LGB_model_F10_S.pkl +3 -0
requirements.txt +2 -0
utils/__init__.py +1 -1
utils/methods.py +87 -0

app.py CHANGED Viewed

@@ -8,7 +8,11 @@ def main(file, ext_threshold, article_type):
         return "invalid_format"
     sentJson = convert_to_sentence_json(paper)
     sentFeat = extract_sentence_features(sentJson)
-    return 'done'

         return "invalid_format"
     sentJson = convert_to_sentence_json(paper)
     sentFeat = extract_sentence_features(sentJson)
+    ExtModel = load_ExtModel('model/LGB_model_F10_S.pkl')
+    ext = extractive_method(sentJson, sentFeat, ExtModel, TGB=False)
+    return ext

model/LGB_model_F10_S.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c0f2b490f03417f065af6f3419b32c30f73af78f2aa9a846b1c55723d75fae3
+size 1837716

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
 numpy==1.23.3
 pandas==1.5.3
 torch==1.13.1
 scikit-learn==1.2.1
 sentence-transformers==2.2.2
 spacy==3.4.4
 https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

 numpy==1.23.3
 pandas==1.5.3
+nltk==3.7
 torch==1.13.1
 scikit-learn==1.2.1
+transformers==4.27.2
 sentence-transformers==2.2.2
 spacy==3.4.4
 https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

utils/__init__.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	from utils.preprocess import read_text_to_json, convert_to_sentence_json, extract_sentence_features, is_valid_format
2	-


1	from utils.preprocess import read_text_to_json, convert_to_sentence_json, extract_sentence_features, is_valid_format
2	+ from utils.methods import load_ExtModel, load_AbstrModel

utils/methods.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import nltk
+import pickle
+import numpy as np
+import pandas as pd
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+class TrigramBlock:
+    def __init__(self):
+        self.trigrams = set()
+    def check_overlap(self, text):
+        tokens = self._preprocess(text)
+        trigrams = set(self._get_trigrams(tokens))
+        overlap = bool(self.trigrams & trigrams)
+        self.trigrams |= trigrams
+        return overlap
+    def _preprocess(self, text):
+        text = text.lower()
+        text = ''.join([c for c in text if c.isalpha() or c.isspace()])
+        tokens = nltk.word_tokenize(text)
+        return tokens
+    def _get_trigrams(self, tokens):
+        trigrams = [' '.join(tokens[i:i+3]) for i in range(len(tokens)-2)]
+        return trigrams
+def convert_sentence_df(sentJson, pred, true_proba, set_trigram_blocking):
+    body = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['body'][section]],
+                       columns=['section', 'text']).astype({'section': 'category', 'text': 'string'})
+    # 加上預測結果和機率
+    body['predict'] = pred.astype('bool')
+    body['proba'] = true_proba.astype('float16')
+    # 對每章節的提取句子進行 trigram blocking
+    if set_trigram_blocking:
+        for section in 'IMRD':
+            block = TrigramBlock()
+            temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False)
+            for i, row in temp.iterrows():
+                if block.check_overlap(row['text']):
+                    body.at[i, 'predict'] = False
+    return body
+# 提取式方法
+def extractive_method(sentJson, sentFeat, model, threshold=0.5, TGB=False):
+    #預測
+    def predict(x):
+        true_proba = model.predict_proba(x)[:, 1]
+        # 如果沒有任何句子的預測機率大於閾值，則選取最大機率的句子為摘要句
+        if not np.any(true_proba > threshold):
+            true_proba[true_proba == np.max(true_proba)] = 1
+        pred = (true_proba > threshold).astype('int')
+        return pred, true_proba
+    pred, true_proba = predict(sentFeat)
+    body = convert_sentence_df(sentJson, pred, true_proba, TGB)
+    res = body[body['predict'] == True]
+    ext = {i: ' '.join(res.groupby('section').get_group(i)['text']) for i in 'IMRD'}
+    return ext
+def abstractive_method(ext, tokenizer, model, device='cpu'):
+    abstr = {key: '' for key in 'IMRD'}
+    for section in 'IMRD':
+        text = ext[section]
+        model_inputs = tokenizer(text,  truncation=True, return_tensors='pt').input_ids
+        outputs = model.generate(model_inputs.to(device))
+        abstr_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        abstr[section] = abstr_text
+    return abstr
+# extractive summarizer
+def load_ExtModel(path):
+    return pickle.load(open(path, 'rb'))
+# abstractive summarizer
+def load_AbstrModel(path, device='cpu'):
+    model_checkpoint = path
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024)
+    abstrModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    abstrModel = abstrModel.to(device)
+    return tokenizer, abstrModel