Spaces:
Runtime error
Runtime error
fuhsiao418
commited on
Commit
·
4d21bee
1
Parent(s):
fb10f98
app init
Browse files- app.py +31 -0
- utils/__init__.py +2 -0
- utils/preprocess.py +191 -0
app.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils import convert_to_sentence_json, extract_sentence_features
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
|
5 |
+
def main(file, ext_threshold, article_type):
|
6 |
+
paper = read_text_to_json(file.name)
|
7 |
+
if not is_valid_format(paper):
|
8 |
+
return "invalid_format"
|
9 |
+
sentJson = convert_to_sentence_json(paper)
|
10 |
+
sentFeat = extract_sentence_features(sentJson)
|
11 |
+
return 'done'
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
# 定義Gradio介面
|
16 |
+
iface = gr.Interface(
|
17 |
+
fn=main,
|
18 |
+
inputs=[
|
19 |
+
gr.inputs.File(),
|
20 |
+
gr.inputs.Slider(minimum=0.5, maximum=1, default=0.5, step=0.01, label="Extractive - Threshold"),
|
21 |
+
gr.inputs.Dropdown(["non-specialized field", "biomedical field"],default="non-specialized field", label="Abstractive - Field")
|
22 |
+
],
|
23 |
+
outputs=gr.outputs.Textbox(label="Output - Structured Abstract"),
|
24 |
+
title="Ext-Abs-StructuredSum",
|
25 |
+
description="please upload a .txt file formatted in the form of the example.",
|
26 |
+
# examples=[['text.txt']],
|
27 |
+
allow_flagging='never'
|
28 |
+
)
|
29 |
+
|
30 |
+
# 啟動Gradio介面
|
31 |
+
iface.launch(share=False) # share=False 用於停用分享模式
|
utils/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from utils.preprocess import convert_to_sentence_json, extract_sentence_features
|
2 |
+
|
utils/preprocess.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
2 |
+
from sentence_transformers import SentenceTransformer, util
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import gradio as gr
|
6 |
+
import torch
|
7 |
+
import spacy
|
8 |
+
import re
|
9 |
+
|
10 |
+
nlp = spacy.load("en_core_sci_sm")
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
# ----------------------------------------------
|
15 |
+
# Step 1. 讀取檔案 轉換 句子單位 JSON
|
16 |
+
# ----------------------------------------------
|
17 |
+
|
18 |
+
def read_text_to_json(path):
|
19 |
+
paper = {}
|
20 |
+
with open(path, 'r', encoding='utf-8') as txt:
|
21 |
+
key = None
|
22 |
+
for line in txt:
|
23 |
+
line = line.strip()
|
24 |
+
if line.startswith('@Paper') or line.startswith('@Section'):
|
25 |
+
key = line.split()[1]
|
26 |
+
paper[key] = []
|
27 |
+
elif key and line:
|
28 |
+
paper[key].append(line)
|
29 |
+
return paper
|
30 |
+
|
31 |
+
def is_valid_format(paper):
|
32 |
+
for key in ['title', 'I', 'M', 'R', 'D']:
|
33 |
+
if key not in paper or len(paper[key])==0:
|
34 |
+
return False
|
35 |
+
return True
|
36 |
+
|
37 |
+
def remove_parentheses_with_useless_tokens(text):
|
38 |
+
return re.sub(r'\s*\(\s*(?:table|fig|http|www)[^()]*\)', '', text, flags = re.I) # re.I 不區分大小寫
|
39 |
+
|
40 |
+
def segment_sentences(section, pos_para = False):
|
41 |
+
sents = []
|
42 |
+
sents_break = [".", "?", "!"]
|
43 |
+
start = para_i = pre_para_i = 0
|
44 |
+
conn = False
|
45 |
+
for para in section:
|
46 |
+
para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
|
47 |
+
doc = nlp(para)
|
48 |
+
for sent in doc.sents:
|
49 |
+
if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可
|
50 |
+
para_i +=1
|
51 |
+
text = "".join(t.text_with_ws for t in doc[start:sent.end]) # 原始字串
|
52 |
+
tokenize_text = " ".join(t.text for t in doc[start:sent.end]) # 分詞字串
|
53 |
+
sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i} # 建立句子物件
|
54 |
+
if pos_para: sentence['pos_para'] = para_i # pos 句子位置, pos_para 句子於每段位置
|
55 |
+
sents.append(sentence)
|
56 |
+
start = sent.end
|
57 |
+
conn = False
|
58 |
+
else:
|
59 |
+
start = start if conn else sent.start # sent.end 非斷句字符 紀錄此句 start, 直到斷句前不更改 start 位置
|
60 |
+
conn = True
|
61 |
+
pre_para_i += para_i
|
62 |
+
start = para_i = 0
|
63 |
+
return sents
|
64 |
+
|
65 |
+
def convert_to_sentence_json(paper):
|
66 |
+
sentJson = {
|
67 |
+
'title': paper['title'],
|
68 |
+
'body': {}
|
69 |
+
}
|
70 |
+
for key in ['I', 'M', 'R', 'D']:
|
71 |
+
sentJson['body'][key] = segment_sentences(paper[key], True)
|
72 |
+
return sentJson
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
# ----------------------------------------------
|
77 |
+
# Step 2. 句子單位 進行 特徵萃取
|
78 |
+
# ----------------------------------------------
|
79 |
+
|
80 |
+
# 句子列表
|
81 |
+
def sent_lst(sents):
|
82 |
+
return [sent['text'] for sent in sents]
|
83 |
+
|
84 |
+
# 移除停用詞及標點
|
85 |
+
def clean_token(doc):
|
86 |
+
return [token for token in doc if not (token.is_stop or token.is_punct)]
|
87 |
+
|
88 |
+
# 段落之總句數
|
89 |
+
def add_num_sents_para(sents):
|
90 |
+
reset = True
|
91 |
+
for index, sent in reversed(list(enumerate(sents))):
|
92 |
+
if reset: ptr = sent['pos_para']
|
93 |
+
reset = True if sent['pos_para'] == 1 else False
|
94 |
+
sents[index]['ns_para'] = ptr
|
95 |
+
return sents
|
96 |
+
|
97 |
+
# 位置重要性
|
98 |
+
def position_imp(cur, ns):
|
99 |
+
imp = 1 if cur == 1 else (ns-cur)/ns
|
100 |
+
return imp
|
101 |
+
|
102 |
+
# 標題詞列表
|
103 |
+
def title_wlst(txt):
|
104 |
+
doc = nlp(txt)
|
105 |
+
wlst = [token.text.lower() for token in clean_token(doc)]
|
106 |
+
return list(set(wlst))
|
107 |
+
|
108 |
+
# 句子之標題詞數量
|
109 |
+
def title_word_count(doc, wlst):
|
110 |
+
titleLen = len(wlst)
|
111 |
+
score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
|
112 |
+
return score
|
113 |
+
|
114 |
+
# 標記詞性之數量
|
115 |
+
def pos_token(doc, pos_type):
|
116 |
+
return len([token for token in doc if token.pos_ == pos_type])
|
117 |
+
|
118 |
+
# 自定分詞器
|
119 |
+
def custom_toknizer(txt):
|
120 |
+
doc = nlp(txt)
|
121 |
+
words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
|
122 |
+
return words
|
123 |
+
|
124 |
+
# 詞頻-逆向句子頻率
|
125 |
+
def Tfisf(lst):
|
126 |
+
tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
|
127 |
+
tfisf_matrix = tf.fit_transform(lst)
|
128 |
+
word_count = (tfisf_matrix!=0).sum(1)
|
129 |
+
with np.errstate(divide='ignore', invalid='ignore'):
|
130 |
+
mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
|
131 |
+
return mean_score
|
132 |
+
|
133 |
+
# 餘弦相似度
|
134 |
+
def similarity(lst, ptm):
|
135 |
+
model = SentenceTransformer(ptm)
|
136 |
+
embeddings = model.encode(lst, convert_to_tensor=True)
|
137 |
+
cosine = util.cos_sim(embeddings, embeddings)
|
138 |
+
cosine = cosine.sum(1)-1
|
139 |
+
cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
|
140 |
+
return cosine
|
141 |
+
|
142 |
+
# 特徵萃取
|
143 |
+
def feature_extraction(title, section, sents):
|
144 |
+
lst = sent_lst(sents)
|
145 |
+
tfisf = Tfisf(lst)
|
146 |
+
cosine = similarity(lst, "pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb")
|
147 |
+
|
148 |
+
# Number of sentences
|
149 |
+
ns = len(sents)
|
150 |
+
sents = add_num_sents_para(sents)
|
151 |
+
# Extracting the features of each sentences
|
152 |
+
arr = np.empty((0,9))
|
153 |
+
for index, sent in enumerate(sents):
|
154 |
+
doc = nlp(sent["text"])
|
155 |
+
doc = clean_token(doc)
|
156 |
+
|
157 |
+
F1 = len(doc) # Sentence Length (undone) -> len / longest sentence len
|
158 |
+
F2 = position_imp(sent["pos"], ns) # Sentence Position
|
159 |
+
F3 = position_imp(sent["pos_para"], sent["ns_para"]) # Sentence Position (in paragraph)
|
160 |
+
F4 = title_word_count(doc, title) # Title Word
|
161 |
+
F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1 # Proper Noun
|
162 |
+
F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1 # Numerical Token
|
163 |
+
F7 = tfisf[index] # Term Frequency-Inverse Sentence Frequency
|
164 |
+
F10 = cosine[index] # Cosine Similarity
|
165 |
+
|
166 |
+
feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
|
167 |
+
arr = np.append(arr, feat, axis=0)
|
168 |
+
# F1 (done)
|
169 |
+
maxLen = np.amax(arr[:,1])
|
170 |
+
arr[:,1] = arr[:,1]/maxLen
|
171 |
+
return arr
|
172 |
+
|
173 |
+
# 設置欄位類型
|
174 |
+
def set_dtypes(df):
|
175 |
+
df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
|
176 |
+
'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
|
177 |
+
'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
|
178 |
+
return df
|
179 |
+
|
180 |
+
# 文章 IMRD - 句子特徵
|
181 |
+
def feature_from_imrd(body, title):
|
182 |
+
paper = np.empty((0,9))
|
183 |
+
for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
|
184 |
+
paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
|
185 |
+
df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
|
186 |
+
return set_dtypes(df)
|
187 |
+
|
188 |
+
def extract_sentence_features(sentJson):
|
189 |
+
title = title_wlst(sentJson['title'][0])
|
190 |
+
sentFeat = feature_from_imrd(sentJson['body'], title)
|
191 |
+
return sentFeat
|