fuhsiao418 commited on
Commit
4d21bee
·
1 Parent(s): fb10f98
Files changed (3) hide show
  1. app.py +31 -0
  2. utils/__init__.py +2 -0
  3. utils/preprocess.py +191 -0
app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import convert_to_sentence_json, extract_sentence_features
2
+ import gradio as gr
3
+
4
+
5
+ def main(file, ext_threshold, article_type):
6
+ paper = read_text_to_json(file.name)
7
+ if not is_valid_format(paper):
8
+ return "invalid_format"
9
+ sentJson = convert_to_sentence_json(paper)
10
+ sentFeat = extract_sentence_features(sentJson)
11
+ return 'done'
12
+
13
+
14
+
15
+ # 定義Gradio介面
16
+ iface = gr.Interface(
17
+ fn=main,
18
+ inputs=[
19
+ gr.inputs.File(),
20
+ gr.inputs.Slider(minimum=0.5, maximum=1, default=0.5, step=0.01, label="Extractive - Threshold"),
21
+ gr.inputs.Dropdown(["non-specialized field", "biomedical field"],default="non-specialized field", label="Abstractive - Field")
22
+ ],
23
+ outputs=gr.outputs.Textbox(label="Output - Structured Abstract"),
24
+ title="Ext-Abs-StructuredSum",
25
+ description="please upload a .txt file formatted in the form of the example.",
26
+ # examples=[['text.txt']],
27
+ allow_flagging='never'
28
+ )
29
+
30
+ # 啟動Gradio介面
31
+ iface.launch(share=False) # share=False 用於停用分享模式
utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from utils.preprocess import convert_to_sentence_json, extract_sentence_features
2
+
utils/preprocess.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import pandas as pd
4
+ import numpy as np
5
+ import gradio as gr
6
+ import torch
7
+ import spacy
8
+ import re
9
+
10
+ nlp = spacy.load("en_core_sci_sm")
11
+
12
+
13
+
14
+ # ----------------------------------------------
15
+ # Step 1. 讀取檔案 轉換 句子單位 JSON
16
+ # ----------------------------------------------
17
+
18
+ def read_text_to_json(path):
19
+ paper = {}
20
+ with open(path, 'r', encoding='utf-8') as txt:
21
+ key = None
22
+ for line in txt:
23
+ line = line.strip()
24
+ if line.startswith('@Paper') or line.startswith('@Section'):
25
+ key = line.split()[1]
26
+ paper[key] = []
27
+ elif key and line:
28
+ paper[key].append(line)
29
+ return paper
30
+
31
+ def is_valid_format(paper):
32
+ for key in ['title', 'I', 'M', 'R', 'D']:
33
+ if key not in paper or len(paper[key])==0:
34
+ return False
35
+ return True
36
+
37
+ def remove_parentheses_with_useless_tokens(text):
38
+ return re.sub(r'\s*\(\s*(?:table|fig|http|www)[^()]*\)', '', text, flags = re.I) # re.I 不區分大小寫
39
+
40
+ def segment_sentences(section, pos_para = False):
41
+ sents = []
42
+ sents_break = [".", "?", "!"]
43
+ start = para_i = pre_para_i = 0
44
+ conn = False
45
+ for para in section:
46
+ para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
47
+ doc = nlp(para)
48
+ for sent in doc.sents:
49
+ if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可
50
+ para_i +=1
51
+ text = "".join(t.text_with_ws for t in doc[start:sent.end]) # 原始字串
52
+ tokenize_text = " ".join(t.text for t in doc[start:sent.end]) # 分詞字串
53
+ sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i} # 建立句子物件
54
+ if pos_para: sentence['pos_para'] = para_i # pos 句子位置, pos_para 句子於每段位置
55
+ sents.append(sentence)
56
+ start = sent.end
57
+ conn = False
58
+ else:
59
+ start = start if conn else sent.start # sent.end 非斷句字符 紀錄此句 start, 直到斷句前不更改 start 位置
60
+ conn = True
61
+ pre_para_i += para_i
62
+ start = para_i = 0
63
+ return sents
64
+
65
+ def convert_to_sentence_json(paper):
66
+ sentJson = {
67
+ 'title': paper['title'],
68
+ 'body': {}
69
+ }
70
+ for key in ['I', 'M', 'R', 'D']:
71
+ sentJson['body'][key] = segment_sentences(paper[key], True)
72
+ return sentJson
73
+
74
+
75
+
76
+ # ----------------------------------------------
77
+ # Step 2. 句子單位 進行 特徵萃取
78
+ # ----------------------------------------------
79
+
80
+ # 句子列表
81
+ def sent_lst(sents):
82
+ return [sent['text'] for sent in sents]
83
+
84
+ # 移除停用詞及標點
85
+ def clean_token(doc):
86
+ return [token for token in doc if not (token.is_stop or token.is_punct)]
87
+
88
+ # 段落之總句數
89
+ def add_num_sents_para(sents):
90
+ reset = True
91
+ for index, sent in reversed(list(enumerate(sents))):
92
+ if reset: ptr = sent['pos_para']
93
+ reset = True if sent['pos_para'] == 1 else False
94
+ sents[index]['ns_para'] = ptr
95
+ return sents
96
+
97
+ # 位置重要性
98
+ def position_imp(cur, ns):
99
+ imp = 1 if cur == 1 else (ns-cur)/ns
100
+ return imp
101
+
102
+ # 標題詞列表
103
+ def title_wlst(txt):
104
+ doc = nlp(txt)
105
+ wlst = [token.text.lower() for token in clean_token(doc)]
106
+ return list(set(wlst))
107
+
108
+ # 句子之標題詞數量
109
+ def title_word_count(doc, wlst):
110
+ titleLen = len(wlst)
111
+ score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
112
+ return score
113
+
114
+ # 標記詞性之數量
115
+ def pos_token(doc, pos_type):
116
+ return len([token for token in doc if token.pos_ == pos_type])
117
+
118
+ # 自定分詞器
119
+ def custom_toknizer(txt):
120
+ doc = nlp(txt)
121
+ words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
122
+ return words
123
+
124
+ # 詞頻-逆向句子頻率
125
+ def Tfisf(lst):
126
+ tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
127
+ tfisf_matrix = tf.fit_transform(lst)
128
+ word_count = (tfisf_matrix!=0).sum(1)
129
+ with np.errstate(divide='ignore', invalid='ignore'):
130
+ mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
131
+ return mean_score
132
+
133
+ # 餘弦相似度
134
+ def similarity(lst, ptm):
135
+ model = SentenceTransformer(ptm)
136
+ embeddings = model.encode(lst, convert_to_tensor=True)
137
+ cosine = util.cos_sim(embeddings, embeddings)
138
+ cosine = cosine.sum(1)-1
139
+ cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
140
+ return cosine
141
+
142
+ # 特徵萃取
143
+ def feature_extraction(title, section, sents):
144
+ lst = sent_lst(sents)
145
+ tfisf = Tfisf(lst)
146
+ cosine = similarity(lst, "pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb")
147
+
148
+ # Number of sentences
149
+ ns = len(sents)
150
+ sents = add_num_sents_para(sents)
151
+ # Extracting the features of each sentences
152
+ arr = np.empty((0,9))
153
+ for index, sent in enumerate(sents):
154
+ doc = nlp(sent["text"])
155
+ doc = clean_token(doc)
156
+
157
+ F1 = len(doc) # Sentence Length (undone) -> len / longest sentence len
158
+ F2 = position_imp(sent["pos"], ns) # Sentence Position
159
+ F3 = position_imp(sent["pos_para"], sent["ns_para"]) # Sentence Position (in paragraph)
160
+ F4 = title_word_count(doc, title) # Title Word
161
+ F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1 # Proper Noun
162
+ F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1 # Numerical Token
163
+ F7 = tfisf[index] # Term Frequency-Inverse Sentence Frequency
164
+ F10 = cosine[index] # Cosine Similarity
165
+
166
+ feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
167
+ arr = np.append(arr, feat, axis=0)
168
+ # F1 (done)
169
+ maxLen = np.amax(arr[:,1])
170
+ arr[:,1] = arr[:,1]/maxLen
171
+ return arr
172
+
173
+ # 設置欄位類型
174
+ def set_dtypes(df):
175
+ df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
176
+ 'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
177
+ 'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
178
+ return df
179
+
180
+ # 文章 IMRD - 句子特徵
181
+ def feature_from_imrd(body, title):
182
+ paper = np.empty((0,9))
183
+ for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
184
+ paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
185
+ df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
186
+ return set_dtypes(df)
187
+
188
+ def extract_sentence_features(sentJson):
189
+ title = title_wlst(sentJson['title'][0])
190
+ sentFeat = feature_from_imrd(sentJson['body'], title)
191
+ return sentFeat