Runtime error
Runtime error
commited on
app init
Browse files- +31 -0
- utils/ +2 -0
- utils/ +191 -0
@@ -0,0 +1,31 @@
1 |
from utils import convert_to_sentence_json, extract_sentence_features
2 |
import gradio as gr
3 |
4 |
5 |
def main(file, ext_threshold, article_type):
6 |
paper = read_text_to_json(
7 |
if not is_valid_format(paper):
8 |
return "invalid_format"
9 |
sentJson = convert_to_sentence_json(paper)
10 |
sentFeat = extract_sentence_features(sentJson)
11 |
return 'done'
12 |
13 |
14 |
15 |
# 定義Gradio介面
16 |
iface = gr.Interface(
17 |
18 |
19 |
20 |
gr.inputs.Slider(minimum=0.5, maximum=1, default=0.5, step=0.01, label="Extractive - Threshold"),
21 |
gr.inputs.Dropdown(["non-specialized field", "biomedical field"],default="non-specialized field", label="Abstractive - Field")
22 |
23 |
outputs=gr.outputs.Textbox(label="Output - Structured Abstract"),
24 |
25 |
description="please upload a .txt file formatted in the form of the example.",
26 |
# examples=[['text.txt']],
27 |
28 |
29 |
30 |
# 啟動Gradio介面
31 |
iface.launch(share=False) # share=False 用於停用分享模式
@@ -0,0 +1,2 @@
1 |
from utils.preprocess import convert_to_sentence_json, extract_sentence_features
2 |
@@ -0,0 +1,191 @@
1 |
from sklearn.feature_extraction.text import TfidfVectorizer
2 |
from sentence_transformers import SentenceTransformer, util
3 |
import pandas as pd
4 |
import numpy as np
5 |
import gradio as gr
6 |
import torch
7 |
import spacy
8 |
import re
9 |
10 |
nlp = spacy.load("en_core_sci_sm")
11 |
12 |
13 |
14 |
# ----------------------------------------------
15 |
# Step 1. 讀取檔案 轉換 句子單位 JSON
16 |
# ----------------------------------------------
17 |
18 |
def read_text_to_json(path):
19 |
paper = {}
20 |
with open(path, 'r', encoding='utf-8') as txt:
21 |
key = None
22 |
for line in txt:
23 |
line = line.strip()
24 |
if line.startswith('@Paper') or line.startswith('@Section'):
25 |
key = line.split()[1]
26 |
paper[key] = []
27 |
elif key and line:
28 |
29 |
return paper
30 |
31 |
def is_valid_format(paper):
32 |
for key in ['title', 'I', 'M', 'R', 'D']:
33 |
if key not in paper or len(paper[key])==0:
34 |
return False
35 |
return True
36 |
37 |
def remove_parentheses_with_useless_tokens(text):
38 |
return re.sub(r'\s*\(\s*(?:table|fig|http|www)[^()]*\)', '', text, flags = re.I) # re.I 不區分大小寫
39 |
40 |
def segment_sentences(section, pos_para = False):
41 |
sents = []
42 |
sents_break = [".", "?", "!"]
43 |
start = para_i = pre_para_i = 0
44 |
conn = False
45 |
for para in section:
46 |
para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
47 |
doc = nlp(para)
48 |
for sent in doc.sents:
49 |
if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可
50 |
para_i +=1
51 |
text = "".join(t.text_with_ws for t in doc[start:sent.end]) # 原始字串
52 |
tokenize_text = " ".join(t.text for t in doc[start:sent.end]) # 分詞字串
53 |
sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i} # 建立句子物件
54 |
if pos_para: sentence['pos_para'] = para_i # pos 句子位置, pos_para 句子於每段位置
55 |
56 |
start = sent.end
57 |
conn = False
58 |
59 |
start = start if conn else sent.start # sent.end 非斷句字符 紀錄此句 start, 直到斷句前不更改 start 位置
60 |
conn = True
61 |
pre_para_i += para_i
62 |
start = para_i = 0
63 |
return sents
64 |
65 |
def convert_to_sentence_json(paper):
66 |
sentJson = {
67 |
'title': paper['title'],
68 |
'body': {}
69 |
70 |
for key in ['I', 'M', 'R', 'D']:
71 |
sentJson['body'][key] = segment_sentences(paper[key], True)
72 |
return sentJson
73 |
74 |
75 |
76 |
# ----------------------------------------------
77 |
# Step 2. 句子單位 進行 特徵萃取
78 |
# ----------------------------------------------
79 |
80 |
# 句子列表
81 |
def sent_lst(sents):
82 |
return [sent['text'] for sent in sents]
83 |
84 |
# 移除停用詞及標點
85 |
def clean_token(doc):
86 |
return [token for token in doc if not (token.is_stop or token.is_punct)]
87 |
88 |
# 段落之總句數
89 |
def add_num_sents_para(sents):
90 |
reset = True
91 |
for index, sent in reversed(list(enumerate(sents))):
92 |
if reset: ptr = sent['pos_para']
93 |
reset = True if sent['pos_para'] == 1 else False
94 |
sents[index]['ns_para'] = ptr
95 |
return sents
96 |
97 |
# 位置重要性
98 |
def position_imp(cur, ns):
99 |
imp = 1 if cur == 1 else (ns-cur)/ns
100 |
return imp
101 |
102 |
# 標題詞列表
103 |
def title_wlst(txt):
104 |
doc = nlp(txt)
105 |
wlst = [token.text.lower() for token in clean_token(doc)]
106 |
return list(set(wlst))
107 |
108 |
# 句子之標題詞數量
109 |
def title_word_count(doc, wlst):
110 |
titleLen = len(wlst)
111 |
score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
112 |
return score
113 |
114 |
# 標記詞性之數量
115 |
def pos_token(doc, pos_type):
116 |
return len([token for token in doc if token.pos_ == pos_type])
117 |
118 |
# 自定分詞器
119 |
def custom_toknizer(txt):
120 |
doc = nlp(txt)
121 |
words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
122 |
return words
123 |
124 |
# 詞頻-逆向句子頻率
125 |
def Tfisf(lst):
126 |
tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
127 |
tfisf_matrix = tf.fit_transform(lst)
128 |
word_count = (tfisf_matrix!=0).sum(1)
129 |
with np.errstate(divide='ignore', invalid='ignore'):
130 |
mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
131 |
return mean_score
132 |
133 |
# 餘弦相似度
134 |
def similarity(lst, ptm):
135 |
model = SentenceTransformer(ptm)
136 |
embeddings = model.encode(lst, convert_to_tensor=True)
137 |
cosine = util.cos_sim(embeddings, embeddings)
138 |
cosine = cosine.sum(1)-1
139 |
cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
140 |
return cosine
141 |
142 |
# 特徵萃取
143 |
def feature_extraction(title, section, sents):
144 |
lst = sent_lst(sents)
145 |
tfisf = Tfisf(lst)
146 |
cosine = similarity(lst, "pritamdeka/PubMedBERT-mnli-snli-scinli-scitail-mednli-stsb")
147 |
148 |
# Number of sentences
149 |
ns = len(sents)
150 |
sents = add_num_sents_para(sents)
151 |
# Extracting the features of each sentences
152 |
arr = np.empty((0,9))
153 |
for index, sent in enumerate(sents):
154 |
doc = nlp(sent["text"])
155 |
doc = clean_token(doc)
156 |
157 |
F1 = len(doc) # Sentence Length (undone) -> len / longest sentence len
158 |
F2 = position_imp(sent["pos"], ns) # Sentence Position
159 |
F3 = position_imp(sent["pos_para"], sent["ns_para"]) # Sentence Position (in paragraph)
160 |
F4 = title_word_count(doc, title) # Title Word
161 |
F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1 # Proper Noun
162 |
F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1 # Numerical Token
163 |
F7 = tfisf[index] # Term Frequency-Inverse Sentence Frequency
164 |
F10 = cosine[index] # Cosine Similarity
165 |
166 |
feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
167 |
arr = np.append(arr, feat, axis=0)
168 |
# F1 (done)
169 |
maxLen = np.amax(arr[:,1])
170 |
arr[:,1] = arr[:,1]/maxLen
171 |
return arr
172 |
173 |
# 設置欄位類型
174 |
def set_dtypes(df):
175 |
df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
176 |
'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
177 |
'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
178 |
return df
179 |
180 |
# 文章 IMRD - 句子特徵
181 |
def feature_from_imrd(body, title):
182 |
paper = np.empty((0,9))
183 |
for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
184 |
paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
185 |
df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
186 |
return set_dtypes(df)
187 |
188 |
def extract_sentence_features(sentJson):
189 |
title = title_wlst(sentJson['title'][0])
190 |
sentFeat = feature_from_imrd(sentJson['body'], title)
191 |
return sentFeat