Spaces:
Runtime error
Runtime error
File size: 7,090 Bytes
4d21bee dfcc660 4d21bee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import gradio as gr
import torch
import spacy
import re
nlp = spacy.load("en_core_sci_sm")
# ----------------------------------------------
# Step 1. 讀取檔案 轉換 句子單位 JSON
# ----------------------------------------------
def read_text_to_json(path):
paper = {}
with open(path, 'r', encoding='utf-8') as txt:
key = None
for line in txt:
line = line.strip()
if line.startswith('@Paper') or line.startswith('@Section'):
key = line.split()[1]
paper[key] = []
elif key and line:
paper[key].append(line)
return paper
def is_valid_format(paper):
for key in ['title', 'I', 'M', 'R', 'D']:
if key not in paper or len(paper[key])==0:
return False
return True
def remove_parentheses_with_useless_tokens(text):
return re.sub(r'\s*\(\s*(?:table|fig|http|www)[^()]*\)', '', text, flags = re.I) # re.I 不區分大小寫
def segment_sentences(section, pos_para = False):
sents = []
sents_break = [".", "?", "!"]
start = para_i = pre_para_i = 0
conn = False
for para in section:
para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
doc = nlp(para)
for sent in doc.sents:
if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可
para_i +=1
text = "".join(t.text_with_ws for t in doc[start:sent.end]) # 原始字串
tokenize_text = " ".join(t.text for t in doc[start:sent.end]) # 分詞字串
sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i} # 建立句子物件
if pos_para: sentence['pos_para'] = para_i # pos 句子位置, pos_para 句子於每段位置
sents.append(sentence)
start = sent.end
conn = False
else:
start = start if conn else sent.start # sent.end 非斷句字符 紀錄此句 start, 直到斷句前不更改 start 位置
conn = True
pre_para_i += para_i
start = para_i = 0
return sents
def convert_to_sentence_json(paper):
sentJson = {
'title': paper['title'],
'body': {}
}
for key in ['I', 'M', 'R', 'D']:
sentJson['body'][key] = segment_sentences(paper[key], True)
return sentJson
# ----------------------------------------------
# Step 2. 句子單位 進行 特徵萃取
# ----------------------------------------------
# 句子列表
def sent_lst(sents):
return [sent['text'] for sent in sents]
# 移除停用詞及標點
def clean_token(doc):
return [token for token in doc if not (token.is_stop or token.is_punct)]
# 段落之總句數
def add_num_sents_para(sents):
reset = True
for index, sent in reversed(list(enumerate(sents))):
if reset: ptr = sent['pos_para']
reset = True if sent['pos_para'] == 1 else False
sents[index]['ns_para'] = ptr
return sents
# 位置重要性
def position_imp(cur, ns):
imp = 1 if cur == 1 else (ns-cur)/ns
return imp
# 標題詞列表
def title_wlst(txt):
doc = nlp(txt)
wlst = [token.text.lower() for token in clean_token(doc)]
return list(set(wlst))
# 句子之標題詞數量
def title_word_count(doc, wlst):
titleLen = len(wlst)
score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
return score
# 標記詞性之數量
def pos_token(doc, pos_type):
return len([token for token in doc if token.pos_ == pos_type])
# 自定分詞器
def custom_toknizer(txt):
doc = nlp(txt)
words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
return words
# 詞頻-逆向句子頻率
def Tfisf(lst):
tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
tfisf_matrix = tf.fit_transform(lst)
word_count = (tfisf_matrix!=0).sum(1)
with np.errstate(divide='ignore', invalid='ignore'):
mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
return mean_score
# 餘弦相似度
def similarity(lst, ptm):
model = SentenceTransformer(ptm)
embeddings = model.encode(lst, convert_to_tensor=True)
cosine = util.cos_sim(embeddings, embeddings)
cosine = cosine.sum(1)-1
cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
return cosine
# 特徵萃取
def feature_extraction(title, section, sents):
lst = sent_lst(sents)
tfisf = Tfisf(lst)
cosine = similarity(lst, "sentence-transformers/all-MiniLM-L6-v2")
# Number of sentences
ns = len(sents)
sents = add_num_sents_para(sents)
# Extracting the features of each sentences
arr = np.empty((0,9))
for index, sent in enumerate(sents):
doc = nlp(sent["text"])
doc = clean_token(doc)
F1 = len(doc) # Sentence Length (undone) -> len / longest sentence len
F2 = position_imp(sent["pos"], ns) # Sentence Position
F3 = position_imp(sent["pos_para"], sent["ns_para"]) # Sentence Position (in paragraph)
F4 = title_word_count(doc, title) # Title Word
F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1 # Proper Noun
F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1 # Numerical Token
F7 = tfisf[index] # Term Frequency-Inverse Sentence Frequency
F10 = cosine[index] # Cosine Similarity
feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
arr = np.append(arr, feat, axis=0)
# F1 (done)
maxLen = np.amax(arr[:,1])
arr[:,1] = arr[:,1]/maxLen
return arr
# 設置欄位類型
def set_dtypes(df):
df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
return df
# 文章 IMRD - 句子特徵
def feature_from_imrd(body, title):
paper = np.empty((0,9))
for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
return set_dtypes(df)
def extract_sentence_features(sentJson):
title = title_wlst(sentJson['title'][0])
sentFeat = feature_from_imrd(sentJson['body'], title)
return sentFeat
|