Spaces:

fuhsiao
/

Ext-Abs-StructuredSum

Runtime error

File size: 7,090 Bytes

from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import gradio as gr
import torch
import spacy
import re

nlp = spacy.load("en_core_sci_sm")



# ----------------------------------------------
# Step 1. 讀取檔案 轉換 句子單位 JSON
# ----------------------------------------------

def read_text_to_json(path):
    paper = {}
    with open(path, 'r', encoding='utf-8') as txt:
        key = None
        for line in txt:
            line = line.strip()
            if line.startswith('@Paper') or line.startswith('@Section'):
                key = line.split()[1]
                paper[key] = []
            elif key and line:
                paper[key].append(line)
    return paper

def is_valid_format(paper):
    for key in ['title', 'I', 'M', 'R', 'D']:
        if key not in paper or len(paper[key])==0:
            return False
    return True

def remove_parentheses_with_useless_tokens(text):
    return re.sub(r'\s*\(\s*(?:table|fig|http|www)[^()]*\)', '', text, flags = re.I)  # re.I 不區分大小寫

def segment_sentences(section, pos_para = False):
    sents = []
    sents_break = [".", "?", "!"]
    start = para_i = pre_para_i =  0
    conn = False
    for para in section:
        para = remove_parentheses_with_useless_tokens(para).strip() # 避免末端空白判斷為 token 而無法 sents_break
        doc = nlp(para)
        for sent in doc.sents:
            if any(t in sents_break for t in sent[-1].text): # 部分句尾詞如 3h. 無法分詞, 因此包含 sents_break 即可  
                para_i +=1          
                text = "".join(t.text_with_ws for t in doc[start:sent.end])                         # 原始字串
                tokenize_text = " ".join(t.text for t in doc[start:sent.end])                       # 分詞字串
                sentence = {"text":text, "tokenize_text":tokenize_text, "pos":pre_para_i+para_i}    # 建立句子物件
                if pos_para: sentence['pos_para'] = para_i                                          # pos 句子位置, pos_para 句子於每段位置
                sents.append(sentence)
                start = sent.end
                conn = False
            else:      
                start = start if conn else sent.start   # sent.end 非斷句字符 紀錄此句 start, 直到斷句前不更改 start 位置
                conn = True
        pre_para_i += para_i
        start = para_i =  0
    return sents

def convert_to_sentence_json(paper):
    sentJson = {
        'title': paper['title'],
        'body': {}
    }
    for key in ['I', 'M', 'R', 'D']:
        sentJson['body'][key] = segment_sentences(paper[key], True)
    return sentJson



# ----------------------------------------------
# Step 2. 句子單位 進行 特徵萃取
# ----------------------------------------------

# 句子列表
def sent_lst(sents):
    return [sent['text'] for sent in sents]

# 移除停用詞及標點
def clean_token(doc):
    return [token for token in doc if not (token.is_stop or token.is_punct)]

# 段落之總句數
def add_num_sents_para(sents):
    reset = True
    for index, sent in reversed(list(enumerate(sents))):    
        if reset: ptr = sent['pos_para']
        reset = True if sent['pos_para'] == 1 else False
        sents[index]['ns_para'] = ptr
    return sents

# 位置重要性
def position_imp(cur, ns):
    imp = 1 if cur == 1 else (ns-cur)/ns
    return imp

# 標題詞列表
def title_wlst(txt):
    doc = nlp(txt)
    wlst = [token.text.lower() for token in clean_token(doc)]
    return list(set(wlst))

# 句子之標題詞數量
def title_word_count(doc, wlst):
    titleLen = len(wlst)
    score = 0 if titleLen == 0 else len([token for token in doc if token.text.lower() in wlst])/titleLen
    return score

# 標記詞性之數量
def pos_token(doc, pos_type):
    return len([token for token in doc if token.pos_ == pos_type])

# 自定分詞器
def custom_toknizer(txt):
    doc = nlp(txt)
    words = [token.lemma_.lower() for token in doc if not (token.is_stop or token.is_punct or token.is_digit)]
    return words

# 詞頻-逆向句子頻率 
def Tfisf(lst):
    tf = TfidfVectorizer(tokenizer=custom_toknizer, lowercase=False)
    tfisf_matrix = tf.fit_transform(lst)
    word_count = (tfisf_matrix!=0).sum(1)
    with np.errstate(divide='ignore', invalid='ignore'):
        mean_score = np.where(word_count == 0, 0, np.divide(tfisf_matrix.sum(1), word_count)).flatten()
    return mean_score

# 餘弦相似度
def similarity(lst, ptm):
    model = SentenceTransformer(ptm)
    embeddings = model.encode(lst, convert_to_tensor=True)
    cosine = util.cos_sim(embeddings, embeddings)
    cosine = cosine.sum(1)-1
    cosine = torch.divide(cosine, torch.max(cosine)).numpy() # .cpu().numpy()
    return cosine

# 特徵萃取
def feature_extraction(title, section, sents): 
    lst = sent_lst(sents)
    tfisf = Tfisf(lst)
    cosine = similarity(lst, "sentence-transformers/all-MiniLM-L6-v2")
    
    # Number of sentences
    ns = len(sents)
    sents = add_num_sents_para(sents)
    # Extracting the features of each sentences
    arr = np.empty((0,9))
    for index, sent in enumerate(sents):
        doc = nlp(sent["text"])
        doc = clean_token(doc)
        
        F1 = len(doc)                                           # Sentence Length (undone) -> len / longest sentence len
        F2 = position_imp(sent["pos"], ns)                      # Sentence Position
        F3 = position_imp(sent["pos_para"], sent["ns_para"])    # Sentence Position (in paragraph)
        F4 = title_word_count(doc, title)                       # Title Word
        F5 = 0 if F1 == 0 else pos_token(doc, "PROPN")/F1       # Proper Noun
        F6 = 0 if F1 == 0 else pos_token(doc, "NUM")/F1         # Numerical Token
        F7 = tfisf[index]                                       # Term Frequency-Inverse Sentence Frequency
        F10 = cosine[index]                                     # Cosine Similarity

        feat = np.array([[section, F1, F2, F3, F4, F5, F6, F7, F10]])
        arr = np.append(arr, feat, axis=0)
    # F1 (done)
    maxLen = np.amax(arr[:,1])
    arr[:,1] = arr[:,1]/maxLen 
    return arr

# 設置欄位類型
def set_dtypes(df):
    df = df.astype({'section': 'int8', 'F1': 'float32', 'F2': 'float32',
                    'F3': 'float32', 'F4': 'float32', 'F5': 'float32',
                    'F6': 'float32', 'F7': 'float32', 'F10': 'float32'})
    return df

# 文章 IMRD - 句子特徵
def feature_from_imrd(body, title):
    paper = np.empty((0,9))
    for index, key in enumerate(['I', 'M', 'R', 'D'], start = 1):
        paper = np.append(paper, feature_extraction(title, index, body[key]), axis = 0)
    df = pd.DataFrame(paper, columns = ['section','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F10'])
    return set_dtypes(df)

def extract_sentence_features(sentJson):
    title = title_wlst(sentJson['title'][0])
    sentFeat = feature_from_imrd(sentJson['body'], title)
    return sentFeat