Spaces:

SanctiMoly
/

SanctiMolyDemo1

Runtime error

App Files Files Community

alex6095 commited on Dec 12, 2021

Commit

962c41a

1 Parent(s): 7f9ccdf

Create app.py

Browse files

Files changed (1) hide show

app.py +114 -0

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import re
+import streamlit as st
+import pandas as pd
+from transformers import PreTrainedTokenizerFast, DistilBertForSequenceClassification, BartForConditionalGeneration
+from tokenization_kobert import KoBertTokenizer
+from tokenizers import SentencePieceBPETokenizer
+@st.cache(allow_output_mutation=True)
+def get_topic():
+    model = DistilBertForSequenceClassification.from_pretrained(
+        'alex6095/SanctiMolyTopic', problem_type="multi_label_classification", num_labels=9)
+    model.eval()
+    tokenizer = KoBertTokenizer.from_pretrained('monologg/distilkobert')
+    return model, tokenizer
+@st.cache(allow_output_mutation=True)
+def get_date():
+    model = BartForConditionalGeneration.from_pretrained('alex6095/SanctiMoly-Bart')
+    model.eval()
+    tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-summarization')
+    return model, tokenizer
+class RegexSubstitution(object):
+    """Regex substitution class for transform"""
+    def __init__(self, regex, sub=''):
+        if isinstance(regex, re.Pattern):
+            self.regex = regex
+        else:
+            self.regex = re.compile(regex)
+        self.sub = sub
+    def __call__(self, target):
+        if isinstance(target, list):
+            return [self.regex.sub(self.sub, self.regex.sub(self.sub, string)) for string in target]
+        else:
+            return self.regex.sub(self.sub, self.regex.sub(self.sub, target))
+default_text = '''질병관리청은 23일 지방자치단체가 보건당국과 협의 없이 단독으로 인플루엔자(독감) 백신 접종 중단을 결정해서는 안 된다는 입장을 밝혔다.
+    질병청은 이날 참고자료를 배포하고 “향후 전체 국가 예방접종사업이 차질 없이 진행되도록 지자체가 자체적으로 접종 유보 여부를 결정하지 않도록 안내를 했다”고 설명했다.
+    독감백신을 접종한 후 고령층을 중심으로 전국에서 사망자가 잇따르자 서울 영등포구보건소는 전날, 경북 포항시는 이날 관내 의료기관에 접종을 보류해달라는 공문을 내려보냈다. 이는 예방접종과 사망 간 직접적 연관성이 낮아 접종을 중단할 상황은 아니라는 질병청의 판단과는 다른 것이다.
+    질병청은 지난 21일 전문가 등이 참여한 ‘예방접종 피해조사반’의 분석 결과를 바탕으로 독감 예방접종 사업을 일정대로 진행하기로 했다. 특히 고령 어르신과 어린이, 임신부 등 독감 고위험군은 백신을 접종하지 않았을 때 합병증 피해가 클 수 있다면서 접종을 독려했다. 하지만 접종사업 유지 발표 이후에도 사망 보고가 잇따르자 질병청은 이날 ‘예방접종 피해조사반 회의’와 ‘예방접종 전문위원회’를 개최해 독감백신과 사망 간 관련성, 접종사업 유지 여부 등에 대해 다시 결론 내리기로 했다. 회의 결과는 이날 오후 7시 넘어 발표될 예정이다.
+'''
+topics_raw = ['IT/과학', '경제', '문화', '미용/건강', '사회', '생활', '스포츠', '연예', '정치']
+#topic_model, topic_tokenizer = get_topic()
+#date_model, date_tokenizer = get_date()
+name = st.side_bar.selectbox('Model', ['Topic Classification', 'Date Prediction'])
+if name == 'Topic Classification':
+    title = 'News Topic Classification'
+    model, tokenizer = get_topic()
+elif name == 'Date Prediction':
+    title = 'News Date prediction'
+    model, tokenizer = get_date()
+st.title(title)
+text = st.text_area("Input news :", value=default_text)
+st.markdown("## Original News Data")
+st.write(text)
+if name == 'Topic Classification':
+    st.markdown("## Predict Topic")
+    col1, col2 = st.columns(2)
+    if text:
+        with st.spinner('processing..'):
+            text = RegexSubstitution(r'\([^()]+\)|[<>\'"△▲□■]')(text)
+            encoded_dict = tokenizer(
+                text=text,
+                add_special_tokens=True,
+                max_length=512,
+                truncation=True,
+                return_tensors='pt',
+                return_length=True
+            )
+            input_ids = encoded_dict['input_ids']
+            input_ids_len = encoded_dict['length'].unsqueeze(0)
+            attn_mask = torch.arange(input_ids.size(1))
+            attn_mask = attn_mask[None, :] < input_ids_len[:, None]
+            outputs = model(input_ids=input_ids, attention_mask=attn_mask)
+            _, preds = torch.max(outputs.logits, 1)
+        col1.write(topics_raw[preds.squeeze(0)])
+        softmax = torch.nn.Softmax(dim=1)
+        prob = softmax(outputs.logits).squeeze(0).detach()
+        chart_data = pd.DataFrame({
+            'Topic': topics_raw,
+            'Probability': prob
+        })
+        chart_data = chart_data.set_index('Topic')
+        col2.bar_chart(chart_data)
+elif name == 'Date Prediction':
+    st.markdown("## Predict Date")
+    if text:
+        with st.spinner('processing..'):
+            text = RegexSubstitution(r'\([^()]+\)|[<>\'"△▲□■]')(text)
+            raw_input_ids = tokenizer.encode(text)
+            input_ids = [tokenizer.bos_token_id] + \
+                raw_input_ids + [tokenizer.eos_token_id]
+            outputs = model.generate(torch.tensor([input_ids]),
+                                         early_stopping=True,
+                                         repetition_penalty=2.0,
+                                         do_sample=True, #샘플링 전략 사용
+                                         max_length=50, # 최대 디코딩 길이는 50
+                                         top_k=50, # 확률 순위가 50위 밖인 토큰은 샘플링에서 제외
+                                         top_p=0.95, # 누적 확률이 95%인 후보집합에서만 생성
+                                         num_return_sequences=3 #3개의 결과를 디코딩해낸다
+                                         )
+        for output in outputs:
+            pred_print = tokenizer.decode(output.squeeze().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            st.write(pred_print)