Spaces:
Runtime error
Runtime error
File size: 4,182 Bytes
39fc926 51a4295 e187645 d050d19 39fc926 e187645 39fc926 9e9c9e8 39fc926 d47bebc 39fc926 768cb65 39fc926 e187645 9e9c9e8 39fc926 462b854 d47bebc 39fc926 9e9c9e8 e187645 9e9c9e8 936f80e d47bebc 936f80e 9e9c9e8 d47bebc 9e9c9e8 39fc926 8d585eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
from newspaper import Article
from newspaper import Config
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup as bs
import requests
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
# Load Model and Tokenize
def get_summary(input_text):
tokenizer = PreTrainedTokenizerFast.from_pretrained("ainize/kobart-news")
summary_model = BartForConditionalGeneration.from_pretrained("ainize/kobart-news")
input_ids = tokenizer.encode(input_text, return_tensors="pt")
summary_text_ids = summary_model.generate(
input_ids=input_ids,
length_penalty=2,
top_p=0.9,
max_length=128,
min_length=12,
num_beams=2,
)
# "task_specific_params": {
# "summarization": {
# "length_penalty": 1.0,
# "max_length": 128,
# "min_length": 12,
# "num_beams": 4
# }
return tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
class news_collector:
def __init__(self):
self.examples_text = []
def get_new_parser(self, url):
article = Article(url, language='ko')
article.download()
article.parse()
return article
def get_news_links(self, page=''):
url = "https://news.daum.net/breakingnews/economic"
response = requests.get(url)
html_text = response.text
soup = bs(response.text, 'html.parser')
news_titles = soup.select("a.link_txt")
links = [item.attrs['href'] for item in news_titles ]
https_links = [item for item in links if item.startswith('https') == True]
https_links
return https_links
def update_news_examples(self):
news_links = self.get_news_links()
for news_url in news_links:
article = self.get_new_parser(news_url)
if article.text:
self.examples_text.append([get_summary(article.text[:1500]), news_url])
title = "๊ท ํ์กํ ๋ด์ค ์ฝ๊ธฐ (Balanced News Reading)"
with gr.Blocks(theme='pseudolab/huggingface-korea-theme') as demo:
collector = news_collector()
collector.update_news_examples()
with gr.Tab("์๊ฐ"):
gr.Markdown(
"""
# ๊ท ํ์กํ ๋ด์ค ์ฝ๊ธฐ (Balanced News Reading)
๊ธ์ ์ ์ธ ๊ธฐ์ฌ์ ๋ถ์ ์ ์ธ ๊ธฐ์ฌ์ธ์ง ํ์ธํ์ฌ ๋ด์ค๋ฅผ ์ฝ์ ์ ์์ต๋๋ค. ์ต๊ทผ ๊ฒฝ์ ๋ด์ค๊ธฐ์ฌ๋ฅผ ๊ฐ์ ธ์ Example์์ ๋ฐ๋ก ํ์ธํ ์ ์๋๋ก ๊ตฌ์ฑํ์ต๋๋ค.
## 1. ์ฌ์ฉ๋ฐฉ๋ฒ
Daum๋ด์ค์ ๊ฒฝ์ ๊ธฐ์ฌ๋ฅผ ๊ฐ์ ธ์ ๋ด์ฉ์ ์์ฝํ๊ณ `Example`์ ๊ฐ์ ธ์ต๋๋ค. ๊ฐ์ ๋ถ์์ ํ๊ณ ์ถ์ ๊ธฐ์ฌ๋ฅผ `Examples`์์ ์ ํํด์ `Submit`์ ๋๋ฅด๋ฉด `Classification`์
ํด๋น ๊ธฐ์ฌ์ ๊ฐ์ ํ๊ฐ ๊ฒฐ๊ณผ๊ฐ ํ์๋ฉ๋๋ค. ๊ฐ์ ํ๊ฐ๋ ๊ฐ ์ํ์ ํ๋ฅ ์ ๋ณด์ ํจ๊ป `neutral`, `positive`, `negative` 3๊ฐ์ง๋ก ํ์๋ฉ๋๋ค.
## 2. ๊ตฌ์กฐ ์ค๋ช
๋ด์ค๊ธฐ์ฌ๋ฅผ ํฌ๋กค๋ง ๋ฐ ์์ฝ ๋ชจ๋ธ์ ์ด์ฉํ ๊ธฐ์ฌ ์์ฝ >> ๊ธฐ์ฌ ์์ฝ์ ๋ณด Example์ ์ถ๊ฐ >> ํ๊ตญ์ด fine-tunningํ ๊ฐ์ ํ๊ฐ ๋ชจ๋ธ์ ์ด์ฉํด ์
๋ ฅ๋ ๊ธฐ์ฌ์ ๋ํ ๊ฐ์ ํ๊ฐ ์งํ
""")
with gr.Tab("๋ฐ๋ชจ"):
Link_TXT = gr.Textbox(label="๋ด์ค ๋ด์ฉ", placeholder = "๋ด์ค ๊ธฐ์ฌ ๋ด์ฉ์ ์
๋ ฅํ์ธ์.")
gr.load("models/gabrielyang/finance_news_classifier-KR_v7",
# gr.load("models/Hyeonseo/ko-finance_news_classifier",
inputs = Link_TXT)
Link_URL = gr.Textbox(label="๋ด์ค URL")
# diable due to dynamic loading
# update_button = gr.Button(value="๋ด์ค ๋ฐ์ดํฐ ์
๋ฐ์ดํธ")
# update_button.click(fn=collector.update_news_examples_and_update, inputs=None, outputs=None)
gr.Examples(
collector.examples_text,
[Link_TXT, Link_URL],
)
if __name__ == "__main__":
demo.launch() |