Commit
·
dba06a0
1
Parent(s):
5fdb12d
first commit
Browse files- app.py +221 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
3 |
+
from transformers import pipeline
|
4 |
+
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import requests
|
7 |
+
|
8 |
+
# app layout #
|
9 |
+
st.set_page_config(
|
10 |
+
page_title="Medium News App"
|
11 |
+
)
|
12 |
+
|
13 |
+
## FUNCTIONS ##
|
14 |
+
|
15 |
+
# search medium urls function #
|
16 |
+
@st.cache_resource
|
17 |
+
def search_meduim_urls(monitored_tickers):
|
18 |
+
search_url = "https://medium.com/tag/{}".format(monitored_tickers)
|
19 |
+
r = requests.get(search_url)
|
20 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
21 |
+
# location where link to news is found(a tag with attribute "aria-label"= "Post Preview Title") #
|
22 |
+
atags = soup.find_all('a', attrs={"aria-label": "Post Preview Title"})
|
23 |
+
hrefs = ['https://medium.com'+link['href'] for link in atags]
|
24 |
+
return hrefs
|
25 |
+
|
26 |
+
# funtion to search and scrape cleaned urls #
|
27 |
+
@st.cache_resource
|
28 |
+
def scrape_and_process(URLs):
|
29 |
+
"""
|
30 |
+
- function grabs all p-tags.
|
31 |
+
- create list of whats in every p tag.
|
32 |
+
- plit list into individual words, max 350.
|
33 |
+
- make 1 corpus of data.
|
34 |
+
- the length of each article tokens will be 350,
|
35 |
+
because the max of the model i am using is 512 and i want the app to be faster.
|
36 |
+
"""
|
37 |
+
ARTICLES = []
|
38 |
+
for url in URLs:
|
39 |
+
r = requests.get(url)
|
40 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
41 |
+
paragraphs = soup.find_all('p')
|
42 |
+
text = [paragraph.text for paragraph in paragraphs]
|
43 |
+
words = ' '.join(text).split(' ')[:350]
|
44 |
+
ARTICLE = ' '.join(words)
|
45 |
+
ARTICLES.append(ARTICLE)
|
46 |
+
return ARTICLES
|
47 |
+
|
48 |
+
#function to Summarise all Articles#
|
49 |
+
@st.cache_resource
|
50 |
+
def summarize(articles,_tokenizer,_model):
|
51 |
+
"""
|
52 |
+
encode , generate, decode, append to list
|
53 |
+
"""
|
54 |
+
summaries = []
|
55 |
+
for article in articles:
|
56 |
+
input_ids = _tokenizer.encode(article, return_tensors='pt',max_length=512, truncation=True)
|
57 |
+
output = _model.generate(input_ids, max_length=56, num_beams=5, early_stopping=True)
|
58 |
+
summary = _tokenizer.decode(output[0], skip_special_tokens=True)
|
59 |
+
summaries.append(summary)
|
60 |
+
return summaries
|
61 |
+
|
62 |
+
# function to load the transformer #
|
63 |
+
@st.cache_resource
|
64 |
+
def load_summary_transformer():
|
65 |
+
# load transformers #
|
66 |
+
model_name = "facebook/bart-large-cnn"
|
67 |
+
tokenizer_summary = AutoTokenizer.from_pretrained(model_name)
|
68 |
+
model_summary = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
69 |
+
|
70 |
+
return tokenizer_summary, model_summary
|
71 |
+
|
72 |
+
# function to load sentiment pipeline #
|
73 |
+
@st.cache_resource
|
74 |
+
def load_sentiment_pipeline():
|
75 |
+
sentiment = pipeline('sentiment-analysis')
|
76 |
+
|
77 |
+
return sentiment
|
78 |
+
|
79 |
+
# function to create final output #
|
80 |
+
def create_output_array(summaries, scores, urls):
|
81 |
+
output = []
|
82 |
+
for ticker in monitored_tickers:
|
83 |
+
for counter in range(len(summaries[ticker])):
|
84 |
+
output_this = [
|
85 |
+
ticker,
|
86 |
+
summaries[ticker][counter],
|
87 |
+
scores[ticker][counter]['label'],
|
88 |
+
scores[ticker][counter]['score'],
|
89 |
+
urls[ticker][counter]
|
90 |
+
]
|
91 |
+
output.append(output_this)
|
92 |
+
return output
|
93 |
+
|
94 |
+
# display summary output #
|
95 |
+
def cards(title,score,sentiment,article,link):
|
96 |
+
return f"""
|
97 |
+
<div class="card bg-light mb-3">
|
98 |
+
<div class="card-body">
|
99 |
+
<h5 class="card-title">{title}</h5>
|
100 |
+
<h6 class="card-subtitle mb-2 text-muted">The article is: {score*100:.2f}% {sentiment}.</h6>
|
101 |
+
<p class="card-text">{article}.</p>
|
102 |
+
<a href={link} class="card-link">Link to article</a>
|
103 |
+
</div>
|
104 |
+
</div>
|
105 |
+
<br></br>
|
106 |
+
"""
|
107 |
+
|
108 |
+
# function to load bootstrap #
|
109 |
+
def boot():
|
110 |
+
return """
|
111 |
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
|
112 |
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
|
113 |
+
"""
|
114 |
+
|
115 |
+
# load bootstrap #
|
116 |
+
st.markdown(boot(), unsafe_allow_html=True)
|
117 |
+
|
118 |
+
# load_summary_transformer #
|
119 |
+
tokenizer_summary, model_summary = load_summary_transformer()
|
120 |
+
|
121 |
+
# load sentiment pipeline #
|
122 |
+
sentiment = load_sentiment_pipeline()
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
## APP OUTPUT ##
|
127 |
+
st.markdown("<h1 style='text-align: center; color: grey;'>Medium News App</h1>",
|
128 |
+
unsafe_allow_html=True)
|
129 |
+
|
130 |
+
# containers #
|
131 |
+
col1, col2, col3 = st.columns(3)
|
132 |
+
|
133 |
+
# session_state user input initilization #
|
134 |
+
if 'user_final_input' not in st.session_state:
|
135 |
+
st.session_state['user_final_input'] = ''
|
136 |
+
|
137 |
+
# SEARCH SECTION #
|
138 |
+
with st.expander("Make inquiry"):
|
139 |
+
st.markdown("<h2 style='text-align: center; color: black;'>Summary</h2>",
|
140 |
+
unsafe_allow_html=True)
|
141 |
+
# user input #
|
142 |
+
monitored_tickers = []
|
143 |
+
|
144 |
+
# user input options #
|
145 |
+
option = st.selectbox(
|
146 |
+
'Some options to select',
|
147 |
+
('chatgpt', 'fastai', 'pytorch', 'tensorflow',('manual entry'))
|
148 |
+
)
|
149 |
+
# allows for manual search entry #
|
150 |
+
if option=="manual entry":
|
151 |
+
user_select = st.text_input(
|
152 |
+
"Please enter a Data Science topic of interest: ")
|
153 |
+
monitored_tickers.append(user_select)
|
154 |
+
st.write(user_select)
|
155 |
+
st.session_state['user_final_input'] = user_select
|
156 |
+
else:
|
157 |
+
monitored_tickers.append(option)
|
158 |
+
st.write(option)
|
159 |
+
st.session_state['user_final_input'] = option
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
# how many summaries to inference #
|
164 |
+
summary_count = st.slider('How many summaries do you want?', 1, 5, 1)
|
165 |
+
st.write("I'm selecting ", summary_count, 'summaries.')
|
166 |
+
if summary_count == 3:
|
167 |
+
st.markdown(f"""
|
168 |
+
<div class="alert alert-warning" role="alert">
|
169 |
+
The summary will take about 1 minute to process.
|
170 |
+
</div>
|
171 |
+
"""
|
172 |
+
, unsafe_allow_html=True)
|
173 |
+
elif summary_count == 4 or summary_count == 5:
|
174 |
+
st.markdown(f"""
|
175 |
+
<div class="alert alert-danger" role="alert">
|
176 |
+
The summary will take about 2 minutes to process.
|
177 |
+
</div>
|
178 |
+
"""
|
179 |
+
, unsafe_allow_html=True)
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
with st.form(key="user_input"):
|
184 |
+
summary = st.form_submit_button("Summary")
|
185 |
+
if summary:
|
186 |
+
# test function #
|
187 |
+
search_meduim_urls(monitored_tickers[0])
|
188 |
+
# make a dictionary {framework: link_to_article about the framework} #
|
189 |
+
cleaned_urls= {framework:search_meduim_urls(framework) for framework in monitored_tickers}
|
190 |
+
|
191 |
+
articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
|
192 |
+
|
193 |
+
articles[st.session_state['user_final_input']] = articles[st.session_state['user_final_input']][:summary_count]
|
194 |
+
#articles[option] = articles[option][:summary_count]
|
195 |
+
|
196 |
+
#articles
|
197 |
+
|
198 |
+
# summary #
|
199 |
+
# 1m 25s to sumarize #
|
200 |
+
summaries = {ticker:summarize(articles[ticker],tokenizer_summary, model_summary) for ticker in monitored_tickers}
|
201 |
+
|
202 |
+
|
203 |
+
scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
|
204 |
+
#scores
|
205 |
+
|
206 |
+
final_output = create_output_array(summaries, scores, cleaned_urls)
|
207 |
+
#final_output
|
208 |
+
|
209 |
+
#final_output[0]
|
210 |
+
for i in range(len(final_output)):
|
211 |
+
st.markdown(
|
212 |
+
cards(
|
213 |
+
final_output[i][0],
|
214 |
+
final_output[i][3],
|
215 |
+
final_output[i][2],
|
216 |
+
final_output[i][1],
|
217 |
+
final_output[i][4]
|
218 |
+
),
|
219 |
+
unsafe_allow_html=True)
|
220 |
+
|
221 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
beautifulsoup4
|
3 |
+
transformers
|
4 |
+
requests
|