victorbahlangene commited on
Commit
dba06a0
·
1 Parent(s): 5fdb12d

first commit

Browse files
Files changed (2) hide show
  1. app.py +221 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import pipeline
4
+
5
+ from bs4 import BeautifulSoup
6
+ import requests
7
+
8
+ # app layout #
9
+ st.set_page_config(
10
+ page_title="Medium News App"
11
+ )
12
+
13
+ ## FUNCTIONS ##
14
+
15
+ # search medium urls function #
16
+ @st.cache_resource
17
+ def search_meduim_urls(monitored_tickers):
18
+ search_url = "https://medium.com/tag/{}".format(monitored_tickers)
19
+ r = requests.get(search_url)
20
+ soup = BeautifulSoup(r.text, 'html.parser')
21
+ # location where link to news is found(a tag with attribute "aria-label"= "Post Preview Title") #
22
+ atags = soup.find_all('a', attrs={"aria-label": "Post Preview Title"})
23
+ hrefs = ['https://medium.com'+link['href'] for link in atags]
24
+ return hrefs
25
+
26
+ # funtion to search and scrape cleaned urls #
27
+ @st.cache_resource
28
+ def scrape_and_process(URLs):
29
+ """
30
+ - function grabs all p-tags.
31
+ - create list of whats in every p tag.
32
+ - plit list into individual words, max 350.
33
+ - make 1 corpus of data.
34
+ - the length of each article tokens will be 350,
35
+ because the max of the model i am using is 512 and i want the app to be faster.
36
+ """
37
+ ARTICLES = []
38
+ for url in URLs:
39
+ r = requests.get(url)
40
+ soup = BeautifulSoup(r.text, 'html.parser')
41
+ paragraphs = soup.find_all('p')
42
+ text = [paragraph.text for paragraph in paragraphs]
43
+ words = ' '.join(text).split(' ')[:350]
44
+ ARTICLE = ' '.join(words)
45
+ ARTICLES.append(ARTICLE)
46
+ return ARTICLES
47
+
48
+ #function to Summarise all Articles#
49
+ @st.cache_resource
50
+ def summarize(articles,_tokenizer,_model):
51
+ """
52
+ encode , generate, decode, append to list
53
+ """
54
+ summaries = []
55
+ for article in articles:
56
+ input_ids = _tokenizer.encode(article, return_tensors='pt',max_length=512, truncation=True)
57
+ output = _model.generate(input_ids, max_length=56, num_beams=5, early_stopping=True)
58
+ summary = _tokenizer.decode(output[0], skip_special_tokens=True)
59
+ summaries.append(summary)
60
+ return summaries
61
+
62
+ # function to load the transformer #
63
+ @st.cache_resource
64
+ def load_summary_transformer():
65
+ # load transformers #
66
+ model_name = "facebook/bart-large-cnn"
67
+ tokenizer_summary = AutoTokenizer.from_pretrained(model_name)
68
+ model_summary = AutoModelForSeq2SeqLM.from_pretrained(model_name)
69
+
70
+ return tokenizer_summary, model_summary
71
+
72
+ # function to load sentiment pipeline #
73
+ @st.cache_resource
74
+ def load_sentiment_pipeline():
75
+ sentiment = pipeline('sentiment-analysis')
76
+
77
+ return sentiment
78
+
79
+ # function to create final output #
80
+ def create_output_array(summaries, scores, urls):
81
+ output = []
82
+ for ticker in monitored_tickers:
83
+ for counter in range(len(summaries[ticker])):
84
+ output_this = [
85
+ ticker,
86
+ summaries[ticker][counter],
87
+ scores[ticker][counter]['label'],
88
+ scores[ticker][counter]['score'],
89
+ urls[ticker][counter]
90
+ ]
91
+ output.append(output_this)
92
+ return output
93
+
94
+ # display summary output #
95
+ def cards(title,score,sentiment,article,link):
96
+ return f"""
97
+ <div class="card bg-light mb-3">
98
+ <div class="card-body">
99
+ <h5 class="card-title">{title}</h5>
100
+ <h6 class="card-subtitle mb-2 text-muted">The article is: {score*100:.2f}% {sentiment}.</h6>
101
+ <p class="card-text">{article}.</p>
102
+ <a href={link} class="card-link">Link to article</a>
103
+ </div>
104
+ </div>
105
+ <br></br>
106
+ """
107
+
108
+ # function to load bootstrap #
109
+ def boot():
110
+ return """
111
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
112
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/js/bootstrap.min.js" integrity="sha384-JZR6Spejh4U02d8jOt6vLEHfe/JQGiRRSQQxSfFWpi1MquVdAyjUar5+76PVCmYl" crossorigin="anonymous"></script>
113
+ """
114
+
115
+ # load bootstrap #
116
+ st.markdown(boot(), unsafe_allow_html=True)
117
+
118
+ # load_summary_transformer #
119
+ tokenizer_summary, model_summary = load_summary_transformer()
120
+
121
+ # load sentiment pipeline #
122
+ sentiment = load_sentiment_pipeline()
123
+
124
+
125
+
126
+ ## APP OUTPUT ##
127
+ st.markdown("<h1 style='text-align: center; color: grey;'>Medium News App</h1>",
128
+ unsafe_allow_html=True)
129
+
130
+ # containers #
131
+ col1, col2, col3 = st.columns(3)
132
+
133
+ # session_state user input initilization #
134
+ if 'user_final_input' not in st.session_state:
135
+ st.session_state['user_final_input'] = ''
136
+
137
+ # SEARCH SECTION #
138
+ with st.expander("Make inquiry"):
139
+ st.markdown("<h2 style='text-align: center; color: black;'>Summary</h2>",
140
+ unsafe_allow_html=True)
141
+ # user input #
142
+ monitored_tickers = []
143
+
144
+ # user input options #
145
+ option = st.selectbox(
146
+ 'Some options to select',
147
+ ('chatgpt', 'fastai', 'pytorch', 'tensorflow',('manual entry'))
148
+ )
149
+ # allows for manual search entry #
150
+ if option=="manual entry":
151
+ user_select = st.text_input(
152
+ "Please enter a Data Science topic of interest: ")
153
+ monitored_tickers.append(user_select)
154
+ st.write(user_select)
155
+ st.session_state['user_final_input'] = user_select
156
+ else:
157
+ monitored_tickers.append(option)
158
+ st.write(option)
159
+ st.session_state['user_final_input'] = option
160
+
161
+
162
+
163
+ # how many summaries to inference #
164
+ summary_count = st.slider('How many summaries do you want?', 1, 5, 1)
165
+ st.write("I'm selecting ", summary_count, 'summaries.')
166
+ if summary_count == 3:
167
+ st.markdown(f"""
168
+ <div class="alert alert-warning" role="alert">
169
+ The summary will take about 1 minute to process.
170
+ </div>
171
+ """
172
+ , unsafe_allow_html=True)
173
+ elif summary_count == 4 or summary_count == 5:
174
+ st.markdown(f"""
175
+ <div class="alert alert-danger" role="alert">
176
+ The summary will take about 2 minutes to process.
177
+ </div>
178
+ """
179
+ , unsafe_allow_html=True)
180
+
181
+
182
+
183
+ with st.form(key="user_input"):
184
+ summary = st.form_submit_button("Summary")
185
+ if summary:
186
+ # test function #
187
+ search_meduim_urls(monitored_tickers[0])
188
+ # make a dictionary {framework: link_to_article about the framework} #
189
+ cleaned_urls= {framework:search_meduim_urls(framework) for framework in monitored_tickers}
190
+
191
+ articles = {ticker:scrape_and_process(cleaned_urls[ticker]) for ticker in monitored_tickers}
192
+
193
+ articles[st.session_state['user_final_input']] = articles[st.session_state['user_final_input']][:summary_count]
194
+ #articles[option] = articles[option][:summary_count]
195
+
196
+ #articles
197
+
198
+ # summary #
199
+ # 1m 25s to sumarize #
200
+ summaries = {ticker:summarize(articles[ticker],tokenizer_summary, model_summary) for ticker in monitored_tickers}
201
+
202
+
203
+ scores = {ticker:sentiment(summaries[ticker]) for ticker in monitored_tickers}
204
+ #scores
205
+
206
+ final_output = create_output_array(summaries, scores, cleaned_urls)
207
+ #final_output
208
+
209
+ #final_output[0]
210
+ for i in range(len(final_output)):
211
+ st.markdown(
212
+ cards(
213
+ final_output[i][0],
214
+ final_output[i][3],
215
+ final_output[i][2],
216
+ final_output[i][1],
217
+ final_output[i][4]
218
+ ),
219
+ unsafe_allow_html=True)
220
+
221
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ beautifulsoup4
3
+ transformers
4
+ requests