Muhammad Murtaza Naqi (Assistant Manager - Data Analyst) commited on
Commit
712d86b
1 Parent(s): 19181ff

supporting files

Browse files
Article_summarizer.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from transformers import pipeline
5
+ from transformers import AutoModel
6
+ #from peft import PeftModel, PeftConfig
7
+ from transformers import AutoModelForCausalLM
8
+
9
+ from Scrapper_Summarizer import get_full_article_dawn, get_full_article_tnews, get_full_article_brecorder, summarizer
10
+
11
+ # summarizer = pipeline("summarization", model="mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization")
12
+
13
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
+
15
+ # Function to scrape full article and summarize it
16
+ def get_full_article(url):
17
+ try:
18
+ response = requests.get(url, verify=False)
19
+ soup = BeautifulSoup(response.text, 'html.parser')
20
+ content_div = soup.find('div', class_='story__content')
21
+
22
+ if content_div:
23
+ paragraphs = content_div.find_all('p')
24
+ full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
25
+
26
+ # Limiting text length for summarization
27
+ summary_obj = summarizer(full_text[:1020])
28
+
29
+ # Convert the summary object to a string
30
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
31
+ st.success("Summary generated successfully!")
32
+ return summary
33
+ else:
34
+ st.error("Content not found in the article.")
35
+ return "Content not found."
36
+ except Exception as e:
37
+ st.error(f"Error fetching the article: {e}")
38
+ return "Error fetching the article."
39
+
40
+
41
+
42
+ def article_sum():
43
+ # App title
44
+ st.title("📰 Article Summarizer")
45
+ st.write("Provide the URL of the article you'd like summarized below, and we'll fetch and summarize it for you!")
46
+
47
+ # Input URL from user
48
+ url = st.text_input("Enter the article URL:", "")
49
+
50
+ # Sidebar with buttons for different sources
51
+ st.sidebar.title("Choose a Source")
52
+
53
+ # Button for "The News"
54
+ if st.sidebar.button("The News"):
55
+ if url:
56
+ with st.spinner('Fetching and summarizing the article from The News...'):
57
+ full_text = get_full_article_tnews(url)
58
+ summary_obj = summarizer(full_text[:1020])
59
+
60
+ # Convert the summary object to a string
61
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
62
+ st.write(summary)
63
+ else:
64
+ st.sidebar.error("Please enter the URL of an article from The News.")
65
+
66
+ # Button for "The Dawn"
67
+ if st.sidebar.button("The Dawn"):
68
+ if url:
69
+ with st.spinner('Fetching and summarizing the article from The Dawn...'):
70
+ full_text = get_full_article_dawn(url)
71
+ summary_obj = summarizer(full_text[:1020])
72
+
73
+ # Convert the summary object to a string
74
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
75
+ st.write(summary)
76
+ else:
77
+ st.sidebar.error("Please enter the URL of an article from The Dawn.")
78
+
79
+ # Button for "Business Recorder"
80
+ if st.sidebar.button("Business Recorder"):
81
+ if url:
82
+ with st.spinner('Fetching and summarizing the article from Business Recorder...'):
83
+ full_text= get_full_article_brecorder(url)
84
+ summary_obj = summarizer(full_text[:1020])
85
+
86
+ # Convert the summary object to a string
87
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
88
+ st.write(summary)
89
+ else:
90
+ st.sidebar.error("Please enter the URL of an article from Business Recorder.")
91
+
92
+ # Sidebar details and credits
93
+ st.sidebar.title("About")
94
+ st.sidebar.write(
95
+ "This utility fetches articles from a given URL and summarizes them using a pre-trained summarization model.")
96
+ st.sidebar.markdown("### Model Used")
97
+ st.sidebar.info("Model: `sshleifer/distilbart-cnn-12-6` (BART-based summarizer)")
98
+ st.sidebar.markdown("---")
99
+ st.sidebar.write("Created by Strategy")
News_scrapper.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from Scrapper_Summarizer import scrape_dawn, scrape_brecorder, scrape_tnews
4
+
5
+ def load_articles_in_batches(articles, batch_size, offset):
6
+ return articles[offset:offset + batch_size]
7
+
8
+ def News_scrapper():
9
+ # App title and description
10
+ st.title("📰 Business News Scrapper & Summarizer")
11
+ st.write("This app scrapes the latest business news from *Dawn* and *Business Recorder* and summarizes the articles for easy reading.")
12
+
13
+ # Add a sidebar for navigation
14
+ st.sidebar.write("Use this sidebar to navigate between options.")
15
+ st.sidebar.markdown("### Scraping Options")
16
+
17
+ # Add a button for Dawn News scraping
18
+ if st.sidebar.button('Scrape Dawn News'):
19
+ st.subheader("Latest Business News from Dawn")
20
+ with st.spinner("Scraping and summarizing news from Dawn..."):
21
+ dawn_articles = scrape_dawn()
22
+ if dawn_articles:
23
+ df = pd.DataFrame(dawn_articles)
24
+ st.dataframe(df)
25
+ else:
26
+ st.write("No articles found.")
27
+
28
+ # Add a button for Business Recorder scraping
29
+ if st.sidebar.button('Scrape Business Recorder'):
30
+ st.subheader("Latest Business News from Business Recorder")
31
+ with st.spinner("Scraping and summarizing news from Business Recorder..."):
32
+ brecorder_articles = scrape_brecorder()
33
+ if brecorder_articles:
34
+ df = pd.DataFrame(brecorder_articles)
35
+ st.dataframe(df)
36
+ else:
37
+ st.write("No articles found.")
38
+
39
+ # Add a button for The News scraping
40
+ if st.sidebar.button('Scrape The News'):
41
+ st.subheader("Latest Business News from The News")
42
+ with st.spinner("Scraping and summarizing news from The News..."):
43
+ tnews_articles = scrape_tnews()
44
+ if tnews_articles:
45
+ df = pd.DataFrame(tnews_articles)
46
+ st.dataframe(df)
47
+ else:
48
+ st.write("No articles found.")
49
+
50
+ # Sidebar details and beautification
51
+ st.sidebar.markdown("---")
52
+ st.sidebar.info("This utility scrapes the latest business articles and generates summaries using the BART summarization model. Great for quick reads!")
53
+ st.sidebar.markdown("---")
54
+ st.sidebar.write("Created by Strategy")
55
+
56
+
Scrapper_Summarizer.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import csv
6
+ from transformers import pipeline
7
+ # from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
8
+
9
+ # Text sumamrization model
10
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
11
+
12
+
13
+
14
+ def scrape_dawn():
15
+ url = 'https://www.dawn.com/business'
16
+ response = requests.get(url, verify=False)
17
+ soup = BeautifulSoup(response.text, 'html.parser')
18
+ articles = []
19
+
20
+ count = 0 # Counter to track the number of articles scraped
21
+
22
+ for item in soup.find_all('article', class_='story'):
23
+ if count >= 5: # Stop after 10 articles
24
+ break
25
+
26
+ title_tag = item.find('h2', class_='story__title')
27
+ if title_tag:
28
+ title = title_tag.get_text(strip=True)
29
+ link = title_tag.find('a')['href']
30
+ full_text = get_full_article_dawn(link)
31
+ # Summarize the full article
32
+ summary_obj = summarizer(full_text[:1020])
33
+
34
+ # Convert the summary object to a string
35
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
36
+ articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
37
+
38
+ count += 1 # Increment the counter
39
+
40
+ return articles
41
+
42
+ # Function to get the full text of an article from Dawn
43
+ def get_full_article_dawn(url):
44
+ response = requests.get(url, verify = False)
45
+ soup = BeautifulSoup(response.text, 'html.parser')
46
+ content_div = soup.find('div', class_='story__content')
47
+ if content_div:
48
+ paragraphs = content_div.find_all('p')
49
+ full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
50
+ return full_text
51
+ return "Content not found."
52
+
53
+
54
+ # Function to scrape articles from Business Recorder
55
+ def scrape_brecorder():
56
+ url = 'https://www.brecorder.com/business-finance'
57
+ response = requests.get(url, verify=False)
58
+ soup = BeautifulSoup(response.text, 'html.parser')
59
+ articles = []
60
+ count = 0 # Counter to track the number of articles scraped
61
+
62
+ for item in soup.find_all('article', class_='story'):
63
+ if count >= 5: # Stop after 10 articles
64
+ break
65
+
66
+ title_tag = item.find('h2', class_='story__title')
67
+ if title_tag:
68
+ title = title_tag.get_text(strip=True)
69
+ link = title_tag.find('a')['href']
70
+ full_text = get_full_article_brecorder(link)
71
+ # Summarize the full article
72
+ summary_obj = summarizer(full_text[:1020])
73
+
74
+ # Convert the summary object to a string
75
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
76
+ articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
77
+
78
+ count += 1 # Increment the counter
79
+
80
+ return articles
81
+
82
+ # Function to get the full text of an article from Business Recorder
83
+ def get_full_article_brecorder(url):
84
+ response = requests.get(url, verify = False)
85
+ soup = BeautifulSoup(response.text, 'html.parser')
86
+ content_div = soup.find('div', class_='story__content')
87
+ if content_div:
88
+ paragraphs = content_div.find_all(['p', 'li'])
89
+ full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
90
+ return full_text
91
+ return "Content not found."
92
+
93
+ #
94
+ # def scrape_tnews():
95
+ # url = 'https://www.thenews.com.pk/latest/category/business'
96
+ # response = requests.get(url, verify=False)
97
+ # soup = BeautifulSoup(response.text, 'html.parser')
98
+ # articles = []
99
+ #
100
+ # count = 0 # Counter to track the number of articles scraped
101
+ #
102
+ # for item in soup.find_all('div', class_='most-popular-box'):
103
+ # if count >= 2: # Stop after 10 articles
104
+ # break
105
+ #
106
+ # title_tag = item.find('h2', class_='most-popular-list')
107
+ # if title_tag:
108
+ # title = title_tag.get_text(strip=True)
109
+ # link = title_tag.find('a')['href']
110
+ # full_text = get_full_article_tnews(link)
111
+ # # Summarize the full article
112
+ # summary_obj = summarizer(full_text[:1020])
113
+ #
114
+ # # Convert the summary object to a string
115
+ # summary = summary_obj[0]['summary_text'] if summary_obj else ""
116
+ # articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
117
+ #
118
+ # count += 1 # Increment the counter
119
+ #
120
+ # return articles
121
+
122
+
123
+ def scrape_tnews():
124
+ url = 'https://www.thenews.com.pk/latest/category/business'
125
+ response = requests.get(url, verify=False)
126
+ soup = BeautifulSoup(response.text, 'html.parser')
127
+ articles = []
128
+
129
+ count = 0 # Counter to track the number of articles scraped
130
+
131
+ for item in soup.find_all('div', class_='most-popular-box'):
132
+ if count >= 5: # Stop after 2 articles
133
+ break
134
+
135
+ # Extract the title from the <h2> tag
136
+ title_tag = item.find('h2')
137
+ if title_tag:
138
+ title = title_tag.get_text(strip=True)
139
+
140
+ # Extract the link from the <a> tag inside <h2>
141
+ link = item.find('a')['href']
142
+
143
+ # Fetch and process full article text (you should define get_full_article_tnews)
144
+ full_text = get_full_article_tnews(link)
145
+
146
+ # Summarize the full article (you should define summarizer)
147
+ summary_obj = summarizer(full_text[:1020])
148
+ summary = summary_obj[0]['summary_text'] if summary_obj else ""
149
+
150
+ # Append the article details
151
+ articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
152
+
153
+
154
+ count += 1 # Increment the counter
155
+
156
+ return articles
157
+
158
+
159
+ def get_full_article_tnews(url):
160
+ response = requests.get(url, verify = False)
161
+ soup = BeautifulSoup(response.text, 'html.parser')
162
+ content_div = soup.find('div', class_='detail-content')
163
+ if content_div:
164
+ paragraphs = content_div.find_all(['p', 'li'])
165
+ full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
166
+ return full_text
167
+ return "Content not found."
168
+
169
+ # Function to save articles to a CSV file
170
+ def save_to_csv(filename, articles):
171
+ if not articles:
172
+ print(f"No articles found to save in {filename}.")
173
+ return
174
+ keys = articles[0].keys()
175
+ with open(filename, 'w', newline='', encoding='utf-8') as output_file:
176
+ dict_writer = csv.DictWriter(output_file, fieldnames=keys)
177
+ dict_writer.writeheader()
178
+ dict_writer.writerows(articles)
179
+
180
+
181
+ # # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
182
+ # def main():
183
+ # # Scraping articles from Dawn
184
+ # dawn_articles = scrape_tnews()
185
+ # save_to_csv('tnews_articles_full.csv', dawn_articles)
186
+ # print("tnews articles saved to CSV file successfully.")
187
+ #
188
+ # # Scraping articles from Business Recorder
189
+ # # brecorder_articles = scrape_brecorder()
190
+ # # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
191
+ # # print("Business Recorder articles saved to CSV file successfully.")
192
+ #
193
+ #
194
+ # if __name__ == '__main__':
195
+ # main()
196
+
197
+ # url = 'https://www.thenews.com.pk/latest/category/business'
198
+ # response = requests.get(url, verify=False)
199
+ # soup = BeautifulSoup(response.text, 'html.parser')
200
+ # s = soup.find_all('div', class_='most-popular-box')
201
+ # print(s)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.30.0
2
+ transformers==4.30.1
3
+ bs4
4
+ pandas
5
+ numpy
6
+ requests