Spaces:
Sleeping
Sleeping
Muhammad Murtaza Naqi (Assistant Manager - Data Analyst)
commited on
Commit
•
712d86b
1
Parent(s):
19181ff
supporting files
Browse files- Article_summarizer.py +99 -0
- News_scrapper.py +56 -0
- Scrapper_Summarizer.py +201 -0
- requirements.txt +6 -0
Article_summarizer.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
from transformers import pipeline
|
5 |
+
from transformers import AutoModel
|
6 |
+
#from peft import PeftModel, PeftConfig
|
7 |
+
from transformers import AutoModelForCausalLM
|
8 |
+
|
9 |
+
from Scrapper_Summarizer import get_full_article_dawn, get_full_article_tnews, get_full_article_brecorder, summarizer
|
10 |
+
|
11 |
+
# summarizer = pipeline("summarization", model="mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization")
|
12 |
+
|
13 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
14 |
+
|
15 |
+
# Function to scrape full article and summarize it
|
16 |
+
def get_full_article(url):
|
17 |
+
try:
|
18 |
+
response = requests.get(url, verify=False)
|
19 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
20 |
+
content_div = soup.find('div', class_='story__content')
|
21 |
+
|
22 |
+
if content_div:
|
23 |
+
paragraphs = content_div.find_all('p')
|
24 |
+
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
|
25 |
+
|
26 |
+
# Limiting text length for summarization
|
27 |
+
summary_obj = summarizer(full_text[:1020])
|
28 |
+
|
29 |
+
# Convert the summary object to a string
|
30 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
31 |
+
st.success("Summary generated successfully!")
|
32 |
+
return summary
|
33 |
+
else:
|
34 |
+
st.error("Content not found in the article.")
|
35 |
+
return "Content not found."
|
36 |
+
except Exception as e:
|
37 |
+
st.error(f"Error fetching the article: {e}")
|
38 |
+
return "Error fetching the article."
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def article_sum():
|
43 |
+
# App title
|
44 |
+
st.title("📰 Article Summarizer")
|
45 |
+
st.write("Provide the URL of the article you'd like summarized below, and we'll fetch and summarize it for you!")
|
46 |
+
|
47 |
+
# Input URL from user
|
48 |
+
url = st.text_input("Enter the article URL:", "")
|
49 |
+
|
50 |
+
# Sidebar with buttons for different sources
|
51 |
+
st.sidebar.title("Choose a Source")
|
52 |
+
|
53 |
+
# Button for "The News"
|
54 |
+
if st.sidebar.button("The News"):
|
55 |
+
if url:
|
56 |
+
with st.spinner('Fetching and summarizing the article from The News...'):
|
57 |
+
full_text = get_full_article_tnews(url)
|
58 |
+
summary_obj = summarizer(full_text[:1020])
|
59 |
+
|
60 |
+
# Convert the summary object to a string
|
61 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
62 |
+
st.write(summary)
|
63 |
+
else:
|
64 |
+
st.sidebar.error("Please enter the URL of an article from The News.")
|
65 |
+
|
66 |
+
# Button for "The Dawn"
|
67 |
+
if st.sidebar.button("The Dawn"):
|
68 |
+
if url:
|
69 |
+
with st.spinner('Fetching and summarizing the article from The Dawn...'):
|
70 |
+
full_text = get_full_article_dawn(url)
|
71 |
+
summary_obj = summarizer(full_text[:1020])
|
72 |
+
|
73 |
+
# Convert the summary object to a string
|
74 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
75 |
+
st.write(summary)
|
76 |
+
else:
|
77 |
+
st.sidebar.error("Please enter the URL of an article from The Dawn.")
|
78 |
+
|
79 |
+
# Button for "Business Recorder"
|
80 |
+
if st.sidebar.button("Business Recorder"):
|
81 |
+
if url:
|
82 |
+
with st.spinner('Fetching and summarizing the article from Business Recorder...'):
|
83 |
+
full_text= get_full_article_brecorder(url)
|
84 |
+
summary_obj = summarizer(full_text[:1020])
|
85 |
+
|
86 |
+
# Convert the summary object to a string
|
87 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
88 |
+
st.write(summary)
|
89 |
+
else:
|
90 |
+
st.sidebar.error("Please enter the URL of an article from Business Recorder.")
|
91 |
+
|
92 |
+
# Sidebar details and credits
|
93 |
+
st.sidebar.title("About")
|
94 |
+
st.sidebar.write(
|
95 |
+
"This utility fetches articles from a given URL and summarizes them using a pre-trained summarization model.")
|
96 |
+
st.sidebar.markdown("### Model Used")
|
97 |
+
st.sidebar.info("Model: `sshleifer/distilbart-cnn-12-6` (BART-based summarizer)")
|
98 |
+
st.sidebar.markdown("---")
|
99 |
+
st.sidebar.write("Created by Strategy")
|
News_scrapper.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from Scrapper_Summarizer import scrape_dawn, scrape_brecorder, scrape_tnews
|
4 |
+
|
5 |
+
def load_articles_in_batches(articles, batch_size, offset):
|
6 |
+
return articles[offset:offset + batch_size]
|
7 |
+
|
8 |
+
def News_scrapper():
|
9 |
+
# App title and description
|
10 |
+
st.title("📰 Business News Scrapper & Summarizer")
|
11 |
+
st.write("This app scrapes the latest business news from *Dawn* and *Business Recorder* and summarizes the articles for easy reading.")
|
12 |
+
|
13 |
+
# Add a sidebar for navigation
|
14 |
+
st.sidebar.write("Use this sidebar to navigate between options.")
|
15 |
+
st.sidebar.markdown("### Scraping Options")
|
16 |
+
|
17 |
+
# Add a button for Dawn News scraping
|
18 |
+
if st.sidebar.button('Scrape Dawn News'):
|
19 |
+
st.subheader("Latest Business News from Dawn")
|
20 |
+
with st.spinner("Scraping and summarizing news from Dawn..."):
|
21 |
+
dawn_articles = scrape_dawn()
|
22 |
+
if dawn_articles:
|
23 |
+
df = pd.DataFrame(dawn_articles)
|
24 |
+
st.dataframe(df)
|
25 |
+
else:
|
26 |
+
st.write("No articles found.")
|
27 |
+
|
28 |
+
# Add a button for Business Recorder scraping
|
29 |
+
if st.sidebar.button('Scrape Business Recorder'):
|
30 |
+
st.subheader("Latest Business News from Business Recorder")
|
31 |
+
with st.spinner("Scraping and summarizing news from Business Recorder..."):
|
32 |
+
brecorder_articles = scrape_brecorder()
|
33 |
+
if brecorder_articles:
|
34 |
+
df = pd.DataFrame(brecorder_articles)
|
35 |
+
st.dataframe(df)
|
36 |
+
else:
|
37 |
+
st.write("No articles found.")
|
38 |
+
|
39 |
+
# Add a button for The News scraping
|
40 |
+
if st.sidebar.button('Scrape The News'):
|
41 |
+
st.subheader("Latest Business News from The News")
|
42 |
+
with st.spinner("Scraping and summarizing news from The News..."):
|
43 |
+
tnews_articles = scrape_tnews()
|
44 |
+
if tnews_articles:
|
45 |
+
df = pd.DataFrame(tnews_articles)
|
46 |
+
st.dataframe(df)
|
47 |
+
else:
|
48 |
+
st.write("No articles found.")
|
49 |
+
|
50 |
+
# Sidebar details and beautification
|
51 |
+
st.sidebar.markdown("---")
|
52 |
+
st.sidebar.info("This utility scrapes the latest business articles and generates summaries using the BART summarization model. Great for quick reads!")
|
53 |
+
st.sidebar.markdown("---")
|
54 |
+
st.sidebar.write("Created by Strategy")
|
55 |
+
|
56 |
+
|
Scrapper_Summarizer.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import requests
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
import csv
|
6 |
+
from transformers import pipeline
|
7 |
+
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
|
8 |
+
|
9 |
+
# Text sumamrization model
|
10 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def scrape_dawn():
|
15 |
+
url = 'https://www.dawn.com/business'
|
16 |
+
response = requests.get(url, verify=False)
|
17 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
18 |
+
articles = []
|
19 |
+
|
20 |
+
count = 0 # Counter to track the number of articles scraped
|
21 |
+
|
22 |
+
for item in soup.find_all('article', class_='story'):
|
23 |
+
if count >= 5: # Stop after 10 articles
|
24 |
+
break
|
25 |
+
|
26 |
+
title_tag = item.find('h2', class_='story__title')
|
27 |
+
if title_tag:
|
28 |
+
title = title_tag.get_text(strip=True)
|
29 |
+
link = title_tag.find('a')['href']
|
30 |
+
full_text = get_full_article_dawn(link)
|
31 |
+
# Summarize the full article
|
32 |
+
summary_obj = summarizer(full_text[:1020])
|
33 |
+
|
34 |
+
# Convert the summary object to a string
|
35 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
36 |
+
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
|
37 |
+
|
38 |
+
count += 1 # Increment the counter
|
39 |
+
|
40 |
+
return articles
|
41 |
+
|
42 |
+
# Function to get the full text of an article from Dawn
|
43 |
+
def get_full_article_dawn(url):
|
44 |
+
response = requests.get(url, verify = False)
|
45 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
46 |
+
content_div = soup.find('div', class_='story__content')
|
47 |
+
if content_div:
|
48 |
+
paragraphs = content_div.find_all('p')
|
49 |
+
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
|
50 |
+
return full_text
|
51 |
+
return "Content not found."
|
52 |
+
|
53 |
+
|
54 |
+
# Function to scrape articles from Business Recorder
|
55 |
+
def scrape_brecorder():
|
56 |
+
url = 'https://www.brecorder.com/business-finance'
|
57 |
+
response = requests.get(url, verify=False)
|
58 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
59 |
+
articles = []
|
60 |
+
count = 0 # Counter to track the number of articles scraped
|
61 |
+
|
62 |
+
for item in soup.find_all('article', class_='story'):
|
63 |
+
if count >= 5: # Stop after 10 articles
|
64 |
+
break
|
65 |
+
|
66 |
+
title_tag = item.find('h2', class_='story__title')
|
67 |
+
if title_tag:
|
68 |
+
title = title_tag.get_text(strip=True)
|
69 |
+
link = title_tag.find('a')['href']
|
70 |
+
full_text = get_full_article_brecorder(link)
|
71 |
+
# Summarize the full article
|
72 |
+
summary_obj = summarizer(full_text[:1020])
|
73 |
+
|
74 |
+
# Convert the summary object to a string
|
75 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
76 |
+
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
|
77 |
+
|
78 |
+
count += 1 # Increment the counter
|
79 |
+
|
80 |
+
return articles
|
81 |
+
|
82 |
+
# Function to get the full text of an article from Business Recorder
|
83 |
+
def get_full_article_brecorder(url):
|
84 |
+
response = requests.get(url, verify = False)
|
85 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
86 |
+
content_div = soup.find('div', class_='story__content')
|
87 |
+
if content_div:
|
88 |
+
paragraphs = content_div.find_all(['p', 'li'])
|
89 |
+
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
|
90 |
+
return full_text
|
91 |
+
return "Content not found."
|
92 |
+
|
93 |
+
#
|
94 |
+
# def scrape_tnews():
|
95 |
+
# url = 'https://www.thenews.com.pk/latest/category/business'
|
96 |
+
# response = requests.get(url, verify=False)
|
97 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
98 |
+
# articles = []
|
99 |
+
#
|
100 |
+
# count = 0 # Counter to track the number of articles scraped
|
101 |
+
#
|
102 |
+
# for item in soup.find_all('div', class_='most-popular-box'):
|
103 |
+
# if count >= 2: # Stop after 10 articles
|
104 |
+
# break
|
105 |
+
#
|
106 |
+
# title_tag = item.find('h2', class_='most-popular-list')
|
107 |
+
# if title_tag:
|
108 |
+
# title = title_tag.get_text(strip=True)
|
109 |
+
# link = title_tag.find('a')['href']
|
110 |
+
# full_text = get_full_article_tnews(link)
|
111 |
+
# # Summarize the full article
|
112 |
+
# summary_obj = summarizer(full_text[:1020])
|
113 |
+
#
|
114 |
+
# # Convert the summary object to a string
|
115 |
+
# summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
116 |
+
# articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
|
117 |
+
#
|
118 |
+
# count += 1 # Increment the counter
|
119 |
+
#
|
120 |
+
# return articles
|
121 |
+
|
122 |
+
|
123 |
+
def scrape_tnews():
|
124 |
+
url = 'https://www.thenews.com.pk/latest/category/business'
|
125 |
+
response = requests.get(url, verify=False)
|
126 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
127 |
+
articles = []
|
128 |
+
|
129 |
+
count = 0 # Counter to track the number of articles scraped
|
130 |
+
|
131 |
+
for item in soup.find_all('div', class_='most-popular-box'):
|
132 |
+
if count >= 5: # Stop after 2 articles
|
133 |
+
break
|
134 |
+
|
135 |
+
# Extract the title from the <h2> tag
|
136 |
+
title_tag = item.find('h2')
|
137 |
+
if title_tag:
|
138 |
+
title = title_tag.get_text(strip=True)
|
139 |
+
|
140 |
+
# Extract the link from the <a> tag inside <h2>
|
141 |
+
link = item.find('a')['href']
|
142 |
+
|
143 |
+
# Fetch and process full article text (you should define get_full_article_tnews)
|
144 |
+
full_text = get_full_article_tnews(link)
|
145 |
+
|
146 |
+
# Summarize the full article (you should define summarizer)
|
147 |
+
summary_obj = summarizer(full_text[:1020])
|
148 |
+
summary = summary_obj[0]['summary_text'] if summary_obj else ""
|
149 |
+
|
150 |
+
# Append the article details
|
151 |
+
articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
|
152 |
+
|
153 |
+
|
154 |
+
count += 1 # Increment the counter
|
155 |
+
|
156 |
+
return articles
|
157 |
+
|
158 |
+
|
159 |
+
def get_full_article_tnews(url):
|
160 |
+
response = requests.get(url, verify = False)
|
161 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
162 |
+
content_div = soup.find('div', class_='detail-content')
|
163 |
+
if content_div:
|
164 |
+
paragraphs = content_div.find_all(['p', 'li'])
|
165 |
+
full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
|
166 |
+
return full_text
|
167 |
+
return "Content not found."
|
168 |
+
|
169 |
+
# Function to save articles to a CSV file
|
170 |
+
def save_to_csv(filename, articles):
|
171 |
+
if not articles:
|
172 |
+
print(f"No articles found to save in {filename}.")
|
173 |
+
return
|
174 |
+
keys = articles[0].keys()
|
175 |
+
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
|
176 |
+
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
|
177 |
+
dict_writer.writeheader()
|
178 |
+
dict_writer.writerows(articles)
|
179 |
+
|
180 |
+
|
181 |
+
# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
|
182 |
+
# def main():
|
183 |
+
# # Scraping articles from Dawn
|
184 |
+
# dawn_articles = scrape_tnews()
|
185 |
+
# save_to_csv('tnews_articles_full.csv', dawn_articles)
|
186 |
+
# print("tnews articles saved to CSV file successfully.")
|
187 |
+
#
|
188 |
+
# # Scraping articles from Business Recorder
|
189 |
+
# # brecorder_articles = scrape_brecorder()
|
190 |
+
# # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
|
191 |
+
# # print("Business Recorder articles saved to CSV file successfully.")
|
192 |
+
#
|
193 |
+
#
|
194 |
+
# if __name__ == '__main__':
|
195 |
+
# main()
|
196 |
+
|
197 |
+
# url = 'https://www.thenews.com.pk/latest/category/business'
|
198 |
+
# response = requests.get(url, verify=False)
|
199 |
+
# soup = BeautifulSoup(response.text, 'html.parser')
|
200 |
+
# s = soup.find_all('div', class_='most-popular-box')
|
201 |
+
# print(s)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.30.0
|
2 |
+
transformers==4.30.1
|
3 |
+
bs4
|
4 |
+
pandas
|
5 |
+
numpy
|
6 |
+
requests
|