Spaces:

MurtazaNaqi
/

Article_Summarizer

Sleeping

App Files Files Community

Muhammad Murtaza Naqi (Assistant Manager - Data Analyst) commited on Sep 18, 2024

Commit

712d86b

•

1 Parent(s): 19181ff

supporting files

Browse files

Files changed (4) hide show

Article_summarizer.py +99 -0
News_scrapper.py +56 -0
Scrapper_Summarizer.py +201 -0
requirements.txt +6 -0

Article_summarizer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+from transformers import pipeline
+from transformers import AutoModel
+#from peft import PeftModel, PeftConfig
+from transformers import AutoModelForCausalLM
+from Scrapper_Summarizer import get_full_article_dawn, get_full_article_tnews, get_full_article_brecorder, summarizer
+# summarizer = pipeline("summarization", model="mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization")
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# Function to scrape full article and summarize it
+def get_full_article(url):
+    try:
+        response = requests.get(url, verify=False)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        content_div = soup.find('div', class_='story__content')
+        if content_div:
+            paragraphs = content_div.find_all('p')
+            full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
+            # Limiting text length for summarization
+            summary_obj = summarizer(full_text[:1020])
+            # Convert the summary object to a string
+            summary = summary_obj[0]['summary_text'] if summary_obj else ""
+            st.success("Summary generated successfully!")
+            return summary
+        else:
+            st.error("Content not found in the article.")
+            return "Content not found."
+    except Exception as e:
+        st.error(f"Error fetching the article: {e}")
+        return "Error fetching the article."
+def article_sum():
+    # App title
+    st.title("📰 Article Summarizer")
+    st.write("Provide the URL of the article you'd like summarized below, and we'll fetch and summarize it for you!")
+    # Input URL from user
+    url = st.text_input("Enter the article URL:", "")
+    # Sidebar with buttons for different sources
+    st.sidebar.title("Choose a Source")
+    # Button for "The News"
+    if st.sidebar.button("The News"):
+        if url:
+            with st.spinner('Fetching and summarizing the article from The News...'):
+                full_text = get_full_article_tnews(url)
+                summary_obj = summarizer(full_text[:1020])
+                # Convert the summary object to a string
+                summary = summary_obj[0]['summary_text'] if summary_obj else ""
+                st.write(summary)
+        else:
+            st.sidebar.error("Please enter the URL of an article from The News.")
+    # Button for "The Dawn"
+    if st.sidebar.button("The Dawn"):
+        if url:
+            with st.spinner('Fetching and summarizing the article from The Dawn...'):
+                full_text = get_full_article_dawn(url)
+                summary_obj = summarizer(full_text[:1020])
+                # Convert the summary object to a string
+                summary = summary_obj[0]['summary_text'] if summary_obj else ""
+                st.write(summary)
+        else:
+            st.sidebar.error("Please enter the URL of an article from The Dawn.")
+    # Button for "Business Recorder"
+    if st.sidebar.button("Business Recorder"):
+        if url:
+            with st.spinner('Fetching and summarizing the article from Business Recorder...'):
+                full_text= get_full_article_brecorder(url)
+                summary_obj = summarizer(full_text[:1020])
+                # Convert the summary object to a string
+                summary = summary_obj[0]['summary_text'] if summary_obj else ""
+                st.write(summary)
+        else:
+            st.sidebar.error("Please enter the URL of an article from Business Recorder.")
+    # Sidebar details and credits
+    st.sidebar.title("About")
+    st.sidebar.write(
+        "This utility fetches articles from a given URL and summarizes them using a pre-trained summarization model.")
+    st.sidebar.markdown("### Model Used")
+    st.sidebar.info("Model: `sshleifer/distilbart-cnn-12-6` (BART-based summarizer)")
+    st.sidebar.markdown("---")
+    st.sidebar.write("Created by Strategy")

News_scrapper.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import pandas as pd
+import streamlit as st
+from Scrapper_Summarizer import scrape_dawn, scrape_brecorder, scrape_tnews
+def load_articles_in_batches(articles, batch_size, offset):
+    return articles[offset:offset + batch_size]
+def News_scrapper():
+    # App title and description
+    st.title("📰 Business News Scrapper & Summarizer")
+    st.write("This app scrapes the latest business news from *Dawn* and *Business Recorder* and summarizes the articles for easy reading.")
+    # Add a sidebar for navigation
+    st.sidebar.write("Use this sidebar to navigate between options.")
+    st.sidebar.markdown("### Scraping Options")
+    # Add a button for Dawn News scraping
+    if st.sidebar.button('Scrape Dawn News'):
+        st.subheader("Latest Business News from Dawn")
+        with st.spinner("Scraping and summarizing news from Dawn..."):
+            dawn_articles = scrape_dawn()
+            if dawn_articles:
+                df = pd.DataFrame(dawn_articles)
+                st.dataframe(df)
+            else:
+                st.write("No articles found.")
+    # Add a button for Business Recorder scraping
+    if st.sidebar.button('Scrape Business Recorder'):
+        st.subheader("Latest Business News from Business Recorder")
+        with st.spinner("Scraping and summarizing news from Business Recorder..."):
+            brecorder_articles = scrape_brecorder()
+            if brecorder_articles:
+                df = pd.DataFrame(brecorder_articles)
+                st.dataframe(df)
+            else:
+                st.write("No articles found.")
+    # Add a button for The News scraping
+    if st.sidebar.button('Scrape The News'):
+        st.subheader("Latest Business News from The News")
+        with st.spinner("Scraping and summarizing news from The News..."):
+            tnews_articles = scrape_tnews()
+            if tnews_articles:
+                df = pd.DataFrame(tnews_articles)
+                st.dataframe(df)
+            else:
+                st.write("No articles found.")
+    # Sidebar details and beautification
+    st.sidebar.markdown("---")
+    st.sidebar.info("This utility scrapes the latest business articles and generates summaries using the BART summarization model. Great for quick reads!")
+    st.sidebar.markdown("---")
+    st.sidebar.write("Created by Strategy")

Scrapper_Summarizer.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import streamlit as st
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+import csv
+from transformers import pipeline
+# from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration
+# Text sumamrization model
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+def scrape_dawn():
+    url = 'https://www.dawn.com/business'
+    response = requests.get(url, verify=False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    articles = []
+    count = 0  # Counter to track the number of articles scraped
+    for item in soup.find_all('article', class_='story'):
+        if count >= 5:  # Stop after 10 articles
+            break
+        title_tag = item.find('h2', class_='story__title')
+        if title_tag:
+            title = title_tag.get_text(strip=True)
+            link = title_tag.find('a')['href']
+            full_text = get_full_article_dawn(link)
+            # Summarize the full article
+            summary_obj = summarizer(full_text[:1020])
+            # Convert the summary object to a string
+            summary = summary_obj[0]['summary_text'] if summary_obj else ""
+            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
+            count += 1  # Increment the counter
+    return articles
+# Function to get the full text of an article from Dawn
+def get_full_article_dawn(url):
+    response = requests.get(url, verify = False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    content_div = soup.find('div', class_='story__content')
+    if content_div:
+        paragraphs = content_div.find_all('p')
+        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
+        return full_text
+    return "Content not found."
+# Function to scrape articles from Business Recorder
+def scrape_brecorder():
+    url = 'https://www.brecorder.com/business-finance'
+    response = requests.get(url, verify=False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    articles = []
+    count = 0  # Counter to track the number of articles scraped
+    for item in soup.find_all('article', class_='story'):
+        if count >= 5:  # Stop after 10 articles
+            break
+        title_tag = item.find('h2', class_='story__title')
+        if title_tag:
+            title = title_tag.get_text(strip=True)
+            link = title_tag.find('a')['href']
+            full_text = get_full_article_brecorder(link)
+            # Summarize the full article
+            summary_obj = summarizer(full_text[:1020])
+            # Convert the summary object to a string
+            summary = summary_obj[0]['summary_text'] if summary_obj else ""
+            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
+            count += 1  # Increment the counter
+    return articles
+# Function to get the full text of an article from Business Recorder
+def get_full_article_brecorder(url):
+    response = requests.get(url, verify = False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    content_div = soup.find('div', class_='story__content')
+    if content_div:
+        paragraphs = content_div.find_all(['p', 'li'])
+        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
+        return full_text
+    return "Content not found."
+#
+# def scrape_tnews():
+#     url = 'https://www.thenews.com.pk/latest/category/business'
+#     response = requests.get(url, verify=False)
+#     soup = BeautifulSoup(response.text, 'html.parser')
+#     articles = []
+#
+#     count = 0  # Counter to track the number of articles scraped
+#
+#     for item in soup.find_all('div', class_='most-popular-box'):
+#         if count >= 2:  # Stop after 10 articles
+#             break
+#
+#         title_tag = item.find('h2', class_='most-popular-list')
+#         if title_tag:
+#             title = title_tag.get_text(strip=True)
+#             link = title_tag.find('a')['href']
+#             full_text = get_full_article_tnews(link)
+#             # Summarize the full article
+#             summary_obj = summarizer(full_text[:1020])
+#
+#             # Convert the summary object to a string
+#             summary = summary_obj[0]['summary_text'] if summary_obj else ""
+#             articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
+#
+#             count += 1  # Increment the counter
+#
+#     return articles
+def scrape_tnews():
+    url = 'https://www.thenews.com.pk/latest/category/business'
+    response = requests.get(url, verify=False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    articles = []
+    count = 0  # Counter to track the number of articles scraped
+    for item in soup.find_all('div', class_='most-popular-box'):
+        if count >= 5:  # Stop after 2 articles
+            break
+        # Extract the title from the <h2> tag
+        title_tag = item.find('h2')
+        if title_tag:
+            title = title_tag.get_text(strip=True)
+            # Extract the link from the <a> tag inside <h2>
+            link = item.find('a')['href']
+            # Fetch and process full article text (you should define get_full_article_tnews)
+            full_text = get_full_article_tnews(link)
+            # Summarize the full article (you should define summarizer)
+            summary_obj = summarizer(full_text[:1020])
+            summary = summary_obj[0]['summary_text'] if summary_obj else ""
+            # Append the article details
+            articles.append({'title': title, 'link': link, 'content': full_text, 'summary': summary})
+            count += 1  # Increment the counter
+    return articles
+def get_full_article_tnews(url):
+    response = requests.get(url, verify = False)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    content_div = soup.find('div', class_='detail-content')
+    if content_div:
+        paragraphs = content_div.find_all(['p', 'li'])
+        full_text = ' '.join([para.get_text(strip=True) for para in paragraphs])
+        return full_text
+    return "Content not found."
+# Function to save articles to a CSV file
+def save_to_csv(filename, articles):
+    if not articles:
+        print(f"No articles found to save in {filename}.")
+        return
+    keys = articles[0].keys()
+    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
+        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
+        dict_writer.writeheader()
+        dict_writer.writerows(articles)
+# # Main function to scrape articles from both Dawn and Business Recorder, and save to CSV
+# def main():
+#     # Scraping articles from Dawn
+#     dawn_articles = scrape_tnews()
+#     save_to_csv('tnews_articles_full.csv', dawn_articles)
+#     print("tnews articles saved to CSV file successfully.")
+#
+#     # Scraping articles from Business Recorder
+#     # brecorder_articles = scrape_brecorder()
+#     # save_to_csv('brecorder_articles_full.csv', brecorder_articles)
+#     # print("Business Recorder articles saved to CSV file successfully.")
+#
+#
+# if __name__ == '__main__':
+#     main()
+# url = 'https://www.thenews.com.pk/latest/category/business'
+# response = requests.get(url, verify=False)
+# soup = BeautifulSoup(response.text, 'html.parser')
+# s = soup.find_all('div', class_='most-popular-box')
+# print(s)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit==1.30.0
+transformers==4.30.1
+bs4
+pandas
+numpy
+requests