Spaces:

sohail-shaikh-s07
/

News-Article-Summarization

Sleeping

App Files Files

sohail-shaikh-s07 commited on Dec 21, 2024

Commit

53ed53b

verified ·

1 Parent(s): e521008

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -11

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import nltk
 import torch
 from urllib.parse import urlparse
-# Download required NLTK data
 try:
     nltk.download('punkt')
 except Exception as e:
@@ -15,7 +15,7 @@ except Exception as e:
 # Initialize the summarization pipeline
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
@@ -39,11 +39,11 @@ def extract_article_text(url):
         soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove unwanted elements
         for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
-        # Find the main content
         article_text = ""
         # Look for common article containers
@@ -52,7 +52,7 @@ def extract_article_text(url):
         if main_content:
             paragraphs = main_content.find_all('p')
         else:
-            # Fallback to all paragraphs if no article container found
             paragraphs = soup.find_all('p')
         # Extract text from paragraphs
@@ -77,13 +77,13 @@ def extract_and_summarize(url):
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
-        # Split text into chunks if it's too long
         max_chunk_length = 1024
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
         # Summarize each chunk
         summaries = []
-        for chunk in chunks:
             if len(chunk.strip()) > 100:  # Only summarize substantial chunks
                 try:
                     summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
@@ -95,10 +95,10 @@ def extract_and_summarize(url):
         if not summaries:
             return "Could not generate summary. Please try a different article."
-        # Combine all summaries
         final_summary = " ".join(summaries)
-        return final_summary
     except Exception as e:
         return f"Error processing article: {str(e)}"
@@ -116,10 +116,13 @@ demo = gr.Interface(
     description="""
     This app creates concise summaries of news articles using AI.
     Simply paste a URL of a news article and get a summary!
     """,
     examples=[
-        ["https://www.bbc.com/news/world-us-canada-67841980"],
-        ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
     ],
     theme=gr.themes.Soft()
 )

 import torch
 from urllib.parse import urlparse
 try:
     nltk.download('punkt')
 except Exception as e:
 # Initialize the summarization pipeline
 try:
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)  # Here I have used the BART-Large-CNN model, you can use other models as well like T5, GPT-2 etc for faster processing and better results
 except Exception as e:
     print(f"Error loading model: {e}")
     summarizer = None
         soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove unwanted elements -- it will help in faster processing and better results for summarization
         for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
             tag.decompose()
         article_text = ""
         # Look for common article containers
         if main_content:
             paragraphs = main_content.find_all('p')
         else:
             paragraphs = soup.find_all('p')
         # Extract text from paragraphs
         if not text:
             return "Could not extract text from the article. Please make sure it's a valid news article."
+        # Split text into chunks if it's too long --- it will divide the article into chunks of 1024 tokens
         max_chunk_length = 1024
         chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
         # Summarize each chunk
         summaries = []
+        for i, chunk in enumerate(chunks):
             if len(chunk.strip()) > 100:  # Only summarize substantial chunks
                 try:
                     summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
         if not summaries:
             return "Could not generate summary. Please try a different article."
+        # Combine all summaries --- it will combine all the summaries into a single paragraph to get a clean summary
         final_summary = " ".join(summaries)
+        return f"Summary:\n\n{final_summary}\n\n(Processing complete - if you want to try another article, just paste a new URL above)"
     except Exception as e:
         return f"Error processing article: {str(e)}"
     description="""
     This app creates concise summaries of news articles using AI.
     Simply paste a URL of a news article and get a summary!
+    Note: Please wait patiently while processing - it may take 30-60 seconds depending on the article length.
+    The model is processing when you see 'Running...' below the output box.
     """,
     examples=[
+        ["https://www.bbc.com/sport/football/articles/cvgxmzy86e4o"],
+        ["https://globalsouthworld.com/article/biden-approves-571-million-in-defense-support-for-taiwan"]
     ],
     theme=gr.themes.Soft()
 )