sohail-shaikh-s07 commited on
Commit
ca0d432
·
verified ·
1 Parent(s): 771797e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -11
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import gradio as gr
2
- from newspaper import Article
 
3
  from transformers import pipeline
4
  import nltk
5
  import torch
 
6
 
7
  # Download required NLTK data
8
  try:
@@ -18,20 +20,62 @@ except Exception as e:
18
  print(f"Error loading model: {e}")
19
  summarizer = None
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def extract_and_summarize(url):
22
  if not url or not url.strip():
23
  return "Please enter a valid URL"
24
 
 
 
 
25
  try:
26
- # Download and parse article
27
- article = Article(url)
28
- article.download()
29
- article.parse()
30
 
31
- # Get the text content
32
- text = article.text
33
  if not text:
34
- return "Could not extract text from the article"
35
 
36
  # Split text into chunks if it's too long
37
  max_chunk_length = 1024
@@ -62,10 +106,17 @@ def extract_and_summarize(url):
62
  # Create Gradio interface
63
  demo = gr.Interface(
64
  fn=extract_and_summarize,
65
- inputs=gr.Textbox(label="Enter News Article URL", placeholder="https://..."),
66
- outputs=gr.Textbox(label="Summary"),
 
 
 
 
67
  title="📰 News Article Summarizer",
68
- description="Enter a news article URL to get a concise summary. The summary will focus on the main points of the article.",
 
 
 
69
  examples=[
70
  ["https://www.bbc.com/news/world-us-canada-67841980"],
71
  ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
 
1
  import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
  import nltk
6
  import torch
7
+ from urllib.parse import urlparse
8
 
9
  # Download required NLTK data
10
  try:
 
20
  print(f"Error loading model: {e}")
21
  summarizer = None
22
 
23
+ def is_valid_url(url):
24
+ try:
25
+ result = urlparse(url)
26
+ return all([result.scheme, result.netloc])
27
+ except:
28
+ return False
29
+
30
+ def extract_article_text(url):
31
+ """Extract article text using BeautifulSoup instead of newspaper3k"""
32
+ headers = {
33
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
34
+ }
35
+
36
+ try:
37
+ response = requests.get(url, headers=headers, timeout=10)
38
+ response.raise_for_status()
39
+
40
+ soup = BeautifulSoup(response.text, 'html.parser')
41
+
42
+ # Remove unwanted elements
43
+ for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
+ tag.decompose()
45
+
46
+ # Find the main content
47
+ article_text = ""
48
+
49
+ # Look for common article containers
50
+ main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
51
+
52
+ if main_content:
53
+ paragraphs = main_content.find_all('p')
54
+ else:
55
+ # Fallback to all paragraphs if no article container found
56
+ paragraphs = soup.find_all('p')
57
+
58
+ # Extract text from paragraphs
59
+ article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
60
+
61
+ return article_text
62
+
63
+ except Exception as e:
64
+ raise Exception(f"Error fetching article: {str(e)}")
65
+
66
  def extract_and_summarize(url):
67
  if not url or not url.strip():
68
  return "Please enter a valid URL"
69
 
70
+ if not is_valid_url(url):
71
+ return "Please enter a valid URL starting with http:// or https://"
72
+
73
  try:
74
+ # Extract article text
75
+ text = extract_article_text(url)
 
 
76
 
 
 
77
  if not text:
78
+ return "Could not extract text from the article. Please make sure it's a valid news article."
79
 
80
  # Split text into chunks if it's too long
81
  max_chunk_length = 1024
 
106
  # Create Gradio interface
107
  demo = gr.Interface(
108
  fn=extract_and_summarize,
109
+ inputs=gr.Textbox(
110
+ label="Enter News Article URL",
111
+ placeholder="https://...",
112
+ info="Enter a news article URL to get a summary"
113
+ ),
114
+ outputs=gr.Textbox(label="Summary", lines=5),
115
  title="📰 News Article Summarizer",
116
+ description="""
117
+ This app creates concise summaries of news articles using AI.
118
+ Simply paste a URL of a news article and get a summary!
119
+ """,
120
  examples=[
121
  ["https://www.bbc.com/news/world-us-canada-67841980"],
122
  ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]