sohail-shaikh-s07 commited on
Commit
5b9028d
·
verified ·
1 Parent(s): dd0f744

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -31
app.py CHANGED
@@ -5,17 +5,23 @@ from transformers import pipeline
5
  import nltk
6
  import torch
7
  from urllib.parse import urlparse
 
8
 
9
  # Download required NLTK data
10
  try:
11
- nltk.download('punkt')
12
  except Exception as e:
13
  print(f"Error downloading NLTK data: {e}")
14
 
15
- # Initialize the summarization pipeline
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
 
 
 
 
 
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
@@ -28,13 +34,14 @@ def is_valid_url(url):
28
  return False
29
 
30
  def extract_article_text(url):
31
- """Extract article text using BeautifulSoup instead of newspaper3k"""
32
  headers = {
33
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
34
  }
35
 
36
  try:
37
- response = requests.get(url, headers=headers, timeout=10)
 
38
  response.raise_for_status()
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
@@ -43,27 +50,29 @@ def extract_article_text(url):
43
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
- # Find the main content
47
- article_text = ""
48
-
49
- # Look for common article containers
50
- main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
51
 
52
  if main_content:
53
- paragraphs = main_content.find_all('p')
 
54
  else:
55
- # Fallback to all paragraphs if no article container found
56
- paragraphs = soup.find_all('p')
57
 
58
- # Extract text from paragraphs
59
- article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
60
 
61
- return article_text
 
62
 
63
  except Exception as e:
64
  raise Exception(f"Error fetching article: {str(e)}")
65
 
66
- def extract_and_summarize(url):
67
  if not url or not url.strip():
68
  return "Please enter a valid URL"
69
 
@@ -71,22 +80,35 @@ def extract_and_summarize(url):
71
  return "Please enter a valid URL starting with http:// or https://"
72
 
73
  try:
74
- # Extract article text
 
 
 
75
  text = extract_article_text(url)
76
 
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
-
80
- # Split text into chunks if it's too long
81
- max_chunk_length = 1024
 
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
83
 
84
- # Summarize each chunk
 
 
 
 
85
  summaries = []
86
  for chunk in chunks:
87
- if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
- summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
 
 
 
 
 
90
  summaries.append(summary[0]['summary_text'])
91
  except Exception as e:
92
  print(f"Error summarizing chunk: {e}")
@@ -94,11 +116,13 @@ def extract_and_summarize(url):
94
 
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
-
98
- # Combine all summaries
99
  final_summary = " ".join(summaries)
100
 
101
- return final_summary
 
 
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
@@ -109,13 +133,13 @@ demo = gr.Interface(
109
  inputs=gr.Textbox(
110
  label="Enter News Article URL",
111
  placeholder="https://...",
112
- info="Enter a news article URL to get a summary"
113
  ),
114
  outputs=gr.Textbox(label="Summary", lines=5),
115
- title="📰 News Article Summarizer",
116
  description="""
117
- This app creates concise summaries of news articles using AI.
118
- Simply paste a URL of a news article and get a summary!
119
  """,
120
  examples=[
121
  ["https://www.bbc.com/news/world-us-canada-67841980"],
 
5
  import nltk
6
  import torch
7
  from urllib.parse import urlparse
8
+ import time
9
 
10
  # Download required NLTK data
11
  try:
12
+ nltk.download('punkt', quiet=True)
13
  except Exception as e:
14
  print(f"Error downloading NLTK data: {e}")
15
 
16
+ # Initialize the summarization pipeline with a smaller, faster model
17
  try:
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ summarizer = pipeline(
20
+ "summarization",
21
+ model="facebook/bart-base-cnn", # Using smaller base model instead of large
22
+ device=device,
23
+ model_kwargs={"cache_dir": "model_cache"}
24
+ )
25
  except Exception as e:
26
  print(f"Error loading model: {e}")
27
  summarizer = None
 
34
  return False
35
 
36
  def extract_article_text(url):
37
+ """Extract article text using BeautifulSoup with timeout"""
38
  headers = {
39
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
40
  }
41
 
42
  try:
43
+ # Add a shorter timeout
44
+ response = requests.get(url, headers=headers, timeout=5)
45
  response.raise_for_status()
46
 
47
  soup = BeautifulSoup(response.text, 'html.parser')
 
50
  for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
51
  tag.decompose()
52
 
53
+ # Find the main content - optimized search
54
+ main_content = (
55
+ soup.find('article') or
56
+ soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
57
+ )
58
 
59
  if main_content:
60
+ # Only get paragraphs from main content
61
+ paragraphs = main_content.find_all('p', recursive=False)
62
  else:
63
+ # Limit number of paragraphs if no main content found
64
+ paragraphs = soup.find_all('p', limit=20)
65
 
66
+ # Extract text from paragraphs with minimum length requirement
67
+ article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
68
 
69
+ # Limit total text length
70
+ return article_text[:5000]
71
 
72
  except Exception as e:
73
  raise Exception(f"Error fetching article: {str(e)}")
74
 
75
+ def extract_and_summarize(url, progress=gr.Progress()):
76
  if not url or not url.strip():
77
  return "Please enter a valid URL"
78
 
 
80
  return "Please enter a valid URL starting with http:// or https://"
81
 
82
  try:
83
+ start_time = time.time()
84
+
85
+ # Extract article text with progress updates
86
+ progress(0.2, desc="Fetching article...")
87
  text = extract_article_text(url)
88
 
89
  if not text:
90
  return "Could not extract text from the article. Please make sure it's a valid news article."
91
+
92
+ progress(0.4, desc="Processing text...")
93
+ # Split text into smaller chunks
94
+ max_chunk_length = 512 # Reduced chunk size
95
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
96
 
97
+ # Limit number of chunks
98
+ chunks = chunks[:3] # Process at most 3 chunks
99
+
100
+ progress(0.6, desc="Generating summary...")
101
+ # Summarize each chunk with shorter output
102
  summaries = []
103
  for chunk in chunks:
104
+ if len(chunk.strip()) > 50: # Reduced minimum length requirement
105
  try:
106
+ summary = summarizer(
107
+ chunk,
108
+ max_length=100, # Reduced max length
109
+ min_length=20, # Reduced min length
110
+ do_sample=False
111
+ )
112
  summaries.append(summary[0]['summary_text'])
113
  except Exception as e:
114
  print(f"Error summarizing chunk: {e}")
 
116
 
117
  if not summaries:
118
  return "Could not generate summary. Please try a different article."
119
+
120
+ # Combine summaries
121
  final_summary = " ".join(summaries)
122
 
123
+ # Add processing time information
124
+ processing_time = round(time.time() - start_time, 2)
125
+ return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
126
 
127
  except Exception as e:
128
  return f"Error processing article: {str(e)}"
 
133
  inputs=gr.Textbox(
134
  label="Enter News Article URL",
135
  placeholder="https://...",
136
+ info="Enter a news article URL to get a quick summary"
137
  ),
138
  outputs=gr.Textbox(label="Summary", lines=5),
139
+ title="📰 Fast News Article Summarizer",
140
  description="""
141
+ This app quickly summarizes news articles using AI.
142
+ Simply paste a URL and get a concise summary in seconds!
143
  """,
144
  examples=[
145
  ["https://www.bbc.com/news/world-us-canada-67841980"],