sohail-shaikh-s07 commited on
Commit
e521008
·
verified ·
1 Parent(s): b12afd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -176
app.py CHANGED
@@ -5,25 +5,17 @@ from transformers import pipeline
5
  import nltk
6
  import torch
7
  from urllib.parse import urlparse
8
- import time
9
- import re
10
- import json
11
 
12
  # Download required NLTK data
13
  try:
14
- nltk.download('punkt', quiet=True)
15
  except Exception as e:
16
  print(f"Error downloading NLTK data: {e}")
17
 
18
- # Initialize the summarization pipeline with a smaller, faster model
19
  try:
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
- summarizer = pipeline(
22
- "summarization",
23
- model="facebook/bart-base-cnn",
24
- device=device,
25
- model_kwargs={"cache_dir": "model_cache"}
26
- )
27
  except Exception as e:
28
  print(f"Error loading model: {e}")
29
  summarizer = None
@@ -35,163 +27,43 @@ def is_valid_url(url):
35
  except:
36
  return False
37
 
38
- def clean_text(text):
39
- # Remove extra whitespace and special characters
40
- text = re.sub(r'\s+', ' ', text)
41
- text = re.sub(r'[^\w\s.,!?-]', '', text)
42
- # Remove common unwanted phrases
43
- text = re.sub(r'advertisement|subscribe now|subscription required|please sign in', '', text, flags=re.IGNORECASE)
44
- return text.strip()
45
-
46
- def get_hindu_article(url):
47
- """Special handler for The Hindu website"""
48
- try:
49
- # First request to get cookies and tokens
50
- session = requests.Session()
51
- headers = {
52
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
53
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
54
- 'Accept-Language': 'en-US,en;q=0.5',
55
- 'Referer': 'https://www.thehindu.com/',
56
- 'DNT': '1',
57
- 'Connection': 'keep-alive',
58
- 'Upgrade-Insecure-Requests': '1',
59
- 'Sec-Fetch-Dest': 'document',
60
- 'Sec-Fetch-Mode': 'navigate',
61
- 'Sec-Fetch-Site': 'same-origin',
62
- 'Sec-Fetch-User': '?1',
63
- 'Cache-Control': 'max-age=0'
64
- }
65
-
66
- # Get the article ID from the URL
67
- article_id = re.search(r'article(\d+)', url)
68
- if article_id:
69
- article_id = article_id.group(1)
70
- api_url = f"https://www.thehindu.com/api/article/{article_id}/"
71
- response = session.get(api_url, headers=headers)
72
- if response.status_code == 200:
73
- try:
74
- data = response.json()
75
- if 'body' in data:
76
- # Parse the HTML content from the API response
77
- soup = BeautifulSoup(data['body'], 'html.parser')
78
- text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))
79
- if text:
80
- return text
81
- except:
82
- pass
83
-
84
- # Fallback to regular page scraping
85
- response = session.get(url, headers=headers)
86
- soup = BeautifulSoup(response.text, 'html.parser')
87
-
88
- # Try multiple selectors specific to The Hindu
89
- selectors = [
90
- 'div.article-text',
91
- 'div#content-body',
92
- 'div.article',
93
- 'div[itemprop="articleBody"]',
94
- 'div.paywall'
95
- ]
96
-
97
- article_text = ""
98
- for selector in selectors:
99
- content = soup.select_one(selector)
100
- if content:
101
- paragraphs = content.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
102
- 'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
103
- ]))
104
- texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40]
105
- if texts:
106
- article_text = ' '.join(texts)
107
- break
108
-
109
- if article_text:
110
- return article_text
111
-
112
- # Last resort: try to find any substantial paragraphs
113
- all_paragraphs = soup.find_all('p')
114
- texts = [p.get_text().strip() for p in all_paragraphs if len(p.get_text().strip()) > 40]
115
- return ' '.join(texts) if texts else None
116
-
117
- except Exception as e:
118
- print(f"Error in get_hindu_article: {str(e)}")
119
- return None
120
-
121
  def extract_article_text(url):
122
- """Extract article text with special handling for different news sites"""
123
  headers = {
124
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
125
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
126
- 'Accept-Language': 'en-US,en;q=0.5',
127
- 'DNT': '1',
128
- 'Connection': 'keep-alive',
129
- 'Upgrade-Insecure-Requests': '1',
130
  }
131
 
132
  try:
133
- # Special handling for The Hindu
134
- if 'thehindu.com' in url:
135
- article_text = get_hindu_article(url)
136
- if article_text:
137
- return clean_text(article_text)[:8000]
138
-
139
- # Regular handling for other sites
140
  response = requests.get(url, headers=headers, timeout=10)
141
  response.raise_for_status()
142
 
143
  soup = BeautifulSoup(response.text, 'html.parser')
144
 
145
  # Remove unwanted elements
146
- for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'meta', 'link']):
147
  tag.decompose()
148
 
149
- # Try multiple methods to find the main content
150
  article_text = ""
151
 
152
- # Method 1: Look for article tag
153
- article = soup.find('article')
154
-
155
- # Method 2: Look for specific class names
156
- if not article:
157
- article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
158
- 'article', 'story', 'content', 'body', 'text', 'main', 'entry'
159
- ]))
160
-
161
- # Method 3: Look for specific div patterns
162
- if not article:
163
- article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
164
 
165
- if article:
166
- paragraphs = article.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
167
- 'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
168
- ]))
169
  else:
170
- paragraphs = soup.find_all('p', recursive=True)
 
171
 
172
- texts = []
173
- for p in paragraphs:
174
- text = p.get_text().strip()
175
- if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
176
- texts.append(clean_text(text))
177
 
178
- article_text = ' '.join(texts)
179
-
180
- if not article_text:
181
- body = soup.find('body')
182
- if body:
183
- article_text = clean_text(body.get_text())
184
-
185
- if len(article_text) < 100:
186
- raise Exception("Could not find enough article content")
187
-
188
- return article_text[:8000]
189
 
190
  except Exception as e:
191
- print(f"Error in extract_article_text: {str(e)}")
192
- raise Exception(f"Error extracting article: {str(e)}")
193
 
194
- def extract_and_summarize(url, progress=gr.Progress()):
195
  if not url or not url.strip():
196
  return "Please enter a valid URL"
197
 
@@ -199,30 +71,22 @@ def extract_and_summarize(url, progress=gr.Progress()):
199
  return "Please enter a valid URL starting with http:// or https://"
200
 
201
  try:
202
- start_time = time.time()
203
-
204
- progress(0.2, desc="Fetching article...")
205
  text = extract_article_text(url)
206
 
207
  if not text:
208
  return "Could not extract text from the article. Please make sure it's a valid news article."
209
-
210
- progress(0.4, desc="Processing text...")
211
- max_chunk_length = 512
212
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
213
- chunks = chunks[:3]
214
 
215
- progress(0.6, desc="Generating summary...")
216
  summaries = []
217
  for chunk in chunks:
218
- if len(chunk.strip()) > 50:
219
  try:
220
- summary = summarizer(
221
- chunk,
222
- max_length=100,
223
- min_length=20,
224
- do_sample=False
225
- )
226
  summaries.append(summary[0]['summary_text'])
227
  except Exception as e:
228
  print(f"Error summarizing chunk: {e}")
@@ -230,11 +94,11 @@ def extract_and_summarize(url, progress=gr.Progress()):
230
 
231
  if not summaries:
232
  return "Could not generate summary. Please try a different article."
233
-
 
234
  final_summary = " ".join(summaries)
235
- processing_time = round(time.time() - start_time, 2)
236
 
237
- return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
238
 
239
  except Exception as e:
240
  return f"Error processing article: {str(e)}"
@@ -245,24 +109,17 @@ demo = gr.Interface(
245
  inputs=gr.Textbox(
246
  label="Enter News Article URL",
247
  placeholder="https://...",
248
- info="Enter a news article URL to get a quick summary"
249
  ),
250
  outputs=gr.Textbox(label="Summary", lines=5),
251
- title="📰 Fast News Article Summarizer",
252
  description="""
253
- This app quickly summarizes news articles using AI.
254
- Simply paste a URL and get a concise summary in seconds!
255
-
256
- Supported news sites include:
257
- - BBC News
258
- - Reuters
259
- - The Hindu
260
- - And many more!
261
  """,
262
  examples=[
263
  ["https://www.bbc.com/news/world-us-canada-67841980"],
264
- ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"],
265
- ["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
266
  ],
267
  theme=gr.themes.Soft()
268
  )
 
5
  import nltk
6
  import torch
7
  from urllib.parse import urlparse
 
 
 
8
 
9
  # Download required NLTK data
10
  try:
11
+ nltk.download('punkt')
12
  except Exception as e:
13
  print(f"Error downloading NLTK data: {e}")
14
 
15
+ # Initialize the summarization pipeline
16
  try:
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
 
 
 
 
 
19
  except Exception as e:
20
  print(f"Error loading model: {e}")
21
  summarizer = None
 
27
  except:
28
  return False
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def extract_article_text(url):
31
+ """Extract article text using BeautifulSoup instead of newspaper3k"""
32
  headers = {
33
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
34
  }
35
 
36
  try:
 
 
 
 
 
 
 
37
  response = requests.get(url, headers=headers, timeout=10)
38
  response.raise_for_status()
39
 
40
  soup = BeautifulSoup(response.text, 'html.parser')
41
 
42
  # Remove unwanted elements
43
+ for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
44
  tag.decompose()
45
 
46
+ # Find the main content
47
  article_text = ""
48
 
49
+ # Look for common article containers
50
+ main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
 
 
 
 
 
 
 
 
 
 
51
 
52
+ if main_content:
53
+ paragraphs = main_content.find_all('p')
 
 
54
  else:
55
+ # Fallback to all paragraphs if no article container found
56
+ paragraphs = soup.find_all('p')
57
 
58
+ # Extract text from paragraphs
59
+ article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
 
 
 
60
 
61
+ return article_text
 
 
 
 
 
 
 
 
 
 
62
 
63
  except Exception as e:
64
+ raise Exception(f"Error fetching article: {str(e)}")
 
65
 
66
+ def extract_and_summarize(url):
67
  if not url or not url.strip():
68
  return "Please enter a valid URL"
69
 
 
71
  return "Please enter a valid URL starting with http:// or https://"
72
 
73
  try:
74
+ # Extract article text
 
 
75
  text = extract_article_text(url)
76
 
77
  if not text:
78
  return "Could not extract text from the article. Please make sure it's a valid news article."
79
+
80
+ # Split text into chunks if it's too long
81
+ max_chunk_length = 1024
82
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
 
83
 
84
+ # Summarize each chunk
85
  summaries = []
86
  for chunk in chunks:
87
+ if len(chunk.strip()) > 100: # Only summarize substantial chunks
88
  try:
89
+ summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
 
 
 
 
 
90
  summaries.append(summary[0]['summary_text'])
91
  except Exception as e:
92
  print(f"Error summarizing chunk: {e}")
 
94
 
95
  if not summaries:
96
  return "Could not generate summary. Please try a different article."
97
+
98
+ # Combine all summaries
99
  final_summary = " ".join(summaries)
 
100
 
101
+ return final_summary
102
 
103
  except Exception as e:
104
  return f"Error processing article: {str(e)}"
 
109
  inputs=gr.Textbox(
110
  label="Enter News Article URL",
111
  placeholder="https://...",
112
+ info="Enter a news article URL to get a summary"
113
  ),
114
  outputs=gr.Textbox(label="Summary", lines=5),
115
+ title="📰 News Article Summarizer",
116
  description="""
117
+ This app creates concise summaries of news articles using AI.
118
+ Simply paste a URL of a news article and get a summary!
 
 
 
 
 
 
119
  """,
120
  examples=[
121
  ["https://www.bbc.com/news/world-us-canada-67841980"],
122
+ ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
 
123
  ],
124
  theme=gr.themes.Soft()
125
  )