Update app.py
Browse files
app.py
CHANGED
@@ -5,25 +5,17 @@ from transformers import pipeline
|
|
5 |
import nltk
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
8 |
-
import time
|
9 |
-
import re
|
10 |
-
import json
|
11 |
|
12 |
# Download required NLTK data
|
13 |
try:
|
14 |
-
nltk.download('punkt'
|
15 |
except Exception as e:
|
16 |
print(f"Error downloading NLTK data: {e}")
|
17 |
|
18 |
-
# Initialize the summarization pipeline
|
19 |
try:
|
20 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
-
summarizer = pipeline(
|
22 |
-
"summarization",
|
23 |
-
model="facebook/bart-base-cnn",
|
24 |
-
device=device,
|
25 |
-
model_kwargs={"cache_dir": "model_cache"}
|
26 |
-
)
|
27 |
except Exception as e:
|
28 |
print(f"Error loading model: {e}")
|
29 |
summarizer = None
|
@@ -35,163 +27,43 @@ def is_valid_url(url):
|
|
35 |
except:
|
36 |
return False
|
37 |
|
38 |
-
def clean_text(text):
|
39 |
-
# Remove extra whitespace and special characters
|
40 |
-
text = re.sub(r'\s+', ' ', text)
|
41 |
-
text = re.sub(r'[^\w\s.,!?-]', '', text)
|
42 |
-
# Remove common unwanted phrases
|
43 |
-
text = re.sub(r'advertisement|subscribe now|subscription required|please sign in', '', text, flags=re.IGNORECASE)
|
44 |
-
return text.strip()
|
45 |
-
|
46 |
-
def get_hindu_article(url):
|
47 |
-
"""Special handler for The Hindu website"""
|
48 |
-
try:
|
49 |
-
# First request to get cookies and tokens
|
50 |
-
session = requests.Session()
|
51 |
-
headers = {
|
52 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
53 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
54 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
55 |
-
'Referer': 'https://www.thehindu.com/',
|
56 |
-
'DNT': '1',
|
57 |
-
'Connection': 'keep-alive',
|
58 |
-
'Upgrade-Insecure-Requests': '1',
|
59 |
-
'Sec-Fetch-Dest': 'document',
|
60 |
-
'Sec-Fetch-Mode': 'navigate',
|
61 |
-
'Sec-Fetch-Site': 'same-origin',
|
62 |
-
'Sec-Fetch-User': '?1',
|
63 |
-
'Cache-Control': 'max-age=0'
|
64 |
-
}
|
65 |
-
|
66 |
-
# Get the article ID from the URL
|
67 |
-
article_id = re.search(r'article(\d+)', url)
|
68 |
-
if article_id:
|
69 |
-
article_id = article_id.group(1)
|
70 |
-
api_url = f"https://www.thehindu.com/api/article/{article_id}/"
|
71 |
-
response = session.get(api_url, headers=headers)
|
72 |
-
if response.status_code == 200:
|
73 |
-
try:
|
74 |
-
data = response.json()
|
75 |
-
if 'body' in data:
|
76 |
-
# Parse the HTML content from the API response
|
77 |
-
soup = BeautifulSoup(data['body'], 'html.parser')
|
78 |
-
text = ' '.join(p.get_text().strip() for p in soup.find_all('p'))
|
79 |
-
if text:
|
80 |
-
return text
|
81 |
-
except:
|
82 |
-
pass
|
83 |
-
|
84 |
-
# Fallback to regular page scraping
|
85 |
-
response = session.get(url, headers=headers)
|
86 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
87 |
-
|
88 |
-
# Try multiple selectors specific to The Hindu
|
89 |
-
selectors = [
|
90 |
-
'div.article-text',
|
91 |
-
'div#content-body',
|
92 |
-
'div.article',
|
93 |
-
'div[itemprop="articleBody"]',
|
94 |
-
'div.paywall'
|
95 |
-
]
|
96 |
-
|
97 |
-
article_text = ""
|
98 |
-
for selector in selectors:
|
99 |
-
content = soup.select_one(selector)
|
100 |
-
if content:
|
101 |
-
paragraphs = content.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
|
102 |
-
'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
|
103 |
-
]))
|
104 |
-
texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40]
|
105 |
-
if texts:
|
106 |
-
article_text = ' '.join(texts)
|
107 |
-
break
|
108 |
-
|
109 |
-
if article_text:
|
110 |
-
return article_text
|
111 |
-
|
112 |
-
# Last resort: try to find any substantial paragraphs
|
113 |
-
all_paragraphs = soup.find_all('p')
|
114 |
-
texts = [p.get_text().strip() for p in all_paragraphs if len(p.get_text().strip()) > 40]
|
115 |
-
return ' '.join(texts) if texts else None
|
116 |
-
|
117 |
-
except Exception as e:
|
118 |
-
print(f"Error in get_hindu_article: {str(e)}")
|
119 |
-
return None
|
120 |
-
|
121 |
def extract_article_text(url):
|
122 |
-
"""Extract article text
|
123 |
headers = {
|
124 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
125 |
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
126 |
-
'Accept-Language': 'en-US,en;q=0.5',
|
127 |
-
'DNT': '1',
|
128 |
-
'Connection': 'keep-alive',
|
129 |
-
'Upgrade-Insecure-Requests': '1',
|
130 |
}
|
131 |
|
132 |
try:
|
133 |
-
# Special handling for The Hindu
|
134 |
-
if 'thehindu.com' in url:
|
135 |
-
article_text = get_hindu_article(url)
|
136 |
-
if article_text:
|
137 |
-
return clean_text(article_text)[:8000]
|
138 |
-
|
139 |
-
# Regular handling for other sites
|
140 |
response = requests.get(url, headers=headers, timeout=10)
|
141 |
response.raise_for_status()
|
142 |
|
143 |
soup = BeautifulSoup(response.text, 'html.parser')
|
144 |
|
145 |
# Remove unwanted elements
|
146 |
-
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside'
|
147 |
tag.decompose()
|
148 |
|
149 |
-
#
|
150 |
article_text = ""
|
151 |
|
152 |
-
#
|
153 |
-
|
154 |
-
|
155 |
-
# Method 2: Look for specific class names
|
156 |
-
if not article:
|
157 |
-
article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
|
158 |
-
'article', 'story', 'content', 'body', 'text', 'main', 'entry'
|
159 |
-
]))
|
160 |
-
|
161 |
-
# Method 3: Look for specific div patterns
|
162 |
-
if not article:
|
163 |
-
article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
|
164 |
|
165 |
-
if
|
166 |
-
paragraphs =
|
167 |
-
'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
|
168 |
-
]))
|
169 |
else:
|
170 |
-
paragraphs
|
|
|
171 |
|
172 |
-
|
173 |
-
for p in paragraphs
|
174 |
-
text = p.get_text().strip()
|
175 |
-
if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
|
176 |
-
texts.append(clean_text(text))
|
177 |
|
178 |
-
article_text
|
179 |
-
|
180 |
-
if not article_text:
|
181 |
-
body = soup.find('body')
|
182 |
-
if body:
|
183 |
-
article_text = clean_text(body.get_text())
|
184 |
-
|
185 |
-
if len(article_text) < 100:
|
186 |
-
raise Exception("Could not find enough article content")
|
187 |
-
|
188 |
-
return article_text[:8000]
|
189 |
|
190 |
except Exception as e:
|
191 |
-
|
192 |
-
raise Exception(f"Error extracting article: {str(e)}")
|
193 |
|
194 |
-
def extract_and_summarize(url
|
195 |
if not url or not url.strip():
|
196 |
return "Please enter a valid URL"
|
197 |
|
@@ -199,30 +71,22 @@ def extract_and_summarize(url, progress=gr.Progress()):
|
|
199 |
return "Please enter a valid URL starting with http:// or https://"
|
200 |
|
201 |
try:
|
202 |
-
|
203 |
-
|
204 |
-
progress(0.2, desc="Fetching article...")
|
205 |
text = extract_article_text(url)
|
206 |
|
207 |
if not text:
|
208 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
209 |
-
|
210 |
-
|
211 |
-
max_chunk_length =
|
212 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
213 |
-
chunks = chunks[:3]
|
214 |
|
215 |
-
|
216 |
summaries = []
|
217 |
for chunk in chunks:
|
218 |
-
if len(chunk.strip()) >
|
219 |
try:
|
220 |
-
summary = summarizer(
|
221 |
-
chunk,
|
222 |
-
max_length=100,
|
223 |
-
min_length=20,
|
224 |
-
do_sample=False
|
225 |
-
)
|
226 |
summaries.append(summary[0]['summary_text'])
|
227 |
except Exception as e:
|
228 |
print(f"Error summarizing chunk: {e}")
|
@@ -230,11 +94,11 @@ def extract_and_summarize(url, progress=gr.Progress()):
|
|
230 |
|
231 |
if not summaries:
|
232 |
return "Could not generate summary. Please try a different article."
|
233 |
-
|
|
|
234 |
final_summary = " ".join(summaries)
|
235 |
-
processing_time = round(time.time() - start_time, 2)
|
236 |
|
237 |
-
return
|
238 |
|
239 |
except Exception as e:
|
240 |
return f"Error processing article: {str(e)}"
|
@@ -245,24 +109,17 @@ demo = gr.Interface(
|
|
245 |
inputs=gr.Textbox(
|
246 |
label="Enter News Article URL",
|
247 |
placeholder="https://...",
|
248 |
-
info="Enter a news article URL to get a
|
249 |
),
|
250 |
outputs=gr.Textbox(label="Summary", lines=5),
|
251 |
-
title="📰
|
252 |
description="""
|
253 |
-
This app
|
254 |
-
Simply paste a URL and get a
|
255 |
-
|
256 |
-
Supported news sites include:
|
257 |
-
- BBC News
|
258 |
-
- Reuters
|
259 |
-
- The Hindu
|
260 |
-
- And many more!
|
261 |
""",
|
262 |
examples=[
|
263 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
264 |
-
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
|
265 |
-
["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
|
266 |
],
|
267 |
theme=gr.themes.Soft()
|
268 |
)
|
|
|
5 |
import nltk
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
|
|
|
|
|
|
8 |
|
9 |
# Download required NLTK data
|
10 |
try:
|
11 |
+
nltk.download('punkt')
|
12 |
except Exception as e:
|
13 |
print(f"Error downloading NLTK data: {e}")
|
14 |
|
15 |
+
# Initialize the summarization pipeline
|
16 |
try:
|
17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
|
|
|
|
|
|
|
|
|
|
|
19 |
except Exception as e:
|
20 |
print(f"Error loading model: {e}")
|
21 |
summarizer = None
|
|
|
27 |
except:
|
28 |
return False
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def extract_article_text(url):
|
31 |
+
"""Extract article text using BeautifulSoup instead of newspaper3k"""
|
32 |
headers = {
|
33 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
|
|
|
|
|
|
|
|
|
34 |
}
|
35 |
|
36 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
response = requests.get(url, headers=headers, timeout=10)
|
38 |
response.raise_for_status()
|
39 |
|
40 |
soup = BeautifulSoup(response.text, 'html.parser')
|
41 |
|
42 |
# Remove unwanted elements
|
43 |
+
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
44 |
tag.decompose()
|
45 |
|
46 |
+
# Find the main content
|
47 |
article_text = ""
|
48 |
|
49 |
+
# Look for common article containers
|
50 |
+
main_content = soup.find('article') or soup.find(class_=['article', 'post-content', 'entry-content', 'content'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
if main_content:
|
53 |
+
paragraphs = main_content.find_all('p')
|
|
|
|
|
54 |
else:
|
55 |
+
# Fallback to all paragraphs if no article container found
|
56 |
+
paragraphs = soup.find_all('p')
|
57 |
|
58 |
+
# Extract text from paragraphs
|
59 |
+
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 50])
|
|
|
|
|
|
|
60 |
|
61 |
+
return article_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
except Exception as e:
|
64 |
+
raise Exception(f"Error fetching article: {str(e)}")
|
|
|
65 |
|
66 |
+
def extract_and_summarize(url):
|
67 |
if not url or not url.strip():
|
68 |
return "Please enter a valid URL"
|
69 |
|
|
|
71 |
return "Please enter a valid URL starting with http:// or https://"
|
72 |
|
73 |
try:
|
74 |
+
# Extract article text
|
|
|
|
|
75 |
text = extract_article_text(url)
|
76 |
|
77 |
if not text:
|
78 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
79 |
+
|
80 |
+
# Split text into chunks if it's too long
|
81 |
+
max_chunk_length = 1024
|
82 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
|
|
83 |
|
84 |
+
# Summarize each chunk
|
85 |
summaries = []
|
86 |
for chunk in chunks:
|
87 |
+
if len(chunk.strip()) > 100: # Only summarize substantial chunks
|
88 |
try:
|
89 |
+
summary = summarizer(chunk, max_length=130, min_length=30, do_sample=False)
|
|
|
|
|
|
|
|
|
|
|
90 |
summaries.append(summary[0]['summary_text'])
|
91 |
except Exception as e:
|
92 |
print(f"Error summarizing chunk: {e}")
|
|
|
94 |
|
95 |
if not summaries:
|
96 |
return "Could not generate summary. Please try a different article."
|
97 |
+
|
98 |
+
# Combine all summaries
|
99 |
final_summary = " ".join(summaries)
|
|
|
100 |
|
101 |
+
return final_summary
|
102 |
|
103 |
except Exception as e:
|
104 |
return f"Error processing article: {str(e)}"
|
|
|
109 |
inputs=gr.Textbox(
|
110 |
label="Enter News Article URL",
|
111 |
placeholder="https://...",
|
112 |
+
info="Enter a news article URL to get a summary"
|
113 |
),
|
114 |
outputs=gr.Textbox(label="Summary", lines=5),
|
115 |
+
title="📰 News Article Summarizer",
|
116 |
description="""
|
117 |
+
This app creates concise summaries of news articles using AI.
|
118 |
+
Simply paste a URL of a news article and get a summary!
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
""",
|
120 |
examples=[
|
121 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
122 |
+
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
|
|
|
123 |
],
|
124 |
theme=gr.themes.Soft()
|
125 |
)
|