Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import nltk
|
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
8 |
import time
|
|
|
9 |
|
10 |
# Download required NLTK data
|
11 |
try:
|
@@ -18,7 +19,7 @@ try:
|
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
summarizer = pipeline(
|
20 |
"summarization",
|
21 |
-
model="facebook/bart-base-cnn",
|
22 |
device=device,
|
23 |
model_kwargs={"cache_dir": "model_cache"}
|
24 |
)
|
@@ -33,44 +34,88 @@ def is_valid_url(url):
|
|
33 |
except:
|
34 |
return False
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
def extract_article_text(url):
|
37 |
-
"""Extract article text
|
38 |
headers = {
|
39 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
|
|
|
|
|
|
|
|
|
40 |
}
|
41 |
|
42 |
try:
|
43 |
# Add a shorter timeout
|
44 |
-
response = requests.get(url, headers=headers, timeout=
|
45 |
response.raise_for_status()
|
46 |
|
47 |
soup = BeautifulSoup(response.text, 'html.parser')
|
48 |
|
49 |
# Remove unwanted elements
|
50 |
-
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
51 |
tag.decompose()
|
52 |
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
else:
|
63 |
-
#
|
64 |
-
paragraphs = soup.find_all('p',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
|
68 |
|
69 |
-
#
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
except Exception as e:
|
73 |
-
|
|
|
74 |
|
75 |
def extract_and_summarize(url, progress=gr.Progress()):
|
76 |
if not url or not url.strip():
|
@@ -91,22 +136,19 @@ def extract_and_summarize(url, progress=gr.Progress()):
|
|
91 |
|
92 |
progress(0.4, desc="Processing text...")
|
93 |
# Split text into smaller chunks
|
94 |
-
max_chunk_length = 512
|
95 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
96 |
-
|
97 |
-
# Limit number of chunks
|
98 |
chunks = chunks[:3] # Process at most 3 chunks
|
99 |
|
100 |
progress(0.6, desc="Generating summary...")
|
101 |
-
# Summarize each chunk with shorter output
|
102 |
summaries = []
|
103 |
for chunk in chunks:
|
104 |
-
if len(chunk.strip()) > 50:
|
105 |
try:
|
106 |
summary = summarizer(
|
107 |
chunk,
|
108 |
-
max_length=100,
|
109 |
-
min_length=20,
|
110 |
do_sample=False
|
111 |
)
|
112 |
summaries.append(summary[0]['summary_text'])
|
@@ -117,11 +159,9 @@ def extract_and_summarize(url, progress=gr.Progress()):
|
|
117 |
if not summaries:
|
118 |
return "Could not generate summary. Please try a different article."
|
119 |
|
120 |
-
# Combine summaries
|
121 |
final_summary = " ".join(summaries)
|
122 |
-
|
123 |
-
# Add processing time information
|
124 |
processing_time = round(time.time() - start_time, 2)
|
|
|
125 |
return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
|
126 |
|
127 |
except Exception as e:
|
@@ -140,10 +180,17 @@ demo = gr.Interface(
|
|
140 |
description="""
|
141 |
This app quickly summarizes news articles using AI.
|
142 |
Simply paste a URL and get a concise summary in seconds!
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
""",
|
144 |
examples=[
|
145 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
146 |
-
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
|
|
|
147 |
],
|
148 |
theme=gr.themes.Soft()
|
149 |
)
|
|
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
8 |
import time
|
9 |
+
import re
|
10 |
|
11 |
# Download required NLTK data
|
12 |
try:
|
|
|
19 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
summarizer = pipeline(
|
21 |
"summarization",
|
22 |
+
model="facebook/bart-base-cnn",
|
23 |
device=device,
|
24 |
model_kwargs={"cache_dir": "model_cache"}
|
25 |
)
|
|
|
34 |
except:
|
35 |
return False
|
36 |
|
37 |
+
def clean_text(text):
|
38 |
+
# Remove extra whitespace
|
39 |
+
text = re.sub(r'\s+', ' ', text)
|
40 |
+
# Remove special characters
|
41 |
+
text = re.sub(r'[^\w\s.,!?-]', '', text)
|
42 |
+
return text.strip()
|
43 |
+
|
44 |
def extract_article_text(url):
|
45 |
+
"""Extract article text with support for various news sites"""
|
46 |
headers = {
|
47 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
48 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
49 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
50 |
+
'DNT': '1',
|
51 |
+
'Connection': 'keep-alive',
|
52 |
+
'Upgrade-Insecure-Requests': '1',
|
53 |
}
|
54 |
|
55 |
try:
|
56 |
# Add a shorter timeout
|
57 |
+
response = requests.get(url, headers=headers, timeout=10)
|
58 |
response.raise_for_status()
|
59 |
|
60 |
soup = BeautifulSoup(response.text, 'html.parser')
|
61 |
|
62 |
# Remove unwanted elements
|
63 |
+
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'meta', 'link']):
|
64 |
tag.decompose()
|
65 |
|
66 |
+
# Try multiple methods to find the main content
|
67 |
+
article_text = ""
|
68 |
+
|
69 |
+
# Method 1: Look for article tag
|
70 |
+
article = soup.find('article')
|
71 |
+
|
72 |
+
# Method 2: Look for specific class names common in news sites
|
73 |
+
if not article:
|
74 |
+
article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
|
75 |
+
'article', 'story', 'content', 'body', 'text', 'main', 'entry'
|
76 |
+
]))
|
77 |
|
78 |
+
# Method 3: Look for specific div patterns
|
79 |
+
if not article:
|
80 |
+
article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
|
81 |
+
|
82 |
+
# Method 4: The Hindu specific
|
83 |
+
if 'thehindu.com' in url:
|
84 |
+
article = soup.find('div', {'id': 'content-body'}) or soup.find(class_='article')
|
85 |
+
|
86 |
+
if article:
|
87 |
+
# Get text from paragraphs
|
88 |
+
paragraphs = article.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
|
89 |
+
'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
|
90 |
+
]))
|
91 |
else:
|
92 |
+
# Fallback: get all paragraphs
|
93 |
+
paragraphs = soup.find_all('p', recursive=True)
|
94 |
+
|
95 |
+
# Extract and clean text
|
96 |
+
texts = []
|
97 |
+
for p in paragraphs:
|
98 |
+
text = p.get_text().strip()
|
99 |
+
if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
|
100 |
+
texts.append(clean_text(text))
|
101 |
|
102 |
+
article_text = ' '.join(texts)
|
|
|
103 |
|
104 |
+
# If still no text, try getting all text from body
|
105 |
+
if not article_text:
|
106 |
+
body = soup.find('body')
|
107 |
+
if body:
|
108 |
+
article_text = clean_text(body.get_text())
|
109 |
+
|
110 |
+
# Limit total text length but ensure it's not too short
|
111 |
+
if len(article_text) < 100:
|
112 |
+
raise Exception("Could not find enough article content")
|
113 |
+
|
114 |
+
return article_text[:8000] # Limit to 8000 characters
|
115 |
|
116 |
except Exception as e:
|
117 |
+
print(f"Error in extract_article_text: {str(e)}")
|
118 |
+
raise Exception(f"Error extracting article: {str(e)}")
|
119 |
|
120 |
def extract_and_summarize(url, progress=gr.Progress()):
|
121 |
if not url or not url.strip():
|
|
|
136 |
|
137 |
progress(0.4, desc="Processing text...")
|
138 |
# Split text into smaller chunks
|
139 |
+
max_chunk_length = 512
|
140 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
|
|
|
|
141 |
chunks = chunks[:3] # Process at most 3 chunks
|
142 |
|
143 |
progress(0.6, desc="Generating summary...")
|
|
|
144 |
summaries = []
|
145 |
for chunk in chunks:
|
146 |
+
if len(chunk.strip()) > 50:
|
147 |
try:
|
148 |
summary = summarizer(
|
149 |
chunk,
|
150 |
+
max_length=100,
|
151 |
+
min_length=20,
|
152 |
do_sample=False
|
153 |
)
|
154 |
summaries.append(summary[0]['summary_text'])
|
|
|
159 |
if not summaries:
|
160 |
return "Could not generate summary. Please try a different article."
|
161 |
|
|
|
162 |
final_summary = " ".join(summaries)
|
|
|
|
|
163 |
processing_time = round(time.time() - start_time, 2)
|
164 |
+
|
165 |
return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
|
166 |
|
167 |
except Exception as e:
|
|
|
180 |
description="""
|
181 |
This app quickly summarizes news articles using AI.
|
182 |
Simply paste a URL and get a concise summary in seconds!
|
183 |
+
|
184 |
+
Supported news sites include:
|
185 |
+
- BBC News
|
186 |
+
- Reuters
|
187 |
+
- The Hindu
|
188 |
+
- And many more!
|
189 |
""",
|
190 |
examples=[
|
191 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
192 |
+
["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"],
|
193 |
+
["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
|
194 |
],
|
195 |
theme=gr.themes.Soft()
|
196 |
)
|