sohail-shaikh-s07
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -5,17 +5,23 @@ from transformers import pipeline
|
|
5 |
import nltk
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
|
|
8 |
|
9 |
# Download required NLTK data
|
10 |
try:
|
11 |
-
nltk.download('punkt')
|
12 |
except Exception as e:
|
13 |
print(f"Error downloading NLTK data: {e}")
|
14 |
|
15 |
-
# Initialize the summarization pipeline
|
16 |
try:
|
17 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
-
summarizer = pipeline(
|
|
|
|
|
|
|
|
|
|
|
19 |
except Exception as e:
|
20 |
print(f"Error loading model: {e}")
|
21 |
summarizer = None
|
@@ -28,13 +34,14 @@ def is_valid_url(url):
|
|
28 |
return False
|
29 |
|
30 |
def extract_article_text(url):
|
31 |
-
"""Extract article text using BeautifulSoup
|
32 |
headers = {
|
33 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
34 |
}
|
35 |
|
36 |
try:
|
37 |
-
|
|
|
38 |
response.raise_for_status()
|
39 |
|
40 |
soup = BeautifulSoup(response.text, 'html.parser')
|
@@ -43,27 +50,29 @@ def extract_article_text(url):
|
|
43 |
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
44 |
tag.decompose()
|
45 |
|
46 |
-
# Find the main content
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
if main_content:
|
53 |
-
paragraphs
|
|
|
54 |
else:
|
55 |
-
#
|
56 |
-
paragraphs = soup.find_all('p')
|
57 |
|
58 |
-
# Extract text from paragraphs
|
59 |
-
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) >
|
60 |
|
61 |
-
|
|
|
62 |
|
63 |
except Exception as e:
|
64 |
raise Exception(f"Error fetching article: {str(e)}")
|
65 |
|
66 |
-
def extract_and_summarize(url):
|
67 |
if not url or not url.strip():
|
68 |
return "Please enter a valid URL"
|
69 |
|
@@ -71,22 +80,35 @@ def extract_and_summarize(url):
|
|
71 |
return "Please enter a valid URL starting with http:// or https://"
|
72 |
|
73 |
try:
|
74 |
-
|
|
|
|
|
|
|
75 |
text = extract_article_text(url)
|
76 |
|
77 |
if not text:
|
78 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
82 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
83 |
|
84 |
-
#
|
|
|
|
|
|
|
|
|
85 |
summaries = []
|
86 |
for chunk in chunks:
|
87 |
-
if len(chunk.strip()) >
|
88 |
try:
|
89 |
-
summary = summarizer(
|
|
|
|
|
|
|
|
|
|
|
90 |
summaries.append(summary[0]['summary_text'])
|
91 |
except Exception as e:
|
92 |
print(f"Error summarizing chunk: {e}")
|
@@ -94,11 +116,13 @@ def extract_and_summarize(url):
|
|
94 |
|
95 |
if not summaries:
|
96 |
return "Could not generate summary. Please try a different article."
|
97 |
-
|
98 |
-
# Combine
|
99 |
final_summary = " ".join(summaries)
|
100 |
|
101 |
-
|
|
|
|
|
102 |
|
103 |
except Exception as e:
|
104 |
return f"Error processing article: {str(e)}"
|
@@ -109,13 +133,13 @@ demo = gr.Interface(
|
|
109 |
inputs=gr.Textbox(
|
110 |
label="Enter News Article URL",
|
111 |
placeholder="https://...",
|
112 |
-
info="Enter a news article URL to get a summary"
|
113 |
),
|
114 |
outputs=gr.Textbox(label="Summary", lines=5),
|
115 |
-
title="📰 News Article Summarizer",
|
116 |
description="""
|
117 |
-
This app
|
118 |
-
Simply paste a URL
|
119 |
""",
|
120 |
examples=[
|
121 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|
|
|
5 |
import nltk
|
6 |
import torch
|
7 |
from urllib.parse import urlparse
|
8 |
+
import time
|
9 |
|
10 |
# Download required NLTK data
|
11 |
try:
|
12 |
+
nltk.download('punkt', quiet=True)
|
13 |
except Exception as e:
|
14 |
print(f"Error downloading NLTK data: {e}")
|
15 |
|
16 |
+
# Initialize the summarization pipeline with a smaller, faster model
|
17 |
try:
|
18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
+
summarizer = pipeline(
|
20 |
+
"summarization",
|
21 |
+
model="facebook/bart-base-cnn", # Using smaller base model instead of large
|
22 |
+
device=device,
|
23 |
+
model_kwargs={"cache_dir": "model_cache"}
|
24 |
+
)
|
25 |
except Exception as e:
|
26 |
print(f"Error loading model: {e}")
|
27 |
summarizer = None
|
|
|
34 |
return False
|
35 |
|
36 |
def extract_article_text(url):
|
37 |
+
"""Extract article text using BeautifulSoup with timeout"""
|
38 |
headers = {
|
39 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
40 |
}
|
41 |
|
42 |
try:
|
43 |
+
# Add a shorter timeout
|
44 |
+
response = requests.get(url, headers=headers, timeout=5)
|
45 |
response.raise_for_status()
|
46 |
|
47 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
50 |
for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
|
51 |
tag.decompose()
|
52 |
|
53 |
+
# Find the main content - optimized search
|
54 |
+
main_content = (
|
55 |
+
soup.find('article') or
|
56 |
+
soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
|
57 |
+
)
|
58 |
|
59 |
if main_content:
|
60 |
+
# Only get paragraphs from main content
|
61 |
+
paragraphs = main_content.find_all('p', recursive=False)
|
62 |
else:
|
63 |
+
# Limit number of paragraphs if no main content found
|
64 |
+
paragraphs = soup.find_all('p', limit=20)
|
65 |
|
66 |
+
# Extract text from paragraphs with minimum length requirement
|
67 |
+
article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
|
68 |
|
69 |
+
# Limit total text length
|
70 |
+
return article_text[:5000]
|
71 |
|
72 |
except Exception as e:
|
73 |
raise Exception(f"Error fetching article: {str(e)}")
|
74 |
|
75 |
+
def extract_and_summarize(url, progress=gr.Progress()):
|
76 |
if not url or not url.strip():
|
77 |
return "Please enter a valid URL"
|
78 |
|
|
|
80 |
return "Please enter a valid URL starting with http:// or https://"
|
81 |
|
82 |
try:
|
83 |
+
start_time = time.time()
|
84 |
+
|
85 |
+
# Extract article text with progress updates
|
86 |
+
progress(0.2, desc="Fetching article...")
|
87 |
text = extract_article_text(url)
|
88 |
|
89 |
if not text:
|
90 |
return "Could not extract text from the article. Please make sure it's a valid news article."
|
91 |
+
|
92 |
+
progress(0.4, desc="Processing text...")
|
93 |
+
# Split text into smaller chunks
|
94 |
+
max_chunk_length = 512 # Reduced chunk size
|
95 |
chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
|
96 |
|
97 |
+
# Limit number of chunks
|
98 |
+
chunks = chunks[:3] # Process at most 3 chunks
|
99 |
+
|
100 |
+
progress(0.6, desc="Generating summary...")
|
101 |
+
# Summarize each chunk with shorter output
|
102 |
summaries = []
|
103 |
for chunk in chunks:
|
104 |
+
if len(chunk.strip()) > 50: # Reduced minimum length requirement
|
105 |
try:
|
106 |
+
summary = summarizer(
|
107 |
+
chunk,
|
108 |
+
max_length=100, # Reduced max length
|
109 |
+
min_length=20, # Reduced min length
|
110 |
+
do_sample=False
|
111 |
+
)
|
112 |
summaries.append(summary[0]['summary_text'])
|
113 |
except Exception as e:
|
114 |
print(f"Error summarizing chunk: {e}")
|
|
|
116 |
|
117 |
if not summaries:
|
118 |
return "Could not generate summary. Please try a different article."
|
119 |
+
|
120 |
+
# Combine summaries
|
121 |
final_summary = " ".join(summaries)
|
122 |
|
123 |
+
# Add processing time information
|
124 |
+
processing_time = round(time.time() - start_time, 2)
|
125 |
+
return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
|
126 |
|
127 |
except Exception as e:
|
128 |
return f"Error processing article: {str(e)}"
|
|
|
133 |
inputs=gr.Textbox(
|
134 |
label="Enter News Article URL",
|
135 |
placeholder="https://...",
|
136 |
+
info="Enter a news article URL to get a quick summary"
|
137 |
),
|
138 |
outputs=gr.Textbox(label="Summary", lines=5),
|
139 |
+
title="📰 Fast News Article Summarizer",
|
140 |
description="""
|
141 |
+
This app quickly summarizes news articles using AI.
|
142 |
+
Simply paste a URL and get a concise summary in seconds!
|
143 |
""",
|
144 |
examples=[
|
145 |
["https://www.bbc.com/news/world-us-canada-67841980"],
|