sohail-shaikh-s07 commited on
Commit
e31262c
·
verified ·
1 Parent(s): 5b9028d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -31
app.py CHANGED
@@ -6,6 +6,7 @@ import nltk
6
  import torch
7
  from urllib.parse import urlparse
8
  import time
 
9
 
10
  # Download required NLTK data
11
  try:
@@ -18,7 +19,7 @@ try:
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  summarizer = pipeline(
20
  "summarization",
21
- model="facebook/bart-base-cnn", # Using smaller base model instead of large
22
  device=device,
23
  model_kwargs={"cache_dir": "model_cache"}
24
  )
@@ -33,44 +34,88 @@ def is_valid_url(url):
33
  except:
34
  return False
35
 
 
 
 
 
 
 
 
36
  def extract_article_text(url):
37
- """Extract article text using BeautifulSoup with timeout"""
38
  headers = {
39
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
40
  }
41
 
42
  try:
43
  # Add a shorter timeout
44
- response = requests.get(url, headers=headers, timeout=5)
45
  response.raise_for_status()
46
 
47
  soup = BeautifulSoup(response.text, 'html.parser')
48
 
49
  # Remove unwanted elements
50
- for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
51
  tag.decompose()
52
 
53
- # Find the main content - optimized search
54
- main_content = (
55
- soup.find('article') or
56
- soup.find(attrs={"class": lambda x: x and any(c in x for c in ['article', 'post-content', 'entry-content', 'content'])})
57
- )
 
 
 
 
 
 
58
 
59
- if main_content:
60
- # Only get paragraphs from main content
61
- paragraphs = main_content.find_all('p', recursive=False)
 
 
 
 
 
 
 
 
 
 
62
  else:
63
- # Limit number of paragraphs if no main content found
64
- paragraphs = soup.find_all('p', limit=20)
 
 
 
 
 
 
 
65
 
66
- # Extract text from paragraphs with minimum length requirement
67
- article_text = ' '.join([p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > 40])
68
 
69
- # Limit total text length
70
- return article_text[:5000]
 
 
 
 
 
 
 
 
 
71
 
72
  except Exception as e:
73
- raise Exception(f"Error fetching article: {str(e)}")
 
74
 
75
  def extract_and_summarize(url, progress=gr.Progress()):
76
  if not url or not url.strip():
@@ -91,22 +136,19 @@ def extract_and_summarize(url, progress=gr.Progress()):
91
 
92
  progress(0.4, desc="Processing text...")
93
  # Split text into smaller chunks
94
- max_chunk_length = 512 # Reduced chunk size
95
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
96
-
97
- # Limit number of chunks
98
  chunks = chunks[:3] # Process at most 3 chunks
99
 
100
  progress(0.6, desc="Generating summary...")
101
- # Summarize each chunk with shorter output
102
  summaries = []
103
  for chunk in chunks:
104
- if len(chunk.strip()) > 50: # Reduced minimum length requirement
105
  try:
106
  summary = summarizer(
107
  chunk,
108
- max_length=100, # Reduced max length
109
- min_length=20, # Reduced min length
110
  do_sample=False
111
  )
112
  summaries.append(summary[0]['summary_text'])
@@ -117,11 +159,9 @@ def extract_and_summarize(url, progress=gr.Progress()):
117
  if not summaries:
118
  return "Could not generate summary. Please try a different article."
119
 
120
- # Combine summaries
121
  final_summary = " ".join(summaries)
122
-
123
- # Add processing time information
124
  processing_time = round(time.time() - start_time, 2)
 
125
  return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
126
 
127
  except Exception as e:
@@ -140,10 +180,17 @@ demo = gr.Interface(
140
  description="""
141
  This app quickly summarizes news articles using AI.
142
  Simply paste a URL and get a concise summary in seconds!
 
 
 
 
 
 
143
  """,
144
  examples=[
145
  ["https://www.bbc.com/news/world-us-canada-67841980"],
146
- ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"]
 
147
  ],
148
  theme=gr.themes.Soft()
149
  )
 
6
  import torch
7
  from urllib.parse import urlparse
8
  import time
9
+ import re
10
 
11
  # Download required NLTK data
12
  try:
 
19
  device = "cuda" if torch.cuda.is_available() else "cpu"
20
  summarizer = pipeline(
21
  "summarization",
22
+ model="facebook/bart-base-cnn",
23
  device=device,
24
  model_kwargs={"cache_dir": "model_cache"}
25
  )
 
34
  except:
35
  return False
36
 
37
+ def clean_text(text):
38
+ # Remove extra whitespace
39
+ text = re.sub(r'\s+', ' ', text)
40
+ # Remove special characters
41
+ text = re.sub(r'[^\w\s.,!?-]', '', text)
42
+ return text.strip()
43
+
44
  def extract_article_text(url):
45
+ """Extract article text with support for various news sites"""
46
  headers = {
47
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
48
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
49
+ 'Accept-Language': 'en-US,en;q=0.5',
50
+ 'DNT': '1',
51
+ 'Connection': 'keep-alive',
52
+ 'Upgrade-Insecure-Requests': '1',
53
  }
54
 
55
  try:
56
  # Add a shorter timeout
57
+ response = requests.get(url, headers=headers, timeout=10)
58
  response.raise_for_status()
59
 
60
  soup = BeautifulSoup(response.text, 'html.parser')
61
 
62
  # Remove unwanted elements
63
+ for tag in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'meta', 'link']):
64
  tag.decompose()
65
 
66
+ # Try multiple methods to find the main content
67
+ article_text = ""
68
+
69
+ # Method 1: Look for article tag
70
+ article = soup.find('article')
71
+
72
+ # Method 2: Look for specific class names common in news sites
73
+ if not article:
74
+ article = soup.find(class_=lambda x: x and any(c in str(x).lower() for c in [
75
+ 'article', 'story', 'content', 'body', 'text', 'main', 'entry'
76
+ ]))
77
 
78
+ # Method 3: Look for specific div patterns
79
+ if not article:
80
+ article = soup.find('div', {'id': re.compile('article|content|story|main', re.I)})
81
+
82
+ # Method 4: The Hindu specific
83
+ if 'thehindu.com' in url:
84
+ article = soup.find('div', {'id': 'content-body'}) or soup.find(class_='article')
85
+
86
+ if article:
87
+ # Get text from paragraphs
88
+ paragraphs = article.find_all(['p', 'div'], class_=lambda x: x and not any(c in str(x).lower() for c in [
89
+ 'caption', 'footer', 'social', 'meta', 'share', 'related', 'ad', 'copyright'
90
+ ]))
91
  else:
92
+ # Fallback: get all paragraphs
93
+ paragraphs = soup.find_all('p', recursive=True)
94
+
95
+ # Extract and clean text
96
+ texts = []
97
+ for p in paragraphs:
98
+ text = p.get_text().strip()
99
+ if len(text) > 40 and not any(x in text.lower() for x in ['advertisement', 'subscribe', 'subscription']):
100
+ texts.append(clean_text(text))
101
 
102
+ article_text = ' '.join(texts)
 
103
 
104
+ # If still no text, try getting all text from body
105
+ if not article_text:
106
+ body = soup.find('body')
107
+ if body:
108
+ article_text = clean_text(body.get_text())
109
+
110
+ # Limit total text length but ensure it's not too short
111
+ if len(article_text) < 100:
112
+ raise Exception("Could not find enough article content")
113
+
114
+ return article_text[:8000] # Limit to 8000 characters
115
 
116
  except Exception as e:
117
+ print(f"Error in extract_article_text: {str(e)}")
118
+ raise Exception(f"Error extracting article: {str(e)}")
119
 
120
  def extract_and_summarize(url, progress=gr.Progress()):
121
  if not url or not url.strip():
 
136
 
137
  progress(0.4, desc="Processing text...")
138
  # Split text into smaller chunks
139
+ max_chunk_length = 512
140
  chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]
 
 
141
  chunks = chunks[:3] # Process at most 3 chunks
142
 
143
  progress(0.6, desc="Generating summary...")
 
144
  summaries = []
145
  for chunk in chunks:
146
+ if len(chunk.strip()) > 50:
147
  try:
148
  summary = summarizer(
149
  chunk,
150
+ max_length=100,
151
+ min_length=20,
152
  do_sample=False
153
  )
154
  summaries.append(summary[0]['summary_text'])
 
159
  if not summaries:
160
  return "Could not generate summary. Please try a different article."
161
 
 
162
  final_summary = " ".join(summaries)
 
 
163
  processing_time = round(time.time() - start_time, 2)
164
+
165
  return f"Summary (processed in {processing_time}s):\n\n{final_summary}"
166
 
167
  except Exception as e:
 
180
  description="""
181
  This app quickly summarizes news articles using AI.
182
  Simply paste a URL and get a concise summary in seconds!
183
+
184
+ Supported news sites include:
185
+ - BBC News
186
+ - Reuters
187
+ - The Hindu
188
+ - And many more!
189
  """,
190
  examples=[
191
  ["https://www.bbc.com/news/world-us-canada-67841980"],
192
+ ["https://www.reuters.com/technology/exclusive-openai-researchers-warned-board-ai-breakthrough-ahead-sam-altman-ouster-2023-11-22/"],
193
+ ["https://www.thehindu.com/news/cities/mumbai/mumbai-boat-accident-body-of-missing-boy-found-off-mumbai-coast-toll-rises-to-15/article69012138.ece"]
194
  ],
195
  theme=gr.themes.Soft()
196
  )