MHamdan commited on
Commit
352d285
Β·
1 Parent(s): 2f05074

Initial commit with full functionality extend

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. smart_web_analyzer.py +221 -29
requirements.txt CHANGED
@@ -3,4 +3,5 @@ gradio>=4.0.0
3
  beautifulsoup4>=4.12.0
4
  requests>=2.31.0
5
  transformers>=4.40.0
6
- torch>=2.2.0
 
 
3
  beautifulsoup4>=4.12.0
4
  requests>=2.31.0
5
  transformers>=4.40.0
6
+ torch>=2.2.0
7
+ requests
smart_web_analyzer.py CHANGED
@@ -3,52 +3,244 @@ import requests
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
  import torch
 
 
 
 
 
 
6
 
7
  class WebAnalyzer:
8
  def __init__(self):
9
  self.device = 0 if torch.cuda.is_available() else -1
10
- self.models = {
11
- 'summarize': pipeline("summarization", model="facebook/bart-large-cnn"),
12
- 'sentiment': pipeline("text-classification",
13
- model="nlptown/bert-base-multilingual-uncased-sentiment"),
14
- 'topics': pipeline("zero-shot-classification",
15
- model="facebook/bart-large-mnli")
16
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
18
  def fetch_content(self, url: str) -> str:
19
- """Fetch webpage content with custom headers"""
20
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
21
- response = requests.get(url, headers=headers, timeout=15)
22
- response.raise_for_status()
23
- return response.text
 
 
 
 
 
 
 
 
24
 
25
  def clean_html(self, html: str) -> str:
26
- """Basic HTML cleaning preserving all tags"""
27
  soup = BeautifulSoup(html, 'html.parser')
28
- return soup.prettify()
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def analyze(self, url: str, modes: list) -> dict:
31
- """Core analysis pipeline"""
32
  results = {}
 
33
  try:
 
34
  html = self.fetch_content(url)
35
- results['clean_text'] = self.clean_html(html)
 
36
 
37
- if 'summarize' in modes:
38
- results['summary'] = self.models['summarize'](html, max_length=150)[0]['summary_text']
39
-
40
- if 'sentiment' in modes:
41
- sentiment = self.models['sentiment'](html[:512])[0]
42
- results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})"
 
 
 
 
 
 
 
43
 
44
- if 'topics' in modes:
45
- topics = self.models['topics'](html[:512],
46
- candidate_labels=["Technology", "AI", "Business",
47
- "Science", "Politics"])
48
- results['topics'] = {topic: score for topic, score
49
- in zip(topics['labels'], topics['scores'])}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  except Exception as e:
 
52
  results['error'] = str(e)
53
 
54
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from bs4 import BeautifulSoup
4
  from transformers import pipeline
5
  import torch
6
+ from typing import Dict, List, Optional
7
+ import logging
8
+ from functools import lru_cache
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
 
13
  class WebAnalyzer:
14
  def __init__(self):
15
  self.device = 0 if torch.cuda.is_available() else -1
16
+ self._models: Dict[str, Optional[pipeline]] = {
17
+ 'summarize': None,
18
+ 'sentiment': None,
19
+ 'topics': None
 
 
20
  }
21
+
22
+ def _load_model(self, model_type: str) -> None:
23
+ """Lazy load models only when needed"""
24
+ if self._models[model_type] is None:
25
+ logger.info(f"Loading {model_type} model...")
26
+ if model_type == 'summarize':
27
+ self._models[model_type] = pipeline(
28
+ "summarization",
29
+ model="facebook/bart-large-cnn",
30
+ device=self.device
31
+ )
32
+ elif model_type == 'sentiment':
33
+ self._models[model_type] = pipeline(
34
+ "text-classification",
35
+ model="nlptown/bert-base-multilingual-uncased-sentiment",
36
+ device=self.device
37
+ )
38
+ elif model_type == 'topics':
39
+ self._models[model_type] = pipeline(
40
+ "zero-shot-classification",
41
+ model="facebook/bart-large-mnli",
42
+ device=self.device
43
+ )
44
 
45
+ @lru_cache(maxsize=100)
46
  def fetch_content(self, url: str) -> str:
47
+ """Fetch webpage content with caching and better error handling"""
48
+ headers = {
49
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
50
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9',
51
+ 'Accept-Language': 'en-US,en;q=0.5'
52
+ }
53
+ try:
54
+ response = requests.get(url, headers=headers, timeout=15)
55
+ response.raise_for_status()
56
+ return response.text
57
+ except requests.RequestException as e:
58
+ logger.error(f"Error fetching URL {url}: {str(e)}")
59
+ raise ValueError(f"Failed to fetch content: {str(e)}")
60
 
61
  def clean_html(self, html: str) -> str:
62
+ """Extract readable text content from HTML"""
63
  soup = BeautifulSoup(html, 'html.parser')
64
+
65
+ # Remove script and style elements
66
+ for script in soup(["script", "style", "meta", "noscript"]):
67
+ script.decompose()
68
+
69
+ # Extract text while preserving some structure
70
+ text = soup.get_text(separator='\n', strip=True)
71
+
72
+ # Clean up whitespace
73
+ lines = (line.strip() for line in text.splitlines())
74
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
75
+ text = '\n'.join(chunk for chunk in chunks if chunk)
76
+
77
+ return text
78
 
79
+ def analyze(self, url: str, modes: List[str]) -> Dict:
80
+ """Improved analysis pipeline with better error handling"""
81
  results = {}
82
+
83
  try:
84
+ # Fetch and clean content
85
  html = self.fetch_content(url)
86
+ cleaned_text = self.clean_html(html)
87
+ results['clean_text'] = cleaned_text
88
 
89
+ # Validate text length
90
+ if len(cleaned_text.split()) < 10:
91
+ raise ValueError("Insufficient text content found on page")
92
+
93
+ # Text chunks for different models
94
+ summary_text = cleaned_text[:2048] # BART limit
95
+ classification_text = cleaned_text[:512] # BERT limit
96
+
97
+ for mode in modes:
98
+ if mode not in self._models:
99
+ continue
100
+
101
+ self._load_model(mode)
102
 
103
+ if mode == 'summarize':
104
+ summary = self._models[mode](summary_text,
105
+ max_length=150,
106
+ min_length=30,
107
+ do_sample=False)[0]['summary_text']
108
+ results['summary'] = summary
109
+
110
+ elif mode == 'sentiment':
111
+ sentiment = self._models[mode](classification_text)[0]
112
+ results['sentiment'] = f"{sentiment['label']} ({sentiment['score']:.2f})"
113
+
114
+ elif mode == 'topics':
115
+ topics = self._models[mode](
116
+ classification_text,
117
+ candidate_labels=[
118
+ "Technology", "Artificial Intelligence",
119
+ "Business", "Science", "Politics",
120
+ "Health", "Environment", "Education"
121
+ ]
122
+ )
123
+ results['topics'] = {
124
+ topic: score
125
+ for topic, score in zip(topics['labels'], topics['scores'])
126
+ if score > 0.1 # Filter low confidence topics
127
+ }
128
 
129
  except Exception as e:
130
+ logger.error(f"Analysis error: {str(e)}")
131
  results['error'] = str(e)
132
 
133
+ return results
134
+
135
+ # app.py
136
+ import gradio as gr
137
+ from smart_web_analyzer import WebAnalyzer
138
+
139
+ analyzer = WebAnalyzer()
140
+
141
+ def format_results(results: Dict) -> Dict:
142
+ """Format analysis results for Gradio tabs"""
143
+ outputs = {}
144
+
145
+ if 'error' in results:
146
+ return {
147
+ "πŸ“œ Clean Text": f"❌ Error: {results['error']}",
148
+ "πŸ“ Summary": "",
149
+ "🎭 Sentiment": "",
150
+ "πŸ“Š Topics": ""
151
+ }
152
+
153
+ # Clean text tab
154
+ text_preview = results.get('clean_text', 'No text extracted')
155
+ if len(text_preview) > 1000:
156
+ text_preview = text_preview[:1000] + "...(truncated)"
157
+ outputs["πŸ“œ Clean Text"] = text_preview
158
+
159
+ # Summary tab
160
+ if 'summary' in results:
161
+ outputs["πŸ“ Summary"] = f"**AI Summary:**\n{results['summary']}"
162
+ else:
163
+ outputs["πŸ“ Summary"] = ""
164
+
165
+ # Sentiment tab
166
+ if 'sentiment' in results:
167
+ outputs["🎭 Sentiment"] = f"**Sentiment Analysis:**\n{results['sentiment']}"
168
+ else:
169
+ outputs["🎭 Sentiment"] = ""
170
+
171
+ # Topics tab
172
+ if 'topics' in results:
173
+ topics = "\n".join([
174
+ f"- **{k}**: {v:.1%}"
175
+ for k,v in sorted(results['topics'].items(),
176
+ key=lambda x: x[1], reverse=True)
177
+ ])
178
+ outputs["πŸ“Š Topics"] = f"**Detected Topics:**\n{topics}"
179
+ else:
180
+ outputs["πŸ“Š Topics"] = ""
181
+
182
+ return outputs
183
+
184
+ with gr.Blocks(title="Smart Web Analyzer Plus") as demo:
185
+ gr.Markdown("# 🌐 Smart Web Analyzer Plus")
186
+ gr.Markdown("Analyze web content with AI - extract summaries, sentiment, and topics.")
187
+
188
+ with gr.Row():
189
+ with gr.Column(scale=4):
190
+ url_input = gr.Textbox(
191
+ label="Enter URL",
192
+ placeholder="https://example.com",
193
+ show_label=True
194
+ )
195
+ with gr.Column(scale=2):
196
+ modes = gr.CheckboxGroup(
197
+ ["summarize", "sentiment", "topics"],
198
+ label="Analysis Types",
199
+ value=["summarize"] # Default selection
200
+ )
201
+ with gr.Column(scale=1):
202
+ submit_btn = gr.Button("Analyze", variant="primary")
203
+
204
+ with gr.Tabs() as tabs:
205
+ text_tab = gr.Tab("πŸ“œ Clean Text")
206
+ with text_tab:
207
+ clean_text = gr.Markdown()
208
+
209
+ summary_tab = gr.Tab("πŸ“ Summary")
210
+ with summary_tab:
211
+ summary = gr.Markdown()
212
+
213
+ sentiment_tab = gr.Tab("🎭 Sentiment")
214
+ with sentiment_tab:
215
+ sentiment = gr.Markdown()
216
+
217
+ topics_tab = gr.Tab("πŸ“Š Topics")
218
+ with topics_tab:
219
+ topics = gr.Markdown()
220
+
221
+ # Example URLs
222
+ examples = gr.Examples(
223
+ examples=[
224
+ ["https://www.bbc.com/news/technology-67881954", ["summarize", "sentiment"]],
225
+ ["https://arxiv.org/html/2312.17296v1", ["topics", "summarize"]]
226
+ ],
227
+ inputs=[url_input, modes]
228
+ )
229
+
230
+ # Handle submission
231
+ submit_btn.click(
232
+ fn=lambda url, m: format_results(analyzer.analyze(url, m)),
233
+ inputs=[url_input, modes],
234
+ outputs=[clean_text, summary, sentiment, topics],
235
+ api_name="analyze"
236
+ )
237
+
238
+ # Error handling for empty URL
239
+ url_input.change(
240
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
241
+ inputs=[url_input],
242
+ outputs=[submit_btn]
243
+ )
244
+
245
+ if __name__ == "__main__":
246
+ demo.launch()