File size: 6,384 Bytes
b35bc08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from smolagents import Tool
from typing import Any, Optional

class SimpleTool(Tool):
    name = "analyze_content"
    description = "Enhanced web content analyzer with multiple analysis modes."
    inputs = {"input_text":{"type":"string","description":"URL or direct text to analyze."},"mode":{"type":"string","nullable":True,"description":"Analysis mode ('analyze', 'summarize', 'sentiment', 'topics')."}}
    output_type = "string"

    def forward(self, input_text: str, mode: str = "analyze") -> str:
        """Enhanced web content analyzer with multiple analysis modes.

        Args:
            input_text: URL or direct text to analyze.
            mode: Analysis mode ('analyze', 'summarize', 'sentiment', 'topics').

        Returns:
            str: JSON-formatted analysis results
        """
        import requests
        from bs4 import BeautifulSoup
        import re
        from transformers import pipeline
        import json

        try:
            # Setup request headers
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}

            # Process input
            if input_text.startswith(('http://', 'https://')):
                response = requests.get(input_text, headers=headers, timeout=10)
                soup = BeautifulSoup(response.text, 'html.parser')

                # Clean page content
                for tag in soup(['script', 'style', 'meta']):
                    tag.decompose()

                title = soup.title.string if soup.title else "No title found"
                content = soup.get_text()
            else:
                title = "Text Analysis"
                content = input_text

            # Clean text
            clean_text = re.sub(r'\s+', ' ', content).strip()

            if len(clean_text) < 100:
                return json.dumps({
                    "status": "error",
                    "message": "Content too short for analysis (minimum 100 characters)"
                })

            # Initialize models
            summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
            classifier = pipeline("text-classification", 
                               model="nlptown/bert-base-multilingual-uncased-sentiment")

            # Basic stats
            stats = {
                "title": title,
                "characters": len(clean_text),
                "words": len(clean_text.split()),
                "paragraphs": len([p for p in clean_text.split("\n") if p.strip()]),
                "reading_time": f"{len(clean_text.split()) // 200} minutes"
            }

            result = {"status": "success", "stats": stats}

            # Mode-specific processing
            if mode == "analyze":
                # Get summary
                summary = summarizer(clean_text[:1024], max_length=100, min_length=30)[0]['summary_text']

                # Get overall sentiment
                sentiment = classifier(clean_text[:512])[0]
                score = int(sentiment['label'][0])
                sentiment_text = ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][score-1]

                result.update({
                    "summary": summary,
                    "sentiment": {
                        "overall": sentiment_text,
                        "score": score,
                        "confidence": f"{score/5*100:.1f}%"
                    }
                })

            elif mode == "sentiment":
                # Analyze paragraphs
                paragraphs = [p for p in clean_text.split("\n") if len(p.strip()) > 50]
                sentiments = []

                for i, para in enumerate(paragraphs[:5]):
                    sent = classifier(para[:512])[0]
                    score = int(sent['label'][0])
                    sentiments.append({
                        "section": i + 1,
                        "text": para[:100] + "...",
                        "sentiment": ["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"][score-1],
                        "score": score
                    })

                result.update({
                    "sentiment_analysis": {
                        "sections": sentiments,
                        "total_sections": len(sentiments)
                    }
                })

            elif mode == "summarize":
                # Process in chunks
                chunks = [clean_text[i:i+1024] for i in range(0, min(len(clean_text), 3072), 1024)]
                summaries = []

                for chunk in chunks:
                    if len(chunk) > 100:
                        summary = summarizer(chunk, max_length=100, min_length=30)[0]['summary_text']
                        summaries.append(summary)

                result.update({
                    "summaries": summaries,
                    "chunks_analyzed": len(summaries)
                })

            elif mode == "topics":
                # Basic topic categorization
                categories = {
                    "Technology": r"tech|software|hardware|digital|computer|AI|data",
                    "Business": r"business|market|finance|economy|industry",
                    "Science": r"science|research|study|discovery",
                    "Health": r"health|medical|medicine|wellness",
                    "General": r"news|world|people|life"
                }

                topic_scores = {}
                for topic, pattern in categories.items():
                    matches = len(re.findall(pattern, clean_text.lower()))
                    topic_scores[topic] = matches

                result.update({
                    "topic_analysis": {
                        "detected_topics": topic_scores,
                        "primary_topic": max(topic_scores.items(), key=lambda x: x[1])[0]
                    }
                })

            return json.dumps(result, indent=2)

        except requests.exceptions.RequestException as e:
            return json.dumps({
                "status": "error",
                "message": f"Failed to fetch content: {str(e)}",
                "type": "request_error"
            })
        except Exception as e:
            return json.dumps({
                "status": "error",
                "message": f"Analysis failed: {str(e)}",
                "type": "general_error"
            })