MHamdan commited on
Commit
5215be1
Β·
1 Parent(s): dbf46ef

Initial content analyzer setup

Browse files
Files changed (4) hide show
  1. README.md +20 -9
  2. app.py +204 -0
  3. deploy_to_hf.py +113 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,14 +1,25 @@
1
- ---
2
- title: ContentAnalyzer
3
- emoji: πŸ“‰
4
- colorFrom: red
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.16.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
- short_description: general content analyzer
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ echo "---
2
+ title: Content Analyzer
3
+ emoji: πŸ“‘
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
+
 
11
  ---
12
 
13
+ # Content Analyzer
14
+
15
+ An advanced content analysis tool that can process:
16
+
17
+ - Text input
18
+ - Web URLs
19
+ - Document files (.txt, .pdf, .docx)
20
+
21
+ ## Features
22
+
23
+ - Text summarization
24
+ - Sentiment analysis
25
+ - Topic detection" > README.md
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from transformers import pipeline
6
+ import PyPDF2
7
+ import docx
8
+ import os
9
+ from typing import List, Tuple, Optional
10
+ from smolagents import CodeAgent, HfApiModel, Tool
11
+
12
+ class ContentAnalyzer:
13
+ def __init__(self):
14
+ # Initialize models
15
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
16
+ self.sentiment_analyzer = pipeline("sentiment-analysis")
17
+ self.zero_shot = pipeline("zero-shot-classification")
18
+
19
+ def read_file(self, file_obj) -> str:
20
+ """Read content from different file types."""
21
+ if file_obj is None:
22
+ return ""
23
+
24
+ file_ext = os.path.splitext(file_obj.name)[1].lower()
25
+
26
+ try:
27
+ if file_ext == '.txt':
28
+ return file_obj.read().decode('utf-8')
29
+
30
+ elif file_ext == '.pdf':
31
+ pdf_reader = PyPDF2.PdfReader(file_obj)
32
+ text = ""
33
+ for page in pdf_reader.pages:
34
+ text += page.extract_text() + "\n"
35
+ return text
36
+
37
+ elif file_ext == '.docx':
38
+ doc = docx.Document(file_obj)
39
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
40
+
41
+ else:
42
+ return f"Unsupported file type: {file_ext}"
43
+
44
+ except Exception as e:
45
+ return f"Error reading file: {str(e)}"
46
+
47
+ def fetch_web_content(self, url: str) -> str:
48
+ """Fetch content from URL."""
49
+ try:
50
+ response = requests.get(url, timeout=10)
51
+ response.raise_for_status()
52
+ soup = BeautifulSoup(response.text, 'html.parser')
53
+
54
+ # Remove scripts and styles
55
+ for script in soup(["script", "style"]):
56
+ script.decompose()
57
+
58
+ text = soup.get_text(separator='\n')
59
+ lines = (line.strip() for line in text.splitlines())
60
+ return "\n".join(line for line in lines if line)
61
+
62
+ except Exception as e:
63
+ return f"Error fetching URL: {str(e)}"
64
+
65
+ def analyze_content(self,
66
+ text: Optional[str] = None,
67
+ url: Optional[str] = None,
68
+ file: Optional[object] = None,
69
+ analysis_types: List[str] = ["summarize"]) -> dict:
70
+ """Analyze content from text, URL, or file."""
71
+ try:
72
+ # Get content from appropriate source
73
+ if url:
74
+ content = self.fetch_web_content(url)
75
+ elif file:
76
+ content = self.read_file(file)
77
+ else:
78
+ content = text or ""
79
+
80
+ if not content or content.startswith("Error"):
81
+ return {"error": content or "No content provided"}
82
+
83
+ results = {
84
+ "original_text": content[:1000] + "..." if len(content) > 1000 else content
85
+ }
86
+
87
+ # Perform requested analyses
88
+ if "summarize" in analysis_types:
89
+ summary = self.summarizer(content[:1024], max_length=130, min_length=30)
90
+ results["summary"] = summary[0]['summary_text']
91
+
92
+ if "sentiment" in analysis_types:
93
+ sentiment = self.sentiment_analyzer(content[:512])
94
+ results["sentiment"] = {
95
+ "label": sentiment[0]['label'],
96
+ "score": round(sentiment[0]['score'], 3)
97
+ }
98
+
99
+ if "topics" in analysis_types:
100
+ topics = self.zero_shot(
101
+ content[:512],
102
+ candidate_labels=["technology", "science", "business",
103
+ "politics", "entertainment", "education",
104
+ "health", "sports"]
105
+ )
106
+ results["topics"] = [
107
+ {"label": label, "score": round(score, 3)}
108
+ for label, score in zip(topics['labels'], topics['scores'])
109
+ if score > 0.1
110
+ ]
111
+
112
+ return results
113
+
114
+ except Exception as e:
115
+ return {"error": f"Analysis error: {str(e)}"}
116
+
117
+ def create_interface():
118
+ analyzer = ContentAnalyzer()
119
+
120
+ with gr.Blocks(title="Content Analyzer") as demo:
121
+ gr.Markdown("# πŸ“‘ Content Analyzer")
122
+ gr.Markdown("Analyze text content from various sources using AI.")
123
+
124
+ with gr.Tabs():
125
+ # Text Input Tab
126
+ with gr.Tab("Text Input"):
127
+ text_input = gr.Textbox(
128
+ label="Enter Text",
129
+ placeholder="Paste your text here...",
130
+ lines=5
131
+ )
132
+
133
+ # URL Input Tab
134
+ with gr.Tab("Web URL"):
135
+ url_input = gr.Textbox(
136
+ label="Enter URL",
137
+ placeholder="https://example.com"
138
+ )
139
+
140
+ # File Upload Tab
141
+ with gr.Tab("File Upload"):
142
+ file_input = gr.File(
143
+ label="Upload File",
144
+ file_types=[".txt", ".pdf", ".docx"]
145
+ )
146
+
147
+ # Analysis Options
148
+ analysis_types = gr.CheckboxGroup(
149
+ choices=["summarize", "sentiment", "topics"],
150
+ value=["summarize"],
151
+ label="Analysis Types"
152
+ )
153
+
154
+ analyze_btn = gr.Button("Analyze", variant="primary")
155
+
156
+ # Output Sections
157
+ with gr.Tabs():
158
+ with gr.Tab("Original Text"):
159
+ original_text = gr.Markdown()
160
+ with gr.Tab("Summary"):
161
+ summary_output = gr.Markdown()
162
+ with gr.Tab("Sentiment"):
163
+ sentiment_output = gr.Markdown()
164
+ with gr.Tab("Topics"):
165
+ topics_output = gr.Markdown()
166
+
167
+ def process_analysis(text, url, file, types):
168
+ # Get analysis results
169
+ results = analyzer.analyze_content(text, url, file, types)
170
+
171
+ if "error" in results:
172
+ return results["error"], "", "", ""
173
+
174
+ # Format outputs
175
+ original = results.get("original_text", "")
176
+ summary = results.get("summary", "")
177
+
178
+ sentiment = ""
179
+ if "sentiment" in results:
180
+ sent = results["sentiment"]
181
+ sentiment = f"**Sentiment:** {sent['label']} (Confidence: {sent['score']})"
182
+
183
+ topics = ""
184
+ if "topics" in results:
185
+ topics = "**Detected Topics:**\n" + "\n".join([
186
+ f"- {t['label']}: {t['score']}"
187
+ for t in results["topics"]
188
+ ])
189
+
190
+ return original, summary, sentiment, topics
191
+
192
+ # Connect the interface
193
+ analyze_btn.click(
194
+ fn=process_analysis,
195
+ inputs=[text_input, url_input, file_input, analysis_types],
196
+ outputs=[original_text, summary_output, sentiment_output, topics_output]
197
+ )
198
+
199
+ return demo
200
+
201
+ # Launch the app
202
+ if __name__ == "__main__":
203
+ demo = create_interface()
204
+ demo.launch()
deploy_to_hf.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deploy_to_hf.py
2
+
3
+ import os
4
+ import requests
5
+
6
+ # Your Hugging Face token
7
+ HF_TOKEN = os.environ.get("HF_REPO_API")
8
+ headers = {
9
+ "Authorization": f"Bearer {HF_TOKEN}",
10
+ "Content-Type": "application/json"
11
+ }
12
+
13
+ # The main app content (from your previous app.py)
14
+ app_content = """
15
+ import gradio as gr
16
+ import requests
17
+ from bs4 import BeautifulSoup
18
+ from transformers import pipeline
19
+ import PyPDF2
20
+ import docx
21
+ import os
22
+ from typing import List, Tuple, Optional
23
+
24
+ class ContentAnalyzer:
25
+ def __init__(self):
26
+ # Initialize models
27
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
28
+ self.sentiment_analyzer = pipeline("sentiment-analysis")
29
+ self.zero_shot = pipeline("zero-shot-classification")
30
+
31
+ def read_file(self, file_obj) -> str:
32
+ # ... [rest of your ContentAnalyzer class code]
33
+ pass
34
+
35
+ # ... [rest of your app.py code]
36
+ """
37
+
38
+ def commit_files_to_space():
39
+ # Prepare files content
40
+ files = {
41
+ 'app.py': app_content,
42
+ 'requirements.txt': """gradio>=4.0.0
43
+ requests>=2.31.0
44
+ beautifulsoup4>=4.12.2
45
+ transformers>=4.35.0
46
+ torch>=2.0.1
47
+ PyPDF2>=3.0.0
48
+ python-docx>=0.8.11
49
+ smolagents>=0.2.0""",
50
+ 'README.md': """---
51
+ title: Content Analyzer
52
+ emoji: πŸ“‘
53
+ colorFrom: blue
54
+ colorTo: indigo
55
+ sdk: gradio
56
+ sdk_version: 4.0.0
57
+ app_file: app.py
58
+ pinned: false
59
+ ---
60
+
61
+ # Content Analyzer
62
+
63
+ An advanced content analysis tool that can process:
64
+ - Text input
65
+ - Web URLs
66
+ - Document files (.txt, .pdf, .docx)
67
+
68
+ ## Features
69
+ - Text summarization
70
+ - Sentiment analysis
71
+ - Topic detection
72
+ """
73
+ }
74
+
75
+ # Commit each file
76
+ commit_url = "https://huggingface.co/api/spaces/MHamdan/ContentAnalyzer/commit"
77
+
78
+ operations = []
79
+ for filename, content in files.items():
80
+ operations.append({
81
+ "operation": "create",
82
+ "path": filename,
83
+ "content": content
84
+ })
85
+
86
+ commit_data = {
87
+ "operations": operations,
88
+ "commit_message": "Initial content analyzer setup"
89
+ }
90
+
91
+ response = requests.post(
92
+ commit_url,
93
+ headers=headers,
94
+ json=commit_data
95
+ )
96
+
97
+ if response.status_code == 200:
98
+ print("Files committed successfully!")
99
+ print("You can view your space at: https://huggingface.co/spaces/MHamdan/ContentAnalyzer")
100
+ else:
101
+ print("Error committing files:", response.text)
102
+ print("Status code:", response.status_code)
103
+
104
+ if __name__ == "__main__":
105
+ # Verify authentication first
106
+ auth_response = requests.get("https://huggingface.co/api/whoami-v2", headers=headers)
107
+ if auth_response.status_code == 200:
108
+ print("Authentication successful!")
109
+ commit_files_to_space()
110
+ else:
111
+ print("Authentication failed. Please check your token.")
112
+ print("Status code:", auth_response.status_code)
113
+ print("Response:", auth_response.text)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ echo "gradio>=4.0.0
2
+ requests>=2.31.0
3
+ beautifulsoup4>=4.12.2
4
+ transformers>=4.35.0
5
+ torch>=2.0.1
6
+ PyPDF2>=3.0.0
7
+ python-docx>=0.8.11
8
+ smolagents>=0.2.0" > requirements.txt