Update app.py
Browse files
app.py
CHANGED
@@ -1,307 +1,532 @@
|
|
1 |
-
# analyzers.py
|
2 |
import re
|
3 |
import emoji
|
4 |
import statistics
|
5 |
from collections import Counter
|
6 |
-
from typing import Dict, List, Tuple, Optional
|
7 |
import logging
|
8 |
-
from
|
|
|
9 |
import csv
|
|
|
|
|
|
|
10 |
|
11 |
-
logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
class TextAnalyzer:
|
15 |
-
"""
|
|
|
16 |
@staticmethod
|
17 |
def clean_text(text: str) -> str:
|
18 |
-
|
|
|
19 |
|
20 |
@staticmethod
|
21 |
def count_emojis(text: str) -> int:
|
22 |
-
|
|
|
23 |
|
24 |
@staticmethod
|
25 |
-
def extract_mentions(text: str) ->
|
26 |
-
|
|
|
27 |
|
28 |
@staticmethod
|
29 |
def get_words(text: str) -> List[str]:
|
30 |
-
|
|
|
31 |
|
32 |
class SentimentAnalyzer:
|
33 |
-
"""
|
34 |
-
POSITIVE_INDICATORS = {
|
35 |
-
'emoji': ['🔥', '❤️', '👍', '😊', '💪', '👏', '🎉', '♥️', '😍', '🙏'],
|
36 |
-
'words': ['круто', 'супер', 'класс', 'огонь', 'пушка', 'отлично', 'здорово',
|
37 |
-
'прекрасно', 'молодец', 'красота', 'спасибо', 'топ', 'лучший',
|
38 |
-
'amazing', 'wonderful', 'great', 'perfect', 'love', 'beautiful']
|
39 |
-
}
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
'
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
}
|
47 |
|
48 |
@classmethod
|
49 |
-
def analyze(cls, text: str) ->
|
|
|
|
|
|
|
50 |
text_lower = text.lower()
|
51 |
-
|
52 |
-
if ind in text_lower)
|
53 |
-
neg_count = sum(1 for ind in cls.NEGATIVE_INDICATORS['emoji'] + cls.NEGATIVE_INDICATORS['words']
|
54 |
-
if ind in text_lower)
|
55 |
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
if pos_count > neg_count:
|
58 |
-
pos_count
|
59 |
elif neg_count > pos_count:
|
60 |
-
neg_count
|
61 |
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
class CommentExtractor:
|
65 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
PATTERNS = {
|
67 |
-
'username':
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
94 |
}
|
95 |
|
96 |
@classmethod
|
97 |
-
def extract_data(cls, comment_text: str) ->
|
|
|
98 |
try:
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
username = match.group(1).strip()
|
104 |
-
break
|
105 |
-
|
106 |
-
if not username:
|
107 |
-
return None, None, 0, 0
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
113 |
comment = TextAnalyzer.clean_text(comment)
|
114 |
|
115 |
-
#
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
return username, comment, likes, weeks
|
134 |
|
|
|
|
|
|
|
135 |
except Exception as e:
|
136 |
-
logger.error(f"
|
137 |
-
return None
|
138 |
|
139 |
class StatsCalculator:
|
140 |
-
"""
|
|
|
141 |
@staticmethod
|
142 |
-
def calculate_period_stats(
|
143 |
-
|
|
|
144 |
return {}
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
|
|
|
|
149 |
|
150 |
-
|
151 |
-
|
152 |
'early': [],
|
153 |
'middle': [],
|
154 |
'late': []
|
155 |
}
|
156 |
|
157 |
-
for
|
158 |
-
if
|
159 |
-
|
160 |
-
elif
|
161 |
-
|
162 |
else:
|
163 |
-
|
164 |
|
|
|
165 |
return {
|
166 |
period: {
|
167 |
-
'comments': len(
|
168 |
-
'avg_likes':
|
169 |
-
'sentiment_ratio': sum(
|
|
|
|
|
|
|
170 |
}
|
171 |
-
for period,
|
172 |
}
|
173 |
|
174 |
-
def analyze_post(
|
175 |
-
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
try:
|
178 |
-
#
|
179 |
-
|
180 |
-
r
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
|
188 |
-
#
|
189 |
-
|
190 |
-
|
|
|
|
|
191 |
|
192 |
-
if not
|
193 |
-
|
194 |
-
|
195 |
-
usernames, comments, likes, weeks = zip(*valid_data)
|
196 |
-
likes = [str(l) for l in likes]
|
197 |
|
198 |
-
#
|
199 |
-
comment_stats = {
|
200 |
-
'lengths': [len(c) for c in comments],
|
201 |
-
'words': [len(TextAnalyzer.get_words(c)) for c in comments],
|
202 |
-
'emojis': sum(TextAnalyzer.count_emojis(c) for c in comments),
|
203 |
-
'mentions': [m for c in comments for m in TextAnalyzer.extract_mentions(c)],
|
204 |
-
'sentiments': [SentimentAnalyzer.analyze(c) for c in comments]
|
205 |
-
}
|
206 |
-
|
207 |
-
# Расчет базовой статистики
|
208 |
basic_stats = {
|
209 |
-
'total_comments': len(
|
210 |
-
'avg_length': statistics.mean(
|
211 |
-
'median_length': statistics.median(
|
212 |
-
'avg_words': statistics.mean(
|
213 |
-
'total_likes': sum(
|
214 |
-
'avg_likes': statistics.mean(
|
215 |
}
|
216 |
|
217 |
-
#
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
|
226 |
return (
|
227 |
-
|
228 |
-
"\n".join(
|
229 |
-
"\n".join(
|
230 |
-
"\n".join(likes),
|
231 |
str(basic_stats['total_likes'])
|
232 |
)
|
233 |
|
234 |
except Exception as e:
|
235 |
-
logger.error(f"Error
|
236 |
-
return f"Error: {str(e)}", "", "", "", "0"
|
237 |
|
238 |
-
def
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
252 |
|
253 |
-
|
254 |
-
|
255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
writer.writerow([section])
|
257 |
for key, value in data.items():
|
258 |
writer.writerow([key, value])
|
259 |
writer.writerow([])
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
f"
|
267 |
-
f"
|
268 |
-
f"-
|
269 |
-
f"-
|
270 |
-
f"
|
271 |
-
f"
|
272 |
-
f"-
|
273 |
-
f"-
|
274 |
-
f"-
|
275 |
-
f"
|
276 |
-
f"-
|
277 |
-
f"
|
278 |
-
f"-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
)
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
-
#
|
282 |
import gradio as gr
|
283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
iface = gr.Interface(
|
285 |
-
fn=
|
286 |
inputs=[
|
287 |
-
gr.Radio(
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
gr.
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
],
|
295 |
outputs=[
|
296 |
-
gr.Textbox(
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
gr.Textbox(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
],
|
302 |
title="Enhanced Instagram Comment Analyzer",
|
303 |
-
description="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
)
|
305 |
|
306 |
if __name__ == "__main__":
|
307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
import emoji
|
3 |
import statistics
|
4 |
from collections import Counter
|
5 |
+
from typing import Dict, List, Tuple, Optional, Set, Union
|
6 |
import logging
|
7 |
+
from pathlib import Path
|
8 |
+
from datetime import datetime
|
9 |
import csv
|
10 |
+
from dataclasses import dataclass, asdict
|
11 |
+
from enum import Enum
|
12 |
+
import numpy as np
|
13 |
|
14 |
+
# Configure logging
|
15 |
+
log_dir = Path("logs")
|
16 |
+
log_dir.mkdir(exist_ok=True)
|
17 |
+
logging.basicConfig(
|
18 |
+
level=logging.INFO,
|
19 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
20 |
+
handlers=[
|
21 |
+
logging.FileHandler(log_dir / f'analyzer_{datetime.now():%Y%m%d}.log'),
|
22 |
+
logging.StreamHandler()
|
23 |
+
]
|
24 |
+
)
|
25 |
logger = logging.getLogger(__name__)
|
26 |
|
27 |
+
class Sentiment(str, Enum):
|
28 |
+
POSITIVE = 'positive'
|
29 |
+
SLIGHTLY_POSITIVE = 'slightly_positive'
|
30 |
+
NEUTRAL = 'neutral'
|
31 |
+
SLIGHTLY_NEGATIVE = 'slightly_negative'
|
32 |
+
NEGATIVE = 'negative'
|
33 |
+
|
34 |
+
@dataclass
|
35 |
+
class CommentData:
|
36 |
+
username: str
|
37 |
+
text: str
|
38 |
+
likes: int
|
39 |
+
weeks_ago: float
|
40 |
+
sentiment: Sentiment
|
41 |
+
|
42 |
class TextAnalyzer:
|
43 |
+
"""Enhanced text analysis utilities"""
|
44 |
+
|
45 |
@staticmethod
|
46 |
def clean_text(text: str) -> str:
|
47 |
+
"""Clean text using more efficient string splitting"""
|
48 |
+
return ' '.join(text.split())
|
49 |
|
50 |
@staticmethod
|
51 |
def count_emojis(text: str) -> int:
|
52 |
+
"""Count emojis using set operations for better performance"""
|
53 |
+
return len({c for c in text if c in emoji.EMOJI_DATA})
|
54 |
|
55 |
@staticmethod
|
56 |
+
def extract_mentions(text: str) -> Set[str]:
|
57 |
+
"""Extract mentions returning a set for uniqueness"""
|
58 |
+
return set(re.findall(r'@[\w.]+', text))
|
59 |
|
60 |
@staticmethod
|
61 |
def get_words(text: str) -> List[str]:
|
62 |
+
"""Extract meaningful words using improved regex"""
|
63 |
+
return [w for w in re.findall(r'\b\w{3,}\b', text.lower())]
|
64 |
|
65 |
class SentimentAnalyzer:
|
66 |
+
"""Enhanced sentiment analysis with gradual classification"""
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
+
# Using sets for O(1) lookup
|
69 |
+
INDICATORS = {
|
70 |
+
'positive': {
|
71 |
+
'🔥', '❤️', '👍', '😊', '💪', '👏', '🎉', '♥️', '😍', '🙏',
|
72 |
+
'круто', 'супер', 'класс', 'огонь', 'пушка', 'отлично', 'здорово',
|
73 |
+
'прекрасно', 'молодец', 'красота', 'спасибо', 'топ', 'лучший',
|
74 |
+
'amazing', 'wonderful', 'great', 'perfect', 'love', 'beautiful'
|
75 |
+
},
|
76 |
+
'negative': {
|
77 |
+
'👎', '😢', '😞', '😠', '😡', '💔', '😕', '😑',
|
78 |
+
'плохо', 'ужас', 'отстой', 'фу', 'жесть', 'ужасно',
|
79 |
+
'разочарован', 'печаль', 'грустно', 'bad', 'worst',
|
80 |
+
'terrible', 'awful', 'sad', 'disappointed'
|
81 |
+
}
|
82 |
}
|
83 |
|
84 |
@classmethod
|
85 |
+
def analyze(cls, text: str) -> Sentiment:
|
86 |
+
"""
|
87 |
+
Analyze text sentiment with enhanced granularity and emphasis handling
|
88 |
+
"""
|
89 |
text_lower = text.lower()
|
90 |
+
words = set(cls.TextAnalyzer.get_words(text_lower))
|
|
|
|
|
|
|
91 |
|
92 |
+
pos_count = len(words & cls.INDICATORS['positive'])
|
93 |
+
neg_count = len(words & cls.INDICATORS['negative'])
|
94 |
+
|
95 |
+
# Calculate emphasis multiplier based on punctuation
|
96 |
+
emphasis = min(text.count('!') * 0.2 + text.count('?') * 0.1, 1.0)
|
97 |
+
|
98 |
+
# Apply emphasis to the dominant sentiment
|
99 |
if pos_count > neg_count:
|
100 |
+
pos_count *= (1 + emphasis)
|
101 |
elif neg_count > pos_count:
|
102 |
+
neg_count *= (1 + emphasis)
|
103 |
|
104 |
+
# Determine sentiment with granularity
|
105 |
+
total = pos_count + neg_count
|
106 |
+
if total == 0:
|
107 |
+
return Sentiment.NEUTRAL
|
108 |
+
|
109 |
+
ratio = pos_count / total
|
110 |
+
if ratio > 0.8:
|
111 |
+
return Sentiment.POSITIVE
|
112 |
+
elif ratio > 0.6:
|
113 |
+
return Sentiment.SLIGHTLY_POSITIVE
|
114 |
+
elif ratio < 0.2:
|
115 |
+
return Sentiment.NEGATIVE
|
116 |
+
elif ratio < 0.4:
|
117 |
+
return Sentiment.SLIGHTLY_NEGATIVE
|
118 |
+
return Sentiment.NEUTRAL
|
119 |
|
120 |
class CommentExtractor:
|
121 |
+
"""Enhanced comment data extraction"""
|
122 |
+
|
123 |
+
class ParseError(Exception):
|
124 |
+
"""Custom exception for parsing errors"""
|
125 |
+
pass
|
126 |
+
|
127 |
+
# Optimized patterns with named groups
|
128 |
PATTERNS = {
|
129 |
+
'username': re.compile(r"""
|
130 |
+
(?:
|
131 |
+
Фото\sпрофиля\s(?P<name1>[^\n]+)|
|
132 |
+
^(?P<name2>[^\s]+)\s+|
|
133 |
+
@(?P<name3>[^\s]+)\s+
|
134 |
+
)
|
135 |
+
""", re.VERBOSE),
|
136 |
+
|
137 |
+
'time': re.compile(r"""
|
138 |
+
(?P<value>\d+)\s*
|
139 |
+
(?P<unit>(?:ч|нед|h|w|час|hour|week))\.?
|
140 |
+
""", re.VERBOSE),
|
141 |
+
|
142 |
+
'likes': re.compile(r"""
|
143 |
+
(?:
|
144 |
+
(?P<count1>\d+)\s*отметк[аи]\s\"Нравится\"|
|
145 |
+
Нравится:\s*(?P<count2>\d+)|
|
146 |
+
\"Нравится\":\s*(?P<count3>\d+)|
|
147 |
+
likes?:\s*(?P<count4>\d+)
|
148 |
+
)
|
149 |
+
""", re.VERBOSE),
|
150 |
+
|
151 |
+
'metadata': re.compile(r"""
|
152 |
+
Фото\sпрофиля[^\n]+\n|
|
153 |
+
\d+\s*(?:ч|нед|h|w|час|hour|week)\.?|
|
154 |
+
(?:Нравится|likes?):\s*\d+|
|
155 |
+
\d+\s*отметк[аи]\s\"Нравится\"|
|
156 |
+
Ответить|
|
157 |
+
Показать\sперевод|
|
158 |
+
Скрыть\sвсе\sответы|
|
159 |
+
Смотреть\sвсе\sответы\s\(\d+\)
|
160 |
+
""", re.VERBOSE)
|
161 |
}
|
162 |
|
163 |
@classmethod
|
164 |
+
def extract_data(cls, comment_text: str) -> Optional[CommentData]:
|
165 |
+
"""Extract comment data with improved error handling"""
|
166 |
try:
|
167 |
+
# Extract username
|
168 |
+
username_match = cls.PATTERNS['username'].search(comment_text)
|
169 |
+
if not username_match:
|
170 |
+
raise cls.ParseError("Could not extract username")
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
+
username = next(
|
173 |
+
name for name in username_match.groups()
|
174 |
+
if name is not None
|
175 |
+
).strip()
|
176 |
+
|
177 |
+
# Clean comment text
|
178 |
+
comment = cls.PATTERNS['metadata'].sub('', comment_text)
|
179 |
comment = TextAnalyzer.clean_text(comment)
|
180 |
|
181 |
+
# Extract time
|
182 |
+
time_match = cls.PATTERNS['time'].search(comment_text)
|
183 |
+
if not time_match:
|
184 |
+
weeks = 0
|
185 |
+
else:
|
186 |
+
value = int(time_match.group('value'))
|
187 |
+
unit = time_match.group('unit')
|
188 |
+
weeks = value if unit in {'нед', 'w', 'week'} else value / (24 * 7)
|
189 |
+
|
190 |
+
# Extract likes
|
191 |
+
likes_match = cls.PATTERNS['likes'].search(comment_text)
|
192 |
+
likes = next(
|
193 |
+
(int(count) for count in likes_match.groups() if count),
|
194 |
+
0
|
195 |
+
) if likes_match else 0
|
196 |
+
|
197 |
+
# Analyze sentiment
|
198 |
+
sentiment = SentimentAnalyzer.analyze(comment)
|
199 |
|
200 |
+
return CommentData(
|
201 |
+
username=username,
|
202 |
+
text=comment,
|
203 |
+
likes=likes,
|
204 |
+
weeks_ago=weeks,
|
205 |
+
sentiment=sentiment
|
206 |
+
)
|
|
|
207 |
|
208 |
+
except cls.ParseError as e:
|
209 |
+
logger.warning(f"Failed to parse comment: {e}")
|
210 |
+
return None
|
211 |
except Exception as e:
|
212 |
+
logger.error(f"Unexpected error parsing comment: {e}", exc_info=True)
|
213 |
+
return None
|
214 |
|
215 |
class StatsCalculator:
|
216 |
+
"""Enhanced statistics calculation"""
|
217 |
+
|
218 |
@staticmethod
|
219 |
+
def calculate_period_stats(comments: List[CommentData]) -> Dict:
|
220 |
+
"""Calculate statistics using quantile-based periods"""
|
221 |
+
if not comments:
|
222 |
return {}
|
223 |
|
224 |
+
# Sort by weeks
|
225 |
+
sorted_comments = sorted(comments, key=lambda x: x.weeks_ago)
|
226 |
+
|
227 |
+
# Calculate period boundaries using quantiles
|
228 |
+
weeks = [c.weeks_ago for c in sorted_comments]
|
229 |
+
boundaries = np.quantile(weeks, [0.33, 0.67])
|
230 |
|
231 |
+
# Group comments by period
|
232 |
+
periods = {
|
233 |
'early': [],
|
234 |
'middle': [],
|
235 |
'late': []
|
236 |
}
|
237 |
|
238 |
+
for comment in sorted_comments:
|
239 |
+
if comment.weeks_ago <= boundaries[0]:
|
240 |
+
periods['early'].append(comment)
|
241 |
+
elif comment.weeks_ago <= boundaries[1]:
|
242 |
+
periods['middle'].append(comment)
|
243 |
else:
|
244 |
+
periods['late'].append(comment)
|
245 |
|
246 |
+
# Calculate statistics for each period
|
247 |
return {
|
248 |
period: {
|
249 |
+
'comments': len(comments),
|
250 |
+
'avg_likes': statistics.mean(c.likes for c in comments) if comments else 0,
|
251 |
+
'sentiment_ratio': sum(
|
252 |
+
1 for c in comments
|
253 |
+
if c.sentiment in {Sentiment.POSITIVE, Sentiment.SLIGHTLY_POSITIVE}
|
254 |
+
) / len(comments) if comments else 0
|
255 |
}
|
256 |
+
for period, comments in periods.items()
|
257 |
}
|
258 |
|
259 |
+
def analyze_post(
|
260 |
+
content_type: str,
|
261 |
+
link_to_post: str,
|
262 |
+
post_likes: int,
|
263 |
+
post_date: str,
|
264 |
+
description: str,
|
265 |
+
comment_count: int,
|
266 |
+
all_comments: str
|
267 |
+
) -> Tuple[str, str, str, str, str]:
|
268 |
+
"""Enhanced post analysis with improved error handling and reporting"""
|
269 |
try:
|
270 |
+
# Split comments using optimized pattern
|
271 |
+
comment_pattern = re.compile(
|
272 |
+
r'(?=Фото профиля|\n\s*[a-zA-Z0-9._]+\s+|\b@[a-zA-Z0-9._]+\s+)',
|
273 |
+
re.MULTILINE
|
274 |
+
)
|
275 |
+
comments_blocks = [
|
276 |
+
block.strip() for block in comment_pattern.split(all_comments)
|
277 |
+
if block and block.strip() and 'Скрыто алгоритмами Instagram' not in block
|
278 |
+
]
|
279 |
|
280 |
+
# Extract and validate comment data
|
281 |
+
comments_data = []
|
282 |
+
for block in comments_blocks:
|
283 |
+
if data := CommentExtractor.extract_data(block):
|
284 |
+
comments_data.append(data)
|
285 |
|
286 |
+
if not comments_data:
|
287 |
+
logger.warning("No valid comments found in the input")
|
288 |
+
return "No valid comments found", "", "", "", "0"
|
|
|
|
|
289 |
|
290 |
+
# Calculate statistics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
basic_stats = {
|
292 |
+
'total_comments': len(comments_data),
|
293 |
+
'avg_length': statistics.mean(len(c.text) for c in comments_data),
|
294 |
+
'median_length': statistics.median(len(c.text) for c in comments_data),
|
295 |
+
'avg_words': statistics.mean(len(TextAnalyzer.get_words(c.text)) for c in comments_data),
|
296 |
+
'total_likes': sum(c.likes for c in comments_data),
|
297 |
+
'avg_likes': statistics.mean(c.likes for c in comments_data)
|
298 |
}
|
299 |
|
300 |
+
# Generate reports
|
301 |
+
reports = generate_reports(
|
302 |
+
content_type=content_type,
|
303 |
+
link_to_post=link_to_post,
|
304 |
+
post_likes=post_likes,
|
305 |
+
comments_data=comments_data,
|
306 |
+
basic_stats=basic_stats
|
307 |
+
)
|
308 |
|
309 |
return (
|
310 |
+
reports['analytics'],
|
311 |
+
"\n".join(c.username for c in comments_data),
|
312 |
+
"\n".join(c.text for c in comments_data),
|
313 |
+
"\n".join(str(c.likes) for c in comments_data),
|
314 |
str(basic_stats['total_likes'])
|
315 |
)
|
316 |
|
317 |
except Exception as e:
|
318 |
+
logger.error(f"Error analyzing post: {e}", exc_info=True)
|
319 |
+
return f"Error analyzing post: {str(e)}", "", "", "", "0"
|
320 |
|
321 |
+
def generate_reports(
|
322 |
+
content_type: str,
|
323 |
+
link_to_post: str,
|
324 |
+
post_likes: int,
|
325 |
+
comments_data: List[CommentData],
|
326 |
+
basic_stats: Dict
|
327 |
+
) -> Dict[str, str]:
|
328 |
+
"""Generate comprehensive reports in multiple formats"""
|
329 |
+
|
330 |
+
# Calculate additional statistics
|
331 |
+
sentiment_dist = Counter(c.sentiment for c in comments_data)
|
332 |
+
period_stats = StatsCalculator.calculate_period_stats(comments_data)
|
333 |
+
top_users = Counter(c.username for c in comments_data).most_common(5)
|
334 |
+
top_mentioned = Counter(
|
335 |
+
mention for c in comments_data
|
336 |
+
for mention in TextAnalyzer.extract_mentions(c.text)
|
337 |
+
).most_common(5)
|
338 |
|
339 |
+
# Generate CSV report
|
340 |
+
csv_output = StringIO()
|
341 |
+
writer = csv.writer(csv_output)
|
342 |
+
|
343 |
+
# Write metadata
|
344 |
+
writer.writerow(['Content Analysis Report'])
|
345 |
+
writer.writerow(['Generated', datetime.now().isoformat()])
|
346 |
+
writer.writerow(['Content Type', content_type])
|
347 |
+
writer.writerow(['Post URL', link_to_post])
|
348 |
+
writer.writerow(['Post Likes', post_likes])
|
349 |
+
writer.writerow([])
|
350 |
+
|
351 |
+
# Write statistics sections
|
352 |
+
for section, data in {
|
353 |
+
'Basic Statistics': basic_stats,
|
354 |
+
'Sentiment Distribution': sentiment_dist,
|
355 |
+
'Period Analysis': period_stats,
|
356 |
+
'Top Users': dict(top_users),
|
357 |
+
'Top Mentioned': dict(top_mentioned)
|
358 |
+
}.items():
|
359 |
writer.writerow([section])
|
360 |
for key, value in data.items():
|
361 |
writer.writerow([key, value])
|
362 |
writer.writerow([])
|
363 |
+
|
364 |
+
# Generate text report
|
365 |
+
text_report = (
|
366 |
+
f"ANALYSIS REPORT\n"
|
367 |
+
f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}\n\n"
|
368 |
+
f"BASIC STATISTICS:\n"
|
369 |
+
f"- Total Comments: {basic_stats['total_comments']}\n"
|
370 |
+
f"- Average Likes: {basic_stats['avg_likes']:.1f}\n"
|
371 |
+
f"- Average Length: {basic_stats['avg_length']:.1f} characters\n"
|
372 |
+
f"- Median Length: {basic_stats['median_length']}\n"
|
373 |
+
f"- Average Words: {basic_stats['avg_words']:.1f}\n\n"
|
374 |
+
f"SENTIMENT ANALYSIS:\n"
|
375 |
+
f"- Positive: {sentiment_dist[Sentiment.POSITIVE]}\n"
|
376 |
+
f"- Slightly Positive: {sentiment_dist[Sentiment.SLIGHTLY_POSITIVE]}\n"
|
377 |
+
f"- Neutral: {sentiment_dist[Sentiment.NEUTRAL]}\n"
|
378 |
+
f"- Slightly Negative: {sentiment_dist[Sentiment.SLIGHTLY_NEGATIVE]}\n"
|
379 |
+
f"- Negative: {sentiment_dist[Sentiment.NEGATIVE]}\n\n"
|
380 |
+
f"TOP CONTRIBUTORS:\n" +
|
381 |
+
"\n".join(f"- {user}: {count} comments" for user, count in top_users) +
|
382 |
+
f"\n\nMOST MENTIONED:\n""\n".join(f"- {user}: {count} mentions" for user, count in top_mentioned) +
|
383 |
+
f"\n\nENGAGEMENT PERIODS:\n"
|
384 |
+
f"Early Period:\n"
|
385 |
+
f"- Comments: {period_stats['early']['comments']}\n"
|
386 |
+
f"- Avg Likes: {period_stats['early']['avg_likes']:.1f}\n"
|
387 |
+
f"- Positive Sentiment: {period_stats['early']['sentiment_ratio']*100:.1f}%\n\n"
|
388 |
+
f"Middle Period:\n"
|
389 |
+
f"- Comments: {period_stats['middle']['comments']}\n"
|
390 |
+
f"- Avg Likes: {period_stats['middle']['avg_likes']:.1f}\n"
|
391 |
+
f"- Positive Sentiment: {period_stats['middle']['sentiment_ratio']*100:.1f}%\n\n"
|
392 |
+
f"Late Period:\n"
|
393 |
+
f"- Comments: {period_stats['late']['comments']}\n"
|
394 |
+
f"- Avg Likes: {period_stats['late']['avg_likes']:.1f}\n"
|
395 |
+
f"- Positive Sentiment: {period_stats['late']['sentiment_ratio']*100:.1f}%\n"
|
396 |
)
|
397 |
+
|
398 |
+
return {
|
399 |
+
'csv': csv_output.getvalue(),
|
400 |
+
'analytics': text_report
|
401 |
+
}
|
402 |
|
403 |
+
# Gradio interface with improved input validation and error handling
|
404 |
import gradio as gr
|
405 |
|
406 |
+
def validate_input(content_type: str, link: str, likes: int, date: str,
|
407 |
+
description: str, comment_count: int, comments: str) -> Tuple[bool, str]:
|
408 |
+
"""Validate input parameters before processing"""
|
409 |
+
if not link:
|
410 |
+
return False, "Post link is required"
|
411 |
+
if likes < 0:
|
412 |
+
return False, "Likes count cannot be negative"
|
413 |
+
if comment_count < 0:
|
414 |
+
return False, "Comment count cannot be negative"
|
415 |
+
if not comments.strip():
|
416 |
+
return False, "Comments text is required"
|
417 |
+
return True, ""
|
418 |
+
|
419 |
+
def wrapped_analyze_post(*args):
|
420 |
+
"""Wrapper for analyze_post with input validation"""
|
421 |
+
is_valid, error_message = validate_input(*args)
|
422 |
+
if not is_valid:
|
423 |
+
return error_message, "", "", "", "0"
|
424 |
+
|
425 |
+
try:
|
426 |
+
return analyze_post(*args)
|
427 |
+
except Exception as e:
|
428 |
+
logger.error(f"Error in analyze_post wrapper: {e}", exc_info=True)
|
429 |
+
return f"An error occurred: {str(e)}", "", "", "", "0"
|
430 |
+
|
431 |
+
# Create enhanced Gradio interface
|
432 |
iface = gr.Interface(
|
433 |
+
fn=wrapped_analyze_post,
|
434 |
inputs=[
|
435 |
+
gr.Radio(
|
436 |
+
choices=["Photo", "Video", "Reel", "Story"],
|
437 |
+
label="Content Type",
|
438 |
+
value="Photo"
|
439 |
+
),
|
440 |
+
gr.Textbox(
|
441 |
+
label="Link to Post",
|
442 |
+
placeholder="https://instagram.com/p/..."
|
443 |
+
),
|
444 |
+
gr.Number(
|
445 |
+
label="Post Likes",
|
446 |
+
value=0,
|
447 |
+
minimum=0
|
448 |
+
),
|
449 |
+
gr.Textbox(
|
450 |
+
label="Post Date",
|
451 |
+
placeholder="YYYY-MM-DD"
|
452 |
+
),
|
453 |
+
gr.Textbox(
|
454 |
+
label="Post Description",
|
455 |
+
lines=3,
|
456 |
+
placeholder="Enter post description..."
|
457 |
+
),
|
458 |
+
gr.Number(
|
459 |
+
label="Total Comment Count",
|
460 |
+
value=0,
|
461 |
+
minimum=0
|
462 |
+
),
|
463 |
+
gr.Textbox(
|
464 |
+
label="Comments",
|
465 |
+
lines=10,
|
466 |
+
placeholder="Paste comments here..."
|
467 |
+
)
|
468 |
],
|
469 |
outputs=[
|
470 |
+
gr.Textbox(
|
471 |
+
label="Analytics Summary",
|
472 |
+
lines=20
|
473 |
+
),
|
474 |
+
gr.Textbox(
|
475 |
+
label="Extracted Usernames"
|
476 |
+
),
|
477 |
+
gr.Textbox(
|
478 |
+
label="Cleaned Comments"
|
479 |
+
),
|
480 |
+
gr.Textbox(
|
481 |
+
label="Comment Likes Timeline"
|
482 |
+
),
|
483 |
+
gr.Textbox(
|
484 |
+
label="Total Comment Likes"
|
485 |
+
)
|
486 |
],
|
487 |
title="Enhanced Instagram Comment Analyzer",
|
488 |
+
description="""
|
489 |
+
Analyze Instagram comments with advanced metrics including:
|
490 |
+
- Sentiment analysis with granular classification
|
491 |
+
- Temporal engagement patterns
|
492 |
+
- User interaction statistics
|
493 |
+
- Content quality metrics
|
494 |
+
""",
|
495 |
+
article="""
|
496 |
+
### Usage Instructions
|
497 |
+
1. Select the content type (Photo, Video, Reel, or Story)
|
498 |
+
2. Paste the post URL
|
499 |
+
3. Enter the post metadata (likes, date, description)
|
500 |
+
4. Paste the comments text
|
501 |
+
5. Click submit to generate analysis
|
502 |
+
|
503 |
+
### Analysis Features
|
504 |
+
- Multi-level sentiment analysis
|
505 |
+
- Engagement period breakdown
|
506 |
+
- Top contributors and mentions
|
507 |
+
- Detailed statistical metrics
|
508 |
+
|
509 |
+
### Notes
|
510 |
+
- All text fields support Unicode characters including emojis
|
511 |
+
- Time references are converted to a standardized format
|
512 |
+
- Analysis includes both quantitative and qualitative metrics
|
513 |
+
"""
|
514 |
)
|
515 |
|
516 |
if __name__ == "__main__":
|
517 |
+
# Configure logging for the main application
|
518 |
+
logger.info("Starting Instagram Comment Analyzer")
|
519 |
+
|
520 |
+
try:
|
521 |
+
# Launch the interface with enhanced settings
|
522 |
+
iface.launch(
|
523 |
+
server_name="0.0.0.0", # Allow external access
|
524 |
+
server_port=7860, # Default Gradio port
|
525 |
+
share=False, # Disable public URL generation
|
526 |
+
debug=False, # Disable debug mode in production
|
527 |
+
enable_queue=True, # Enable request queuing
|
528 |
+
max_threads=4 # Limit concurrent processing
|
529 |
+
)
|
530 |
+
except Exception as e:
|
531 |
+
logger.error(f"Failed to start application: {e}", exc_info=True)
|
532 |
+
raise
|