|
import re |
|
import emoji |
|
import statistics |
|
from collections import Counter |
|
from typing import Dict, List, Tuple, Optional, Set, Union |
|
import logging |
|
from pathlib import Path |
|
from datetime import datetime |
|
import csv |
|
from dataclasses import dataclass, asdict |
|
from enum import Enum |
|
import numpy as np |
|
|
|
|
|
log_dir = Path("logs") |
|
log_dir.mkdir(exist_ok=True) |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.FileHandler(log_dir / f'analyzer_{datetime.now():%Y%m%d}.log'), |
|
logging.StreamHandler() |
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
class Sentiment(str, Enum): |
|
POSITIVE = 'positive' |
|
SLIGHTLY_POSITIVE = 'slightly_positive' |
|
NEUTRAL = 'neutral' |
|
SLIGHTLY_NEGATIVE = 'slightly_negative' |
|
NEGATIVE = 'negative' |
|
|
|
@dataclass |
|
class CommentData: |
|
username: str |
|
text: str |
|
likes: int |
|
weeks_ago: float |
|
sentiment: Sentiment |
|
|
|
class TextAnalyzer: |
|
"""Enhanced text analysis utilities""" |
|
|
|
@staticmethod |
|
def clean_text(text: str) -> str: |
|
"""Clean text using more efficient string splitting""" |
|
return ' '.join(text.split()) |
|
|
|
@staticmethod |
|
def count_emojis(text: str) -> int: |
|
"""Count emojis using set operations for better performance""" |
|
return len({c for c in text if c in emoji.EMOJI_DATA}) |
|
|
|
@staticmethod |
|
def extract_mentions(text: str) -> Set[str]: |
|
"""Extract mentions returning a set for uniqueness""" |
|
return set(re.findall(r'@[\w.]+', text)) |
|
|
|
@staticmethod |
|
def get_words(text: str) -> List[str]: |
|
"""Extract meaningful words using improved regex""" |
|
return [w for w in re.findall(r'\b\w{3,}\b', text.lower())] |
|
|
|
class SentimentAnalyzer: |
|
"""Enhanced sentiment analysis with gradual classification""" |
|
|
|
|
|
INDICATORS = { |
|
'positive': { |
|
'🔥', '❤️', '👍', '😊', '💪', '👏', '🎉', '♥️', '😍', '🙏', |
|
'круто', 'супер', 'класс', 'огонь', 'пушка', 'отлично', 'здорово', |
|
'прекрасно', 'молодец', 'красота', 'спасибо', 'топ', 'лучший', |
|
'amazing', 'wonderful', 'great', 'perfect', 'love', 'beautiful' |
|
}, |
|
'negative': { |
|
'👎', '😢', '😞', '😠', '😡', '💔', '😕', '😑', |
|
'плохо', 'ужас', 'отстой', 'фу', 'жесть', 'ужасно', |
|
'разочарован', 'печаль', 'грустно', 'bad', 'worst', |
|
'terrible', 'awful', 'sad', 'disappointed' |
|
} |
|
} |
|
|
|
@classmethod |
|
def analyze(cls, text: str) -> Sentiment: |
|
""" |
|
Analyze text sentiment with enhanced granularity and emphasis handling |
|
""" |
|
text_lower = text.lower() |
|
words = set(cls.TextAnalyzer.get_words(text_lower)) |
|
|
|
pos_count = len(words & cls.INDICATORS['positive']) |
|
neg_count = len(words & cls.INDICATORS['negative']) |
|
|
|
|
|
emphasis = min(text.count('!') * 0.2 + text.count('?') * 0.1, 1.0) |
|
|
|
|
|
if pos_count > neg_count: |
|
pos_count *= (1 + emphasis) |
|
elif neg_count > pos_count: |
|
neg_count *= (1 + emphasis) |
|
|
|
|
|
total = pos_count + neg_count |
|
if total == 0: |
|
return Sentiment.NEUTRAL |
|
|
|
ratio = pos_count / total |
|
if ratio > 0.8: |
|
return Sentiment.POSITIVE |
|
elif ratio > 0.6: |
|
return Sentiment.SLIGHTLY_POSITIVE |
|
elif ratio < 0.2: |
|
return Sentiment.NEGATIVE |
|
elif ratio < 0.4: |
|
return Sentiment.SLIGHTLY_NEGATIVE |
|
return Sentiment.NEUTRAL |
|
|
|
class CommentExtractor: |
|
"""Enhanced comment data extraction""" |
|
|
|
class ParseError(Exception): |
|
"""Custom exception for parsing errors""" |
|
pass |
|
|
|
|
|
PATTERNS = { |
|
'username': re.compile(r""" |
|
(?: |
|
Фото\sпрофиля\s(?P<name1>[^\n]+)| |
|
^(?P<name2>[^\s]+)\s+| |
|
@(?P<name3>[^\s]+)\s+ |
|
) |
|
""", re.VERBOSE), |
|
|
|
'time': re.compile(r""" |
|
(?P<value>\d+)\s* |
|
(?P<unit>(?:ч|нед|h|w|час|hour|week))\.? |
|
""", re.VERBOSE), |
|
|
|
'likes': re.compile(r""" |
|
(?: |
|
(?P<count1>\d+)\s*отметк[аи]\s\"Нравится\"| |
|
Нравится:\s*(?P<count2>\d+)| |
|
\"Нравится\":\s*(?P<count3>\d+)| |
|
likes?:\s*(?P<count4>\d+) |
|
) |
|
""", re.VERBOSE), |
|
|
|
'metadata': re.compile(r""" |
|
Фото\sпрофиля[^\n]+\n| |
|
\d+\s*(?:ч|нед|h|w|час|hour|week)\.?| |
|
(?:Нравится|likes?):\s*\d+| |
|
\d+\s*отметк[аи]\s\"Нравится\"| |
|
Ответить| |
|
Показать\sперевод| |
|
Скрыть\sвсе\sответы| |
|
Смотреть\sвсе\sответы\s\(\d+\) |
|
""", re.VERBOSE) |
|
} |
|
|
|
@classmethod |
|
def extract_data(cls, comment_text: str) -> Optional[CommentData]: |
|
"""Extract comment data with improved error handling""" |
|
try: |
|
|
|
username_match = cls.PATTERNS['username'].search(comment_text) |
|
if not username_match: |
|
raise cls.ParseError("Could not extract username") |
|
|
|
username = next( |
|
name for name in username_match.groups() |
|
if name is not None |
|
).strip() |
|
|
|
|
|
comment = cls.PATTERNS['metadata'].sub('', comment_text) |
|
comment = TextAnalyzer.clean_text(comment) |
|
|
|
|
|
time_match = cls.PATTERNS['time'].search(comment_text) |
|
if not time_match: |
|
weeks = 0 |
|
else: |
|
value = int(time_match.group('value')) |
|
unit = time_match.group('unit') |
|
weeks = value if unit in {'нед', 'w', 'week'} else value / (24 * 7) |
|
|
|
|
|
likes_match = cls.PATTERNS['likes'].search(comment_text) |
|
likes = next( |
|
(int(count) for count in likes_match.groups() if count), |
|
0 |
|
) if likes_match else 0 |
|
|
|
|
|
sentiment = SentimentAnalyzer.analyze(comment) |
|
|
|
return CommentData( |
|
username=username, |
|
text=comment, |
|
likes=likes, |
|
weeks_ago=weeks, |
|
sentiment=sentiment |
|
) |
|
|
|
except cls.ParseError as e: |
|
logger.warning(f"Failed to parse comment: {e}") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Unexpected error parsing comment: {e}", exc_info=True) |
|
return None |
|
|
|
class StatsCalculator: |
|
"""Enhanced statistics calculation""" |
|
|
|
@staticmethod |
|
def calculate_period_stats(comments: List[CommentData]) -> Dict: |
|
"""Calculate statistics using quantile-based periods""" |
|
if not comments: |
|
return {} |
|
|
|
|
|
sorted_comments = sorted(comments, key=lambda x: x.weeks_ago) |
|
|
|
|
|
weeks = [c.weeks_ago for c in sorted_comments] |
|
boundaries = np.quantile(weeks, [0.33, 0.67]) |
|
|
|
|
|
periods = { |
|
'early': [], |
|
'middle': [], |
|
'late': [] |
|
} |
|
|
|
for comment in sorted_comments: |
|
if comment.weeks_ago <= boundaries[0]: |
|
periods['early'].append(comment) |
|
elif comment.weeks_ago <= boundaries[1]: |
|
periods['middle'].append(comment) |
|
else: |
|
periods['late'].append(comment) |
|
|
|
|
|
return { |
|
period: { |
|
'comments': len(comments), |
|
'avg_likes': statistics.mean(c.likes for c in comments) if comments else 0, |
|
'sentiment_ratio': sum( |
|
1 for c in comments |
|
if c.sentiment in {Sentiment.POSITIVE, Sentiment.SLIGHTLY_POSITIVE} |
|
) / len(comments) if comments else 0 |
|
} |
|
for period, comments in periods.items() |
|
} |
|
|
|
def analyze_post( |
|
content_type: str, |
|
link_to_post: str, |
|
post_likes: int, |
|
post_date: str, |
|
description: str, |
|
comment_count: int, |
|
all_comments: str |
|
) -> Tuple[str, str, str, str, str]: |
|
"""Enhanced post analysis with improved error handling and reporting""" |
|
try: |
|
|
|
comment_pattern = re.compile( |
|
r'(?=Фото профиля|\n\s*[a-zA-Z0-9._]+\s+|\b@[a-zA-Z0-9._]+\s+)', |
|
re.MULTILINE |
|
) |
|
comments_blocks = [ |
|
block.strip() for block in comment_pattern.split(all_comments) |
|
if block and block.strip() and 'Скрыто алгоритмами Instagram' not in block |
|
] |
|
|
|
|
|
comments_data = [] |
|
for block in comments_blocks: |
|
if data := CommentExtractor.extract_data(block): |
|
comments_data.append(data) |
|
|
|
if not comments_data: |
|
logger.warning("No valid comments found in the input") |
|
return "No valid comments found", "", "", "", "0" |
|
|
|
|
|
basic_stats = { |
|
'total_comments': len(comments_data), |
|
'avg_length': statistics.mean(len(c.text) for c in comments_data), |
|
'median_length': statistics.median(len(c.text) for c in comments_data), |
|
'avg_words': statistics.mean(len(TextAnalyzer.get_words(c.text)) for c in comments_data), |
|
'total_likes': sum(c.likes for c in comments_data), |
|
'avg_likes': statistics.mean(c.likes for c in comments_data) |
|
} |
|
|
|
|
|
reports = generate_reports( |
|
content_type=content_type, |
|
link_to_post=link_to_post, |
|
post_likes=post_likes, |
|
comments_data=comments_data, |
|
basic_stats=basic_stats |
|
) |
|
|
|
return ( |
|
reports['analytics'], |
|
"\n".join(c.username for c in comments_data), |
|
"\n".join(c.text for c in comments_data), |
|
"\n".join(str(c.likes) for c in comments_data), |
|
str(basic_stats['total_likes']) |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"Error analyzing post: {e}", exc_info=True) |
|
return f"Error analyzing post: {str(e)}", "", "", "", "0" |
|
|
|
def generate_reports( |
|
content_type: str, |
|
link_to_post: str, |
|
post_likes: int, |
|
comments_data: List[CommentData], |
|
basic_stats: Dict |
|
) -> Dict[str, str]: |
|
"""Generate comprehensive reports in multiple formats""" |
|
|
|
|
|
sentiment_dist = Counter(c.sentiment for c in comments_data) |
|
period_stats = StatsCalculator.calculate_period_stats(comments_data) |
|
top_users = Counter(c.username for c in comments_data).most_common(5) |
|
top_mentioned = Counter( |
|
mention for c in comments_data |
|
for mention in TextAnalyzer.extract_mentions(c.text) |
|
).most_common(5) |
|
|
|
|
|
csv_output = StringIO() |
|
writer = csv.writer(csv_output) |
|
|
|
|
|
writer.writerow(['Content Analysis Report']) |
|
writer.writerow(['Generated', datetime.now().isoformat()]) |
|
writer.writerow(['Content Type', content_type]) |
|
writer.writerow(['Post URL', link_to_post]) |
|
writer.writerow(['Post Likes', post_likes]) |
|
writer.writerow([]) |
|
|
|
|
|
for section, data in { |
|
'Basic Statistics': basic_stats, |
|
'Sentiment Distribution': sentiment_dist, |
|
'Period Analysis': period_stats, |
|
'Top Users': dict(top_users), |
|
'Top Mentioned': dict(top_mentioned) |
|
}.items(): |
|
writer.writerow([section]) |
|
for key, value in data.items(): |
|
writer.writerow([key, value]) |
|
writer.writerow([]) |
|
|
|
|
|
text_report = ( |
|
f"ANALYSIS REPORT\n" |
|
f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}\n\n" |
|
f"BASIC STATISTICS:\n" |
|
f"- Total Comments: {basic_stats['total_comments']}\n" |
|
f"- Average Likes: {basic_stats['avg_likes']:.1f}\n" |
|
f"- Average Length: {basic_stats['avg_length']:.1f} characters\n" |
|
f"- Median Length: {basic_stats['median_length']}\n" |
|
f"- Average Words: {basic_stats['avg_words']:.1f}\n\n" |
|
f"SENTIMENT ANALYSIS:\n" |
|
f"- Positive: {sentiment_dist[Sentiment.POSITIVE]}\n" |
|
f"- Slightly Positive: {sentiment_dist[Sentiment.SLIGHTLY_POSITIVE]}\n" |
|
f"- Neutral: {sentiment_dist[Sentiment.NEUTRAL]}\n" |
|
f"- Slightly Negative: {sentiment_dist[Sentiment.SLIGHTLY_NEGATIVE]}\n" |
|
f"- Negative: {sentiment_dist[Sentiment.NEGATIVE]}\n\n" |
|
f"TOP CONTRIBUTORS:\n" + |
|
"\n".join(f"- {user}: {count} comments" for user, count in top_users) + |
|
f"\n\nMOST MENTIONED:\n""\n".join(f"- {user}: {count} mentions" for user, count in top_mentioned) + |
|
f"\n\nENGAGEMENT PERIODS:\n" |
|
f"Early Period:\n" |
|
f"- Comments: {period_stats['early']['comments']}\n" |
|
f"- Avg Likes: {period_stats['early']['avg_likes']:.1f}\n" |
|
f"- Positive Sentiment: {period_stats['early']['sentiment_ratio']*100:.1f}%\n\n" |
|
f"Middle Period:\n" |
|
f"- Comments: {period_stats['middle']['comments']}\n" |
|
f"- Avg Likes: {period_stats['middle']['avg_likes']:.1f}\n" |
|
f"- Positive Sentiment: {period_stats['middle']['sentiment_ratio']*100:.1f}%\n\n" |
|
f"Late Period:\n" |
|
f"- Comments: {period_stats['late']['comments']}\n" |
|
f"- Avg Likes: {period_stats['late']['avg_likes']:.1f}\n" |
|
f"- Positive Sentiment: {period_stats['late']['sentiment_ratio']*100:.1f}%\n" |
|
) |
|
|
|
return { |
|
'csv': csv_output.getvalue(), |
|
'analytics': text_report |
|
} |
|
|
|
|
|
import gradio as gr |
|
|
|
def validate_input(content_type: str, link: str, likes: int, date: str, |
|
description: str, comment_count: int, comments: str) -> Tuple[bool, str]: |
|
"""Validate input parameters before processing""" |
|
if not link: |
|
return False, "Post link is required" |
|
if likes < 0: |
|
return False, "Likes count cannot be negative" |
|
if comment_count < 0: |
|
return False, "Comment count cannot be negative" |
|
if not comments.strip(): |
|
return False, "Comments text is required" |
|
return True, "" |
|
|
|
def wrapped_analyze_post(*args): |
|
"""Wrapper for analyze_post with input validation""" |
|
is_valid, error_message = validate_input(*args) |
|
if not is_valid: |
|
return error_message, "", "", "", "0" |
|
|
|
try: |
|
return analyze_post(*args) |
|
except Exception as e: |
|
logger.error(f"Error in analyze_post wrapper: {e}", exc_info=True) |
|
return f"An error occurred: {str(e)}", "", "", "", "0" |
|
|
|
|
|
iface = gr.Interface( |
|
fn=wrapped_analyze_post, |
|
inputs=[ |
|
gr.Radio( |
|
choices=["Photo", "Video", "Reel", "Story"], |
|
label="Content Type", |
|
value="Photo" |
|
), |
|
gr.Textbox( |
|
label="Link to Post", |
|
placeholder="https://instagram.com/p/..." |
|
), |
|
gr.Number( |
|
label="Post Likes", |
|
value=0, |
|
minimum=0 |
|
), |
|
gr.Textbox( |
|
label="Post Date", |
|
placeholder="YYYY-MM-DD" |
|
), |
|
gr.Textbox( |
|
label="Post Description", |
|
lines=3, |
|
placeholder="Enter post description..." |
|
), |
|
gr.Number( |
|
label="Total Comment Count", |
|
value=0, |
|
minimum=0 |
|
), |
|
gr.Textbox( |
|
label="Comments", |
|
lines=10, |
|
placeholder="Paste comments here..." |
|
) |
|
], |
|
outputs=[ |
|
gr.Textbox( |
|
label="Analytics Summary", |
|
lines=20 |
|
), |
|
gr.Textbox( |
|
label="Extracted Usernames" |
|
), |
|
gr.Textbox( |
|
label="Cleaned Comments" |
|
), |
|
gr.Textbox( |
|
label="Comment Likes Timeline" |
|
), |
|
gr.Textbox( |
|
label="Total Comment Likes" |
|
) |
|
], |
|
title="Enhanced Instagram Comment Analyzer", |
|
description=""" |
|
Analyze Instagram comments with advanced metrics including: |
|
- Sentiment analysis with granular classification |
|
- Temporal engagement patterns |
|
- User interaction statistics |
|
- Content quality metrics |
|
""", |
|
article=""" |
|
### Usage Instructions |
|
1. Select the content type (Photo, Video, Reel, or Story) |
|
2. Paste the post URL |
|
3. Enter the post metadata (likes, date, description) |
|
4. Paste the comments text |
|
5. Click submit to generate analysis |
|
|
|
### Analysis Features |
|
- Multi-level sentiment analysis |
|
- Engagement period breakdown |
|
- Top contributors and mentions |
|
- Detailed statistical metrics |
|
|
|
### Notes |
|
- All text fields support Unicode characters including emojis |
|
- Time references are converted to a standardized format |
|
- Analysis includes both quantitative and qualitative metrics |
|
""" |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
logger.info("Starting Instagram Comment Analyzer") |
|
|
|
try: |
|
|
|
iface.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
share=False, |
|
debug=False, |
|
enable_queue=True, |
|
max_threads=4 |
|
) |
|
except Exception as e: |
|
logger.error(f"Failed to start application: {e}", exc_info=True) |
|
raise |