Spaces:

boompack
/

new-space

Sleeping

App Files Files Community

boompack commited on Nov 2, 2024

Commit

ed0485e

verified ·

1 Parent(s): f508547

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -506

app.py CHANGED Viewed

@@ -1,532 +1,113 @@
 import re
-import emoji
-import statistics
-from collections import Counter
-from typing import Dict, List, Tuple, Optional, Set, Union
 import logging
-from pathlib import Path
-from datetime import datetime
-import csv
-from dataclasses import dataclass, asdict
-from enum import Enum
-import numpy as np
-# Configure logging
-log_dir = Path("logs")
-log_dir.mkdir(exist_ok=True)
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler(log_dir / f'analyzer_{datetime.now():%Y%m%d}.log'),
-        logging.StreamHandler()
-    ]
-)
 logger = logging.getLogger(__name__)
-class Sentiment(str, Enum):
-    POSITIVE = 'positive'
-    SLIGHTLY_POSITIVE = 'slightly_positive'
-    NEUTRAL = 'neutral'
-    SLIGHTLY_NEGATIVE = 'slightly_negative'
-    NEGATIVE = 'negative'
-@dataclass
-class CommentData:
-    username: str
-    text: str
-    likes: int
-    weeks_ago: float
-    sentiment: Sentiment
-class TextAnalyzer:
-    """Enhanced text analysis utilities"""
-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean text using more efficient string splitting"""
-        return ' '.join(text.split())
-    @staticmethod
-    def count_emojis(text: str) -> int:
-        """Count emojis using set operations for better performance"""
-        return len({c for c in text if c in emoji.EMOJI_DATA})
-    @staticmethod
-    def extract_mentions(text: str) -> Set[str]:
-        """Extract mentions returning a set for uniqueness"""
-        return set(re.findall(r'@[\w.]+', text))
-    @staticmethod
-    def get_words(text: str) -> List[str]:
-        """Extract meaningful words using improved regex"""
-        return [w for w in re.findall(r'\b\w{3,}\b', text.lower())]
-class SentimentAnalyzer:
-    """Enhanced sentiment analysis with gradual classification"""
-    # Using sets for O(1) lookup
-    INDICATORS = {
-        'positive': {
-            '🔥', '❤️', '👍', '😊', '💪', '👏', '🎉', '♥️', '😍', '🙏',
-            'круто', 'супер', 'класс', 'огонь', 'пушка', 'отлично', 'здорово',
-            'прекрасно', 'молодец', 'красота', 'спасибо', 'топ', 'лучший',
-            'amazing', 'wonderful', 'great', 'perfect', 'love', 'beautiful'
-        },
-        'negative': {
-            '👎', '😢', '😞', '😠', '😡', '💔', '😕', '😑',
-            'плохо', 'ужас', 'отстой', 'фу', 'жесть', 'ужасно',
-            'разочарован', 'печаль', 'грустно', 'bad', 'worst',
-            'terrible', 'awful', 'sad', 'disappointed'
-        }
-    }
-    @classmethod
-    def analyze(cls, text: str) -> Sentiment:
-        """
-        Analyze text sentiment with enhanced granularity and emphasis handling
-        """
-        text_lower = text.lower()
-        words = set(cls.TextAnalyzer.get_words(text_lower))
-        pos_count = len(words & cls.INDICATORS['positive'])
-        neg_count = len(words & cls.INDICATORS['negative'])
-        # Calculate emphasis multiplier based on punctuation
-        emphasis = min(text.count('!') * 0.2 + text.count('?') * 0.1, 1.0)
-        # Apply emphasis to the dominant sentiment
-        if pos_count > neg_count:
-            pos_count *= (1 + emphasis)
-        elif neg_count > pos_count:
-            neg_count *= (1 + emphasis)
-        # Determine sentiment with granularity
-        total = pos_count + neg_count
-        if total == 0:
-            return Sentiment.NEUTRAL
-        ratio = pos_count / total
-        if ratio > 0.8:
-            return Sentiment.POSITIVE
-        elif ratio > 0.6:
-            return Sentiment.SLIGHTLY_POSITIVE
-        elif ratio < 0.2:
-            return Sentiment.NEGATIVE
-        elif ratio < 0.4:
-            return Sentiment.SLIGHTLY_NEGATIVE
-        return Sentiment.NEUTRAL
-class CommentExtractor:
-    """Enhanced comment data extraction"""
-    class ParseError(Exception):
-        """Custom exception for parsing errors"""
-        pass
-    # Optimized patterns with named groups
-    PATTERNS = {
-        'username': re.compile(r"""
-            (?:
-                Фото\sпрофиля\s(?P<name1>[^\n]+)|
-                ^(?P<name2>[^\s]+)\s+|
-                @(?P<name3>[^\s]+)\s+
-            )
-        """, re.VERBOSE),
-        'time': re.compile(r"""
-            (?P<value>\d+)\s*
-            (?P<unit>(?:ч|нед|h|w|час|hour|week))\.?
-        """, re.VERBOSE),
-        'likes': re.compile(r"""
-            (?:
-                (?P<count1>\d+)\s*отметк[аи]\s\"Нравится\"|
-                Нравится:\s*(?P<count2>\d+)|
-                \"Нравится\":\s*(?P<count3>\d+)|
-                likes?:\s*(?P<count4>\d+)
-            )
-        """, re.VERBOSE),
-        'metadata': re.compile(r"""
-            Фото\sпрофиля[^\n]+\n|
-            \d+\s*(?:ч|нед|h|w|час|hour|week)\.?|
-            (?:Нравится|likes?):\s*\d+|
-            \d+\s*отметк[аи]\s\"Нравится\"|
-            Ответить|
-            Показать\sперевод|
-            Скрыть\sвсе\sответы|
-            Смотреть\sвсе\sответы\s\(\d+\)
-        """, re.VERBOSE)
-    }
-    @classmethod
-    def extract_data(cls, comment_text: str) -> Optional[CommentData]:
-        """Extract comment data with improved error handling"""
-        try:
-            # Extract username
-            username_match = cls.PATTERNS['username'].search(comment_text)
-            if not username_match:
-                raise cls.ParseError("Could not extract username")
-            username = next(
-                name for name in username_match.groups()
-                if name is not None
-            ).strip()
-            # Clean comment text
-            comment = cls.PATTERNS['metadata'].sub('', comment_text)
-            comment = TextAnalyzer.clean_text(comment)
-            # Extract time
-            time_match = cls.PATTERNS['time'].search(comment_text)
-            if not time_match:
-                weeks = 0
-            else:
-                value = int(time_match.group('value'))
-                unit = time_match.group('unit')
-                weeks = value if unit in {'нед', 'w', 'week'} else value / (24 * 7)
-            # Extract likes
-            likes_match = cls.PATTERNS['likes'].search(comment_text)
-            likes = next(
-                (int(count) for count in likes_match.groups() if count),
-                0
-            ) if likes_match else 0
-            # Analyze sentiment
-            sentiment = SentimentAnalyzer.analyze(comment)
-            return CommentData(
-                username=username,
-                text=comment,
-                likes=likes,
-                weeks_ago=weeks,
-                sentiment=sentiment
-            )
-        except cls.ParseError as e:
-            logger.warning(f"Failed to parse comment: {e}")
-            return None
-        except Exception as e:
-            logger.error(f"Unexpected error parsing comment: {e}", exc_info=True)
-            return None
-class StatsCalculator:
-    """Enhanced statistics calculation"""
-    @staticmethod
-    def calculate_period_stats(comments: List[CommentData]) -> Dict:
-        """Calculate statistics using quantile-based periods"""
-        if not comments:
-            return {}
-        # Sort by weeks
-        sorted_comments = sorted(comments, key=lambda x: x.weeks_ago)
-        # Calculate period boundaries using quantiles
-        weeks = [c.weeks_ago for c in sorted_comments]
-        boundaries = np.quantile(weeks, [0.33, 0.67])
-        # Group comments by period
-        periods = {
-            'early': [],
-            'middle': [],
-            'late': []
-        }
-        for comment in sorted_comments:
-            if comment.weeks_ago <= boundaries[0]:
-                periods['early'].append(comment)
-            elif comment.weeks_ago <= boundaries[1]:
-                periods['middle'].append(comment)
-            else:
-                periods['late'].append(comment)
-        # Calculate statistics for each period
-        return {
-            period: {
-                'comments': len(comments),
-                'avg_likes': statistics.mean(c.likes for c in comments) if comments else 0,
-                'sentiment_ratio': sum(
-                    1 for c in comments
-                    if c.sentiment in {Sentiment.POSITIVE, Sentiment.SLIGHTLY_POSITIVE}
-                ) / len(comments) if comments else 0
-            }
-            for period, comments in periods.items()
-        }
-def analyze_post(
-    content_type: str,
-    link_to_post: str,
-    post_likes: int,
-    post_date: str,
-    description: str,
-    comment_count: int,
-    all_comments: str
-) -> Tuple[str, str, str, str, str]:
-    """Enhanced post analysis with improved error handling and reporting"""
     try:
-        # Split comments using optimized pattern
-        comment_pattern = re.compile(
-            r'(?=Фото профиля|\n\s*[a-zA-Z0-9._]+\s+|\b@[a-zA-Z0-9._]+\s+)',
-            re.MULTILINE
-        )
-        comments_blocks = [
-            block.strip() for block in comment_pattern.split(all_comments)
-            if block and block.strip() and 'Скрыто алгоритмами Instagram' not in block
-        ]
-        # Extract and validate comment data
         comments_data = []
-        for block in comments_blocks:
-            if data := CommentExtractor.extract_data(block):
-                comments_data.append(data)
-        if not comments_data:
-            logger.warning("No valid comments found in the input")
-            return "No valid comments found", "", "", "", "0"
-        # Calculate statistics
-        basic_stats = {
-            'total_comments': len(comments_data),
-            'avg_length': statistics.mean(len(c.text) for c in comments_data),
-            'median_length': statistics.median(len(c.text) for c in comments_data),
-            'avg_words': statistics.mean(len(TextAnalyzer.get_words(c.text)) for c in comments_data),
-            'total_likes': sum(c.likes for c in comments_data),
-            'avg_likes': statistics.mean(c.likes for c in comments_data)
-        }
-        # Generate reports
-        reports = generate_reports(
-            content_type=content_type,
-            link_to_post=link_to_post,
-            post_likes=post_likes,
-            comments_data=comments_data,
-            basic_stats=basic_stats
-        )
-        return (
-            reports['analytics'],
-            "\n".join(c.username for c in comments_data),
-            "\n".join(c.text for c in comments_data),
-            "\n".join(str(c.likes) for c in comments_data),
-            str(basic_stats['total_likes'])
-        )
-    except Exception as e:
-        logger.error(f"Error analyzing post: {e}", exc_info=True)
-        return f"Error analyzing post: {str(e)}", "", "", "", "0"
-def generate_reports(
-    content_type: str,
-    link_to_post: str,
-    post_likes: int,
-    comments_data: List[CommentData],
-    basic_stats: Dict
-) -> Dict[str, str]:
-    """Generate comprehensive reports in multiple formats"""
-    # Calculate additional statistics
-    sentiment_dist = Counter(c.sentiment for c in comments_data)
-    period_stats = StatsCalculator.calculate_period_stats(comments_data)
-    top_users = Counter(c.username for c in comments_data).most_common(5)
-    top_mentioned = Counter(
-        mention for c in comments_data
-        for mention in TextAnalyzer.extract_mentions(c.text)
-    ).most_common(5)
-    # Generate CSV report
-    csv_output = StringIO()
-    writer = csv.writer(csv_output)
-    # Write metadata
-    writer.writerow(['Content Analysis Report'])
-    writer.writerow(['Generated', datetime.now().isoformat()])
-    writer.writerow(['Content Type', content_type])
-    writer.writerow(['Post URL', link_to_post])
-    writer.writerow(['Post Likes', post_likes])
-    writer.writerow([])
-    # Write statistics sections
-    for section, data in {
-        'Basic Statistics': basic_stats,
-        'Sentiment Distribution': sentiment_dist,
-        'Period Analysis': period_stats,
-        'Top Users': dict(top_users),
-        'Top Mentioned': dict(top_mentioned)
-    }.items():
-        writer.writerow([section])
-        for key, value in data.items():
-            writer.writerow([key, value])
-        writer.writerow([])
-    # Generate text report
-    text_report = (
-        f"ANALYSIS REPORT\n"
-        f"Generated: {datetime.now():%Y-%m-%d %H:%M:%S}\n\n"
-        f"BASIC STATISTICS:\n"
-        f"- Total Comments: {basic_stats['total_comments']}\n"
-        f"- Average Likes: {basic_stats['avg_likes']:.1f}\n"
-        f"- Average Length: {basic_stats['avg_length']:.1f} characters\n"
-        f"- Median Length: {basic_stats['median_length']}\n"
-        f"- Average Words: {basic_stats['avg_words']:.1f}\n\n"
-        f"SENTIMENT ANALYSIS:\n"
-        f"- Positive: {sentiment_dist[Sentiment.POSITIVE]}\n"
-        f"- Slightly Positive: {sentiment_dist[Sentiment.SLIGHTLY_POSITIVE]}\n"
-        f"- Neutral: {sentiment_dist[Sentiment.NEUTRAL]}\n"
-        f"- Slightly Negative: {sentiment_dist[Sentiment.SLIGHTLY_NEGATIVE]}\n"
-        f"- Negative: {sentiment_dist[Sentiment.NEGATIVE]}\n\n"
-        f"TOP CONTRIBUTORS:\n" +
-        "\n".join(f"- {user}: {count} comments" for user, count in top_users) +
-        f"\n\nMOST MENTIONED:\n""\n".join(f"- {user}: {count} mentions" for user, count in top_mentioned) +
-        f"\n\nENGAGEMENT PERIODS:\n"
-        f"Early Period:\n"
-        f"- Comments: {period_stats['early']['comments']}\n"
-        f"- Avg Likes: {period_stats['early']['avg_likes']:.1f}\n"
-        f"- Positive Sentiment: {period_stats['early']['sentiment_ratio']*100:.1f}%\n\n"
-        f"Middle Period:\n"
-        f"- Comments: {period_stats['middle']['comments']}\n"
-        f"- Avg Likes: {period_stats['middle']['avg_likes']:.1f}\n"
-        f"- Positive Sentiment: {period_stats['middle']['sentiment_ratio']*100:.1f}%\n\n"
-        f"Late Period:\n"
-        f"- Comments: {period_stats['late']['comments']}\n"
-        f"- Avg Likes: {period_stats['late']['avg_likes']:.1f}\n"
-        f"- Positive Sentiment: {period_stats['late']['sentiment_ratio']*100:.1f}%\n"
-    )
-    return {
-        'csv': csv_output.getvalue(),
-        'analytics': text_report
-    }
-# Gradio interface with improved input validation and error handling
-import gradio as gr
-def validate_input(content_type: str, link: str, likes: int, date: str,
-                  description: str, comment_count: int, comments: str) -> Tuple[bool, str]:
-    """Validate input parameters before processing"""
-    if not link:
-        return False, "Post link is required"
-    if likes < 0:
-        return False, "Likes count cannot be negative"
-    if comment_count < 0:
-        return False, "Comment count cannot be negative"
-    if not comments.strip():
-        return False, "Comments text is required"
-    return True, ""
-def wrapped_analyze_post(*args):
-    """Wrapper for analyze_post with input validation"""
-    is_valid, error_message = validate_input(*args)
-    if not is_valid:
-        return error_message, "", "", "", "0"
-    try:
-        return analyze_post(*args)
     except Exception as e:
-        logger.error(f"Error in analyze_post wrapper: {e}", exc_info=True)
-        return f"An error occurred: {str(e)}", "", "", "", "0"
-# Create enhanced Gradio interface
 iface = gr.Interface(
-    fn=wrapped_analyze_post,
     inputs=[
-        gr.Radio(
-            choices=["Photo", "Video", "Reel", "Story"],
-            label="Content Type",
-            value="Photo"
-        ),
-        gr.Textbox(
-            label="Link to Post",
-            placeholder="https://instagram.com/p/..."
-        ),
-        gr.Number(
-            label="Post Likes",
-            value=0,
-            minimum=0
-        ),
-        gr.Textbox(
-            label="Post Date",
-            placeholder="YYYY-MM-DD"
-        ),
-        gr.Textbox(
-            label="Post Description",
-            lines=3,
-            placeholder="Enter post description..."
-        ),
-        gr.Number(
-            label="Total Comment Count",
-            value=0,
-            minimum=0
-        ),
-        gr.Textbox(
-            label="Comments",
-            lines=10,
-            placeholder="Paste comments here..."
-        )
     ],
     outputs=[
-        gr.Textbox(
-            label="Analytics Summary",
-            lines=20
-        ),
-        gr.Textbox(
-            label="Extracted Usernames"
-        ),
-        gr.Textbox(
-            label="Cleaned Comments"
-        ),
-        gr.Textbox(
-            label="Comment Likes Timeline"
-        ),
-        gr.Textbox(
-            label="Total Comment Likes"
-        )
     ],
-    title="Enhanced Instagram Comment Analyzer",
-    description="""
-    Analyze Instagram comments with advanced metrics including:
-    - Sentiment analysis with granular classification
-    - Temporal engagement patterns
-    - User interaction statistics
-    - Content quality metrics
-    """,
-    article="""
-    ### Usage Instructions
-    1. Select the content type (Photo, Video, Reel, or Story)
-    2. Paste the post URL
-    3. Enter the post metadata (likes, date, description)
-    4. Paste the comments text
-    5. Click submit to generate analysis
-    ### Analysis Features
-    - Multi-level sentiment analysis
-    - Engagement period breakdown
-    - Top contributors and mentions
-    - Detailed statistical metrics
-    ### Notes
-    - All text fields support Unicode characters including emojis
-    - Time references are converted to a standardized format
-    - Analysis includes both quantitative and qualitative metrics
-    """
 )
 if __name__ == "__main__":
-    # Configure logging for the main application
-    logger.info("Starting Instagram Comment Analyzer")
-    try:
-        # Launch the interface with enhanced settings
-        iface.launch(
-            server_name="0.0.0.0",  # Allow external access
-            server_port=7860,        # Default Gradio port
-            share=False,             # Disable public URL generation
-            debug=False,             # Disable debug mode in production
-            enable_queue=True,       # Enable request queuing
-            max_threads=4           # Limit concurrent processing
-        )
-    except Exception as e:
-        logger.error(f"Failed to start application: {e}", exc_info=True)
-        raise

+import gradio as gr
 import re
 import logging
+from typing import Tuple, Optional
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def extract_comment_data(comment_text: str) -> Tuple[Optional[str], Optional[str], int, int]:
+    """Извлекает данные из комментария"""
+    try:
+        # Пропускаем информацию о посте
+        if 'отметок "Нравится"' in comment_text:
+            return None, None, 0, 0
+        # Извлекаем имя пользователя
+        username_match = re.search(r"Фото профиля ([^\n]+)", comment_text)
+        if not username_match:
+            return None, None, 0, 0
+        username = username_match.group(1).strip()
+        # Извлекаем текст комментария
+        lines = comment_text.split('\n')
+        comment = ""
+        for i, line in enumerate(lines):
+            if username in line and i + 1 < len(lines):
+                comment = lines[i + 1].strip()
+                # Очищаем комментарий
+                comment = re.sub(r'\d+\s*(?:ч\.|нед\.)\s*$', '', comment)
+                comment = re.sub(r'"Нравится":\s*\d+\s*Ответить\s*$', '', comment)
+                break
+        # Извлекаем лайки
+        likes_match = re.search(r'"Нравится":\s*(\d+)', comment_text)
+        likes = int(likes_match.group(1)) if likes_match else 0
+        # Извлекаем время
+        time_match = re.search(r'(\d+)\s*(?:ч\.|нед\.)', comment_text)
+        time = int(time_match.group(1)) if time_match else 0
+        return username, comment.strip(), likes, time
+    except Exception as e:
+        logger.error(f"Error extracting data: {e}")
+        return None, None, 0, 0
+def analyze_post(content_type: str, link: str, post_likes: int,
+                post_date: str, description: str, comment_count: int,
+                all_comments: str) -> Tuple[str, str, str, str, str]:
+    """Анализирует пост и комментарии"""
     try:
+        # Разделяем на блоки комментариев
+        blocks = re.split(r'(?=Фото профиля)', all_comments)
+        blocks = [b.strip() for b in blocks if b.strip()]
         comments_data = []
+        # Обрабатываем каждый блок
+        for block in blocks:
+            username, comment, likes, time = extract_comment_data(block)
+            if username and comment:
+                comments_data.append({
+                    'username': username,
+                    'comment': comment,
+                    'likes': likes,
+                    'time': time
+                })
+        # Формируем выходные данные
+        usernames = "\n".join(item['username'] for item in comments_data)
+        comments = "\n".join(item['comment'] for item in comments_data)
+        likes = "\n".join(str(item['likes']) for item in comments_data)
+        total_likes = sum(item['likes'] for item in comments_data)
+        analytics = f"""
+        📊 Анализ комментариев:
+        Всего комментариев: {len(comments_data)}
+        Уникальных пользователей: {len(set(item['username'] for item in comments_data))}
+        Общее количество лайков: {total_likes}
+        """
+        return analytics, usernames, comments, likes, str(total_likes)
     except Exception as e:
+        logger.error(f"Analysis error: {e}")
+        return str(e), "", "", "", "0"
+# Интерфейс Gradio
 iface = gr.Interface(
+    fn=analyze_post,
     inputs=[
+        gr.Radio(choices=["Photo", "Video"], label="Content Type", value="Photo"),
+        gr.Textbox(label="Link to Post"),
+        gr.Number(label="Likes", value=0),
+        gr.Textbox(label="Post Date"),
+        gr.Textbox(label="Description", lines=3),
+        gr.Number(label="Comment Count", value=0),
+        gr.Textbox(label="Comments", lines=10)
     ],
     outputs=[
+        gr.Textbox(label="Analytics Summary", lines=10),
+        gr.Textbox(label="Usernames"),
+        gr.Textbox(label="Comments"),
+        gr.Textbox(label="Likes Chronology"),
+        gr.Textbox(label="Total Likes on Comments")
     ],
+    title="Instagram Comment Analyzer",
+    description="Анализатор комментариев Instagram"
 )
 if __name__ == "__main__":
+    iface.launch()