File size: 4,362 Bytes
e87abff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# src/utils/conversation_summarizer.py
from typing import List, Dict
from transformers import pipeline
import numpy as np
from datetime import datetime

class ConversationSummarizer:
    def __init__(
        self,
        model_name: str = "facebook/bart-large-cnn",
        max_length: int = 130,
        min_length: int = 30
    ):
        """Initialize the summarizer"""
        self.summarizer = pipeline(
            "summarization",
            model=model_name,
            device=-1  # CPU
        )
        self.max_length = max_length
        self.min_length = min_length

    async def summarize_conversation(
        self,
        messages: List[Dict],
        include_metadata: bool = True
    ) -> Dict:
        """
        Summarize a conversation and provide key insights
        """
        # Format conversation for summarization
        formatted_convo = self._format_conversation(messages)
        
        # Generate summary
        summary = self.summarizer(
            formatted_convo,
            max_length=self.max_length,
            min_length=self.min_length,
            do_sample=False
        )[0]['summary_text']
        
        # Extract key insights
        insights = self._extract_insights(messages)
        
        # Generate metadata if requested
        metadata = self._generate_metadata(messages) if include_metadata else {}
        
        return {
            'summary': summary,
            'key_insights': insights,
            'metadata': metadata
        }

    def _format_conversation(self, messages: List[Dict]) -> str:
        """Format conversation for summarization"""
        formatted = []
        for msg in messages:
            role = msg.get('role', 'unknown')
            content = msg.get('content', '')
            formatted.append(f"{role}: {content}")
        
        return "\n".join(formatted)

    def _extract_insights(self, messages: List[Dict]) -> Dict:
        """Extract key insights from conversation"""
        # Count message types
        message_counts = {
            'user': len([m for m in messages if m.get('role') == 'user']),
            'assistant': len([m for m in messages if m.get('role') == 'assistant'])
        }
        
        # Calculate average message length
        avg_length = np.mean([len(m.get('content', '')) for m in messages])
        
        # Extract main topics (simplified)
        topics = self._extract_topics(messages)
        
        return {
            'message_distribution': message_counts,
            'average_message_length': int(avg_length),
            'main_topics': topics,
            'total_messages': len(messages)
        }

    def _extract_topics(self, messages: List[Dict]) -> List[str]:
        """Extract main topics from conversation"""
        # Combine all messages
        full_text = " ".join([m.get('content', '') for m in messages])
        
        # Use the summarizer to extract main points
        topics = self.summarizer(
            full_text,
            max_length=50,
            min_length=10,
            do_sample=False
        )[0]['summary_text'].split('. ')
        
        return topics

    def _generate_metadata(self, messages: List[Dict]) -> Dict:
        """Generate conversation metadata"""
        if not messages:
            return {}
            
        return {
            'start_time': messages[0].get('timestamp', None),
            'end_time': messages[-1].get('timestamp', None),
            'duration_minutes': self._calculate_duration(messages),
            'sources_used': self._extract_sources(messages)
        }

    def _calculate_duration(self, messages: List[Dict]) -> float:
        """Calculate conversation duration in minutes"""
        try:
            start_time = datetime.fromisoformat(messages[0].get('timestamp', ''))
            end_time = datetime.fromisoformat(messages[-1].get('timestamp', ''))
            return (end_time - start_time).total_seconds() / 60
        except:
            return 0

    def _extract_sources(self, messages: List[Dict]) -> List[str]:
        """Extract unique sources used in conversation"""
        sources = set()
        for message in messages:
            if message.get('sources'):
                for source in message['sources']:
                    sources.add(source.get('filename', ''))
        return list(sources)