File size: 4,992 Bytes
e87abff
 
 
 
 
82b8aa2
 
e87abff
 
 
 
82b8aa2
 
 
e87abff
82b8aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
e87abff
 
82b8aa2
 
 
e87abff
 
 
 
 
 
 
 
 
 
 
 
82b8aa2
e87abff
 
 
 
 
 
 
82b8aa2
e87abff
 
82b8aa2
e87abff
82b8aa2
 
 
e87abff
 
 
 
 
 
 
 
 
 
 
 
 
82b8aa2
e87abff
 
 
 
 
 
 
 
 
82b8aa2
e87abff
 
82b8aa2
e87abff
 
82b8aa2
e87abff
 
 
 
 
 
 
 
 
 
 
82b8aa2
e87abff
 
 
 
 
 
 
82b8aa2
e87abff
 
 
 
 
 
82b8aa2
e87abff
 
 
 
 
 
 
 
 
 
82b8aa2
 
 
 
e87abff
 
 
 
 
 
 
 
 
 
 
82b8aa2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# src/utils/conversation_summarizer.py
from typing import List, Dict
from transformers import pipeline
import numpy as np
from datetime import datetime
from config.config import settings


class ConversationSummarizer:
    def __init__(
        self,
        model_name: str = None,
        max_length: int = None,
        min_length: int = None
    ):
        """
        Initialize the summarizer

        Args:
            model_name (str, optional): Override default model from config
            max_length (int, optional): Override default max_length from config
            min_length (int, optional): Override default min_length from config
        """
        # Use provided values or fall back to config values
        self.model_name = model_name or settings.SUMMARIZER_CONFIG['model_name']
        self.max_length = max_length or settings.SUMMARIZER_CONFIG['max_length']
        self.min_length = min_length or settings.SUMMARIZER_CONFIG['min_length']

        # Initialize the summarizer with config settings
        self.summarizer = pipeline(
            "summarization",
            model=self.model_name,
            device=settings.SUMMARIZER_CONFIG['device'],
            model_kwargs=settings.SUMMARIZER_CONFIG['model_kwargs']
        )

    async def summarize_conversation(
        self,
        messages: List[Dict],
        include_metadata: bool = True
    ) -> Dict:
        """
        Summarize a conversation and provide key insights
        """
        # Format conversation for summarization
        formatted_convo = self._format_conversation(messages)

        # Generate summary
        summary = self.summarizer(
            formatted_convo,
            max_length=self.max_length,
            min_length=self.min_length,
            do_sample=False
        )[0]['summary_text']

        # Extract key insights
        insights = self._extract_insights(messages)

        # Generate metadata if requested
        metadata = self._generate_metadata(
            messages) if include_metadata else {}

        return {
            'summary': summary,
            'key_insights': insights,
            'metadata': metadata
        }

    def _format_conversation(self, messages: List[Dict]) -> str:
        """Format conversation for summarization"""
        formatted = []
        for msg in messages:
            role = msg.get('role', 'unknown')
            content = msg.get('content', '')
            formatted.append(f"{role}: {content}")

        return "\n".join(formatted)

    def _extract_insights(self, messages: List[Dict]) -> Dict:
        """Extract key insights from conversation"""
        # Count message types
        message_counts = {
            'user': len([m for m in messages if m.get('role') == 'user']),
            'assistant': len([m for m in messages if m.get('role') == 'assistant'])
        }

        # Calculate average message length
        avg_length = np.mean([len(m.get('content', '')) for m in messages])

        # Extract main topics (simplified)
        topics = self._extract_topics(messages)

        return {
            'message_distribution': message_counts,
            'average_message_length': int(avg_length),
            'main_topics': topics,
            'total_messages': len(messages)
        }

    def _extract_topics(self, messages: List[Dict]) -> List[str]:
        """Extract main topics from conversation"""
        # Combine all messages
        full_text = " ".join([m.get('content', '') for m in messages])

        # Use the summarizer to extract main points
        topics = self.summarizer(
            full_text,
            max_length=50,
            min_length=10,
            do_sample=False
        )[0]['summary_text'].split('. ')

        return topics

    def _generate_metadata(self, messages: List[Dict]) -> Dict:
        """Generate conversation metadata"""
        if not messages:
            return {}

        return {
            'start_time': messages[0].get('timestamp', None),
            'end_time': messages[-1].get('timestamp', None),
            'duration_minutes': self._calculate_duration(messages),
            'sources_used': self._extract_sources(messages)
        }

    def _calculate_duration(self, messages: List[Dict]) -> float:
        """Calculate conversation duration in minutes"""
        try:
            start_time = datetime.fromisoformat(
                messages[0].get('timestamp', ''))
            end_time = datetime.fromisoformat(
                messages[-1].get('timestamp', ''))
            return (end_time - start_time).total_seconds() / 60
        except:
            return 0

    def _extract_sources(self, messages: List[Dict]) -> List[str]:
        """Extract unique sources used in conversation"""
        sources = set()
        for message in messages:
            if message.get('sources'):
                for source in message['sources']:
                    sources.add(source.get('filename', ''))
        return list(sources)