Spaces:
Running
Running
File size: 4,992 Bytes
e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 e87abff 82b8aa2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# src/utils/conversation_summarizer.py
from typing import List, Dict
from transformers import pipeline
import numpy as np
from datetime import datetime
from config.config import settings
class ConversationSummarizer:
def __init__(
self,
model_name: str = None,
max_length: int = None,
min_length: int = None
):
"""
Initialize the summarizer
Args:
model_name (str, optional): Override default model from config
max_length (int, optional): Override default max_length from config
min_length (int, optional): Override default min_length from config
"""
# Use provided values or fall back to config values
self.model_name = model_name or settings.SUMMARIZER_CONFIG['model_name']
self.max_length = max_length or settings.SUMMARIZER_CONFIG['max_length']
self.min_length = min_length or settings.SUMMARIZER_CONFIG['min_length']
# Initialize the summarizer with config settings
self.summarizer = pipeline(
"summarization",
model=self.model_name,
device=settings.SUMMARIZER_CONFIG['device'],
model_kwargs=settings.SUMMARIZER_CONFIG['model_kwargs']
)
async def summarize_conversation(
self,
messages: List[Dict],
include_metadata: bool = True
) -> Dict:
"""
Summarize a conversation and provide key insights
"""
# Format conversation for summarization
formatted_convo = self._format_conversation(messages)
# Generate summary
summary = self.summarizer(
formatted_convo,
max_length=self.max_length,
min_length=self.min_length,
do_sample=False
)[0]['summary_text']
# Extract key insights
insights = self._extract_insights(messages)
# Generate metadata if requested
metadata = self._generate_metadata(
messages) if include_metadata else {}
return {
'summary': summary,
'key_insights': insights,
'metadata': metadata
}
def _format_conversation(self, messages: List[Dict]) -> str:
"""Format conversation for summarization"""
formatted = []
for msg in messages:
role = msg.get('role', 'unknown')
content = msg.get('content', '')
formatted.append(f"{role}: {content}")
return "\n".join(formatted)
def _extract_insights(self, messages: List[Dict]) -> Dict:
"""Extract key insights from conversation"""
# Count message types
message_counts = {
'user': len([m for m in messages if m.get('role') == 'user']),
'assistant': len([m for m in messages if m.get('role') == 'assistant'])
}
# Calculate average message length
avg_length = np.mean([len(m.get('content', '')) for m in messages])
# Extract main topics (simplified)
topics = self._extract_topics(messages)
return {
'message_distribution': message_counts,
'average_message_length': int(avg_length),
'main_topics': topics,
'total_messages': len(messages)
}
def _extract_topics(self, messages: List[Dict]) -> List[str]:
"""Extract main topics from conversation"""
# Combine all messages
full_text = " ".join([m.get('content', '') for m in messages])
# Use the summarizer to extract main points
topics = self.summarizer(
full_text,
max_length=50,
min_length=10,
do_sample=False
)[0]['summary_text'].split('. ')
return topics
def _generate_metadata(self, messages: List[Dict]) -> Dict:
"""Generate conversation metadata"""
if not messages:
return {}
return {
'start_time': messages[0].get('timestamp', None),
'end_time': messages[-1].get('timestamp', None),
'duration_minutes': self._calculate_duration(messages),
'sources_used': self._extract_sources(messages)
}
def _calculate_duration(self, messages: List[Dict]) -> float:
"""Calculate conversation duration in minutes"""
try:
start_time = datetime.fromisoformat(
messages[0].get('timestamp', ''))
end_time = datetime.fromisoformat(
messages[-1].get('timestamp', ''))
return (end_time - start_time).total_seconds() / 60
except:
return 0
def _extract_sources(self, messages: List[Dict]) -> List[str]:
"""Extract unique sources used in conversation"""
sources = set()
for message in messages:
if message.get('sources'):
for source in message['sources']:
sources.add(source.get('filename', ''))
return list(sources)
|