flare / ssml_converter.py
ciyidogan's picture
Update ssml_converter.py
f119504 verified
raw
history blame
5.68 kB
# ssml_converter.py
"""
SSML (Speech Synthesis Markup Language) Converter
"""
import re
from typing import Dict, Optional
from datetime import datetime
import xml.etree.ElementTree as ET
from xml.sax.saxutils import escape
class SSMLConverter:
"""Convert plain text to SSML format"""
def __init__(self, language: str = "tr-TR"):
self.language = language
def convert_to_ssml(self, text: str, options: Dict[str, any] = None) -> str:
"""Convert plain text to SSML with smart detection"""
# Create root speak element
speak = ET.Element("speak")
speak.set("version", "1.0")
speak.set("xml:lang", self.language)
# Process text segments
segments = self._segment_text(text)
for segment in segments:
if segment["type"] == "plain":
# Add plain text (escaped)
if segment["text"].strip():
speak.text = (speak.text or "") + escape(segment["text"])
elif segment["type"] == "number":
# Add number with say-as
say_as = ET.SubElement(speak, "say-as")
say_as.set("interpret-as", "cardinal")
say_as.text = segment["text"]
elif segment["type"] == "currency":
# Add currency with say-as
say_as = ET.SubElement(speak, "say-as")
say_as.set("interpret-as", "currency")
say_as.text = segment["text"]
elif segment["type"] == "time":
# Add time with say-as
say_as = ET.SubElement(speak, "say-as")
say_as.set("interpret-as", "time")
say_as.set("format", "hms24")
say_as.text = segment["text"]
elif segment["type"] == "date":
# Add date with say-as
say_as = ET.SubElement(speak, "say-as")
say_as.set("interpret-as", "date")
say_as.set("format", "ymd")
say_as.text = segment["text"]
elif segment["type"] == "code":
# Spell out codes
say_as = ET.SubElement(speak, "say-as")
say_as.set("interpret-as", "characters")
say_as.text = segment["text"]
elif segment["type"] == "pause":
# Add break for punctuation
break_elem = ET.SubElement(speak, "break")
break_elem.set("time", segment["duration"])
# Convert to string
return ET.tostring(speak, encoding='unicode', method='xml')
def _segment_text(self, text: str) -> list:
"""Segment text into different types for SSML processing"""
segments = []
# Patterns for different content types
patterns = {
'currency': r'[₺$€£]\s*\d+(?:[.,]\d+)?|\d+(?:[.,]\d+)?\s*(?:TL|USD|EUR|GBP)',
'time': r'\b\d{1,2}:\d{2}(?::\d{2})?\b',
'date': r'\b\d{4}-\d{2}-\d{2}\b|\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b',
'code': r'\b[A-Z]{2,5}\d{2,5}\b',
'number': r'\b\d+(?:[.,]\d+)?\b',
'pause': r'\.{3}|--'
}
# Combined pattern
combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in patterns.items())
last_end = 0
for match in re.finditer(combined_pattern, text):
# Add text before match
if match.start() > last_end:
segments.append({
'type': 'plain',
'text': text[last_end:match.start()]
})
# Determine match type and add
for type_name, group_text in match.groupdict().items():
if group_text:
if type_name == 'pause':
segments.append({
'type': 'pause',
'duration': '500ms' if group_text == '...' else '1s'
})
else:
segments.append({
'type': type_name,
'text': group_text
})
break
last_end = match.end()
# Add remaining text
if last_end < len(text):
segments.append({
'type': 'plain',
'text': text[last_end:]
})
return segments
def add_emphasis(self, text: str, words: list, level: str = "moderate") -> str:
"""Add emphasis to specific words in SSML"""
ssml_text = self.convert_to_ssml(text)
# Parse SSML
root = ET.fromstring(ssml_text)
# Add emphasis to matching words
for elem in root.iter():
if elem.text and any(word in elem.text for word in words):
for word in words:
if word in elem.text:
# Create emphasis element
parts = elem.text.split(word, 1)
elem.text = parts[0]
emphasis = ET.SubElement(elem, "emphasis")
emphasis.set("level", level)
emphasis.text = word
if len(parts) > 1:
emphasis.tail = parts[1]
return ET.tostring(root, encoding='unicode', method='xml')