Spaces:
Building
Building
# ssml_converter.py | |
""" | |
SSML (Speech Synthesis Markup Language) Converter | |
""" | |
import re | |
from typing import Dict, Optional | |
from datetime import datetime | |
import xml.etree.ElementTree as ET | |
from xml.sax.saxutils import escape | |
class SSMLConverter: | |
"""Convert plain text to SSML format""" | |
def __init__(self, language: str = "tr-TR"): | |
self.language = language | |
def convert_to_ssml(self, text: str, options: Dict[str, any] = None) -> str: | |
"""Convert plain text to SSML with smart detection""" | |
# Create root speak element | |
speak = ET.Element("speak") | |
speak.set("version", "1.0") | |
speak.set("xml:lang", self.language) | |
# Process text segments | |
segments = self._segment_text(text) | |
for segment in segments: | |
if segment["type"] == "plain": | |
# Add plain text (escaped) | |
if segment["text"].strip(): | |
speak.text = (speak.text or "") + escape(segment["text"]) | |
elif segment["type"] == "number": | |
# Add number with say-as | |
say_as = ET.SubElement(speak, "say-as") | |
say_as.set("interpret-as", "cardinal") | |
say_as.text = segment["text"] | |
elif segment["type"] == "currency": | |
# Add currency with say-as | |
say_as = ET.SubElement(speak, "say-as") | |
say_as.set("interpret-as", "currency") | |
say_as.text = segment["text"] | |
elif segment["type"] == "time": | |
# Add time with say-as | |
say_as = ET.SubElement(speak, "say-as") | |
say_as.set("interpret-as", "time") | |
say_as.set("format", "hms24") | |
say_as.text = segment["text"] | |
elif segment["type"] == "date": | |
# Add date with say-as | |
say_as = ET.SubElement(speak, "say-as") | |
say_as.set("interpret-as", "date") | |
say_as.set("format", "ymd") | |
say_as.text = segment["text"] | |
elif segment["type"] == "code": | |
# Spell out codes | |
say_as = ET.SubElement(speak, "say-as") | |
say_as.set("interpret-as", "characters") | |
say_as.text = segment["text"] | |
elif segment["type"] == "pause": | |
# Add break for punctuation | |
break_elem = ET.SubElement(speak, "break") | |
break_elem.set("time", segment["duration"]) | |
# Convert to string | |
return ET.tostring(speak, encoding='unicode', method='xml') | |
def _segment_text(self, text: str) -> list: | |
"""Segment text into different types for SSML processing""" | |
segments = [] | |
# Patterns for different content types | |
patterns = { | |
'currency': r'[₺$€£]\s*\d+(?:[.,]\d+)?|\d+(?:[.,]\d+)?\s*(?:TL|USD|EUR|GBP)', | |
'time': r'\b\d{1,2}:\d{2}(?::\d{2})?\b', | |
'date': r'\b\d{4}-\d{2}-\d{2}\b|\b\d{1,2}[./]\d{1,2}[./]\d{2,4}\b', | |
'code': r'\b[A-Z]{2,5}\d{2,5}\b', | |
'number': r'\b\d+(?:[.,]\d+)?\b', | |
'pause': r'\.{3}|--' | |
} | |
# Combined pattern | |
combined_pattern = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in patterns.items()) | |
last_end = 0 | |
for match in re.finditer(combined_pattern, text): | |
# Add text before match | |
if match.start() > last_end: | |
segments.append({ | |
'type': 'plain', | |
'text': text[last_end:match.start()] | |
}) | |
# Determine match type and add | |
for type_name, group_text in match.groupdict().items(): | |
if group_text: | |
if type_name == 'pause': | |
segments.append({ | |
'type': 'pause', | |
'duration': '500ms' if group_text == '...' else '1s' | |
}) | |
else: | |
segments.append({ | |
'type': type_name, | |
'text': group_text | |
}) | |
break | |
last_end = match.end() | |
# Add remaining text | |
if last_end < len(text): | |
segments.append({ | |
'type': 'plain', | |
'text': text[last_end:] | |
}) | |
return segments | |
def add_emphasis(self, text: str, words: list, level: str = "moderate") -> str: | |
"""Add emphasis to specific words in SSML""" | |
ssml_text = self.convert_to_ssml(text) | |
# Parse SSML | |
root = ET.fromstring(ssml_text) | |
# Add emphasis to matching words | |
for elem in root.iter(): | |
if elem.text and any(word in elem.text for word in words): | |
for word in words: | |
if word in elem.text: | |
# Create emphasis element | |
parts = elem.text.split(word, 1) | |
elem.text = parts[0] | |
emphasis = ET.SubElement(elem, "emphasis") | |
emphasis.set("level", level) | |
emphasis.text = word | |
if len(parts) > 1: | |
emphasis.tail = parts[1] | |
return ET.tostring(root, encoding='unicode', method='xml') |