Spaces:

fedirz
/

faster-whisper-server

Configuration error

App Files Files Community

Fedir Zadniprovskyi commited on Sep 30, 2024

Commit

ec4d8ae

1 Parent(s): 6b7295b

refactor: update response model names and module name

Browse files

Files changed (10) hide show

src/faster_whisper_server/{server_models.py → api_models.py} +102 -16
src/faster_whisper_server/asr.py +11 -5
src/faster_whisper_server/routers/list_models.py +9 -9
src/faster_whisper_server/routers/stt.py +25 -20
src/faster_whisper_server/{core.py → text_utils.py} +13 -188
src/faster_whisper_server/text_utils_test.py +111 -0
src/faster_whisper_server/transcriber.py +3 -2
tests/api_timestamp_granularities_test.py +1 -1
tests/openai_timestamp_granularities_test.py +1 -1
tests/sse_test.py +5 -5

src/faster_whisper_server/{server_models.py → api_models.py} RENAMED Viewed

@@ -4,36 +4,117 @@ from typing import TYPE_CHECKING, Literal
 from pydantic import BaseModel, ConfigDict, Field
-from faster_whisper_server.core import Segment, Transcription, Word, segments_to_text
 if TYPE_CHECKING:
-    from faster_whisper.transcribe import TranscriptionInfo
 # https://platform.openai.com/docs/api-reference/audio/json-object
-class TranscriptionJsonResponse(BaseModel):
     text: str
     @classmethod
-    def from_segments(cls, segments: list[Segment]) -> TranscriptionJsonResponse:
         return cls(text=segments_to_text(segments))
     @classmethod
-    def from_transcription(cls, transcription: Transcription) -> TranscriptionJsonResponse:
         return cls(text=transcription.text)
 # https://platform.openai.com/docs/api-reference/audio/verbose-json-object
-class TranscriptionVerboseJsonResponse(BaseModel):
     task: str = "transcribe"
     language: str
     duration: float
     text: str
-    words: list[Word] | None
-    segments: list[Segment]
     @classmethod
-    def from_segment(cls, segment: Segment, transcription_info: TranscriptionInfo) -> TranscriptionVerboseJsonResponse:
         return cls(
             language=transcription_info.language,
             duration=segment.end - segment.start,
@@ -44,18 +125,20 @@ class TranscriptionVerboseJsonResponse(BaseModel):
     @classmethod
     def from_segments(
-        cls, segments: list[Segment], transcription_info: TranscriptionInfo
-    ) -> TranscriptionVerboseJsonResponse:
         return cls(
             language=transcription_info.language,
             duration=transcription_info.duration,
             text=segments_to_text(segments),
             segments=segments,
-            words=Word.from_segments(segments) if transcription_info.transcription_options.word_timestamps else None,
         )
     @classmethod
-    def from_transcription(cls, transcription: Transcription) -> TranscriptionVerboseJsonResponse:
         return cls(
             language="english",  # FIX: hardcoded
             duration=transcription.duration,
@@ -65,12 +148,14 @@ class TranscriptionVerboseJsonResponse(BaseModel):
         )
-class ModelListResponse(BaseModel):
-    data: list[ModelObject]
     object: Literal["list"] = "list"
-class ModelObject(BaseModel):
     id: str
     """The model identifier, which can be referenced in the API endpoints."""
     created: int
@@ -109,6 +194,7 @@ class ModelObject(BaseModel):
     )
 TimestampGranularities = list[Literal["segment", "word"]]

 from pydantic import BaseModel, ConfigDict, Field
+from faster_whisper_server.text_utils import Transcription, canonicalize_word, segments_to_text
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+    import faster_whisper.transcribe
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10909
+class TranscriptionWord(BaseModel):
+    start: float
+    end: float
+    word: str
+    probability: float
+    @classmethod
+    def from_segments(cls, segments: Iterable[TranscriptionSegment]) -> list[TranscriptionWord]:
+        words: list[TranscriptionWord] = []
+        for segment in segments:
+            # NOTE: a temporary "fix" for https://github.com/fedirz/faster-whisper-server/issues/58.
+            # TODO: properly address the issue
+            assert (
+                segment.words is not None
+            ), "Segment must have words. If you are using an API ensure `timestamp_granularities[]=word` is set"
+            words.extend(segment.words)
+        return words
+    def offset(self, seconds: float) -> None:
+        self.start += seconds
+        self.end += seconds
+    @classmethod
+    def common_prefix(cls, a: list[TranscriptionWord], b: list[TranscriptionWord]) -> list[TranscriptionWord]:
+        i = 0
+        while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
+            i += 1
+        return a[:i]
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10938
+class TranscriptionSegment(BaseModel):
+    id: int
+    seek: int
+    start: float
+    end: float
+    text: str
+    tokens: list[int]
+    temperature: float
+    avg_logprob: float
+    compression_ratio: float
+    no_speech_prob: float
+    words: list[TranscriptionWord] | None
+    @classmethod
+    def from_faster_whisper_segments(
+        cls, segments: Iterable[faster_whisper.transcribe.Segment]
+    ) -> Iterable[TranscriptionSegment]:
+        for segment in segments:
+            yield cls(
+                id=segment.id,
+                seek=segment.seek,
+                start=segment.start,
+                end=segment.end,
+                text=segment.text,
+                tokens=segment.tokens,
+                temperature=segment.temperature,
+                avg_logprob=segment.avg_logprob,
+                compression_ratio=segment.compression_ratio,
+                no_speech_prob=segment.no_speech_prob,
+                words=[
+                    TranscriptionWord(
+                        start=word.start,
+                        end=word.end,
+                        word=word.word,
+                        probability=word.probability,
+                    )
+                    for word in segment.words
+                ]
+                if segment.words is not None
+                else None,
+            )
 # https://platform.openai.com/docs/api-reference/audio/json-object
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10924
+class CreateTranscriptionResponseJson(BaseModel):
     text: str
     @classmethod
+    def from_segments(cls, segments: list[TranscriptionSegment]) -> CreateTranscriptionResponseJson:
         return cls(text=segments_to_text(segments))
     @classmethod
+    def from_transcription(cls, transcription: Transcription) -> CreateTranscriptionResponseJson:
         return cls(text=transcription.text)
 # https://platform.openai.com/docs/api-reference/audio/verbose-json-object
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L11007
+class CreateTranscriptionResponseVerboseJson(BaseModel):
     task: str = "transcribe"
     language: str
     duration: float
     text: str
+    words: list[TranscriptionWord] | None
+    segments: list[TranscriptionSegment]
     @classmethod
+    def from_segment(
+        cls, segment: TranscriptionSegment, transcription_info: faster_whisper.transcribe.TranscriptionInfo
+    ) -> CreateTranscriptionResponseVerboseJson:
         return cls(
             language=transcription_info.language,
             duration=segment.end - segment.start,
     @classmethod
     def from_segments(
+        cls, segments: list[TranscriptionSegment], transcription_info: faster_whisper.transcribe.TranscriptionInfo
+    ) -> CreateTranscriptionResponseVerboseJson:
         return cls(
             language=transcription_info.language,
             duration=transcription_info.duration,
             text=segments_to_text(segments),
             segments=segments,
+            words=TranscriptionWord.from_segments(segments)
+            if transcription_info.transcription_options.word_timestamps
+            else None,
         )
     @classmethod
+    def from_transcription(cls, transcription: Transcription) -> CreateTranscriptionResponseVerboseJson:
         return cls(
             language="english",  # FIX: hardcoded
             duration=transcription.duration,
         )
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8730
+class ListModelsResponse(BaseModel):
+    data: list[Model]
     object: Literal["list"] = "list"
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L11146
+class Model(BaseModel):
     id: str
     """The model identifier, which can be referenced in the API endpoints."""
     created: int
     )
+# https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L10909
 TimestampGranularities = list[Literal["segment", "word"]]

src/faster_whisper_server/asr.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import asyncio
 import logging
 import time
-from faster_whisper import transcribe
-from faster_whisper_server.audio import Audio
-from faster_whisper_server.core import Segment, Transcription, Word
 logger = logging.getLogger(__name__)
@@ -31,8 +37,8 @@ class FasterWhisperASR:
             word_timestamps=True,
             **self.transcribe_opts,
         )
-        segments = Segment.from_faster_whisper_segments(segments)
-        words = Word.from_segments(segments)
         for word in words:
             word.offset(audio.start)
         transcription = Transcription(words)

+from __future__ import annotations
 import asyncio
 import logging
 import time
+from typing import TYPE_CHECKING
+from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
+from faster_whisper_server.text_utils import Transcription
+if TYPE_CHECKING:
+    from faster_whisper import transcribe
+    from faster_whisper_server.audio import Audio
 logger = logging.getLogger(__name__)
             word_timestamps=True,
             **self.transcribe_opts,
         )
+        segments = TranscriptionSegment.from_faster_whisper_segments(segments)
+        words = TranscriptionWord.from_segments(segments)
         for word in words:
             word.offset(audio.start)
         transcription = Transcription(words)

src/faster_whisper_server/routers/list_models.py CHANGED Viewed

@@ -9,9 +9,9 @@ from fastapi import (
 )
 import huggingface_hub
-from faster_whisper_server.server_models import (
-    ModelListResponse,
-    ModelObject,
 )
 if TYPE_CHECKING:
@@ -21,11 +21,11 @@ router = APIRouter()
 @router.get("/v1/models")
-def get_models() -> ModelListResponse:
     models = huggingface_hub.list_models(library="ctranslate2", tags="automatic-speech-recognition", cardData=True)
     models = list(models)
     models.sort(key=lambda model: model.downloads, reverse=True)  # type: ignore  # noqa: PGH003
-    transformed_models: list[ModelObject] = []
     for model in models:
         assert model.created_at is not None
         assert model.card_data is not None
@@ -36,7 +36,7 @@ def get_models() -> ModelListResponse:
             language = [model.card_data.language]
         else:
             language = model.card_data.language
-        transformed_model = ModelObject(
             id=model.id,
             created=int(model.created_at.timestamp()),
             object_="model",
@@ -44,14 +44,14 @@ def get_models() -> ModelListResponse:
             language=language,
         )
         transformed_models.append(transformed_model)
-    return ModelListResponse(data=transformed_models)
 @router.get("/v1/models/{model_name:path}")
 # NOTE: `examples` doesn't work https://github.com/tiangolo/fastapi/discussions/10537
 def get_model(
     model_name: Annotated[str, Path(example="Systran/faster-distil-whisper-large-v3")],
-) -> ModelObject:
     models = huggingface_hub.list_models(
         model_name=model_name, library="ctranslate2", tags="automatic-speech-recognition", cardData=True
     )
@@ -78,7 +78,7 @@ def get_model(
         language = [exact_match.card_data.language]
     else:
         language = exact_match.card_data.language
-    return ModelObject(
         id=exact_match.id,
         created=int(exact_match.created_at.timestamp()),
         object_="model",

 )
 import huggingface_hub
+from faster_whisper_server.api_models import (
+    ListModelsResponse,
+    Model,
 )
 if TYPE_CHECKING:
 @router.get("/v1/models")
+def get_models() -> ListModelsResponse:
     models = huggingface_hub.list_models(library="ctranslate2", tags="automatic-speech-recognition", cardData=True)
     models = list(models)
     models.sort(key=lambda model: model.downloads, reverse=True)  # type: ignore  # noqa: PGH003
+    transformed_models: list[Model] = []
     for model in models:
         assert model.created_at is not None
         assert model.card_data is not None
             language = [model.card_data.language]
         else:
             language = model.card_data.language
+        transformed_model = Model(
             id=model.id,
             created=int(model.created_at.timestamp()),
             object_="model",
             language=language,
         )
         transformed_models.append(transformed_model)
+    return ListModelsResponse(data=transformed_models)
 @router.get("/v1/models/{model_name:path}")
 # NOTE: `examples` doesn't work https://github.com/tiangolo/fastapi/discussions/10537
 def get_model(
     model_name: Annotated[str, Path(example="Systran/faster-distil-whisper-large-v3")],
+) -> Model:
     models = huggingface_hub.list_models(
         model_name=model_name, library="ctranslate2", tags="automatic-speech-recognition", cardData=True
     )
         language = [exact_match.card_data.language]
     else:
         language = exact_match.card_data.language
+    return Model(
         id=exact_match.id,
         created=int(exact_match.created_at.timestamp()),
         object_="model",

src/faster_whisper_server/routers/stt.py CHANGED Viewed

@@ -20,6 +20,14 @@ from fastapi.websockets import WebSocketState
 from faster_whisper.vad import VadOptions, get_speech_timestamps
 from pydantic import AfterValidator
 from faster_whisper_server.asr import FasterWhisperASR
 from faster_whisper_server.audio import AudioStream, audio_samples_from_file
 from faster_whisper_server.config import (
@@ -28,15 +36,8 @@ from faster_whisper_server.config import (
     ResponseFormat,
     Task,
 )
-from faster_whisper_server.core import Segment, segments_to_srt, segments_to_text, segments_to_vtt
 from faster_whisper_server.dependencies import ConfigDependency, ModelManagerDependency, get_config
-from faster_whisper_server.server_models import (
-    DEFAULT_TIMESTAMP_GRANULARITIES,
-    TIMESTAMP_GRANULARITIES_COMBINATIONS,
-    TimestampGranularities,
-    TranscriptionJsonResponse,
-    TranscriptionVerboseJsonResponse,
-)
 from faster_whisper_server.transcriber import audio_transcriber
 if TYPE_CHECKING:
@@ -51,7 +52,7 @@ router = APIRouter()
 def segments_to_response(
-    segments: Iterable[Segment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> Response:
@@ -60,12 +61,12 @@ def segments_to_response(
         return Response(segments_to_text(segments), media_type="text/plain")
     elif response_format == ResponseFormat.JSON:
         return Response(
-            TranscriptionJsonResponse.from_segments(segments).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VERBOSE_JSON:
         return Response(
-            TranscriptionVerboseJsonResponse.from_segments(segments, transcription_info).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VTT:
@@ -83,7 +84,7 @@ def format_as_sse(data: str) -> str:
 def segments_to_streaming_response(
-    segments: Iterable[Segment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> StreamingResponse:
@@ -92,9 +93,11 @@ def segments_to_streaming_response(
             if response_format == ResponseFormat.TEXT:
                 data = segment.text
             elif response_format == ResponseFormat.JSON:
-                data = TranscriptionJsonResponse.from_segments([segment]).model_dump_json()
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                data = TranscriptionVerboseJsonResponse.from_segment(segment, transcription_info).model_dump_json()
             elif response_format == ResponseFormat.VTT:
                 data = segments_to_vtt(segment, i)
             elif response_format == ResponseFormat.SRT:
@@ -121,7 +124,7 @@ ModelName = Annotated[str, AfterValidator(handle_default_openai_model)]
 @router.post(
     "/v1/audio/translations",
-    response_model=str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse,
 )
 def translate_file(
     config: ConfigDependency,
@@ -145,7 +148,7 @@ def translate_file(
         temperature=temperature,
         vad_filter=True,
     )
-    segments = Segment.from_faster_whisper_segments(segments)
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
@@ -169,7 +172,7 @@ async def get_timestamp_granularities(request: Request) -> TimestampGranularitie
 # https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8915
 @router.post(
     "/v1/audio/transcriptions",
-    response_model=str | TranscriptionJsonResponse | TranscriptionVerboseJsonResponse,
 )
 def transcribe_file(
     config: ConfigDependency,
@@ -211,7 +214,7 @@ def transcribe_file(
         vad_filter=True,
         hotwords=hotwords,
     )
-    segments = Segment.from_faster_whisper_segments(segments)
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
@@ -286,9 +289,11 @@ async def transcribe_stream(
             if response_format == ResponseFormat.TEXT:
                 await ws.send_text(transcription.text)
             elif response_format == ResponseFormat.JSON:
-                await ws.send_json(TranscriptionJsonResponse.from_transcription(transcription).model_dump())
             elif response_format == ResponseFormat.VERBOSE_JSON:
-                await ws.send_json(TranscriptionVerboseJsonResponse.from_transcription(transcription).model_dump())
     if ws.client_state != WebSocketState.DISCONNECTED:
         logger.info("Closing the connection.")

 from faster_whisper.vad import VadOptions, get_speech_timestamps
 from pydantic import AfterValidator
+from faster_whisper_server.api_models import (
+    DEFAULT_TIMESTAMP_GRANULARITIES,
+    TIMESTAMP_GRANULARITIES_COMBINATIONS,
+    CreateTranscriptionResponseJson,
+    CreateTranscriptionResponseVerboseJson,
+    TimestampGranularities,
+    TranscriptionSegment,
+)
 from faster_whisper_server.asr import FasterWhisperASR
 from faster_whisper_server.audio import AudioStream, audio_samples_from_file
 from faster_whisper_server.config import (
     ResponseFormat,
     Task,
 )
 from faster_whisper_server.dependencies import ConfigDependency, ModelManagerDependency, get_config
+from faster_whisper_server.text_utils import segments_to_srt, segments_to_text, segments_to_vtt
 from faster_whisper_server.transcriber import audio_transcriber
 if TYPE_CHECKING:
 def segments_to_response(
+    segments: Iterable[TranscriptionSegment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> Response:
         return Response(segments_to_text(segments), media_type="text/plain")
     elif response_format == ResponseFormat.JSON:
         return Response(
+            CreateTranscriptionResponseJson.from_segments(segments).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VERBOSE_JSON:
         return Response(
+            CreateTranscriptionResponseVerboseJson.from_segments(segments, transcription_info).model_dump_json(),
             media_type="application/json",
         )
     elif response_format == ResponseFormat.VTT:
 def segments_to_streaming_response(
+    segments: Iterable[TranscriptionSegment],
     transcription_info: TranscriptionInfo,
     response_format: ResponseFormat,
 ) -> StreamingResponse:
             if response_format == ResponseFormat.TEXT:
                 data = segment.text
             elif response_format == ResponseFormat.JSON:
+                data = CreateTranscriptionResponseJson.from_segments([segment]).model_dump_json()
             elif response_format == ResponseFormat.VERBOSE_JSON:
+                data = CreateTranscriptionResponseVerboseJson.from_segment(
+                    segment, transcription_info
+                ).model_dump_json()
             elif response_format == ResponseFormat.VTT:
                 data = segments_to_vtt(segment, i)
             elif response_format == ResponseFormat.SRT:
 @router.post(
     "/v1/audio/translations",
+    response_model=str | CreateTranscriptionResponseJson | CreateTranscriptionResponseVerboseJson,
 )
 def translate_file(
     config: ConfigDependency,
         temperature=temperature,
         vad_filter=True,
     )
+    segments = TranscriptionSegment.from_faster_whisper_segments(segments)
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
 # https://github.com/openai/openai-openapi/blob/master/openapi.yaml#L8915
 @router.post(
     "/v1/audio/transcriptions",
+    response_model=str | CreateTranscriptionResponseJson | CreateTranscriptionResponseVerboseJson,
 )
 def transcribe_file(
     config: ConfigDependency,
         vad_filter=True,
         hotwords=hotwords,
     )
+    segments = TranscriptionSegment.from_faster_whisper_segments(segments)
     if stream:
         return segments_to_streaming_response(segments, transcription_info, response_format)
             if response_format == ResponseFormat.TEXT:
                 await ws.send_text(transcription.text)
             elif response_format == ResponseFormat.JSON:
+                await ws.send_json(CreateTranscriptionResponseJson.from_transcription(transcription).model_dump())
             elif response_format == ResponseFormat.VERBOSE_JSON:
+                await ws.send_json(
+                    CreateTranscriptionResponseVerboseJson.from_transcription(transcription).model_dump()
+                )
     if ws.client_state != WebSocketState.DISCONNECTED:
         logger.info("Closing the connection.")

src/faster_whisper_server/{core.py → text_utils.py} RENAMED Viewed

@@ -3,90 +3,17 @@ from __future__ import annotations
 import re
 from typing import TYPE_CHECKING
-from pydantic import BaseModel
 from faster_whisper_server.dependencies import get_config
 if TYPE_CHECKING:
     from collections.abc import Iterable
-    import faster_whisper.transcribe
-class Word(BaseModel):
-    start: float
-    end: float
-    word: str
-    probability: float
-    @classmethod
-    def from_segments(cls, segments: Iterable[Segment]) -> list[Word]:
-        words: list[Word] = []
-        for segment in segments:
-            # NOTE: a temporary "fix" for https://github.com/fedirz/faster-whisper-server/issues/58.
-            # TODO: properly address the issue
-            assert (
-                segment.words is not None
-            ), "Segment must have words. If you are using an API ensure `timestamp_granularities[]=word` is set"
-            words.extend(segment.words)
-        return words
-    def offset(self, seconds: float) -> None:
-        self.start += seconds
-        self.end += seconds
-    @classmethod
-    def common_prefix(cls, a: list[Word], b: list[Word]) -> list[Word]:
-        i = 0
-        while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
-            i += 1
-        return a[:i]
-class Segment(BaseModel):
-    id: int
-    seek: int
-    start: float
-    end: float
-    text: str
-    tokens: list[int]
-    temperature: float
-    avg_logprob: float
-    compression_ratio: float
-    no_speech_prob: float
-    words: list[Word] | None
-    @classmethod
-    def from_faster_whisper_segments(cls, segments: Iterable[faster_whisper.transcribe.Segment]) -> Iterable[Segment]:
-        for segment in segments:
-            yield cls(
-                id=segment.id,
-                seek=segment.seek,
-                start=segment.start,
-                end=segment.end,
-                text=segment.text,
-                tokens=segment.tokens,
-                temperature=segment.temperature,
-                avg_logprob=segment.avg_logprob,
-                compression_ratio=segment.compression_ratio,
-                no_speech_prob=segment.no_speech_prob,
-                words=[
-                    Word(
-                        start=word.start,
-                        end=word.end,
-                        word=word.word,
-                        probability=word.probability,
-                    )
-                    for word in segment.words
-                ]
-                if segment.words is not None
-                else None,
-            )
 class Transcription:
-    def __init__(self, words: list[Word] = []) -> None:
-        self.words: list[Word] = []
         self.extend(words)
     @property
@@ -108,11 +35,11 @@ class Transcription:
     def after(self, seconds: float) -> Transcription:
         return Transcription(words=[word for word in self.words if word.start > seconds])
-    def extend(self, words: list[Word]) -> None:
         self._ensure_no_word_overlap(words)
         self.words.extend(words)
-    def _ensure_no_word_overlap(self, words: list[Word]) -> None:
         config = get_config()  # HACK
         if len(self.words) > 0 and len(words) > 0:
             if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
@@ -130,19 +57,8 @@ def is_eos(text: str) -> bool:
     return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
-def test_is_eos() -> None:
-    assert not is_eos("Hello")
-    assert not is_eos("Hello...")
-    assert is_eos("Hello.")
-    assert is_eos("Hello!")
-    assert is_eos("Hello?")
-    assert not is_eos("Hello. Yo")
-    assert not is_eos("Hello. Yo...")
-    assert is_eos("Hello. Yo.")
-def to_full_sentences(words: list[Word]) -> list[list[Word]]:
-    sentences: list[list[Word]] = [[]]
     for word in words:
         sentences[-1].append(word)
         if is_eos(word.word):
@@ -152,28 +68,15 @@ def to_full_sentences(words: list[Word]) -> list[list[Word]]:
     return sentences
-def tests_to_full_sentences() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-    assert to_full_sentences([]) == []
-    assert to_full_sentences([word(text="Hello")]) == []
-    assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
-    assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
-    assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
-        [word("Hello..."), word(" world.")],
-    ]
-def word_to_text(words: list[Word]) -> str:
     return "".join(word.word for word in words)
-def words_to_text_w_ts(words: list[Word]) -> str:
     return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
-def segments_to_text(segments: Iterable[Segment]) -> str:
     return "".join(segment.text for segment in segments).strip()
@@ -185,19 +88,6 @@ def srt_format_timestamp(ts: float) -> str:
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
-def test_srt_format_timestamp() -> None:
-    assert srt_format_timestamp(0.0) == "00:00:00,000"
-    assert srt_format_timestamp(1.0) == "00:00:01,000"
-    assert srt_format_timestamp(1.234) == "00:00:01,234"
-    assert srt_format_timestamp(60.0) == "00:01:00,000"
-    assert srt_format_timestamp(61.0) == "00:01:01,000"
-    assert srt_format_timestamp(61.234) == "00:01:01,234"
-    assert srt_format_timestamp(3600.0) == "01:00:00,000"
-    assert srt_format_timestamp(3601.0) == "01:00:01,000"
-    assert srt_format_timestamp(3601.234) == "01:00:01,234"
-    assert srt_format_timestamp(23423.4234) == "06:30:23,423"
 def vtt_format_timestamp(ts: float) -> str:
     hours = ts // 3600
     minutes = (ts % 3600) // 60
@@ -206,20 +96,7 @@ def vtt_format_timestamp(ts: float) -> str:
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
-def test_vtt_format_timestamp() -> None:
-    assert vtt_format_timestamp(0.0) == "00:00:00.000"
-    assert vtt_format_timestamp(1.0) == "00:00:01.000"
-    assert vtt_format_timestamp(1.234) == "00:00:01.234"
-    assert vtt_format_timestamp(60.0) == "00:01:00.000"
-    assert vtt_format_timestamp(61.0) == "00:01:01.000"
-    assert vtt_format_timestamp(61.234) == "00:01:01.234"
-    assert vtt_format_timestamp(3600.0) == "01:00:00.000"
-    assert vtt_format_timestamp(3601.0) == "01:00:01.000"
-    assert vtt_format_timestamp(3601.234) == "01:00:01.234"
-    assert vtt_format_timestamp(23423.4234) == "06:30:23.423"
-def segments_to_vtt(segment: Segment, i: int) -> str:
     start = segment.start if i > 0 else 0.0
     result = f"{vtt_format_timestamp(start)} --> {vtt_format_timestamp(segment.end)}\n{segment.text}\n\n"
@@ -229,7 +106,7 @@ def segments_to_vtt(segment: Segment, i: int) -> str:
         return result
-def segments_to_srt(segment: Segment, i: int) -> str:
     return f"{i + 1}\n{srt_format_timestamp(segment.start)} --> {srt_format_timestamp(segment.end)}\n{segment.text}\n\n"
@@ -240,60 +117,8 @@ def canonicalize_word(text: str) -> str:
     return text.lower().strip().strip(".,?!")
-def test_canonicalize_word() -> None:
-    assert canonicalize_word("ABC") == "abc"
-    assert canonicalize_word("...ABC?") == "abc"
-    assert canonicalize_word("... AbC  ...") == "abc"
-def common_prefix(a: list[Word], b: list[Word]) -> list[Word]:
     i = 0
     while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
         i += 1
     return a[:i]
-def test_common_prefix() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("a"), word("b"), word("c")]
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a"), word("b"), word("d")]
-    assert common_prefix(a, b) == [word("a"), word("b")]
-    a = [word("a"), word("b"), word("c")]
-    b = [word("a")]
-    assert common_prefix(a, b) == [word("a")]
-    a = [word("a")]
-    b = [word("a"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("a")]
-    a = [word("a")]
-    b = []
-    assert common_prefix(a, b) == []
-    a = []
-    b = [word("a")]
-    assert common_prefix(a, b) == []
-    a = [word("a"), word("b"), word("c")]
-    b = [word("b"), word("c")]
-    assert common_prefix(a, b) == []
-def test_common_prefix_and_canonicalization() -> None:
-    def word(text: str) -> Word:
-        return Word(word=text, start=0.0, end=0.0, probability=0.0)
-    a = [word("A...")]
-    b = [word("a?"), word("b"), word("c")]
-    assert common_prefix(a, b) == [word("A...")]
-    a = [word("A..."), word("B?"), word("C,")]
-    b = [word("a??"), word("  b"), word(" ,c")]
-    assert common_prefix(a, b) == [word("A..."), word("B?"), word("C,")]

 import re
 from typing import TYPE_CHECKING
 from faster_whisper_server.dependencies import get_config
 if TYPE_CHECKING:
     from collections.abc import Iterable
+    from faster_whisper_server.api_models import TranscriptionSegment, TranscriptionWord
 class Transcription:
+    def __init__(self, words: list[TranscriptionWord] = []) -> None:
+        self.words: list[TranscriptionWord] = []
         self.extend(words)
     @property
     def after(self, seconds: float) -> Transcription:
         return Transcription(words=[word for word in self.words if word.start > seconds])
+    def extend(self, words: list[TranscriptionWord]) -> None:
         self._ensure_no_word_overlap(words)
         self.words.extend(words)
+    def _ensure_no_word_overlap(self, words: list[TranscriptionWord]) -> None:
         config = get_config()  # HACK
         if len(self.words) > 0 and len(words) > 0:
             if words[0].start + config.word_timestamp_error_margin <= self.words[-1].end:
     return any(text.endswith(punctuation_symbol) for punctuation_symbol in ".?!")
+def to_full_sentences(words: list[TranscriptionWord]) -> list[list[TranscriptionWord]]:
+    sentences: list[list[TranscriptionWord]] = [[]]
     for word in words:
         sentences[-1].append(word)
         if is_eos(word.word):
     return sentences
+def word_to_text(words: list[TranscriptionWord]) -> str:
     return "".join(word.word for word in words)
+def words_to_text_w_ts(words: list[TranscriptionWord]) -> str:
     return "".join(f"{word.word}({word.start:.2f}-{word.end:.2f})" for word in words)
+def segments_to_text(segments: Iterable[TranscriptionSegment]) -> str:
     return "".join(segment.text for segment in segments).strip()
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
 def vtt_format_timestamp(ts: float) -> str:
     hours = ts // 3600
     minutes = (ts % 3600) // 60
     return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}.{int(milliseconds):03d}"
+def segments_to_vtt(segment: TranscriptionSegment, i: int) -> str:
     start = segment.start if i > 0 else 0.0
     result = f"{vtt_format_timestamp(start)} --> {vtt_format_timestamp(segment.end)}\n{segment.text}\n\n"
         return result
+def segments_to_srt(segment: TranscriptionSegment, i: int) -> str:
     return f"{i + 1}\n{srt_format_timestamp(segment.start)} --> {srt_format_timestamp(segment.end)}\n{segment.text}\n\n"
     return text.lower().strip().strip(".,?!")
+def common_prefix(a: list[TranscriptionWord], b: list[TranscriptionWord]) -> list[TranscriptionWord]:
     i = 0
     while i < len(a) and i < len(b) and canonicalize_word(a[i].word) == canonicalize_word(b[i].word):
         i += 1
     return a[:i]

src/faster_whisper_server/text_utils_test.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from faster_whisper_server.api_models import TranscriptionWord
+from faster_whisper_server.text_utils import (
+    canonicalize_word,
+    common_prefix,
+    is_eos,
+    srt_format_timestamp,
+    to_full_sentences,
+    vtt_format_timestamp,
+)
+def test_is_eos() -> None:
+    assert not is_eos("Hello")
+    assert not is_eos("Hello...")
+    assert is_eos("Hello.")
+    assert is_eos("Hello!")
+    assert is_eos("Hello?")
+    assert not is_eos("Hello. Yo")
+    assert not is_eos("Hello. Yo...")
+    assert is_eos("Hello. Yo.")
+def tests_to_full_sentences() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+    assert to_full_sentences([]) == []
+    assert to_full_sentences([word(text="Hello")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world")]) == []
+    assert to_full_sentences([word(text="Hello..."), word(" world.")]) == [[word("Hello..."), word(" world.")]]
+    assert to_full_sentences([word(text="Hello..."), word(" world."), word(" How")]) == [
+        [word("Hello..."), word(" world.")],
+    ]
+def test_srt_format_timestamp() -> None:
+    assert srt_format_timestamp(0.0) == "00:00:00,000"
+    assert srt_format_timestamp(1.0) == "00:00:01,000"
+    assert srt_format_timestamp(1.234) == "00:00:01,234"
+    assert srt_format_timestamp(60.0) == "00:01:00,000"
+    assert srt_format_timestamp(61.0) == "00:01:01,000"
+    assert srt_format_timestamp(61.234) == "00:01:01,234"
+    assert srt_format_timestamp(3600.0) == "01:00:00,000"
+    assert srt_format_timestamp(3601.0) == "01:00:01,000"
+    assert srt_format_timestamp(3601.234) == "01:00:01,234"
+    assert srt_format_timestamp(23423.4234) == "06:30:23,423"
+def test_vtt_format_timestamp() -> None:
+    assert vtt_format_timestamp(0.0) == "00:00:00.000"
+    assert vtt_format_timestamp(1.0) == "00:00:01.000"
+    assert vtt_format_timestamp(1.234) == "00:00:01.234"
+    assert vtt_format_timestamp(60.0) == "00:01:00.000"
+    assert vtt_format_timestamp(61.0) == "00:01:01.000"
+    assert vtt_format_timestamp(61.234) == "00:01:01.234"
+    assert vtt_format_timestamp(3600.0) == "01:00:00.000"
+    assert vtt_format_timestamp(3601.0) == "01:00:01.000"
+    assert vtt_format_timestamp(3601.234) == "01:00:01.234"
+    assert vtt_format_timestamp(23423.4234) == "06:30:23.423"
+def test_canonicalize_word() -> None:
+    assert canonicalize_word("ABC") == "abc"
+    assert canonicalize_word("...ABC?") == "abc"
+    assert canonicalize_word("... AbC  ...") == "abc"
+def test_common_prefix() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("a"), word("b"), word("c")]
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a"), word("b"), word("d")]
+    assert common_prefix(a, b) == [word("a"), word("b")]
+    a = [word("a"), word("b"), word("c")]
+    b = [word("a")]
+    assert common_prefix(a, b) == [word("a")]
+    a = [word("a")]
+    b = [word("a"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("a")]
+    a = [word("a")]
+    b = []
+    assert common_prefix(a, b) == []
+    a = []
+    b = [word("a")]
+    assert common_prefix(a, b) == []
+    a = [word("a"), word("b"), word("c")]
+    b = [word("b"), word("c")]
+    assert common_prefix(a, b) == []
+def test_common_prefix_and_canonicalization() -> None:
+    def word(text: str) -> TranscriptionWord:
+        return TranscriptionWord(word=text, start=0.0, end=0.0, probability=0.0)
+    a = [word("A...")]
+    b = [word("a?"), word("b"), word("c")]
+    assert common_prefix(a, b) == [word("A...")]
+    a = [word("A..."), word("B?"), word("C,")]
+    b = [word("a??"), word("  b"), word(" ,c")]
+    assert common_prefix(a, b) == [word("A..."), word("B?"), word("C,")]

src/faster_whisper_server/transcriber.py CHANGED Viewed

@@ -4,11 +4,12 @@ import logging
 from typing import TYPE_CHECKING
 from faster_whisper_server.audio import Audio, AudioStream
-from faster_whisper_server.core import Transcription, Word, common_prefix, to_full_sentences, word_to_text
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
     from faster_whisper_server.asr import FasterWhisperASR
 logger = logging.getLogger(__name__)
@@ -18,7 +19,7 @@ class LocalAgreement:
     def __init__(self) -> None:
         self.unconfirmed = Transcription()
-    def merge(self, confirmed: Transcription, incoming: Transcription) -> list[Word]:
         # https://github.com/ufal/whisper_streaming/blob/main/whisper_online.py#L264
         incoming = incoming.after(confirmed.end - 0.1)
         prefix = common_prefix(incoming.words, self.unconfirmed.words)

 from typing import TYPE_CHECKING
 from faster_whisper_server.audio import Audio, AudioStream
+from faster_whisper_server.text_utils import Transcription, common_prefix, to_full_sentences, word_to_text
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator
+    from faster_whisper_server.api_models import TranscriptionWord
     from faster_whisper_server.asr import FasterWhisperASR
 logger = logging.getLogger(__name__)
     def __init__(self) -> None:
         self.unconfirmed = Transcription()
+    def merge(self, confirmed: Transcription, incoming: Transcription) -> list[TranscriptionWord]:
         # https://github.com/ufal/whisper_streaming/blob/main/whisper_online.py#L264
         incoming = incoming.after(confirmed.end - 0.1)
         prefix = common_prefix(incoming.words, self.unconfirmed.words)

tests/api_timestamp_granularities_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """See `tests/openai_timestamp_granularities_test.py` to understand how OpenAI handles `response_type` and `timestamp_granularities`."""  # noqa: E501
-from faster_whisper_server.server_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI
 import pytest

 """See `tests/openai_timestamp_granularities_test.py` to understand how OpenAI handles `response_type` and `timestamp_granularities`."""  # noqa: E501
+from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI
 import pytest

tests/openai_timestamp_granularities_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """OpenAI's handling of `response_format` and `timestamp_granularities` is a bit confusing and inconsistent. This test module exists to capture the OpenAI API's behavior with respect to these parameters."""  # noqa: E501
-from faster_whisper_server.server_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI, BadRequestError
 import pytest

 """OpenAI's handling of `response_format` and `timestamp_granularities` is a bit confusing and inconsistent. This test module exists to capture the OpenAI API's behavior with respect to these parameters."""  # noqa: E501
+from faster_whisper_server.api_models import TIMESTAMP_GRANULARITIES_COMBINATIONS, TimestampGranularities
 from openai import AsyncOpenAI, BadRequestError
 import pytest

tests/sse_test.py CHANGED Viewed

@@ -2,9 +2,9 @@ import json
 import os
 from fastapi.testclient import TestClient
-from faster_whisper_server.server_models import (
-    TranscriptionJsonResponse,
-    TranscriptionVerboseJsonResponse,
 )
 from httpx_sse import connect_sse
 import pytest
@@ -48,7 +48,7 @@ def test_streaming_transcription_json(client: TestClient, file_path: str, endpoi
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
-            TranscriptionJsonResponse(**json.loads(event.data))
 @pytest.mark.parametrize(("file_path", "endpoint"), parameters)
@@ -62,7 +62,7 @@ def test_streaming_transcription_verbose_json(client: TestClient, file_path: str
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
-            TranscriptionVerboseJsonResponse(**json.loads(event.data))
 def test_transcription_vtt(client: TestClient) -> None:

 import os
 from fastapi.testclient import TestClient
+from faster_whisper_server.api_models import (
+    CreateTranscriptionResponseJson,
+    CreateTranscriptionResponseVerboseJson,
 )
 from httpx_sse import connect_sse
 import pytest
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
+            CreateTranscriptionResponseJson(**json.loads(event.data))
 @pytest.mark.parametrize(("file_path", "endpoint"), parameters)
     }
     with connect_sse(client, "POST", endpoint, **kwargs) as event_source:
         for event in event_source.iter_sse():
+            CreateTranscriptionResponseVerboseJson(**json.loads(event.data))
 def test_transcription_vtt(client: TestClient) -> None: