# -*- coding: utf-8 -*- from fastapi import APIRouter from pythainlp.tokenize import word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize from enum import Enum from typing import List, Optional from pydantic import BaseModel router = APIRouter() class SentTokenizeEngine(str, Enum): whitespace = "whitespace" whitespace_newline = "whitespace+newline" class WordTokenizeEngine(str, Enum): newmm = "newmm" longest = "longest" deepcut = "deepcut" icu = "icu" ulmfit = "ulmfit" class SubwordTokenizeEngine(str, Enum): tcc = "tcc" etcc = "etcc" class WordTokenizeResponse(BaseModel): words: List[str] = [] class SubwordTokenizeResponse(BaseModel): subwords: List[str] = [] @router.get('/word_tokenize', response_model=WordTokenizeResponse) def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): return {"words": py_word_tokenize(text=text, engine=engine)} @router.get('/subword_tokenize', response_model=SubwordTokenizeResponse) def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): return {"subwords": py_subword_tokenize(text=text, engine=engine)}