# -*- coding: utf-8 -*- from fastapi import APIRouter from pythainlp.tokenize import ( word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize, sent_tokenize as py_sent_tokenize ) from enum import Enum from typing import List, Optional from pydantic import BaseModel router = APIRouter() class SentTokenizeEngine(str, Enum): whitespace = "whitespace" whitespace_newline = "whitespace+newline" crfcut = "crfcut" class WordTokenizeEngine(str, Enum): newmm = "newmm" longest = "longest" tltk = "tltk" class SubwordTokenizeEngine(str, Enum): tcc = "tcc" etcc = "etcc" ssg = "ssg" tltk = "tltk" class WordTokenizeResponse(BaseModel): words: List[str] = [] class SubwordTokenizeResponse(BaseModel): subwords: List[str] = [] class SentTokenizeEngine(BaseModel): sents: List[str] = [] @router.post('/word_tokenize', response_model=WordTokenizeResponse) def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): """ Word tokenize """ return {"words": py_word_tokenize(text=text, engine=engine)} @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): return {"subwords": py_subword_tokenize(text=text, engine=engine)} @router.post('/sent_tokenize', response_model=SentTokenizeEngine) def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): return {"sents": py_sent_tokenize(text=text, engine=engine)}