|
|
|
from fastapi import APIRouter |
|
from pythainlp.tokenize import word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize |
|
from enum import Enum |
|
from typing import List, Optional |
|
from pydantic import BaseModel |
|
|
|
router = APIRouter() |
|
|
|
|
|
class SentTokenizeEngine(str, Enum): |
|
whitespace = "whitespace" |
|
whitespace_newline = "whitespace+newline" |
|
|
|
|
|
class WordTokenizeEngine(str, Enum): |
|
newmm = "newmm" |
|
longest = "longest" |
|
deepcut = "deepcut" |
|
icu = "icu" |
|
ulmfit = "ulmfit" |
|
|
|
|
|
class SubwordTokenizeEngine(str, Enum): |
|
tcc = "tcc" |
|
etcc = "etcc" |
|
|
|
class WordTokenizeResponse(BaseModel): |
|
words: List[str] = [] |
|
|
|
class SubwordTokenizeResponse(BaseModel): |
|
subwords: List[str] = [] |
|
|
|
@router.get('/word_tokenize', response_model=WordTokenizeResponse) |
|
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): |
|
return {"words": py_word_tokenize(text=text, engine=engine)} |
|
|
|
|
|
@router.get('/subword_tokenize', response_model=SubwordTokenizeResponse) |
|
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): |
|
return {"subwords": py_subword_tokenize(text=text, engine=engine)} |
|
|