|
|
|
from fastapi import APIRouter |
|
from pythainlp.tokenize import ( |
|
word_tokenize as py_word_tokenize, |
|
subword_tokenize as py_subword_tokenize, |
|
sent_tokenize as py_sent_tokenize |
|
) |
|
from enum import Enum |
|
from typing import List, Optional |
|
from pydantic import BaseModel |
|
|
|
router = APIRouter() |
|
|
|
|
|
class SentTokenizeEngine(str, Enum): |
|
whitespace = "whitespace" |
|
whitespace_newline = "whitespace+newline" |
|
crfcut = "crfcut" |
|
|
|
|
|
class WordTokenizeEngine(str, Enum): |
|
newmm = "newmm" |
|
longest = "longest" |
|
tltk = "tltk" |
|
|
|
|
|
class SubwordTokenizeEngine(str, Enum): |
|
tcc = "tcc" |
|
etcc = "etcc" |
|
ssg = "ssg" |
|
tltk = "tltk" |
|
|
|
class WordTokenizeResponse(BaseModel): |
|
words: List[str] = [] |
|
|
|
class SubwordTokenizeResponse(BaseModel): |
|
subwords: List[str] = [] |
|
|
|
class SentTokenizeEngine(BaseModel): |
|
sents: List[str] = [] |
|
|
|
@router.post('/word_tokenize', response_model=WordTokenizeResponse) |
|
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): |
|
""" |
|
Word tokenize |
|
""" |
|
return {"words": py_word_tokenize(text=text, engine=engine)} |
|
|
|
|
|
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) |
|
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): |
|
return {"subwords": py_subword_tokenize(text=text, engine=engine)} |
|
|
|
|
|
@router.post('/sent_tokenize', response_model=SentTokenizeEngine) |
|
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): |
|
return {"sents": py_sent_tokenize(text=text, engine=engine)} |