Commit
·
8fe7306
1
Parent(s):
cf27cda
Add soundex
Browse files- app.py +19 -3
- requirements.txt +3 -1
- routers/soundex.py +23 -0
- routers/tokenize.py +17 -1
app.py
CHANGED
@@ -1,16 +1,31 @@
|
|
1 |
from fastapi import Depends, FastAPI, Header, HTTPException
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from fastapi.responses import RedirectResponse
|
4 |
-
from routers import tokenize
|
5 |
import pythainlp
|
6 |
|
7 |
|
8 |
-
DESC_TEXT = "PyThaiNLP API
|
|
|
|
|
|
|
|
|
9 |
|
10 |
app = FastAPI(
|
11 |
title='PyThaiNLP API',
|
12 |
description=DESC_TEXT,
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
)
|
15 |
|
16 |
app.add_middleware(
|
@@ -35,3 +50,4 @@ def version():
|
|
35 |
return {"version": pythainlp.__version__}
|
36 |
|
37 |
app.include_router(tokenize.router, prefix="/tokenize", tags=["Tokenize"])
|
|
|
|
1 |
from fastapi import Depends, FastAPI, Header, HTTPException
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
from fastapi.responses import RedirectResponse
|
4 |
+
from routers import tokenize, soundex
|
5 |
import pythainlp
|
6 |
|
7 |
|
8 |
+
DESC_TEXT = """# PyThaiNLP API
|
9 |
+
|
10 |
+
PyThaiNLP API
|
11 |
+
"""
|
12 |
+
|
13 |
|
14 |
app = FastAPI(
|
15 |
title='PyThaiNLP API',
|
16 |
description=DESC_TEXT,
|
17 |
+
# summary="Deadpool's favorite app. Nuff said.",
|
18 |
+
version="0.0.1",
|
19 |
+
# terms_of_service="http://example.com/terms/",
|
20 |
+
# contact={
|
21 |
+
# "name": "Deadpoolio the Amazing",
|
22 |
+
# "url": "http://x-force.example.com/contact/",
|
23 |
+
# "email": "[email protected]",
|
24 |
+
# },
|
25 |
+
license_info={
|
26 |
+
"name": "Apache 2.0",
|
27 |
+
"identifier": "MIT",
|
28 |
+
},
|
29 |
)
|
30 |
|
31 |
app.add_middleware(
|
|
|
50 |
return {"version": pythainlp.__version__}
|
51 |
|
52 |
app.include_router(tokenize.router, prefix="/tokenize", tags=["Tokenize"])
|
53 |
+
app.include_router(soundex.router, prefix="/soundex", tags=["Soundex"])
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
fastapi
|
2 |
uvicorn[standard]
|
3 |
pythainlp==5.0.5
|
4 |
-
python-crfsuite
|
|
|
|
|
|
1 |
fastapi
|
2 |
uvicorn[standard]
|
3 |
pythainlp==5.0.5
|
4 |
+
python-crfsuite
|
5 |
+
ssg
|
6 |
+
tltk
|
routers/soundex.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
from fastapi import APIRouter
|
3 |
+
from pythainlp.soundex import (
|
4 |
+
soundex as py_soundex
|
5 |
+
)
|
6 |
+
from enum import Enum
|
7 |
+
|
8 |
+
router = APIRouter()
|
9 |
+
|
10 |
+
|
11 |
+
class SoundexEngine(str, Enum):
|
12 |
+
udom83 = "udom83"
|
13 |
+
lk82 = "lk82"
|
14 |
+
metasound = "metasound"
|
15 |
+
prayut_and_somchaip = "prayut_and_somchaip"
|
16 |
+
|
17 |
+
|
18 |
+
@router.post('/soundex')
|
19 |
+
def soundex(text: str, engine: SoundexEngine = "udom83"):
|
20 |
+
"""
|
21 |
+
Word tokenize
|
22 |
+
"""
|
23 |
+
return {"soundex": py_soundex(text=text, engine=engine)}
|
routers/tokenize.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
from fastapi import APIRouter
|
3 |
-
from pythainlp.tokenize import
|
|
|
|
|
|
|
|
|
4 |
from enum import Enum
|
5 |
from typing import List, Optional
|
6 |
from pydantic import BaseModel
|
@@ -11,16 +15,20 @@ router = APIRouter()
|
|
11 |
class SentTokenizeEngine(str, Enum):
|
12 |
whitespace = "whitespace"
|
13 |
whitespace_newline = "whitespace+newline"
|
|
|
14 |
|
15 |
|
16 |
class WordTokenizeEngine(str, Enum):
|
17 |
newmm = "newmm"
|
18 |
longest = "longest"
|
|
|
19 |
|
20 |
|
21 |
class SubwordTokenizeEngine(str, Enum):
|
22 |
tcc = "tcc"
|
23 |
etcc = "etcc"
|
|
|
|
|
24 |
|
25 |
class WordTokenizeResponse(BaseModel):
|
26 |
words: List[str] = []
|
@@ -28,6 +36,9 @@ class WordTokenizeResponse(BaseModel):
|
|
28 |
class SubwordTokenizeResponse(BaseModel):
|
29 |
subwords: List[str] = []
|
30 |
|
|
|
|
|
|
|
31 |
@router.post('/word_tokenize', response_model=WordTokenizeResponse)
|
32 |
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
|
33 |
"""
|
@@ -39,3 +50,8 @@ def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
|
|
39 |
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
|
40 |
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
|
41 |
return {"subwords": py_subword_tokenize(text=text, engine=engine)}
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
from fastapi import APIRouter
|
3 |
+
from pythainlp.tokenize import (
|
4 |
+
word_tokenize as py_word_tokenize,
|
5 |
+
subword_tokenize as py_subword_tokenize,
|
6 |
+
sent_tokenize as py_sent_tokenize
|
7 |
+
)
|
8 |
from enum import Enum
|
9 |
from typing import List, Optional
|
10 |
from pydantic import BaseModel
|
|
|
15 |
class SentTokenizeEngine(str, Enum):
|
16 |
whitespace = "whitespace"
|
17 |
whitespace_newline = "whitespace+newline"
|
18 |
+
crfcut = "crfcut"
|
19 |
|
20 |
|
21 |
class WordTokenizeEngine(str, Enum):
|
22 |
newmm = "newmm"
|
23 |
longest = "longest"
|
24 |
+
tltk = "tltk"
|
25 |
|
26 |
|
27 |
class SubwordTokenizeEngine(str, Enum):
|
28 |
tcc = "tcc"
|
29 |
etcc = "etcc"
|
30 |
+
ssg = "ssg"
|
31 |
+
tltk = "tltk"
|
32 |
|
33 |
class WordTokenizeResponse(BaseModel):
|
34 |
words: List[str] = []
|
|
|
36 |
class SubwordTokenizeResponse(BaseModel):
|
37 |
subwords: List[str] = []
|
38 |
|
39 |
+
class SentTokenizeEngine(BaseModel):
|
40 |
+
sents: List[str] = []
|
41 |
+
|
42 |
@router.post('/word_tokenize', response_model=WordTokenizeResponse)
|
43 |
def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
|
44 |
"""
|
|
|
50 |
@router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
|
51 |
def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
|
52 |
return {"subwords": py_subword_tokenize(text=text, engine=engine)}
|
53 |
+
|
54 |
+
|
55 |
+
@router.post('/sent_tokenize', response_model=SentTokenizeEngine)
|
56 |
+
def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
|
57 |
+
return {"sents": py_sent_tokenize(text=text, engine=engine)}
|