wannaphong commited on
Commit
8fe7306
·
1 Parent(s): cf27cda

Add soundex

Browse files
Files changed (4) hide show
  1. app.py +19 -3
  2. requirements.txt +3 -1
  3. routers/soundex.py +23 -0
  4. routers/tokenize.py +17 -1
app.py CHANGED
@@ -1,16 +1,31 @@
1
  from fastapi import Depends, FastAPI, Header, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.responses import RedirectResponse
4
- from routers import tokenize
5
  import pythainlp
6
 
7
 
8
- DESC_TEXT = "PyThaiNLP API"
 
 
 
 
9
 
10
  app = FastAPI(
11
  title='PyThaiNLP API',
12
  description=DESC_TEXT,
13
- version='0.1',
 
 
 
 
 
 
 
 
 
 
 
14
  )
15
 
16
  app.add_middleware(
@@ -35,3 +50,4 @@ def version():
35
  return {"version": pythainlp.__version__}
36
 
37
  app.include_router(tokenize.router, prefix="/tokenize", tags=["Tokenize"])
 
 
1
  from fastapi import Depends, FastAPI, Header, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.responses import RedirectResponse
4
+ from routers import tokenize, soundex
5
  import pythainlp
6
 
7
 
8
+ DESC_TEXT = """# PyThaiNLP API
9
+
10
+ PyThaiNLP API
11
+ """
12
+
13
 
14
  app = FastAPI(
15
  title='PyThaiNLP API',
16
  description=DESC_TEXT,
17
+ # summary="Deadpool's favorite app. Nuff said.",
18
+ version="0.0.1",
19
+ # terms_of_service="http://example.com/terms/",
20
+ # contact={
21
+ # "name": "Deadpoolio the Amazing",
22
+ # "url": "http://x-force.example.com/contact/",
23
+ # "email": "[email protected]",
24
+ # },
25
+ license_info={
26
+ "name": "Apache 2.0",
27
+ "identifier": "MIT",
28
+ },
29
  )
30
 
31
  app.add_middleware(
 
50
  return {"version": pythainlp.__version__}
51
 
52
  app.include_router(tokenize.router, prefix="/tokenize", tags=["Tokenize"])
53
+ app.include_router(soundex.router, prefix="/soundex", tags=["Soundex"])
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  fastapi
2
  uvicorn[standard]
3
  pythainlp==5.0.5
4
- python-crfsuite
 
 
 
1
  fastapi
2
  uvicorn[standard]
3
  pythainlp==5.0.5
4
+ python-crfsuite
5
+ ssg
6
+ tltk
routers/soundex.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from fastapi import APIRouter
3
+ from pythainlp.soundex import (
4
+ soundex as py_soundex
5
+ )
6
+ from enum import Enum
7
+
8
+ router = APIRouter()
9
+
10
+
11
+ class SoundexEngine(str, Enum):
12
+ udom83 = "udom83"
13
+ lk82 = "lk82"
14
+ metasound = "metasound"
15
+ prayut_and_somchaip = "prayut_and_somchaip"
16
+
17
+
18
+ @router.post('/soundex')
19
+ def soundex(text: str, engine: SoundexEngine = "udom83"):
20
+ """
21
+ Word tokenize
22
+ """
23
+ return {"soundex": py_soundex(text=text, engine=engine)}
routers/tokenize.py CHANGED
@@ -1,6 +1,10 @@
1
  # -*- coding: utf-8 -*-
2
  from fastapi import APIRouter
3
- from pythainlp.tokenize import word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize
 
 
 
 
4
  from enum import Enum
5
  from typing import List, Optional
6
  from pydantic import BaseModel
@@ -11,16 +15,20 @@ router = APIRouter()
11
  class SentTokenizeEngine(str, Enum):
12
  whitespace = "whitespace"
13
  whitespace_newline = "whitespace+newline"
 
14
 
15
 
16
  class WordTokenizeEngine(str, Enum):
17
  newmm = "newmm"
18
  longest = "longest"
 
19
 
20
 
21
  class SubwordTokenizeEngine(str, Enum):
22
  tcc = "tcc"
23
  etcc = "etcc"
 
 
24
 
25
  class WordTokenizeResponse(BaseModel):
26
  words: List[str] = []
@@ -28,6 +36,9 @@ class WordTokenizeResponse(BaseModel):
28
  class SubwordTokenizeResponse(BaseModel):
29
  subwords: List[str] = []
30
 
 
 
 
31
  @router.post('/word_tokenize', response_model=WordTokenizeResponse)
32
  def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
33
  """
@@ -39,3 +50,8 @@ def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
39
  @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
40
  def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
41
  return {"subwords": py_subword_tokenize(text=text, engine=engine)}
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  from fastapi import APIRouter
3
+ from pythainlp.tokenize import (
4
+ word_tokenize as py_word_tokenize,
5
+ subword_tokenize as py_subword_tokenize,
6
+ sent_tokenize as py_sent_tokenize
7
+ )
8
  from enum import Enum
9
  from typing import List, Optional
10
  from pydantic import BaseModel
 
15
  class SentTokenizeEngine(str, Enum):
16
  whitespace = "whitespace"
17
  whitespace_newline = "whitespace+newline"
18
+ crfcut = "crfcut"
19
 
20
 
21
  class WordTokenizeEngine(str, Enum):
22
  newmm = "newmm"
23
  longest = "longest"
24
+ tltk = "tltk"
25
 
26
 
27
  class SubwordTokenizeEngine(str, Enum):
28
  tcc = "tcc"
29
  etcc = "etcc"
30
+ ssg = "ssg"
31
+ tltk = "tltk"
32
 
33
  class WordTokenizeResponse(BaseModel):
34
  words: List[str] = []
 
36
  class SubwordTokenizeResponse(BaseModel):
37
  subwords: List[str] = []
38
 
39
+ class SentTokenizeEngine(BaseModel):
40
+ sents: List[str] = []
41
+
42
  @router.post('/word_tokenize', response_model=WordTokenizeResponse)
43
  def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
44
  """
 
50
  @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse)
51
  def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
52
  return {"subwords": py_subword_tokenize(text=text, engine=engine)}
53
+
54
+
55
+ @router.post('/sent_tokenize', response_model=SentTokenizeEngine)
56
+ def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"):
57
+ return {"sents": py_sent_tokenize(text=text, engine=engine)}