wannaphong commited on
Commit
dbaf7c6
·
1 Parent(s): 4274b71
Files changed (4) hide show
  1. Dockerfile +16 -0
  2. app.py +28 -0
  3. requirements.txt +4 -0
  4. routers/tokenize.py +41 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Depends, FastAPI, Header, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from routers import tokenize
4
+ import pythainlp
5
+
6
+
7
+ DESC_TEXT = "PyThaiNLP API"
8
+
9
+ app = FastAPI(
10
+ title='PyThaiNLP API',
11
+ description=DESC_TEXT,
12
+ version='0.1',
13
+ )
14
+
15
+ app.add_middleware(
16
+ CORSMiddleware,
17
+ allow_origins=["*"],
18
+ allow_credentials=True,
19
+ allow_methods=["*"],
20
+ allow_headers=["*"],
21
+ )
22
+
23
+
24
+ @app.get("/")
25
+ def index():
26
+ return {"Pythainlp Version": pythainlp.__version__}
27
+
28
+ app.include_router(tokenize.router, prefix="/tokenize", tags=["Tokenize"])
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ pythainlp==5.0.5
4
+ python-crfsuite
routers/tokenize.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from fastapi import APIRouter
3
+ from pythainlp.tokenize import word_tokenize as py_word_tokenize, subword_tokenize as py_subword_tokenize
4
+ from enum import Enum
5
+ from typing import List, Optional
6
+ from pydantic import BaseModel
7
+
8
+ router = APIRouter()
9
+
10
+
11
+ class SentTokenizeEngine(str, Enum):
12
+ whitespace = "whitespace"
13
+ whitespace_newline = "whitespace+newline"
14
+
15
+
16
+ class WordTokenizeEngine(str, Enum):
17
+ newmm = "newmm"
18
+ longest = "longest"
19
+ deepcut = "deepcut"
20
+ icu = "icu"
21
+ ulmfit = "ulmfit"
22
+
23
+
24
+ class SubwordTokenizeEngine(str, Enum):
25
+ tcc = "tcc"
26
+ etcc = "etcc"
27
+
28
+ class WordTokenizeResponse(BaseModel):
29
+ words: List[str] = []
30
+
31
+ class SubwordTokenizeResponse(BaseModel):
32
+ subwords: List[str] = []
33
+
34
+ @router.get('/word_tokenize', response_model=WordTokenizeResponse)
35
+ def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"):
36
+ return {"words": py_word_tokenize(text=text, engine=engine)}
37
+
38
+
39
+ @router.get('/subword_tokenize', response_model=SubwordTokenizeResponse)
40
+ def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"):
41
+ return {"subwords": py_subword_tokenize(text=text, engine=engine)}