pmkhanh7890 commited on
Commit
19cf5e6
·
1 Parent(s): 936d627

Freemind demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +251 -0
  2. models/freemind/embeddings/freemind.csv +0 -0
  3. requirements.txt +33 -0
  4. src/AI/__init__.py +0 -0
  5. src/AI/ai_configs.py +84 -0
  6. src/AI/chatbot_demo.py +112 -0
  7. src/AI/embedding.py +323 -0
  8. src/AI/evaluation.py +101 -0
  9. src/AI/klever_search.py +73 -0
  10. src/AI/parsing.py +7 -0
  11. src/AI/search.py +219 -0
  12. src/AI/training.py +95 -0
  13. src/__init__.py +0 -0
  14. src/backend/TTChatBot/.sample-env +24 -0
  15. src/backend/TTChatBot/chatbot/__init__.py +0 -0
  16. src/backend/TTChatBot/chatbot/admin.py +3 -0
  17. src/backend/TTChatBot/chatbot/apps.py +242 -0
  18. src/backend/TTChatBot/chatbot/exceptions.py +10 -0
  19. src/backend/TTChatBot/chatbot/migrations/__init__.py +0 -0
  20. src/backend/TTChatBot/chatbot/serializers.py +25 -0
  21. src/backend/TTChatBot/chatbot/tasks.py +79 -0
  22. src/backend/TTChatBot/chatbot/urls.py +31 -0
  23. src/backend/TTChatBot/chatbot/utils.py +108 -0
  24. src/backend/TTChatBot/chatbot/views.py +199 -0
  25. src/backend/TTChatBot/config/__init__.py +3 -0
  26. src/backend/TTChatBot/config/asgi.py +16 -0
  27. src/backend/TTChatBot/config/celery.py +24 -0
  28. src/backend/TTChatBot/config/settings/__init__.py +19 -0
  29. src/backend/TTChatBot/config/settings/common.py +132 -0
  30. src/backend/TTChatBot/config/settings/local.py +98 -0
  31. src/backend/TTChatBot/config/settings/prod.py +98 -0
  32. src/backend/TTChatBot/config/settings/staging.py +0 -0
  33. src/backend/TTChatBot/config/urls.py +62 -0
  34. src/backend/TTChatBot/config/wsgi.py +16 -0
  35. src/backend/TTChatBot/manage.py +22 -0
  36. src/backend/TTChatBot/storage/.gitkeep +0 -0
  37. src/frontend/.gitkeep +0 -0
  38. src/frontend/.prettierignore +1 -0
  39. src/frontend/.prettierrc +4 -0
  40. src/frontend/.sample-env +1 -0
  41. src/frontend/Dockerfile +22 -0
  42. src/frontend/environments/dev/build.args +1 -0
  43. src/frontend/environments/prod/build.args +1 -0
  44. src/frontend/next-env.d.ts +5 -0
  45. src/frontend/next.config.js +11 -0
  46. src/frontend/package-lock.json +0 -0
  47. src/frontend/package.json +30 -0
  48. src/frontend/postcss.config.js +6 -0
  49. src/frontend/public/favicon.webp +0 -0
  50. src/frontend/public/locales/en.ts +15 -0
.gitignore ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For experiment of counting tokens
2
+ src/AI/tokens.py
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ share/python-wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ MANIFEST
31
+
32
+ # PyInstaller
33
+ # Usually these files are written by a python script from a template
34
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Installer logs
39
+ pip-log.txt
40
+ pip-delete-this-directory.txt
41
+
42
+ # Unit test / coverage reports
43
+ htmlcov/
44
+ .tox/
45
+ .nox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ *.py,cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+ cover/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # poetry
101
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105
+ #poetry.lock
106
+
107
+ # pdm
108
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109
+ #pdm.lock
110
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111
+ # in version control.
112
+ # https://pdm.fming.dev/#use-with-ide
113
+ .pdm.toml
114
+
115
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116
+ __pypackages__/
117
+
118
+ # Celery stuff
119
+ celerybeat-schedule
120
+ celerybeat.pid
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+
152
+ # pytype static type analyzer
153
+ .pytype/
154
+
155
+ # Cython debug symbols
156
+ cython_debug/
157
+
158
+ # PyCharm
159
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
162
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
+ #.idea/
164
+
165
+ # ignore training data
166
+ src/data/tt-content-postprocess
167
+ src/data/tt-content-debug
168
+ src/data/tt-content
169
+ src/scraper/tt-content-postprocess
170
+ src/scraper/tt-content-debug
171
+ src/scraper/tt-content
172
+ src/scraper/tt-klever-content
173
+
174
+ # staticfile
175
+ src/backend/TTChatBot/staticfiles/
176
+ src/backend/TTChatBot/static/
177
+
178
+ # MacOS
179
+ .DS_Store
180
+
181
+ # Frontend
182
+ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
183
+
184
+ # dependencies
185
+ src/frontend/node_modules/
186
+ /.pnp
187
+ .pnp.js
188
+
189
+ # next.js
190
+ src/frontend/.next/
191
+ /out/
192
+
193
+ # misc
194
+ *.pem
195
+
196
+ # debug
197
+ npm-debug.log*
198
+ yarn-debug.log*
199
+ yarn-error.log*
200
+ .pnpm-debug.log*
201
+
202
+ # vercel
203
+ .vercel
204
+
205
+ # Pycharm
206
+ .idea
207
+
208
+ # Env vars - to be updated
209
+ /infra/ci/secret.yaml
210
+
211
+ # Local .terraform directories
212
+ **/.terraform/*
213
+
214
+ # .tfstate files
215
+ *.tfstate
216
+ *.tfstate.*
217
+
218
+ # Exclude all .tfvars files, which are likely to contain sensitive data, such as
219
+ # password, private keys, and other secrets. These should not be part of version
220
+ # control as they are data points which are potentially sensitive and subject
221
+ # to change depending on the environment.
222
+ *.tfvars
223
+ *.tfvars.json
224
+
225
+ # Ignore override files as they are usually used to override resources locally and so
226
+ # are not checked in
227
+ override.tf
228
+ override.tf.json
229
+ *_override.tf
230
+ *_override.tf.json
231
+
232
+ # Include override files you do wish to add to version control using negated pattern
233
+ # !example_override.tf
234
+
235
+ # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
236
+ # example: *tfplan*
237
+
238
+ # Ignore CLI configuration files
239
+ .terraformrc
240
+ terraform.rc
241
+ .terraform.lock.hcl
242
+ # ignore .vscode
243
+ .vscode
244
+
245
+ # Ignore sensitive data - k8s env vars
246
+ infra/environments/chatbot-dev/dev_secret.yaml
247
+ infra/environments/chatbot-prod/prod_secret.yaml
248
+ infra/environments/tt-chatbot-prod/prod_secret.yaml
249
+
250
+ # yarn file
251
+ yarn.lock
models/freemind/embeddings/freemind.csv ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI packages
2
+ gradio==3.40.1 # for demo with interface
3
+ llama-index==0.7.4 # enables the LLM to access the outside information that serves as our knowledge base
4
+ openai==0.27.8 # for generating embeddings
5
+ pandas==2.0.3 # for DataFrames to store article sections and embeddings
6
+ scipy==1.11.1 # for calculating vector similarities for search
7
+ tiktoken==0.4.0 # for counting tokens
8
+ typing_extensions==4.5.0 # for TypeError: issubclass() arg 1 must be a class of langchain
9
+
10
+ # BE/FE packages
11
+ requests==2.31.0
12
+ tqdm==4.65.0
13
+ django==4.2.4
14
+ python-dotenv==1.0.0
15
+ beautifulsoup4==4.12.2
16
+ # django rest API
17
+ djangorestframework==3.14.0
18
+ drf-yasg==1.21.7
19
+ celery==5.3.1
20
+ django-celery-results==2.5.1
21
+ psycopg2-binary==2.9.6
22
+ # message broker for celery
23
+ redis==4.6.0
24
+ # threading library for celery
25
+ gevent==23.7.0
26
+ # production
27
+ gunicorn==21.2.0
28
+ # cors
29
+ django-cors-headers==4.2.0
30
+ # convert html to text
31
+ html2text==2020.1.16
32
+ # serving staticfiles without using ngnix
33
+ whitenoise==6.5.0
src/AI/__init__.py ADDED
File without changes
src/AI/ai_configs.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2023-04-20
4
+ """
5
+ import os
6
+ import sys
7
+
8
+ # MODEL PARAMETERS: https://platform.openai.com/docs/models/gpt-3-5
9
+ MODEL_NAME = "gpt-3.5-turbo" # Must select from MODEL_NAMES
10
+ MODEL_NAMES = ["gpt-4", "text-davinci-003", "gpt-3.5-turbo"]
11
+ EMBEDDING_MODEL = (
12
+ "text-embedding-ada-002" # OpenAI's best embeddings as of Apr 2023
13
+ )
14
+
15
+ # CHATBOT SERVICE
16
+ SERVICE = "freemind" # Must select from SERVICES
17
+ SERVICES = ["TokyoTechies", "Klever", "Test", "freemind"]
18
+
19
+ # DATA FORMATTING
20
+ DELIMITER_TOKYOTECHIES = "Sub Section:"
21
+ FILE_TYPE = ".txt"
22
+ FILE_ENCODING = "utf-8"
23
+ INTRODUCTION_MESSAGE = (
24
+ f"You are a chatbot of {SERVICE}. "
25
+ f"Use the below articles on the {SERVICE} to answer the subsequent question. " # noqa: E501
26
+ "If an answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance." # noqa: E501
27
+ r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.' # noqa: E501
28
+ )
29
+ SYSTEM_CONTENT = "You answer questions about {SERVICE}"
30
+
31
+ # CALCULATE EMBEDDING PARAMETERS
32
+ MAX_TOKENS = 1600 # maximum tokens for a section
33
+ BATCH_SIZE = 1000 # up to 2048 embedding inputs per request
34
+ TOKEN_BUDGET = 4096 - 500
35
+
36
+ # TRAINING PARAMETERS
37
+ CONTEXT_WINDOW = 4096 # Context window for the LLM.
38
+ NUM_OUTPUTS = 512 # Number of outputs for the LLM.
39
+ CHUNK_OVERLAP_RATIO = 0.1 # Chunk overlap as a ratio of chunk size
40
+ TEMPERATURE = 0.0 # A parameter that controls the “creativity” or
41
+ # randomness of the text generated. A higher temperature (e.g., 0.7)
42
+ # results in more diverse and creative output, while a lower temperature
43
+ # (e.g., 0.2) makes the output more deterministic and focused.
44
+
45
+ sys.path.append(os.path.abspath(os.path.join("..", "data")))
46
+
47
+ # PATH
48
+ if SERVICE in SERVICES:
49
+ if MODEL_NAME in MODEL_NAMES:
50
+ # Path to training files:
51
+ FOLDERPATH_DOCUMENTS = os.path.join(
52
+ "data",
53
+ SERVICE,
54
+ "training_files",
55
+ )
56
+ # Path to model
57
+ FOLDERPATH_INDEXES = os.path.join(
58
+ "models",
59
+ SERVICE,
60
+ MODEL_NAME,
61
+ )
62
+ FILEPATH_EMBEDDINGS = os.path.join(
63
+ "models",
64
+ SERVICE,
65
+ "embeddings",
66
+ f"{SERVICE}.csv",
67
+ )
68
+ # For evaluation
69
+ FOLDERPATH_QUESTION = os.path.join(
70
+ "data",
71
+ SERVICE,
72
+ "evaluation",
73
+ "questions",
74
+ )
75
+ FOLDERPATH_QA = os.path.join(
76
+ "data",
77
+ SERVICE,
78
+ "evaluation",
79
+ "QA_" + MODEL_NAME,
80
+ )
81
+ else:
82
+ raise ValueError("MODEL_NAME must be in MODEL_NAMES")
83
+ else:
84
+ raise ValueError("SERVICE must be in SERVICES")
src/AI/chatbot_demo.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2023-04-20
4
+ """
5
+ import configparser
6
+ import os
7
+
8
+ import gradio as gr
9
+ import openai
10
+ from ai_configs import (
11
+ FOLDERPATH_INDEXES,
12
+ MODEL_NAME,
13
+ )
14
+ from llama_index import (
15
+ StorageContext,
16
+ load_index_from_storage,
17
+ )
18
+
19
+ env = configparser.ConfigParser()
20
+ env.read(".env")
21
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
22
+ openai.api_key = os.environ["OPENAI_API_KEY"]
23
+
24
+
25
+ def format_response(responses: dict):
26
+ """
27
+ (Optional) Format one or multiple responses from version(s) of chatbot
28
+
29
+ Parameters:
30
+ responses (dict): chatbot response with the name of model
31
+
32
+ Returns:
33
+ output (str): formatted reponse
34
+ """
35
+ output = ""
36
+ for response in responses:
37
+ output += response + (responses[response]) + "\n\n"
38
+ return output
39
+
40
+
41
+ def chat(message, history):
42
+ """
43
+ Load index to chatbot and get response
44
+
45
+ Parameters:
46
+ message (dict): question to chatbot
47
+ history (list): history of the whole conversation
48
+
49
+ Returns:
50
+ history (list): history of the whole conversation (for displaying)
51
+ history (list): state of the chatbot
52
+ """
53
+ history = history or []
54
+ # rebuild storage context
55
+ FOLDERPATH_INDEXES_EN = FOLDERPATH_INDEXES + "_en"
56
+ storage_context = StorageContext.from_defaults(
57
+ persist_dir=FOLDERPATH_INDEXES_EN,
58
+ )
59
+
60
+ # load index to memory
61
+ index = load_index_from_storage(storage_context)
62
+
63
+ # open QA engine
64
+ query_engine = index.as_query_engine()
65
+
66
+ # Get the response from OpenAI
67
+ response_en = query_engine.query(message)
68
+ print("Q: ", message)
69
+ print("A: ", response_en.response, "\n")
70
+
71
+ # ---------- JAPANESE
72
+ # rebuild storage context
73
+ FOLDERPATH_INDEXES_JA = FOLDERPATH_INDEXES + "_ja"
74
+ storage_context_ja = StorageContext.from_defaults(
75
+ persist_dir=FOLDERPATH_INDEXES_JA,
76
+ )
77
+
78
+ # load index to memory
79
+ index_ja = load_index_from_storage(storage_context_ja)
80
+
81
+ # open QA engine
82
+ query_engine_ja = index_ja.as_query_engine()
83
+
84
+ # Get the response from OpenAI
85
+ response_ja = query_engine_ja.query(message)
86
+ print("Q: ", message)
87
+ print("A: ", response_ja.response, "\n")
88
+ ######
89
+
90
+ # Format the response
91
+ responses = {
92
+ f"---{MODEL_NAME} (English)---": response_en.response,
93
+ f"---{MODEL_NAME} (Japanese)---": response_ja.response,
94
+ }
95
+
96
+ response = format_response(responses)
97
+
98
+ # Append the response to history (to show in the UI)
99
+ history.append((message, response))
100
+
101
+ return history, history
102
+
103
+
104
+ # Call the chat using gradio which supports UI for chatbot and is shareable
105
+ chatgpt = gr.Interface(
106
+ chat,
107
+ ["text", "state"],
108
+ ["chatbot", "state"],
109
+ allow_flagging="never",
110
+ )
111
+
112
+ chatgpt.launch(share=True) # share=True to share the chat publicly
src/AI/embedding.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2023-07-20
4
+ """
5
+
6
+ import configparser
7
+ import os
8
+
9
+ import openai
10
+ import pandas as pd
11
+ import tiktoken
12
+ from ai_configs import (
13
+ BATCH_SIZE,
14
+ DELIMITER_TOKYOTECHIES,
15
+ EMBEDDING_MODEL,
16
+ FILE_ENCODING,
17
+ FILE_TYPE,
18
+ FILEPATH_EMBEDDINGS,
19
+ FOLDERPATH_DOCUMENTS,
20
+ MAX_TOKENS,
21
+ MODEL_NAME,
22
+ SERVICE,
23
+ )
24
+
25
+
26
+ def list_files(directory: str) -> list:
27
+ files = []
28
+ for file in os.listdir(directory):
29
+ # check only text files
30
+ if file.endswith(FILE_TYPE):
31
+ files.append(file)
32
+ return files
33
+
34
+
35
+ def read_file(file_path: str) -> str:
36
+ # Open a file: file
37
+ file = open(file_path, encoding=FILE_ENCODING)
38
+
39
+ # read all lines at once
40
+ file_content = file.read()
41
+
42
+ # close the file
43
+ file.close()
44
+ return file_content
45
+
46
+
47
+ def num_tokens(text: str, model: str = MODEL_NAME) -> int:
48
+ """Return the number of tokens in a string."""
49
+ encoding = tiktoken.encoding_for_model(model)
50
+ return len(encoding.encode(text))
51
+
52
+
53
+ def truncated_string(
54
+ string: str,
55
+ model: str,
56
+ max_tokens: int,
57
+ print_warning: bool = True,
58
+ ) -> str:
59
+ """Truncate a string to a maximum number of tokens."""
60
+ encoding = tiktoken.encoding_for_model(model)
61
+ encoded_string = encoding.encode(string)
62
+ truncated_string = encoding.decode(encoded_string[:max_tokens])
63
+ if print_warning and len(encoded_string) > max_tokens:
64
+ print(
65
+ f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.", # noqa: E501
66
+ )
67
+ return truncated_string
68
+
69
+
70
+ def determine_delimiter(
71
+ strings: str,
72
+ service: str = SERVICE,
73
+ ) -> str:
74
+ """
75
+ Determine the delimiter of the file
76
+ """
77
+ if service == "TokyoTechies":
78
+ return DELIMITER_TOKYOTECHIES
79
+ elif service == "Klever":
80
+ if "# " in strings:
81
+ return "# "
82
+ elif "## " in strings:
83
+ return "## "
84
+ elif "### " in strings:
85
+ return "### "
86
+ else:
87
+ return False
88
+ raise ValueError(
89
+ f"No delimiter found in Klever file: {strings[0:20]}",
90
+ )
91
+ elif service == "freemind":
92
+ return "-----"
93
+ else:
94
+ raise ValueError(f"Unknown service: {service}")
95
+
96
+
97
+ def format_content_Tokyo_Techies(
98
+ strings: str,
99
+ content: str,
100
+ max_tokens: int = 1000,
101
+ model: str = MODEL_NAME,
102
+ ):
103
+ """
104
+ Format content for Tokyo Techies
105
+ """
106
+ chunks = content.split(determine_delimiter(content))
107
+ # TODO: add to config
108
+ if "URL:" and "Language:" in chunks[0]:
109
+ url = (
110
+ "<url>"
111
+ + (content.split("URL:"))[1].split("Language")[0].strip()
112
+ + "</url>"
113
+ ) # get url
114
+ else:
115
+ url = "<url>No URL</url>"
116
+
117
+ for chunk in chunks[1:]:
118
+ chunk = (
119
+ chunk.strip()
120
+ ) # remove leading and trailing whitespace and newline
121
+ if not chunk:
122
+ continue
123
+
124
+ # get section title (first row) and content (from 2nd row)
125
+ section_title = chunk.split("\n")[0]
126
+ titles = [url, section_title]
127
+ section_content = chunk.split("\n")[1:]
128
+ section_content = "\n".join(section_content)
129
+
130
+ if num_tokens(section_content) > max_tokens:
131
+ print(
132
+ f"{titles} ({num_tokens(section_content)}) has more than {max_tokens} tokens", # noqa: E501
133
+ )
134
+ section_content = truncated_string(
135
+ section_content,
136
+ model=model,
137
+ max_tokens=max_tokens,
138
+ )
139
+
140
+ string = "\n\n".join(titles + [section_content])
141
+ strings.extend([string])
142
+ print(string)
143
+ return strings
144
+
145
+
146
+ def format_content_klever(
147
+ strings: str,
148
+ content: str,
149
+ max_tokens: int = 1000,
150
+ model: str = MODEL_NAME,
151
+ ):
152
+ """
153
+ Format content for Klever
154
+ """
155
+
156
+ # Add images tag to image link
157
+ content = content.replace("![](", "![image](")
158
+
159
+ delimiter = determine_delimiter(content)
160
+ if delimiter:
161
+ chunks = content.split(delimiter)
162
+ else:
163
+ chunks = [content]
164
+
165
+ # TODO: add to config
166
+ url = ""
167
+ if "Title:" and "URL:" in chunks[0]:
168
+ title = "Title: " + (
169
+ (content.split("Title:"))[1].split("URL")[0].strip()
170
+ )
171
+ if "Language:" in chunks[0]:
172
+ url = (
173
+ "<url>"
174
+ + (content.split("URL:"))[1].split("Language:")[0].strip()
175
+ + "</url>"
176
+ )
177
+ else:
178
+ title = ""
179
+
180
+ # Extract content between title and the first sub-section
181
+ section_content = (chunks[0].split("-----"))[1].strip()
182
+ if section_content != "":
183
+ titles = [title, url]
184
+ string = "\n\n".join(titles + [section_content])
185
+ # print(f"----------\n{string}\n")
186
+ strings.extend([string])
187
+
188
+ # Extract contentin every sub-section
189
+ for chunk in chunks[1:]:
190
+ chunk = (
191
+ chunk.strip()
192
+ ) # remove leading and trailing whitespace and newline
193
+ if not chunk:
194
+ continue
195
+
196
+ # get section title (first row) and content (from 2nd row)
197
+ section_title = chunk.split("\n")[0]
198
+ titles = [title + " > " + section_title, url]
199
+ section_content = chunk.split("\n")[1:]
200
+ section_content = "\n".join(section_content)
201
+
202
+ if num_tokens(section_content) > max_tokens:
203
+ print(
204
+ f"{titles} ({num_tokens(section_content)}) has more than {max_tokens} tokens", # noqa: E501
205
+ )
206
+ section_content = truncated_string(
207
+ section_content,
208
+ model=model,
209
+ max_tokens=max_tokens,
210
+ )
211
+
212
+ string = "\n\n".join(titles + [section_content])
213
+ # print(f"----------\n{string}\n")
214
+ strings.extend([string])
215
+ return strings
216
+
217
+
218
+ def format_content_freemind(
219
+ strings: str,
220
+ content: str,
221
+ max_tokens: int = 1000,
222
+ model: str = MODEL_NAME,
223
+ ):
224
+ """
225
+ Format content for freemind
226
+ """
227
+ chunks = content.split(determine_delimiter(content))
228
+ for chunk in chunks:
229
+ chunk = (
230
+ chunk.strip()
231
+ ) # remove leading and trailing whitespace and newline
232
+ if not chunk:
233
+ continue
234
+
235
+ if num_tokens(chunk) > max_tokens:
236
+ print(
237
+ f"{chunk} ({num_tokens(section_content)}) has more than {max_tokens} tokens", # noqa: E501
238
+ )
239
+ section_content = truncated_string(
240
+ section_content,
241
+ model=model,
242
+ max_tokens=max_tokens,
243
+ )
244
+
245
+ string = chunk
246
+ # print(f"----------\n{string}\n")
247
+ strings.extend([string])
248
+ return strings
249
+
250
+
251
+ def format_content(
252
+ directory: str,
253
+ max_tokens: int = 1000,
254
+ model: str = MODEL_NAME,
255
+ ) -> list[str]:
256
+ strings = []
257
+
258
+ # read files
259
+ files = list_files(directory)
260
+ for file in files:
261
+ print(f"File: {file}")
262
+ file_content = read_file(
263
+ os.path.join(
264
+ FOLDERPATH_DOCUMENTS,
265
+ file,
266
+ ),
267
+ )
268
+
269
+ if SERVICE == "TokyoTechies":
270
+ strings = format_content_Tokyo_Techies(
271
+ strings,
272
+ file_content,
273
+ max_tokens,
274
+ model,
275
+ )
276
+ elif SERVICE == "Klever":
277
+ strings = format_content_klever(
278
+ strings,
279
+ file_content,
280
+ max_tokens,
281
+ model,
282
+ )
283
+ elif SERVICE == "freemind":
284
+ strings = format_content_freemind(
285
+ strings,
286
+ file_content,
287
+ max_tokens,
288
+ model,
289
+ )
290
+
291
+ return strings
292
+
293
+
294
+ def embed_data():
295
+ # read config
296
+ env = configparser.ConfigParser()
297
+ env.read(".env")
298
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
299
+ openai.api_key = os.environ["OPENAI_API_KEY"]
300
+
301
+ formatted_strings = format_content(FOLDERPATH_DOCUMENTS, MAX_TOKENS)
302
+
303
+ embeddings = []
304
+ for batch_start in range(0, len(formatted_strings), BATCH_SIZE):
305
+ batch_end = batch_start + BATCH_SIZE
306
+ batch = formatted_strings[batch_start:batch_end]
307
+ print(f"Batch {batch_start} to {batch_end-1}")
308
+ response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
309
+ for i, be in enumerate(response["data"]):
310
+ assert (
311
+ i == be["index"]
312
+ ) # double check embeddings are in same order as input
313
+ batch_embeddings = [e["embedding"] for e in response["data"]]
314
+ embeddings.extend(batch_embeddings)
315
+
316
+ df = pd.DataFrame({"text": formatted_strings, "embedding": embeddings})
317
+
318
+ # save document chunks and embeddings
319
+ SAVE_PATH = FILEPATH_EMBEDDINGS
320
+ df.to_csv(SAVE_PATH, index=False)
321
+
322
+
323
+ embed_data()
src/AI/evaluation.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2023-06-20
4
+ """
5
+ import configparser
6
+ import os
7
+
8
+ import openai
9
+ from ai_configs import (
10
+ FOLDERPATH_INDEXES,
11
+ FOLDERPATH_QA,
12
+ FOLDERPATH_QUESTION,
13
+ )
14
+ from llama_index import (
15
+ StorageContext,
16
+ load_index_from_storage,
17
+ )
18
+
19
+ env = configparser.ConfigParser()
20
+ env.read(".env")
21
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
22
+ openai.api_key = os.environ["OPENAI_API_KEY"]
23
+
24
+
25
+ def get_question_files(path: str = FOLDERPATH_QUESTION) -> None:
26
+ """
27
+ Get the directory.
28
+ Check whether it is valid (a folder or a text file) or invalid.
29
+ If valid, generate the answer from the questions in the directory.
30
+
31
+ Parameters:
32
+ path (str): Path to a question file/folder
33
+
34
+ Returns:
35
+ None
36
+ """
37
+ if os.path.isdir(path) is True:
38
+ for file in os.listdir(path):
39
+ if file.endswith(".txt"):
40
+ print("Generating answer from: ", os.path.join(path, file))
41
+ generate_answers(os.path.join(path, file))
42
+ elif os.path.isfile(path) is True and path.endswith(".txt"):
43
+ print("Generating answer from:", path)
44
+ generate_answers(path)
45
+ else:
46
+ raise Exception("Input is not a directory of a folder or a text file")
47
+
48
+
49
+ def generate_answers(
50
+ file_directory: str,
51
+ output_path=FOLDERPATH_QA,
52
+ ) -> None:
53
+ """
54
+ Get the list of questions from file(s),
55
+ then generate the answers and write to file(s).
56
+ These answers are used for evaluation
57
+
58
+ Parameters:
59
+ file_directory (str): Path to a question file
60
+ output_path (str): folder to write the answers
61
+
62
+ Returns:
63
+ None
64
+ """
65
+
66
+ # Load the questions
67
+ question_file = open(file_directory)
68
+ lines = question_file.readlines()
69
+
70
+ # Create a file to write the answers
71
+ file_name = os.path.basename(file_directory)
72
+ qa_file = open(os.path.join(output_path, file_name), "w")
73
+
74
+ count = 0
75
+ for line in lines: # for each question
76
+ count += 1
77
+ # generate the answer
78
+ response = query_engine.query(line)
79
+
80
+ # format the output
81
+ question = "Q" + str(count) + ": " + str(line)
82
+ answer = "A" + str(count) + ": " + str(response.response)
83
+
84
+ response = question + answer + "\n"
85
+ print(response)
86
+
87
+ # write Q&A to file
88
+ qa_file.writelines(response)
89
+
90
+ question_file.close()
91
+ qa_file.close()
92
+
93
+
94
+ # rebuild storage context
95
+ storage_context = StorageContext.from_defaults(persist_dir=FOLDERPATH_INDEXES)
96
+
97
+ # load index
98
+ index = load_index_from_storage(storage_context)
99
+ query_engine = index.as_query_engine()
100
+
101
+ get_question_files()
src/AI/klever_search.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast # for converting embeddings saved as strings back to arrays
2
+ import configparser
3
+ import os
4
+ import time
5
+
6
+ import gradio as gr
7
+ import openai # for calling the OpenAI API
8
+ import pandas as pd # for storing text and embeddings data
9
+ import tiktoken # for counting tokens
10
+ from ai_configs import (
11
+ EMBEDDING_MODEL,
12
+ FILEPATH_EMBEDDINGS,
13
+ INTRODUCTION_MESSAGE,
14
+ MODEL_NAME,
15
+ SYSTEM_CONTENT,
16
+ TOKEN_BUDGET,
17
+ )
18
+ from scipy import spatial # for calculating vector similarities for search
19
+
20
+ env = configparser.ConfigParser()
21
+ env.read(".env")
22
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
23
+ openai.api_key = os.environ["OPENAI_API_KEY"]
24
+
25
+ model_name = MODEL_NAME
26
+ # Read embbeding file
27
+ embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
28
+ # Convert embeddings from CSV str type back to list type
29
+ embedding_data["embedding"] = embedding_data["embedding"].apply(
30
+ ast.literal_eval,
31
+ )
32
+ print("Finished loading embedding data!")
33
+
34
+
35
+ # search function
36
+ def strings_ranked_by_relatedness(
37
+ query: str,
38
+ df: pd.DataFrame,
39
+ relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
40
+ top_n: int = 3,
41
+ ) -> tuple[list[str], list[float]]:
42
+ """Returns a list of strings and relatednesses,
43
+ sorted from most related to least.
44
+ """
45
+ query_embedding_response = openai.Embedding.create(
46
+ model=EMBEDDING_MODEL,
47
+ input=query,
48
+ )
49
+ query_embedding = query_embedding_response["data"][0]["embedding"]
50
+ strings_and_relatednesses = [
51
+ (row["text"], relatedness_fn(query_embedding, row["embedding"]))
52
+ for i, row in df.iterrows()
53
+ ]
54
+ strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
55
+ strings, relatednesses = zip(*strings_and_relatednesses)
56
+
57
+ return strings[:top_n], relatednesses[:top_n]
58
+
59
+
60
+ embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
61
+ # Convert embeddings from CSV str type back to list type
62
+ embedding_data["embedding"] = embedding_data["embedding"].apply(
63
+ ast.literal_eval,
64
+ )
65
+
66
+ query = "what is Klever?"
67
+ strings, relatedness = strings_ranked_by_relatedness(query, embedding_data)
68
+ for string in strings:
69
+ if "</url>" in string:
70
+ string = string.split("</url>")[0].replace("<url>", "URL: ")
71
+ print(string)
72
+ print("----------------")
73
+ print(relatedness)
src/AI/parsing.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import aspose.words as aw
2
+
3
+ # Load the PDF document from the disc.
4
+ doc = aw.Document("TestDocument.pdf")
5
+
6
+ # Save the document to DOCX format.
7
+ doc.save("output.md")
src/AI/search.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast # for converting embeddings saved as strings back to arrays
2
+ import configparser
3
+ import os
4
+ import time
5
+
6
+ import gradio as gr
7
+ import openai # for calling the OpenAI API
8
+ import pandas as pd # for storing text and embeddings data
9
+ import tiktoken # for counting tokens
10
+ from ai_configs import (
11
+ EMBEDDING_MODEL,
12
+ FILEPATH_EMBEDDINGS,
13
+ INTRODUCTION_MESSAGE,
14
+ MODEL_NAME,
15
+ SYSTEM_CONTENT,
16
+ TOKEN_BUDGET,
17
+ )
18
+ from scipy import spatial # for calculating vector similarities for search
19
+
20
+ env = configparser.ConfigParser()
21
+ env.read(".env")
22
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
23
+ openai.api_key = os.environ["OPENAI_API_KEY"]
24
+
25
+ print(openai.api_key)
26
+
27
+ model_name = MODEL_NAME
28
+ # Read embbeding file
29
+ embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
30
+ # Convert embeddings from CSV str type back to list type
31
+ embedding_data["embedding"] = embedding_data["embedding"].apply(
32
+ ast.literal_eval,
33
+ )
34
+ print("Finished loading embedding data!")
35
+
36
+
37
+ # search function
38
+ def strings_ranked_by_relatedness(
39
+ query: str,
40
+ df: pd.DataFrame,
41
+ relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
42
+ top_n: int = 3,
43
+ ) -> tuple[list[str], list[float]]:
44
+ """Returns a list of strings and relatednesses,
45
+ sorted from most related to least.
46
+ """
47
+ query_embedding_response = openai.Embedding.create(
48
+ model=EMBEDDING_MODEL,
49
+ input=query,
50
+ )
51
+ query_embedding = query_embedding_response["data"][0]["embedding"]
52
+ strings_and_relatednesses = [
53
+ (row["text"], relatedness_fn(query_embedding, row["embedding"]))
54
+ for i, row in df.iterrows()
55
+ ]
56
+ strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
57
+ strings, relatednesses = zip(*strings_and_relatednesses)
58
+
59
+ return strings[:top_n], relatednesses[:top_n]
60
+
61
+
62
+ def num_tokens(text: str, model: str = MODEL_NAME) -> int:
63
+ """Return the number of tokens in a string."""
64
+ encoding = tiktoken.encoding_for_model(model)
65
+ return len(encoding.encode(text))
66
+
67
+
68
+ def query_message(
69
+ query: str,
70
+ df: pd.DataFrame,
71
+ model: str,
72
+ token_budget: int,
73
+ ) -> str:
74
+ """Return a message for GPT,
75
+ with relevant source texts pulled from a dataframe.
76
+ """
77
+ strings, _ = strings_ranked_by_relatedness(query, df)
78
+
79
+ """ example:
80
+ #strings, relatednesses = strings_ranked_by_relatedness(
81
+ # "what solutions that TT provides?",
82
+ # df,
83
+ # top_n=5,
84
+ # )
85
+ #for string, relatedness in zip(strings, relatednesses):
86
+ # print(f"{relatedness=:.3f}\n{string}\n")
87
+ """
88
+
89
+ question = f"\n\nQuestion: {query}"
90
+ message = INTRODUCTION_MESSAGE
91
+ for string in strings:
92
+ next_article = f"\nTT article section:\n--\n{string}\n--"
93
+ next_article = f"\nFreemind article section:\n--\n{string}\n--"
94
+ if (
95
+ num_tokens(message + next_article + question, model=model)
96
+ > token_budget
97
+ ):
98
+ break
99
+ else:
100
+ message += next_article
101
+ return message + question
102
+
103
+
104
+ def get_response(
105
+ query: str,
106
+ df: pd.DataFrame,
107
+ model: str = MODEL_NAME,
108
+ token_budget: int = TOKEN_BUDGET,
109
+ print_message: bool = False,
110
+ ) -> str:
111
+ """Answers a query using GPT and a dataframe of
112
+ relevant texts and embeddings.
113
+ """
114
+ message = query_message(query, df, model=model, token_budget=token_budget)
115
+
116
+ if print_message:
117
+ print(message)
118
+ messages = [
119
+ {"role": "system", "content": SYSTEM_CONTENT},
120
+ {"role": "user", "content": message},
121
+ ]
122
+
123
+ response = openai.ChatCompletion.create(
124
+ model=model,
125
+ messages=messages,
126
+ temperature=0,
127
+ )
128
+ response_message = response["choices"][0]["message"]["content"]
129
+ print(f'Total used tokens: {response["usage"]["total_tokens"]}')
130
+ return response_message, message
131
+
132
+
133
+ # Code for getting chatbot's response ends here. Below code is for UI only.
134
+ def format_response(responses: dict):
135
+ """
136
+ (Optional) Format one or multiple responses from version(s) of chatbot
137
+
138
+ Parameters:
139
+ responses (dict): chatbot response with the name of model
140
+
141
+ Returns:
142
+ output (str): formatted reponse
143
+ """
144
+ output = ""
145
+ for response in responses:
146
+ output += response + (responses[response]) + "\n\n"
147
+ return output
148
+
149
+
150
+ with gr.Blocks() as chatgpt:
151
+ chatbot = gr.Chatbot(label="Freemind Bot", height=500)
152
+ message = gr.Textbox(
153
+ label="Enter your chat here",
154
+ placeholder="Press enter to send a message",
155
+ show_copy_button=True,
156
+ )
157
+ radio = gr.Radio(
158
+ [
159
+ "Full model (most capable but slow & expensive)",
160
+ "Lite model (Capable but fast & cheap)",
161
+ ],
162
+ label="Choose a chatbot model",
163
+ value="Lite model (Capable but fast & cheap)",
164
+ )
165
+ clear = gr.Button("Clear all chat")
166
+
167
+ def choice_model(choice):
168
+ if choice == "Full model (most capable but slow & expensive)":
169
+ return "gpt-4"
170
+ else:
171
+ return "gpt-3.5-turbo"
172
+
173
+ def get_user_message(user_message, history):
174
+ return "", history + [[user_message, None]]
175
+
176
+ def show_response(history, model):
177
+ message = history[-1][0]
178
+ model = choice_model(model)
179
+ print(f"model: {model}")
180
+ # Get the response from OpenAI
181
+ response, _ = get_response(
182
+ query=message,
183
+ df=embedding_data,
184
+ model=model,
185
+ )
186
+
187
+ # Correct URL
188
+ # I will remove this function after BE/FE fixing this bug
189
+ response = response.replace("help/document/", "wiki/1-")
190
+ response = response.replace(">>", ">")
191
+ print("Q: ", message, "\nA: ", response, "\n")
192
+
193
+ # Format the response
194
+ # responses = {
195
+ # f"[{MODEL_NAME}] → ": response,
196
+ # }
197
+ # response = format_response(responses)
198
+
199
+ history[-1][1] = ""
200
+ for character in response:
201
+ history[-1][1] += character
202
+ time.sleep(0.01)
203
+ yield history
204
+
205
+ message.submit(
206
+ get_user_message,
207
+ [message, chatbot],
208
+ [message, chatbot],
209
+ queue=False,
210
+ ).then(
211
+ show_response,
212
+ [chatbot, radio],
213
+ chatbot,
214
+ )
215
+ clear.click(lambda: None, None, chatbot, queue=False)
216
+
217
+
218
+ chatgpt.queue()
219
+ chatgpt.launch(share=True) # share=True to share the chat publicly
src/AI/training.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Author: Khanh Phan
3
+ Date: 2023-04-20
4
+ """
5
+
6
+ import configparser
7
+ import os
8
+
9
+ import openai
10
+ from ai_configs import ( # CHUNK_SIZE_LIMIT,
11
+ CHUNK_OVERLAP_RATIO,
12
+ CONTEXT_WINDOW,
13
+ FOLDERPATH_DOCUMENTS,
14
+ FOLDERPATH_INDEXES,
15
+ MODEL_NAME,
16
+ NUM_OUTPUTS,
17
+ TEMPERATURE,
18
+ )
19
+ from langchain import OpenAI
20
+ from llama_index import (
21
+ GPTVectorStoreIndex,
22
+ LLMPredictor,
23
+ PromptHelper,
24
+ ServiceContext,
25
+ SimpleDirectoryReader,
26
+ )
27
+
28
+ env = configparser.ConfigParser()
29
+ env.read(".env")
30
+ os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
31
+ openai.api_key = os.environ["OPENAI_API_KEY"]
32
+
33
+
34
+ def construct_index(
35
+ folderpath_documents: str,
36
+ folderpath_index: str,
37
+ ) -> GPTVectorStoreIndex:
38
+ """
39
+ Construsted index for all the documents.
40
+
41
+ Parameters:
42
+ folderpath_documents (str): Path to a training folder
43
+ folderpath_index (str): Path to a folder to save the model
44
+
45
+ Returns:
46
+ document_index (GPTVectorStoreIndex): the model
47
+ """
48
+
49
+ # Create a prompt helper with initial parameters for the chatbot
50
+ prompt_helper = PromptHelper(
51
+ context_window=CONTEXT_WINDOW,
52
+ num_output=NUM_OUTPUTS,
53
+ chunk_overlap_ratio=CHUNK_OVERLAP_RATIO,
54
+ # chunk_size_limit=CHUNK_SIZE_LIMIT,
55
+ )
56
+
57
+ # Configure the LLM provider and model.
58
+ llm_predictor = LLMPredictor(
59
+ llm=OpenAI(
60
+ temperature=TEMPERATURE,
61
+ model_name=MODEL_NAME,
62
+ max_tokens=NUM_OUTPUTS,
63
+ ),
64
+ )
65
+
66
+ # Create the service context
67
+ service_context = ServiceContext.from_defaults(
68
+ llm_predictor=llm_predictor,
69
+ prompt_helper=prompt_helper,
70
+ )
71
+
72
+ # Load the documents
73
+ documents = SimpleDirectoryReader(folderpath_documents).load_data()
74
+
75
+ # Generate the index from documents
76
+ document_index = GPTVectorStoreIndex.from_documents(
77
+ documents,
78
+ service_context=service_context,
79
+ )
80
+
81
+ # Save index to disk
82
+ document_index.storage_context.persist(persist_dir=folderpath_index)
83
+
84
+ return document_index
85
+
86
+
87
+ document_index = construct_index(FOLDERPATH_DOCUMENTS, FOLDERPATH_INDEXES)
88
+
89
+ """
90
+ # These lines are for testing purposes only.
91
+ query = input("What do you want to ask? ")
92
+ query_engine = document_index.as_query_engine()
93
+ response = query_engine.query("what are the articles about?")
94
+ print(response)
95
+ """
src/__init__.py ADDED
File without changes
src/backend/TTChatBot/.sample-env ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ENV_NAME='local'
2
+
3
+ DJANGO_SETTINGS_MODULE=config.settings.local
4
+ DJANGO_SECRET_KEY='django-insecure-xfs(py=^axctf8(#5yd-svkffy3ft0u0z6^*&vx@g#)fttc#sl'
5
+ DJANGO_DEBUG=True
6
+
7
+ # Database
8
+ # DB_NAME='postgres'
9
+ # DB_USER='postgres'
10
+ # DB_PASSWORD='postgres'
11
+ # DB_HOST='127.0.0.1'
12
+ # DB_PORT=5678
13
+
14
+ # Celery
15
+ # CELERY_BROKER_URL = 'redis://localhost:6379/0'
16
+ # CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
17
+
18
+ # Other API
19
+ OPEN_AI_KEY='KEY'
20
+
21
+ # Redis config
22
+ REDIS_HOST = redis
23
+ REDIS_PORT = 6380
24
+ BROKER_URL = redis://${REDIS_HOST}:${REDIS_PORT}/0
src/backend/TTChatBot/chatbot/__init__.py ADDED
File without changes
src/backend/TTChatBot/chatbot/admin.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # from django.contrib import admin
2
+
3
+ # Register your models here.
src/backend/TTChatBot/chatbot/apps.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import logging
3
+ import os
4
+
5
+ import openai
6
+ import pandas as pd
7
+ from django.apps import AppConfig
8
+ from django.conf import settings
9
+ from llama_index import (
10
+ StorageContext,
11
+ load_index_from_storage,
12
+ )
13
+ from scipy import spatial
14
+
15
+ from .utils import num_tokens_from_messages
16
+
17
+ # set OpenAI API key
18
+ openai.api_key = os.environ["OPENAI_API_KEY"]
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def load_chatgpt_index(apps_names: str, index_file: str):
23
+ """Functions to load chatGPT index
24
+
25
+ Args:
26
+ apps_names (str): TokyoTechies or Klever
27
+ index_file (str): Storage index path
28
+ """
29
+ # build storage context
30
+ logger.info("Building %s storage context", apps_names)
31
+ storage_context = StorageContext.from_defaults(
32
+ persist_dir=index_file,
33
+ )
34
+
35
+ # load index
36
+ index = load_index_from_storage(storage_context)
37
+
38
+ query_engine = index.as_query_engine()
39
+ logger.info("Loading index from %s storage completed", apps_names)
40
+ return query_engine
41
+
42
+
43
+ class ChatGPTEmbeddingSearchBased:
44
+ """ChatGPT embedding search based method for Kleverbot"""
45
+
46
+ def __init__(
47
+ self,
48
+ service,
49
+ embedding_model,
50
+ chat_model,
51
+ filepath_embedding,
52
+ ):
53
+ self.embedding_model = embedding_model
54
+ self.chat_model = chat_model
55
+ self.filepath_embedding = filepath_embedding
56
+ self.service = service
57
+ # model related config
58
+ self.token_budget = settings.TOKEN_BUDGET
59
+ self.introduction_message = settings.INTRODUCTION_MESSAGE.format(
60
+ service=self.service,
61
+ )
62
+ self.system_content = settings.SYSTEM_CONTENT.format(
63
+ service=self.service,
64
+ )
65
+ self.next_article = settings.NEXT_ARTICLE
66
+ self.embedding_data = self.load_embedding_data()
67
+
68
+ def load_embedding_data(self):
69
+ """Loading embedding data from csv"""
70
+ df_data = pd.read_csv(self.filepath_embedding)
71
+ df_data["embedding"] = df_data["embedding"].apply(
72
+ ast.literal_eval,
73
+ )
74
+ logger.info(
75
+ "Loading embeddings from %s storage completed",
76
+ self.filepath_embedding,
77
+ )
78
+ return df_data
79
+
80
+ # search function
81
+ def strings_ranked_by_relatedness(
82
+ self,
83
+ query: str,
84
+ df_data: pd.DataFrame,
85
+ relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
86
+ top_n: int = 3,
87
+ ) -> tuple[list[str], list[float]]:
88
+ """Returns a list of strings and relatednesses,
89
+ sorted from most related to least.
90
+ """
91
+ query_embedding_response = openai.Embedding.create(
92
+ model=self.embedding_model,
93
+ input=query,
94
+ )
95
+ query_embedding = query_embedding_response["data"][0]["embedding"]
96
+ strings_and_relatednesses = [
97
+ (row["text"], relatedness_fn(query_embedding, row["embedding"]))
98
+ for i, row in df_data.iterrows()
99
+ ]
100
+ strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
101
+ strings, relatednesses = zip(*strings_and_relatednesses)
102
+
103
+ return strings[:top_n], relatednesses[:top_n]
104
+
105
+ def query_message(
106
+ self,
107
+ query: str,
108
+ dataframe: pd.DataFrame,
109
+ model: str,
110
+ token_budget: int,
111
+ ) -> str:
112
+ """Return a message for GPT,
113
+ with relevant source texts pulled from a dataframe.
114
+ """
115
+ strings, _ = self.strings_ranked_by_relatedness(query, dataframe)
116
+ question = f"\n\nQuestion: {query}"
117
+ message = self.introduction_message
118
+ for string in strings:
119
+ next_article = self.next_article.format(
120
+ service=self.service,
121
+ string=string,
122
+ )
123
+ if (
124
+ num_tokens_from_messages(
125
+ message + next_article + question,
126
+ model=model,
127
+ )
128
+ > token_budget
129
+ ):
130
+ break
131
+ else:
132
+ message += next_article
133
+ return message + question
134
+
135
+ def get_response(
136
+ self,
137
+ query: str,
138
+ data: pd.DataFrame,
139
+ model: str = "gpt-3.5-turbo",
140
+ token_budget: int = 4096 - 500,
141
+ log_message: bool = False,
142
+ ):
143
+ """Answers a query using GPT and a dataframe of
144
+ relevant texts and embeddings.
145
+ """
146
+ message = self.query_message(
147
+ query=query,
148
+ dataframe=data,
149
+ model=model,
150
+ token_budget=token_budget,
151
+ )
152
+
153
+ if log_message:
154
+ logging.info(message)
155
+
156
+ messages = [
157
+ {"role": "system", "content": self.system_content},
158
+ {"role": "user", "content": message},
159
+ ]
160
+
161
+ response = openai.ChatCompletion.create(
162
+ model=model,
163
+ messages=messages,
164
+ temperature=0,
165
+ )
166
+ response_message = response["choices"][0]["message"]["content"]
167
+
168
+ if log_message:
169
+ logging.info(
170
+ "Total used tokens: %s",
171
+ response["usage"]["total_tokens"],
172
+ )
173
+
174
+ return response_message, message
175
+
176
+ def chat(self, message):
177
+ """Chat with Kleverbot with message, return response from OpenAI"""
178
+ res, _ = self.get_response(
179
+ query=message,
180
+ data=self.embedding_data,
181
+ model=self.chat_model,
182
+ token_budget=self.token_budget,
183
+ )
184
+
185
+ # TODO: fix this one after Klever FE fix -> scraper
186
+ return (
187
+ res.replace("help/document/", "wiki/1-")
188
+ .replace(">>", ">")
189
+ .replace("https://tokyotechies.kleversuite.net", "{ORG_URL}")
190
+ )
191
+
192
+
193
+ class TTChatbotConfig(AppConfig):
194
+ """TokyoTechies Chatbot Init"""
195
+
196
+ name = "chatbot"
197
+ label = "tt_chatbot"
198
+
199
+ # old method
200
+ # INDEXES_FILE = os.path.join(
201
+ # settings.TT_MODELS_PATH,
202
+ # settings.TT_MODEL_NAME,
203
+ # )
204
+
205
+ # QUERY_ENGINE = load_chatgpt_index(
206
+ # apps_names="TokyoTechies",
207
+ # index_file=INDEXES_FILE,
208
+ # )
209
+
210
+ # new method
211
+ QUERY_ENGINE = ChatGPTEmbeddingSearchBased(
212
+ service="TokyoTechies",
213
+ embedding_model=settings.TT_EMBEDDING_MODEL,
214
+ chat_model=settings.TT_EMBEDDING_CHAT_MODEL,
215
+ filepath_embedding=settings.TT_FILEPATH_EMBEDDING,
216
+ )
217
+
218
+
219
+ class KleverChatbotConfig(AppConfig):
220
+ """Klever Chatbot Init"""
221
+
222
+ name = "chatbot"
223
+ label = "klever_chatbot"
224
+
225
+ # old method
226
+ # INDEXES_FILE = os.path.join(
227
+ # settings.KLEVER_MODELS_PATH,
228
+ # settings.KLEVER_MODEL_NAME,
229
+ # )
230
+
231
+ # QUERY_ENGINE = load_chatgpt_index(
232
+ # apps_names="Klever",
233
+ # index_file=INDEXES_FILE,
234
+ # )
235
+
236
+ # new method
237
+ QUERY_ENGINE = ChatGPTEmbeddingSearchBased(
238
+ service="Klever",
239
+ embedding_model=settings.KLEVER_EMBEDDING_MODEL,
240
+ chat_model=settings.KLEVER_EMBEDDING_CHAT_MODEL,
241
+ filepath_embedding=settings.KLEVER_FILEPATH_EMBEDDING,
242
+ )
src/backend/TTChatBot/chatbot/exceptions.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ class TTChatBotConnectException(Exception):
2
+ pass
3
+
4
+
5
+ class TTChatBotEngineException(Exception):
6
+ pass
7
+
8
+
9
+ class ChatbotVersionException(Exception):
10
+ pass
src/backend/TTChatBot/chatbot/migrations/__init__.py ADDED
File without changes
src/backend/TTChatBot/chatbot/serializers.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rest_framework import serializers
2
+
3
+
4
+ class ConversationSerializer(serializers.Serializer):
5
+ """Conversation content when user interact with the Chatbot"""
6
+
7
+ user_chat = serializers.CharField(required=True, max_length=1000)
8
+
9
+
10
+ class MessageSerializer(ConversationSerializer):
11
+ """Response content when Chabot outputs to the user"""
12
+
13
+ chatbot_answer = serializers.CharField(required=True, max_length=1000)
14
+
15
+
16
+ class TaskSerializer(ConversationSerializer):
17
+ """Response content when Chabot outputs to the user"""
18
+
19
+ task_id = serializers.CharField(required=True)
20
+
21
+
22
+ class VersionSerializer(serializers.Serializer):
23
+ """Trained version of the chatbot"""
24
+
25
+ version = serializers.CharField(required=True)
src/backend/TTChatBot/chatbot/tasks.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from celery import shared_task
4
+ from celery.utils.log import get_task_logger
5
+ from django.conf import settings
6
+
7
+ from .apps import (
8
+ KleverChatbotConfig,
9
+ TTChatbotConfig,
10
+ )
11
+ from .exceptions import (
12
+ TTChatBotConnectException,
13
+ TTChatBotEngineException,
14
+ )
15
+ from .utils import num_tokens_from_messages
16
+
17
+ # TODO: fix the logger does not goes into log file
18
+ logger = get_task_logger(__name__)
19
+
20
+
21
+ def tt_sync_chat_website(message):
22
+ return _chat_tt(message, engine="TokyoTechies")
23
+
24
+
25
+ @shared_task(max_retries=0)
26
+ def tt_async_chat_website(message):
27
+ max_token = settings.MAX_TOKEN
28
+
29
+ if (
30
+ num_tokens_from_messages(
31
+ messages=message,
32
+ model=settings.TT_MODEL_NAME,
33
+ )
34
+ >= max_token
35
+ ):
36
+ logging.warning(
37
+ "Maximum token %s reached for user messages: %s",
38
+ max_token,
39
+ message,
40
+ )
41
+ return settings.MAX_TOKEN_RESPONSE
42
+ else:
43
+ return _chat_tt(message, TTChatbotConfig)
44
+
45
+
46
+ @shared_task(max_retries=0)
47
+ def tt_async_chat_klever(message):
48
+ max_token = settings.MAX_TOKEN
49
+
50
+ if (
51
+ num_tokens_from_messages(
52
+ messages=message,
53
+ model=settings.KLEVER_MODEL_NAME,
54
+ )
55
+ >= max_token
56
+ ):
57
+ logging.warning(
58
+ "Maximum token %s reached for user messages: %s",
59
+ max_token,
60
+ message,
61
+ )
62
+ return settings.MAX_TOKEN_RESPONSE
63
+ else:
64
+ return _chat_tt(message, KleverChatbotConfig)
65
+
66
+
67
+ def _chat_tt(message, engine=None):
68
+ try:
69
+ # TODO: check for query retries number of time when disconnected
70
+ if engine:
71
+ response = engine.QUERY_ENGINE.chat(message)
72
+ return response
73
+ else:
74
+ raise TTChatBotEngineException("Connect engine failed")
75
+ except Exception as exc:
76
+ logging.error("OpenAI error: ", exc_info=exc)
77
+ raise TTChatBotConnectException(
78
+ "Connect OpenAPI engine failed",
79
+ ) from exc
src/backend/TTChatBot/chatbot/urls.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from django.urls import path
2
+
3
+ # from rest_framework.urlpatterns import format_suffix_patterns
4
+ from . import views
5
+
6
+ app_name = "chatbot"
7
+
8
+ urlpatterns = [
9
+ # List and create conversations
10
+ path("chat/sync", views.ConversationSyncView.as_view()),
11
+ path(
12
+ "chat/tokyotechies/async",
13
+ views.TokyoTechiesConversationAsyncView.as_view(),
14
+ ),
15
+ path("chat/klever/async", views.KleverConversationAsyncView.as_view()),
16
+ path(
17
+ "chat/<str:task_id>/",
18
+ views.ChatTaskStatus.as_view(),
19
+ name="gpt_task_status",
20
+ ),
21
+ path(
22
+ "chat/tokyotechies/version",
23
+ views.TTBotVerion.as_view(),
24
+ name="ttbot_version",
25
+ ),
26
+ path(
27
+ "chat/klever/version",
28
+ views.KleverBotVerion.as_view(),
29
+ name="kleverbot_version",
30
+ ),
31
+ ]
src/backend/TTChatBot/chatbot/utils.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ from datetime import datetime
5
+
6
+ import tiktoken
7
+
8
+ from .exceptions import ChatbotVersionException
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def num_tokens_from_messages(
14
+ messages: str,
15
+ model: str = "gpt-3.5-turbo-0613",
16
+ ) -> int:
17
+ """
18
+ Return the number of tokens used by a list of messages.
19
+ """
20
+ try:
21
+ if model in ["text-davinci-003", "text-davinci-002"]:
22
+ encoding = tiktoken.get_encoding("p50k_base")
23
+ else: # gpt-4, "gpt-3.5-turbo"
24
+ encoding = tiktoken.encoding_for_model(model)
25
+ except KeyError:
26
+ logger.warning("Warning: model not found. Using cl100k_base encoding.")
27
+ encoding = tiktoken.get_encoding("cl100k_base")
28
+
29
+ num_tokens = len(encoding.encode(messages))
30
+ return num_tokens
31
+
32
+
33
+ def get_datetime_from_file(file_path: str) -> str:
34
+ """Get datetime from file
35
+
36
+ Args:
37
+ file_path (str): file path
38
+
39
+ Raises:
40
+ ChatbotVersionException: expcetion when file is not found
41
+
42
+ Returns:
43
+ str: version of the chatbot in format
44
+ """
45
+ try:
46
+ # Get the modification timestamp of the file
47
+ file_ts = os.path.getmtime(file_path)
48
+ return datetime.fromtimestamp(file_ts).strftime("%Y%m%d")
49
+
50
+ except FileNotFoundError as exc:
51
+ logging.error("File %s not found error: ", file_path, exc_info=exc)
52
+ raise ChatbotVersionException("File not found", exc) from exc
53
+
54
+
55
+ def extract_datetime_from_file(
56
+ version_file_path: str,
57
+ weight_file_path: str,
58
+ ) -> str:
59
+ r"""Extract date from
60
+ "- Training data includes information up until (\w{3} \d{2})"
61
+
62
+ Args:
63
+ version_file_path (str): file path of version file
64
+ weight_file_path (str): file path of embedding file
65
+ to be used when can not get file path of version file
66
+
67
+ Raises:
68
+ ChatbotVersionException: expcetion when file is not found
69
+
70
+ Returns:
71
+ str: version of the chatbot in format
72
+ """
73
+ current_year = datetime.now().year
74
+ target_line_format = (
75
+ r"- Training data includes information up until (\w{3} \d{2})"
76
+ )
77
+
78
+ try:
79
+ # Open the input file for reading
80
+ with open(version_file_path, encoding="utf-8") as infile:
81
+ # Iterate through each line in the input file
82
+ for line in infile:
83
+ # Use regex to search for the date format in the line
84
+ match = re.search(target_line_format, line.strip())
85
+ if match:
86
+ # If a match is found, extract the date
87
+ extracted_date = match.group(1)
88
+ break # Exit the loop after finding the first date
89
+
90
+ # Check if a date was extracted and print it
91
+ if extracted_date:
92
+ return datetime.strptime(
93
+ f"{extracted_date} {current_year}",
94
+ "%b %d %Y",
95
+ ).strftime("%Y%m%d")
96
+ else:
97
+ logging.warning(
98
+ "Date not found in the file, fallback to deployment date",
99
+ )
100
+ return get_datetime_from_file(weight_file_path)
101
+
102
+ except FileNotFoundError as exc:
103
+ logging.error(
104
+ "File %s not found error: ",
105
+ version_file_path,
106
+ exc_info=exc,
107
+ )
108
+ raise ChatbotVersionException("File not found", exc) from exc
src/backend/TTChatBot/chatbot/views.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+
4
+ from celery.result import AsyncResult
5
+ from django.conf import settings
6
+ from rest_framework import (
7
+ generics,
8
+ status,
9
+ )
10
+ from rest_framework.response import Response
11
+
12
+ from .exceptions import (
13
+ ChatbotVersionException,
14
+ TTChatBotConnectException,
15
+ TTChatBotEngineException,
16
+ )
17
+ from .serializers import (
18
+ ConversationSerializer,
19
+ MessageSerializer,
20
+ TaskSerializer,
21
+ VersionSerializer,
22
+ )
23
+ from .tasks import (
24
+ tt_async_chat_klever,
25
+ tt_async_chat_website,
26
+ tt_sync_chat_website,
27
+ )
28
+ from .utils import (
29
+ extract_datetime_from_file,
30
+ num_tokens_from_messages,
31
+ )
32
+
33
+ # add logger
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class ConversationSyncView(generics.GenericAPIView):
38
+ serializer_class = ConversationSerializer
39
+
40
+ def post(self, request, *args, **kwargs):
41
+ data = json.loads(request.body.decode("utf-8"))
42
+ question = data.get("user_chat", None)
43
+ max_token = settings.MAX_TOKEN
44
+
45
+ try:
46
+ if (
47
+ num_tokens_from_messages(
48
+ messages=question,
49
+ model=settings.TT_MODEL_NAME,
50
+ )
51
+ >= max_token
52
+ ):
53
+ logging.warning(
54
+ "Maximum token %s reached for user messages: %s",
55
+ max_token,
56
+ question,
57
+ )
58
+ res = MessageSerializer(
59
+ {
60
+ "user_chat": question,
61
+ "chatbot_answer": settings.MAX_TOKEN_RESPONSE,
62
+ },
63
+ )
64
+ else:
65
+ answer = tt_sync_chat_website(question)
66
+ res = MessageSerializer(
67
+ {
68
+ "user_chat": question,
69
+ "chatbot_answer": answer,
70
+ },
71
+ )
72
+
73
+ except (TTChatBotConnectException, TTChatBotEngineException) as exc:
74
+ logger.error("Failed to send request to ChatGPT: %s", exc)
75
+ res = MessageSerializer(
76
+ {
77
+ "user_chat": question,
78
+ "chatbot_answer": settings.DEFAULT_RESPONSE,
79
+ },
80
+ )
81
+ return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
82
+
83
+ return Response(res.data, status=status.HTTP_200_OK)
84
+
85
+
86
+ class TokyoTechiesConversationAsyncView(generics.GenericAPIView):
87
+ serializer_class = ConversationSerializer
88
+
89
+ def post(self, request, *args, **kwargs):
90
+ data = json.loads(request.body.decode("utf-8"))
91
+ question = data.get("user_chat", None)
92
+
93
+ try:
94
+ answer = tt_async_chat_website.delay(question)
95
+ res = TaskSerializer(
96
+ {
97
+ "user_chat": question,
98
+ "task_id": answer.id,
99
+ },
100
+ )
101
+
102
+ except (TTChatBotConnectException, TTChatBotEngineException) as exc:
103
+ logger.error("Failed to send request to ChatGPT: %s", exc)
104
+ res = TaskSerializer(
105
+ {
106
+ "user_chat": question,
107
+ "task_id": None,
108
+ },
109
+ )
110
+ return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
111
+
112
+ return Response(res.data, status=status.HTTP_200_OK)
113
+
114
+
115
+ class KleverConversationAsyncView(generics.GenericAPIView):
116
+ serializer_class = ConversationSerializer
117
+
118
+ def post(self, request, *args, **kwargs):
119
+ data = json.loads(request.body.decode("utf-8"))
120
+ question = data.get("user_chat", None)
121
+
122
+ try:
123
+ answer = tt_async_chat_klever.delay(question)
124
+ res = TaskSerializer(
125
+ {
126
+ "user_chat": question,
127
+ "task_id": answer.id,
128
+ },
129
+ )
130
+
131
+ except (TTChatBotConnectException, TTChatBotEngineException) as exc:
132
+ logger.error("Failed to send request to ChatGPT: %s", exc)
133
+ res = TaskSerializer(
134
+ {
135
+ "user_chat": question,
136
+ "task_id": None,
137
+ },
138
+ )
139
+ return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
140
+
141
+ return Response(res.data, status=status.HTTP_200_OK)
142
+
143
+
144
+ class ChatTaskStatus(generics.GenericAPIView):
145
+ """
146
+ Check the status of ChatGPT task
147
+ """
148
+
149
+ serializer_class = TaskSerializer
150
+
151
+ def get(self, request, task_id, *args, **kwargs):
152
+ task = AsyncResult(task_id)
153
+
154
+ if task.ready():
155
+ response = task.result
156
+ logging.info("Task reponse: %s", response)
157
+ return Response({"status": "READY", "response": response})
158
+ else:
159
+ return Response({"status": "PENDING"})
160
+
161
+
162
+ class TTBotVerion(generics.GenericAPIView):
163
+ """Get version of TTBot"""
164
+
165
+ serializer_class = VersionSerializer
166
+
167
+ def get(self, *args, **kwargs):
168
+ try:
169
+ ttbot_version = extract_datetime_from_file(
170
+ version_file_path=settings.TT_TRAINING_VERSION,
171
+ weight_file_path=settings.TT_FILEPATH_EMBEDDING,
172
+ )
173
+ res = VersionSerializer({"version": ttbot_version})
174
+ return Response(res.data, status=status.HTTP_200_OK)
175
+
176
+ except ChatbotVersionException as exc:
177
+ logger.error("Failed to check version: %s", exc)
178
+ res = VersionSerializer({"version": None})
179
+ return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
180
+
181
+
182
+ class KleverBotVerion(generics.GenericAPIView):
183
+ """Get version of Klever"""
184
+
185
+ serializer_class = VersionSerializer
186
+
187
+ def get(self, *args, **kwargs):
188
+ try:
189
+ kleverbot_version = extract_datetime_from_file(
190
+ version_file_path=settings.KLEVER_TRAINING_VERSION,
191
+ weight_file_path=settings.KLEVER_FILEPATH_EMBEDDING,
192
+ )
193
+ res = VersionSerializer({"version": kleverbot_version})
194
+ return Response(res.data, status=status.HTTP_200_OK)
195
+
196
+ except ChatbotVersionException as exc:
197
+ logger.error("Failed to check version: %s", exc)
198
+ res = VersionSerializer({"version": None})
199
+ return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
src/backend/TTChatBot/config/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .celery import app as celery_app
2
+
3
+ __all__ = ["celery_app"]
src/backend/TTChatBot/config/asgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ASGI config for TTChatBot project.
3
+
4
+ It exposes the ASGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.asgi import get_asgi_application
13
+
14
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
15
+
16
+ application = get_asgi_application()
src/backend/TTChatBot/config/celery.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from celery import Celery
4
+ from django.conf import settings
5
+
6
+ # Set the default Django settings module for the 'celery' program.
7
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
8
+
9
+ app = Celery("TTChatBot")
10
+
11
+ app.conf.update(
12
+ broker_connection_retry_on_startup=True,
13
+ broker_connection_max_retries=10,
14
+ result_expires=60,
15
+ task_acks_late=True,
16
+ )
17
+
18
+ app.config_from_object("django.conf:settings", namespace="CELERY")
19
+ app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
20
+
21
+ # TODO: convention celery:
22
+ # https://qiita.com/hankehly/items/c3e0496eb04327a53ac4
23
+ # TODO: crontab for celery:
24
+ # https://www.codingforentrepreneurs.com/blog/celery-redis-django/
src/backend/TTChatBot/config/settings/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+
7
+ # Open AI key
8
+ OPENAI_API_KEY = os.getenv("OPEN_AI_KEY")
9
+ # settings keys for model
10
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
11
+
12
+ env_name = os.getenv("ENV_NAME", "prod")
13
+
14
+ if env_name == "local":
15
+ from .local import * # noqa
16
+ elif env_name == "staging":
17
+ from .staging import * # noqa
18
+ else:
19
+ from .prod import * # noqa
src/backend/TTChatBot/config/settings/common.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ BASE_DIR = Path(__file__).resolve().parent.parent.parent
5
+
6
+ # Static files (CSS, JavaScript, Images)
7
+ STORAGE_URL = BASE_DIR / "storage"
8
+
9
+ # Swagger HTTPS
10
+ USE_X_FORWARDED_HOST = True
11
+ SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
12
+
13
+ # Application definition
14
+
15
+ DJANGO_APPS = [
16
+ "django.contrib.admin",
17
+ "django.contrib.auth",
18
+ "django.contrib.contenttypes",
19
+ "django.contrib.sessions",
20
+ "django.contrib.messages",
21
+ "django.contrib.staticfiles",
22
+ ]
23
+
24
+ THIRD_PARTY_APPS = [
25
+ "gunicorn",
26
+ "rest_framework",
27
+ "drf_yasg", # another way to swagger
28
+ "django_celery_results", # Store Celery Result and cache
29
+ ]
30
+
31
+ LOCAL_APPS = [
32
+ "chatbot.apps",
33
+ # 'users.apps.UsersConfig',
34
+ # 'site_settings.apps.SiteSettingsConfig',
35
+ # 'training_model.apps.TrainingModelConfig',
36
+ ]
37
+
38
+ INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS
39
+
40
+ MIDDLEWARE = [
41
+ "django.middleware.security.SecurityMiddleware",
42
+ "django.contrib.sessions.middleware.SessionMiddleware",
43
+ "django.middleware.common.CommonMiddleware",
44
+ "django.middleware.csrf.CsrfViewMiddleware",
45
+ "django.contrib.auth.middleware.AuthenticationMiddleware",
46
+ "django.contrib.messages.middleware.MessageMiddleware",
47
+ "django.middleware.clickjacking.XFrameOptionsMiddleware",
48
+ "whitenoise.middleware.WhiteNoiseMiddleware",
49
+ ]
50
+
51
+
52
+ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY")
53
+ ROOT_URLCONF = "config.urls"
54
+ WSGI_APPLICATION = "config.wsgi.application"
55
+ ASGI_APPLICATION = "config.asgi.application"
56
+
57
+ TEMPLATES = [
58
+ {
59
+ "BACKEND": "django.template.backends.django.DjangoTemplates",
60
+ "DIRS": [BASE_DIR / "templates"],
61
+ "APP_DIRS": True,
62
+ "OPTIONS": {
63
+ "context_processors": [
64
+ "django.template.context_processors.debug",
65
+ "django.template.context_processors.request",
66
+ "django.contrib.auth.context_processors.auth",
67
+ "django.contrib.messages.context_processors.messages",
68
+ ],
69
+ },
70
+ },
71
+ ]
72
+
73
+ # Logging
74
+ LOGGING = {
75
+ "version": 1,
76
+ "disable_existing_loggers": False,
77
+ "formatters": {
78
+ "default": {
79
+ "format": "%(asctime)s %(levelname)s: %(message)s",
80
+ },
81
+ },
82
+ "filters": {
83
+ "require_debug_false": {
84
+ "()": "django.utils.log.RequireDebugFalse",
85
+ },
86
+ "require_debug_true": {
87
+ "()": "django.utils.log.RequireDebugTrue",
88
+ },
89
+ },
90
+ "handlers": {
91
+ "console": {
92
+ "class": "logging.StreamHandler",
93
+ "formatter": "default",
94
+ "level": "INFO",
95
+ },
96
+ "common": {
97
+ "class": "logging.FileHandler",
98
+ "filename": STORAGE_URL / "common.log",
99
+ "formatter": "default",
100
+ "level": "INFO",
101
+ },
102
+ },
103
+ "loggers": {
104
+ "": {
105
+ "handlers": ["console", "common"],
106
+ "level": 1,
107
+ },
108
+ },
109
+ }
110
+
111
+ # Internationalization
112
+ # https://docs.djangoproject.com/en/4.2/topics/i18n/
113
+
114
+ LANGUAGE_CODE = "en-us"
115
+
116
+ TIME_ZONE = "UTC"
117
+
118
+ USE_I18N = True
119
+
120
+ USE_TZ = True
121
+
122
+ # Static files (CSS, JavaScript, Images)
123
+ # https://docs.djangoproject.com/en/4.2/howto/static-files/
124
+
125
+ STATIC_URL = "/static/"
126
+ STATIC_ROOT = os.path.join(BASE_DIR, "static")
127
+ STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage"
128
+
129
+ # Default primary key field type
130
+ # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
131
+
132
+ DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
src/backend/TTChatBot/config/settings/local.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from .common import * # noqa
4
+
5
+ ALLOWED_HOSTS = ["*"]
6
+
7
+ # SECURITY WARNING: don't run with debug turned on in production!
8
+ DEBUG = os.getenv("DJANGO_DEBUG")
9
+
10
+ # Chatbot default answer
11
+ DEFAULT_RESPONSE = "Sorry, I'm having trouble understanding you."
12
+ DEFAULT_RESPONSE_JP = "申し訳ございません、ご質問を理解いたしかねます"
13
+
14
+ MAX_TOKEN_RESPONSE = (
15
+ "Sorry, I'm having trouble processing all that information. "
16
+ "Could you summarize a bit more concisely?"
17
+ )
18
+ MAX_TOKEN_RESPONSE_JP = "申し訳ございません、いただいた全ての情報を処理することができません。もう少し簡潔にしてください。"
19
+
20
+ # --Postgres--
21
+ # DATABASES = {
22
+ # 'default': {
23
+ # 'ENGINE': 'django.db.backends.postgresql',
24
+ # 'NAME': os.getenv('DB_NAME'),
25
+ # 'USER': os.getenv('DB_USER'),
26
+ # 'PASSWORD': os.getenv('DB_PASSWORD'),
27
+ # 'HOST': os.getenv('DB_HOST', 'localhost'),
28
+ # 'PORT': os.getenv('DB_PORT'),
29
+ # }
30
+ # }
31
+
32
+ # --Celery--
33
+ # List of modules to import when celery starts.
34
+ # --Worker settings--
35
+ # If you're doing mostly I/O you can have more processes,
36
+ # but if mostly spending CPU, try to keep it close to the
37
+ # number of CPUs on your machine. If not set, the number of CPUs/cores
38
+ # available will be used.
39
+ CELERY_WORKER_CONCURRENCY = 1
40
+ # CELERYD_LOG_FILE = "celeryd.log"
41
+ # CELERYD_LOG_LEVEL = "INFO"
42
+ REDIS_HOST = os.getenv("REDIS_HOST")
43
+ REDIS_PORT = os.getenv("REDIS_PORT")
44
+ BROKER_URL = os.getenv("BROKER_URL")
45
+
46
+ CELERY_BROKER_URL = BROKER_URL
47
+ CELERY_RESULT_BACKEND = BROKER_URL
48
+ CELERY_ACCEPT_CONTENT = ["application/json"]
49
+ CELERY_RESULT_SERIALIZER = "json"
50
+ CELERY_TASK_SERIALIZER = "json"
51
+
52
+ # Config for old query methods
53
+ # TT Websites Models path
54
+ TT_MODELS_PATH = "../../../models/TokyoTechies/"
55
+ # text-davinci-003 or gpt-4
56
+ TT_MODEL_NAME = "text-davinci-003"
57
+ # Klever Models path
58
+ KLEVER_MODELS_PATH = "../../../models/Klever/"
59
+ # text-davinci-003 or gpt-4
60
+ KLEVER_MODEL_NAME = "text-davinci-003"
61
+
62
+
63
+ # Config for new embedding methods
64
+ KLEVER_EMBEDDING_MODEL = (
65
+ "text-embedding-ada-002" # OpenAI's best embeddings as of Apr 2023
66
+ )
67
+ KLEVER_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo" # "gpt-4"
68
+ KLEVER_FILEPATH_EMBEDDING = "../../../models/Klever/embeddings/Klever.csv"
69
+ KLEVER_TRAINING_VERSION = "../../../models/Klever/_version.txt"
70
+
71
+ # Config for new embedding methods
72
+ TT_EMBEDDING_MODEL = (
73
+ "text-embedding-ada-002" # OpenAI's best embeddings as of Apr 2023
74
+ )
75
+ TT_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo" # "gpt-4"
76
+ TT_FILEPATH_EMBEDDING = (
77
+ "../../../models/TokyoTechies/embeddings/TokyoTechies.csv"
78
+ )
79
+ TT_TRAINING_VERSION = "../../../models/TokyoTechies/_version.txt"
80
+
81
+ INTRODUCTION_MESSAGE = (
82
+ "You are a chatbot of {service}. "
83
+ "Use the below articles on the {service} to answer the subsequent question. " # noqa: E501
84
+ "If the answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance." # noqa: E501
85
+ r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.' # noqa: E501
86
+ )
87
+ SYSTEM_CONTENT = "You answer questions about {service}"
88
+ NEXT_ARTICLE = "\n{service}" + "article section:\n--\n{string}\n--"
89
+ TOKEN_BUDGET = 4096 - 500
90
+
91
+ # max token
92
+ MAX_TOKEN = 200
93
+
94
+ # CELERYD_TASK_SOFT_TIME_LIMIT = 3
95
+ # Kill anything longer than 10 seconds:
96
+ # CELERYD_TASK_TIME_LIMIT = 10
97
+ # After 2 hours remove the task result:
98
+ # CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 2
src/backend/TTChatBot/config/settings/prod.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from .common import * # noqa
4
+
5
+ ALLOWED_HOSTS = ["*"]
6
+
7
+ # SECURITY WARNING: don't run with debug turned on in production!
8
+ DEBUG = False
9
+
10
+ # Chatbot default answer
11
+ DEFAULT_RESPONSE = "Sorry, I'm having trouble understanding you."
12
+ DEFAULT_RESPONSE_JP = "申し訳ございません、ご質問を理解いたしかねます"
13
+
14
+ MAX_TOKEN_RESPONSE = (
15
+ "Sorry, I'm having trouble processing all that information. "
16
+ "Could you summarize a bit more concisely?"
17
+ )
18
+ MAX_TOKEN_RESPONSE_JP = "申し訳ございません、いただいた全ての情報を処理することができません。もう少し簡潔にしてください。"
19
+
20
+ # --Postgres--
21
+ # DATABASES = {
22
+ # 'default': {
23
+ # 'ENGINE': 'django.db.backends.postgresql',
24
+ # 'NAME': os.getenv('DB_NAME'),
25
+ # 'USER': os.getenv('DB_USER'),
26
+ # 'PASSWORD': os.getenv('DB_PASSWORD'),
27
+ # 'HOST': os.getenv('DB_HOST', 'localhost'),
28
+ # 'PORT': os.getenv('DB_PORT'),
29
+ # }
30
+ # }
31
+
32
+ # --Celery--
33
+ # List of modules to import when celery starts.
34
+ # --Worker settings--
35
+ # If you're doing mostly I/O you can have more processes,
36
+ # but if mostly spending CPU, try to keep it close to the
37
+ # number of CPUs on your machine. If not set, the number of CPUs/cores
38
+ # available will be used.
39
+ CELERY_WORKER_CONCURRENCY = 20
40
+ # CELERYD_LOG_FILE = "celeryd.log"
41
+ # CELERYD_LOG_LEVEL = "INFO"
42
+ REDIS_HOST = os.getenv("REDIS_HOST")
43
+ REDIS_PORT = os.getenv("REDIS_PORT")
44
+ BROKER_URL = os.getenv("BROKER_URL")
45
+
46
+ CELERY_BROKER_URL = BROKER_URL
47
+ CELERY_RESULT_BACKEND = BROKER_URL
48
+ CELERY_ACCEPT_CONTENT = ["application/json"]
49
+ CELERY_RESULT_SERIALIZER = "json"
50
+ CELERY_TASK_SERIALIZER = "json"
51
+
52
+ # Config for old query methods
53
+ # TT Websites Models path
54
+ TT_MODELS_PATH = "../../../models/TokyoTechies/"
55
+ # text-davinci-003 or gpt-4
56
+ TT_MODEL_NAME = "text-davinci-003"
57
+ # Klever Models path
58
+ KLEVER_MODELS_PATH = "../../../models/Klever/"
59
+ # text-davinci-003 or gpt-4
60
+ KLEVER_MODEL_NAME = "text-davinci-003"
61
+
62
+
63
+ # Config for new embedding methods
64
+ KLEVER_EMBEDDING_MODEL = (
65
+ "text-embedding-ada-002" # OpenAI's best embeddings as of Apr 2023
66
+ )
67
+ KLEVER_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo" # "gpt-4"
68
+ KLEVER_FILEPATH_EMBEDDING = "../../../models/Klever/embeddings/Klever.csv"
69
+ KLEVER_TRAINING_VERSION = "../../../models/Klever/_version.txt"
70
+
71
+ # Config for new embedding methods
72
+ TT_EMBEDDING_MODEL = (
73
+ "text-embedding-ada-002" # OpenAI's best embeddings as of Apr 2023
74
+ )
75
+ TT_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo" # "gpt-4"
76
+ TT_FILEPATH_EMBEDDING = (
77
+ "../../../models/TokyoTechies/embeddings/TokyoTechies.csv"
78
+ )
79
+ TT_TRAINING_VERSION = "../../../models/TokyoTechies/_version.txt"
80
+
81
+ INTRODUCTION_MESSAGE = (
82
+ "You are a chatbot of {service}. "
83
+ "Use the below articles on the {service} to answer the subsequent question. " # noqa: E501
84
+ "If the answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance." # noqa: E501
85
+ r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.' # noqa: E501
86
+ )
87
+ SYSTEM_CONTENT = "You answer questions about {service}"
88
+ NEXT_ARTICLE = "\n{service}" + "article section:\n--\n{string}\n--"
89
+ TOKEN_BUDGET = 4096 - 500
90
+
91
+ # max token
92
+ MAX_TOKEN = 200
93
+
94
+ # CELERYD_TASK_SOFT_TIME_LIMIT = 3
95
+ # Kill anything longer than 10 seconds:
96
+ # CELERYD_TASK_TIME_LIMIT = 10
97
+ # After 2 hours remove the task result:
98
+ # CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 2
src/backend/TTChatBot/config/settings/staging.py ADDED
File without changes
src/backend/TTChatBot/config/urls.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ URL configuration for TTChatBot project.
3
+
4
+ The `urlpatterns` list routes URLs to views. For more information please see:
5
+ https://docs.djangoproject.com/en/4.2/topics/http/urls/
6
+ Examples:
7
+ Function views
8
+ 1. Add an import: from my_app import views
9
+ 2. Add a URL to urlpatterns: path('', views.home, name='home')
10
+ Class-based views
11
+ 1. Add an import: from other_app.views import Home
12
+ 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
13
+ Including another URLconf
14
+ 1. Import the include() function: from django.urls import include, path
15
+ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
16
+ """
17
+ from django.conf import settings
18
+ from django.conf.urls.static import static
19
+ from django.urls import (
20
+ include,
21
+ path,
22
+ )
23
+ from drf_yasg import openapi
24
+ from drf_yasg.generators import OpenAPISchemaGenerator
25
+ from drf_yasg.views import get_schema_view
26
+ from rest_framework import permissions
27
+
28
+
29
+ class BothHttpAndHttpsSchemaGenerator(OpenAPISchemaGenerator):
30
+ def get_schema(self, request=None, public=False):
31
+ schema = super().get_schema(request, public)
32
+ schema.schemes = ["http", "https"]
33
+ return schema
34
+
35
+
36
+ schema_view = get_schema_view(
37
+ openapi.Info(
38
+ title="Tokyo Techies Chatbot",
39
+ default_version="v1",
40
+ description="API documentation for Toyko Techies Chatbot API",
41
+ ),
42
+ public=True,
43
+ generator_class=BothHttpAndHttpsSchemaGenerator,
44
+ permission_classes=[permissions.AllowAny],
45
+ )
46
+
47
+ urlpatterns = [
48
+ path("api/v1/", include("chatbot.urls")),
49
+ # Swagger URLs
50
+ path(
51
+ "",
52
+ schema_view.with_ui("swagger", cache_timeout=0),
53
+ name="schema-swagger-ui",
54
+ ),
55
+ ]
56
+
57
+ # Include static files serving only during development
58
+ if settings.DEBUG:
59
+ urlpatterns += static(
60
+ settings.STATIC_URL,
61
+ document_root=settings.STATIC_ROOT,
62
+ )
src/backend/TTChatBot/config/wsgi.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WSGI config for TTChatBot project.
3
+
4
+ It exposes the WSGI callable as a module-level variable named ``application``.
5
+
6
+ For more information on this file, see
7
+ https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
8
+ """
9
+
10
+ import os
11
+
12
+ from django.core.wsgi import get_wsgi_application
13
+
14
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
15
+
16
+ application = get_wsgi_application()
src/backend/TTChatBot/manage.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """Django's command-line utility for administrative tasks."""
3
+ import os
4
+ import sys
5
+
6
+
7
+ def main():
8
+ """Run administrative tasks."""
9
+ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
10
+ try:
11
+ from django.core.management import execute_from_command_line
12
+ except ImportError as exc:
13
+ raise ImportError(
14
+ "Couldn't import Django. Are you sure it's installed and "
15
+ "available on your PYTHONPATH environment variable? Did you "
16
+ "forget to activate a virtual environment?",
17
+ ) from exc
18
+ execute_from_command_line(sys.argv)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ main()
src/backend/TTChatBot/storage/.gitkeep ADDED
File without changes
src/frontend/.gitkeep ADDED
File without changes
src/frontend/.prettierignore ADDED
@@ -0,0 +1 @@
 
 
1
+ public/**/*
src/frontend/.prettierrc ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "semi": false,
3
+ "singleQuote": true
4
+ }
src/frontend/.sample-env ADDED
@@ -0,0 +1 @@
 
 
1
+ BACKEND_API_URL=http://localhost:8000/
src/frontend/Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG NODE_VERSION=18.16.0
2
+ ARG ALPINE_VERSION=3.17.2
3
+
4
+ FROM node:${NODE_VERSION}-alpine AS node
5
+ FROM alpine:${ALPINE_VERSION}
6
+
7
+ COPY --from=node /usr/lib /usr/lib
8
+ COPY --from=node /usr/local/lib /usr/local/lib
9
+ COPY --from=node /usr/local/include /usr/local/include
10
+ COPY --from=node /usr/local/bin /usr/local/bin
11
+
12
+ # create destination directory
13
+ RUN mkdir -p /src/frontend
14
+ WORKDIR /src/frontend
15
+
16
+ # copy the app
17
+ COPY . /src/frontend
18
+ RUN npm install
19
+
20
+ EXPOSE 3000
21
+
22
+ CMD [ "npm", "start" ]
src/frontend/environments/dev/build.args ADDED
@@ -0,0 +1 @@
 
 
1
+ BACKEND_API_URL=https://www.chatbot-api.dev.aws.tokyotechies.co.jp/
src/frontend/environments/prod/build.args ADDED
@@ -0,0 +1 @@
 
 
1
+ BACKEND_API_URL=https://www.chatbot-api.tokyotechies.com/
src/frontend/next-env.d.ts ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /// <reference types="next" />
2
+ /// <reference types="next/image-types/global" />
3
+
4
+ // NOTE: This file should not be edited
5
+ // see https://nextjs.org/docs/basic-features/typescript for more information.
src/frontend/next.config.js ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('next').NextConfig} */
2
+ const nextConfig = {
3
+ reactStrictMode: true,
4
+ i18n: {
5
+ locales: ['en', 'ja'],
6
+ defaultLocale: 'en',
7
+ localeDetection: false,
8
+ },
9
+ }
10
+
11
+ module.exports = nextConfig
src/frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
src/frontend/package.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "tokyo-techies-chatbot",
3
+ "version": "0.1.0",
4
+ "private": true,
5
+ "scripts": {
6
+ "dev": "next dev",
7
+ "build": "next build",
8
+ "start": "next start",
9
+ "lint": "next lint"
10
+ },
11
+ "dependencies": {
12
+ "@types/node": "20.5.3",
13
+ "@types/react": "18.2.21",
14
+ "@types/react-dom": "18.2.7",
15
+ "autoprefixer": "10.4.15",
16
+ "axios": "^0.27.2",
17
+ "eslint": "8.47.0",
18
+ "eslint-config-next": "13.4.19",
19
+ "eslint-config-prettier": "^8.5.0",
20
+ "eslint-plugin-prettier": "^4.0.0",
21
+ "next": "13.4.19",
22
+ "postcss": "8.4.28",
23
+ "prettier": "^2.7.0",
24
+ "react": "18.2.0",
25
+ "react-dom": "18.2.0",
26
+ "react-markdown": "^8.0.4",
27
+ "tailwindcss": "3.3.3",
28
+ "typescript": "5.1.6"
29
+ }
30
+ }
src/frontend/postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ }
src/frontend/public/favicon.webp ADDED
src/frontend/public/locales/en.ts ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default {
2
+ name: 'Techie ',
3
+ askMe: 'Ask me anything',
4
+ connected: 'You are connected with a virtual assistant',
5
+ greeting: 'Hi there! 😊 \n' +
6
+ 'I\'m Techie - a virtual assistant here to help you with anything related to Tokyo Techies.\n' +
7
+ 'If you have any questions, need information, or just want to chat, feel free to ask me!\n' +
8
+ 'How can I help you?',
9
+ placeholder: 'Type your question...',
10
+ maintenance: 'Sorry, we are under maintenance!',
11
+ year: '/',
12
+ month: '/',
13
+ day: '',
14
+ edition: ' version'
15
+ }