Spaces:

pmkhanh7890
/

chatbot

Sleeping

App Files Files

pmkhanh7890 commited on Oct 20, 2023

Commit

19cf5e6

1 Parent(s): 936d627

Freemind demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +251 -0
models/freemind/embeddings/freemind.csv +0 -0
requirements.txt +33 -0
src/AI/__init__.py +0 -0
src/AI/ai_configs.py +84 -0
src/AI/chatbot_demo.py +112 -0
src/AI/embedding.py +323 -0
src/AI/evaluation.py +101 -0
src/AI/klever_search.py +73 -0
src/AI/parsing.py +7 -0
src/AI/search.py +219 -0
src/AI/training.py +95 -0
src/__init__.py +0 -0
src/backend/TTChatBot/.sample-env +24 -0
src/backend/TTChatBot/chatbot/__init__.py +0 -0
src/backend/TTChatBot/chatbot/admin.py +3 -0
src/backend/TTChatBot/chatbot/apps.py +242 -0
src/backend/TTChatBot/chatbot/exceptions.py +10 -0
src/backend/TTChatBot/chatbot/migrations/__init__.py +0 -0
src/backend/TTChatBot/chatbot/serializers.py +25 -0
src/backend/TTChatBot/chatbot/tasks.py +79 -0
src/backend/TTChatBot/chatbot/urls.py +31 -0
src/backend/TTChatBot/chatbot/utils.py +108 -0
src/backend/TTChatBot/chatbot/views.py +199 -0
src/backend/TTChatBot/config/__init__.py +3 -0
src/backend/TTChatBot/config/asgi.py +16 -0
src/backend/TTChatBot/config/celery.py +24 -0
src/backend/TTChatBot/config/settings/__init__.py +19 -0
src/backend/TTChatBot/config/settings/common.py +132 -0
src/backend/TTChatBot/config/settings/local.py +98 -0
src/backend/TTChatBot/config/settings/prod.py +98 -0
src/backend/TTChatBot/config/settings/staging.py +0 -0
src/backend/TTChatBot/config/urls.py +62 -0
src/backend/TTChatBot/config/wsgi.py +16 -0
src/backend/TTChatBot/manage.py +22 -0
src/backend/TTChatBot/storage/.gitkeep +0 -0
src/frontend/.gitkeep +0 -0
src/frontend/.prettierignore +1 -0
src/frontend/.prettierrc +4 -0
src/frontend/.sample-env +1 -0
src/frontend/Dockerfile +22 -0
src/frontend/environments/dev/build.args +1 -0
src/frontend/environments/prod/build.args +1 -0
src/frontend/next-env.d.ts +5 -0
src/frontend/next.config.js +11 -0
src/frontend/package-lock.json +0 -0
src/frontend/package.json +30 -0
src/frontend/postcss.config.js +6 -0
src/frontend/public/favicon.webp +0 -0
src/frontend/public/locales/en.ts +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,251 @@

+# For experiment of counting tokens
+src/AI/tokens.py
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# ignore training data
+src/data/tt-content-postprocess
+src/data/tt-content-debug
+src/data/tt-content
+src/scraper/tt-content-postprocess
+src/scraper/tt-content-debug
+src/scraper/tt-content
+src/scraper/tt-klever-content
+# staticfile
+src/backend/TTChatBot/staticfiles/
+src/backend/TTChatBot/static/
+# MacOS
+.DS_Store
+# Frontend
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+# dependencies
+src/frontend/node_modules/
+/.pnp
+.pnp.js
+# next.js
+src/frontend/.next/
+/out/
+# misc
+*.pem
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+.pnpm-debug.log*
+# vercel
+.vercel
+# Pycharm
+.idea
+# Env vars - to be updated
+/infra/ci/secret.yaml
+# Local .terraform directories
+**/.terraform/*
+# .tfstate files
+*.tfstate
+*.tfstate.*
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version
+# control as they are data points which are potentially sensitive and subject
+# to change depending on the environment.
+*.tfvars
+*.tfvars.json
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+# Ignore CLI configuration files
+.terraformrc
+terraform.rc
+.terraform.lock.hcl
+# ignore .vscode
+.vscode
+# Ignore sensitive data - k8s env vars
+infra/environments/chatbot-dev/dev_secret.yaml
+infra/environments/chatbot-prod/prod_secret.yaml
+infra/environments/tt-chatbot-prod/prod_secret.yaml
+# yarn file
+yarn.lock

models/freemind/embeddings/freemind.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# AI packages
+gradio==3.40.1  # for demo with interface
+llama-index==0.7.4 # enables the LLM to access the outside information that serves as our knowledge base
+openai==0.27.8  # for generating embeddings
+pandas==2.0.3  # for DataFrames to store article sections and embeddings
+scipy==1.11.1  # for calculating vector similarities for search
+tiktoken==0.4.0  # for counting tokens
+typing_extensions==4.5.0 # for TypeError: issubclass() arg 1 must be a class of langchain
+# BE/FE packages
+requests==2.31.0
+tqdm==4.65.0
+django==4.2.4
+python-dotenv==1.0.0
+beautifulsoup4==4.12.2
+# django rest API
+djangorestframework==3.14.0
+drf-yasg==1.21.7
+celery==5.3.1
+django-celery-results==2.5.1
+psycopg2-binary==2.9.6
+# message broker for celery
+redis==4.6.0
+# threading library for celery
+gevent==23.7.0
+# production
+gunicorn==21.2.0
+# cors
+django-cors-headers==4.2.0
+# convert html to text
+html2text==2020.1.16
+# serving staticfiles without using ngnix
+whitenoise==6.5.0

src/AI/__init__.py ADDED Viewed

File without changes

src/AI/ai_configs.py ADDED Viewed

	@@ -0,0 +1,84 @@

+"""
+Author: Khanh Phan
+Date: 2023-04-20
+"""
+import os
+import sys
+# MODEL PARAMETERS: https://platform.openai.com/docs/models/gpt-3-5
+MODEL_NAME = "gpt-3.5-turbo"  # Must select from MODEL_NAMES
+MODEL_NAMES = ["gpt-4", "text-davinci-003", "gpt-3.5-turbo"]
+EMBEDDING_MODEL = (
+    "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
+)
+# CHATBOT SERVICE
+SERVICE = "freemind"  # Must select from SERVICES
+SERVICES = ["TokyoTechies", "Klever", "Test", "freemind"]
+# DATA FORMATTING
+DELIMITER_TOKYOTECHIES = "Sub Section:"
+FILE_TYPE = ".txt"
+FILE_ENCODING = "utf-8"
+INTRODUCTION_MESSAGE = (
+    f"You are a chatbot of {SERVICE}. "
+    f"Use the below articles on the {SERVICE} to answer the subsequent question. "  # noqa: E501
+    "If an answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance."  # noqa: E501
+    r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.'  # noqa: E501
+)
+SYSTEM_CONTENT = "You answer questions about {SERVICE}"
+# CALCULATE EMBEDDING PARAMETERS
+MAX_TOKENS = 1600  # maximum tokens for a section
+BATCH_SIZE = 1000  # up to 2048 embedding inputs per request
+TOKEN_BUDGET = 4096 - 500
+# TRAINING PARAMETERS
+CONTEXT_WINDOW = 4096  # Context window for the LLM.
+NUM_OUTPUTS = 512  # Number of outputs for the LLM.
+CHUNK_OVERLAP_RATIO = 0.1  # Chunk overlap as a ratio of chunk size
+TEMPERATURE = 0.0  # A parameter that controls the “creativity” or
+# randomness of the text generated. A higher temperature (e.g., 0.7)
+# results in more diverse and creative output, while a lower temperature
+# (e.g., 0.2) makes the output more deterministic and focused.
+sys.path.append(os.path.abspath(os.path.join("..", "data")))
+# PATH
+if SERVICE in SERVICES:
+    if MODEL_NAME in MODEL_NAMES:
+        # Path to training files:
+        FOLDERPATH_DOCUMENTS = os.path.join(
+            "data",
+            SERVICE,
+            "training_files",
+        )
+        # Path to model
+        FOLDERPATH_INDEXES = os.path.join(
+            "models",
+            SERVICE,
+            MODEL_NAME,
+        )
+        FILEPATH_EMBEDDINGS = os.path.join(
+            "models",
+            SERVICE,
+            "embeddings",
+            f"{SERVICE}.csv",
+        )
+        # For evaluation
+        FOLDERPATH_QUESTION = os.path.join(
+            "data",
+            SERVICE,
+            "evaluation",
+            "questions",
+        )
+        FOLDERPATH_QA = os.path.join(
+            "data",
+            SERVICE,
+            "evaluation",
+            "QA_" + MODEL_NAME,
+        )
+    else:
+        raise ValueError("MODEL_NAME must be in MODEL_NAMES")
+else:
+    raise ValueError("SERVICE must be in SERVICES")

src/AI/chatbot_demo.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""
+Author: Khanh Phan
+Date: 2023-04-20
+"""
+import configparser
+import os
+import gradio as gr
+import openai
+from ai_configs import (
+    FOLDERPATH_INDEXES,
+    MODEL_NAME,
+)
+from llama_index import (
+    StorageContext,
+    load_index_from_storage,
+)
+env = configparser.ConfigParser()
+env.read(".env")
+os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+openai.api_key = os.environ["OPENAI_API_KEY"]
+def format_response(responses: dict):
+    """
+    (Optional) Format one or multiple responses from version(s) of chatbot
+        Parameters:
+            responses (dict): chatbot response with the name of model
+        Returns:
+            output (str): formatted reponse
+    """
+    output = ""
+    for response in responses:
+        output += response + (responses[response]) + "\n\n"
+    return output
+def chat(message, history):
+    """
+    Load index to chatbot and get response
+        Parameters:
+            message (dict): question to chatbot
+            history (list): history of the whole conversation
+        Returns:
+            history (list): history of the whole conversation (for displaying)
+            history (list): state of the chatbot
+    """
+    history = history or []
+    # rebuild storage context
+    FOLDERPATH_INDEXES_EN = FOLDERPATH_INDEXES + "_en"
+    storage_context = StorageContext.from_defaults(
+        persist_dir=FOLDERPATH_INDEXES_EN,
+    )
+    # load index to memory
+    index = load_index_from_storage(storage_context)
+    # open QA engine
+    query_engine = index.as_query_engine()
+    # Get the response from OpenAI
+    response_en = query_engine.query(message)
+    print("Q: ", message)
+    print("A: ", response_en.response, "\n")
+    # ---------- JAPANESE
+    # rebuild storage context
+    FOLDERPATH_INDEXES_JA = FOLDERPATH_INDEXES + "_ja"
+    storage_context_ja = StorageContext.from_defaults(
+        persist_dir=FOLDERPATH_INDEXES_JA,
+    )
+    # load index to memory
+    index_ja = load_index_from_storage(storage_context_ja)
+    # open QA engine
+    query_engine_ja = index_ja.as_query_engine()
+    # Get the response from OpenAI
+    response_ja = query_engine_ja.query(message)
+    print("Q: ", message)
+    print("A: ", response_ja.response, "\n")
+    ######
+    # Format the response
+    responses = {
+        f"---{MODEL_NAME} (English)---": response_en.response,
+        f"---{MODEL_NAME} (Japanese)---": response_ja.response,
+    }
+    response = format_response(responses)
+    # Append the response to history (to show in the UI)
+    history.append((message, response))
+    return history, history
+# Call the chat using gradio which supports UI for chatbot and is shareable
+chatgpt = gr.Interface(
+    chat,
+    ["text", "state"],
+    ["chatbot", "state"],
+    allow_flagging="never",
+)
+chatgpt.launch(share=True)  # share=True to share the chat publicly

src/AI/embedding.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Author: Khanh Phan
+Date: 2023-07-20
+"""
+import configparser
+import os
+import openai
+import pandas as pd
+import tiktoken
+from ai_configs import (
+    BATCH_SIZE,
+    DELIMITER_TOKYOTECHIES,
+    EMBEDDING_MODEL,
+    FILE_ENCODING,
+    FILE_TYPE,
+    FILEPATH_EMBEDDINGS,
+    FOLDERPATH_DOCUMENTS,
+    MAX_TOKENS,
+    MODEL_NAME,
+    SERVICE,
+)
+def list_files(directory: str) -> list:
+    files = []
+    for file in os.listdir(directory):
+        # check only text files
+        if file.endswith(FILE_TYPE):
+            files.append(file)
+    return files
+def read_file(file_path: str) -> str:
+    # Open a file: file
+    file = open(file_path, encoding=FILE_ENCODING)
+    # read all lines at once
+    file_content = file.read()
+    # close the file
+    file.close()
+    return file_content
+def num_tokens(text: str, model: str = MODEL_NAME) -> int:
+    """Return the number of tokens in a string."""
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(text))
+def truncated_string(
+    string: str,
+    model: str,
+    max_tokens: int,
+    print_warning: bool = True,
+) -> str:
+    """Truncate a string to a maximum number of tokens."""
+    encoding = tiktoken.encoding_for_model(model)
+    encoded_string = encoding.encode(string)
+    truncated_string = encoding.decode(encoded_string[:max_tokens])
+    if print_warning and len(encoded_string) > max_tokens:
+        print(
+            f"Warning: Truncated string from {len(encoded_string)} tokens to {max_tokens} tokens.",  # noqa: E501
+        )
+    return truncated_string
+def determine_delimiter(
+    strings: str,
+    service: str = SERVICE,
+) -> str:
+    """
+    Determine the delimiter of the file
+    """
+    if service == "TokyoTechies":
+        return DELIMITER_TOKYOTECHIES
+    elif service == "Klever":
+        if "# " in strings:
+            return "# "
+        elif "## " in strings:
+            return "## "
+        elif "### " in strings:
+            return "### "
+        else:
+            return False
+            raise ValueError(
+                f"No delimiter found in Klever file: {strings[0:20]}",
+            )
+    elif service == "freemind":
+        return "-----"
+    else:
+        raise ValueError(f"Unknown service: {service}")
+def format_content_Tokyo_Techies(
+    strings: str,
+    content: str,
+    max_tokens: int = 1000,
+    model: str = MODEL_NAME,
+):
+    """
+    Format content for Tokyo Techies
+    """
+    chunks = content.split(determine_delimiter(content))
+    # TODO: add to config
+    if "URL:" and "Language:" in chunks[0]:
+        url = (
+            "<url>"
+            + (content.split("URL:"))[1].split("Language")[0].strip()
+            + "</url>"
+        )  # get url
+    else:
+        url = "<url>No URL</url>"
+    for chunk in chunks[1:]:
+        chunk = (
+            chunk.strip()
+        )  # remove leading and trailing whitespace and newline
+        if not chunk:
+            continue
+        # get section title (first row) and content (from 2nd row)
+        section_title = chunk.split("\n")[0]
+        titles = [url, section_title]
+        section_content = chunk.split("\n")[1:]
+        section_content = "\n".join(section_content)
+        if num_tokens(section_content) > max_tokens:
+            print(
+                f"{titles} ({num_tokens(section_content)}) has more than {max_tokens} tokens",  # noqa: E501
+            )
+            section_content = truncated_string(
+                section_content,
+                model=model,
+                max_tokens=max_tokens,
+            )
+        string = "\n\n".join(titles + [section_content])
+        strings.extend([string])
+        print(string)
+    return strings
+def format_content_klever(
+    strings: str,
+    content: str,
+    max_tokens: int = 1000,
+    model: str = MODEL_NAME,
+):
+    """
+    Format content for Klever
+    """
+    # Add images tag to image link
+    content = content.replace("![](", "![image](")
+    delimiter = determine_delimiter(content)
+    if delimiter:
+        chunks = content.split(delimiter)
+    else:
+        chunks = [content]
+    # TODO: add to config
+    url = ""
+    if "Title:" and "URL:" in chunks[0]:
+        title = "Title: " + (
+            (content.split("Title:"))[1].split("URL")[0].strip()
+        )
+        if "Language:" in chunks[0]:
+            url = (
+                "<url>"
+                + (content.split("URL:"))[1].split("Language:")[0].strip()
+                + "</url>"
+            )
+    else:
+        title = ""
+    # Extract content between title and the first sub-section
+    section_content = (chunks[0].split("-----"))[1].strip()
+    if section_content != "":
+        titles = [title, url]
+        string = "\n\n".join(titles + [section_content])
+        # print(f"----------\n{string}\n")
+        strings.extend([string])
+    # Extract contentin every sub-section
+    for chunk in chunks[1:]:
+        chunk = (
+            chunk.strip()
+        )  # remove leading and trailing whitespace and newline
+        if not chunk:
+            continue
+        # get section title (first row) and content (from 2nd row)
+        section_title = chunk.split("\n")[0]
+        titles = [title + " > " + section_title, url]
+        section_content = chunk.split("\n")[1:]
+        section_content = "\n".join(section_content)
+        if num_tokens(section_content) > max_tokens:
+            print(
+                f"{titles} ({num_tokens(section_content)}) has more than {max_tokens} tokens",  # noqa: E501
+            )
+            section_content = truncated_string(
+                section_content,
+                model=model,
+                max_tokens=max_tokens,
+            )
+        string = "\n\n".join(titles + [section_content])
+        # print(f"----------\n{string}\n")
+        strings.extend([string])
+    return strings
+def format_content_freemind(
+    strings: str,
+    content: str,
+    max_tokens: int = 1000,
+    model: str = MODEL_NAME,
+):
+    """
+    Format content for freemind
+    """
+    chunks = content.split(determine_delimiter(content))
+    for chunk in chunks:
+        chunk = (
+            chunk.strip()
+        )  # remove leading and trailing whitespace and newline
+        if not chunk:
+            continue
+        if num_tokens(chunk) > max_tokens:
+            print(
+                f"{chunk} ({num_tokens(section_content)}) has more than {max_tokens} tokens",  # noqa: E501
+            )
+            section_content = truncated_string(
+                section_content,
+                model=model,
+                max_tokens=max_tokens,
+            )
+        string = chunk
+        # print(f"----------\n{string}\n")
+        strings.extend([string])
+    return strings
+def format_content(
+    directory: str,
+    max_tokens: int = 1000,
+    model: str = MODEL_NAME,
+) -> list[str]:
+    strings = []
+    # read files
+    files = list_files(directory)
+    for file in files:
+        print(f"File: {file}")
+        file_content = read_file(
+            os.path.join(
+                FOLDERPATH_DOCUMENTS,
+                file,
+            ),
+        )
+        if SERVICE == "TokyoTechies":
+            strings = format_content_Tokyo_Techies(
+                strings,
+                file_content,
+                max_tokens,
+                model,
+            )
+        elif SERVICE == "Klever":
+            strings = format_content_klever(
+                strings,
+                file_content,
+                max_tokens,
+                model,
+            )
+        elif SERVICE == "freemind":
+            strings = format_content_freemind(
+                strings,
+                file_content,
+                max_tokens,
+                model,
+            )
+    return strings
+def embed_data():
+    # read config
+    env = configparser.ConfigParser()
+    env.read(".env")
+    os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+    openai.api_key = os.environ["OPENAI_API_KEY"]
+    formatted_strings = format_content(FOLDERPATH_DOCUMENTS, MAX_TOKENS)
+    embeddings = []
+    for batch_start in range(0, len(formatted_strings), BATCH_SIZE):
+        batch_end = batch_start + BATCH_SIZE
+        batch = formatted_strings[batch_start:batch_end]
+        print(f"Batch {batch_start} to {batch_end-1}")
+        response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
+        for i, be in enumerate(response["data"]):
+            assert (
+                i == be["index"]
+            )  # double check embeddings are in same order as input
+        batch_embeddings = [e["embedding"] for e in response["data"]]
+        embeddings.extend(batch_embeddings)
+    df = pd.DataFrame({"text": formatted_strings, "embedding": embeddings})
+    # save document chunks and embeddings
+    SAVE_PATH = FILEPATH_EMBEDDINGS
+    df.to_csv(SAVE_PATH, index=False)
+embed_data()

src/AI/evaluation.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+Author: Khanh Phan
+Date: 2023-06-20
+"""
+import configparser
+import os
+import openai
+from ai_configs import (
+    FOLDERPATH_INDEXES,
+    FOLDERPATH_QA,
+    FOLDERPATH_QUESTION,
+)
+from llama_index import (
+    StorageContext,
+    load_index_from_storage,
+)
+env = configparser.ConfigParser()
+env.read(".env")
+os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+openai.api_key = os.environ["OPENAI_API_KEY"]
+def get_question_files(path: str = FOLDERPATH_QUESTION) -> None:
+    """
+    Get the directory.
+    Check whether it is valid (a folder or a text file) or invalid.
+    If valid, generate the answer from the questions in the directory.
+        Parameters:
+            path (str): Path to a question file/folder
+        Returns:
+            None
+    """
+    if os.path.isdir(path) is True:
+        for file in os.listdir(path):
+            if file.endswith(".txt"):
+                print("Generating answer from: ", os.path.join(path, file))
+                generate_answers(os.path.join(path, file))
+    elif os.path.isfile(path) is True and path.endswith(".txt"):
+        print("Generating answer from:", path)
+        generate_answers(path)
+    else:
+        raise Exception("Input is not a directory of a folder or a text file")
+def generate_answers(
+    file_directory: str,
+    output_path=FOLDERPATH_QA,
+) -> None:
+    """
+    Get the list of questions from file(s),
+        then generate the answers and write to file(s).
+    These answers are used for evaluation
+        Parameters:
+            file_directory (str): Path to a question file
+            output_path (str): folder to write the answers
+        Returns:
+            None
+    """
+    # Load the questions
+    question_file = open(file_directory)
+    lines = question_file.readlines()
+    # Create a file to write the answers
+    file_name = os.path.basename(file_directory)
+    qa_file = open(os.path.join(output_path, file_name), "w")
+    count = 0
+    for line in lines:  # for each question
+        count += 1
+        # generate the answer
+        response = query_engine.query(line)
+        # format the output
+        question = "Q" + str(count) + ": " + str(line)
+        answer = "A" + str(count) + ": " + str(response.response)
+        response = question + answer + "\n"
+        print(response)
+        # write Q&A to file
+        qa_file.writelines(response)
+    question_file.close()
+    qa_file.close()
+# rebuild storage context
+storage_context = StorageContext.from_defaults(persist_dir=FOLDERPATH_INDEXES)
+# load index
+index = load_index_from_storage(storage_context)
+query_engine = index.as_query_engine()
+get_question_files()

src/AI/klever_search.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import ast  # for converting embeddings saved as strings back to arrays
+import configparser
+import os
+import time
+import gradio as gr
+import openai  # for calling the OpenAI API
+import pandas as pd  # for storing text and embeddings data
+import tiktoken  # for counting tokens
+from ai_configs import (
+    EMBEDDING_MODEL,
+    FILEPATH_EMBEDDINGS,
+    INTRODUCTION_MESSAGE,
+    MODEL_NAME,
+    SYSTEM_CONTENT,
+    TOKEN_BUDGET,
+)
+from scipy import spatial  # for calculating vector similarities for search
+env = configparser.ConfigParser()
+env.read(".env")
+os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+openai.api_key = os.environ["OPENAI_API_KEY"]
+model_name = MODEL_NAME
+# Read embbeding file
+embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
+# Convert embeddings from CSV str type back to list type
+embedding_data["embedding"] = embedding_data["embedding"].apply(
+    ast.literal_eval,
+)
+print("Finished loading embedding data!")
+# search function
+def strings_ranked_by_relatedness(
+    query: str,
+    df: pd.DataFrame,
+    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
+    top_n: int = 3,
+) -> tuple[list[str], list[float]]:
+    """Returns a list of strings and relatednesses,
+    sorted from most related to least.
+    """
+    query_embedding_response = openai.Embedding.create(
+        model=EMBEDDING_MODEL,
+        input=query,
+    )
+    query_embedding = query_embedding_response["data"][0]["embedding"]
+    strings_and_relatednesses = [
+        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
+        for i, row in df.iterrows()
+    ]
+    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
+    strings, relatednesses = zip(*strings_and_relatednesses)
+    return strings[:top_n], relatednesses[:top_n]
+embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
+# Convert embeddings from CSV str type back to list type
+embedding_data["embedding"] = embedding_data["embedding"].apply(
+    ast.literal_eval,
+)
+query = "what is Klever?"
+strings, relatedness = strings_ranked_by_relatedness(query, embedding_data)
+for string in strings:
+    if "</url>" in string:
+        string = string.split("</url>")[0].replace("<url>", "URL: ")
+    print(string)
+    print("----------------")
+print(relatedness)

src/AI/parsing.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import aspose.words as aw
+# Load the PDF document from the disc.
+doc = aw.Document("TestDocument.pdf")
+# Save the document to DOCX format.
+doc.save("output.md")

src/AI/search.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import ast  # for converting embeddings saved as strings back to arrays
+import configparser
+import os
+import time
+import gradio as gr
+import openai  # for calling the OpenAI API
+import pandas as pd  # for storing text and embeddings data
+import tiktoken  # for counting tokens
+from ai_configs import (
+    EMBEDDING_MODEL,
+    FILEPATH_EMBEDDINGS,
+    INTRODUCTION_MESSAGE,
+    MODEL_NAME,
+    SYSTEM_CONTENT,
+    TOKEN_BUDGET,
+)
+from scipy import spatial  # for calculating vector similarities for search
+env = configparser.ConfigParser()
+env.read(".env")
+os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+openai.api_key = os.environ["OPENAI_API_KEY"]
+print(openai.api_key)
+model_name = MODEL_NAME
+# Read embbeding file
+embedding_data = pd.read_csv(FILEPATH_EMBEDDINGS)
+# Convert embeddings from CSV str type back to list type
+embedding_data["embedding"] = embedding_data["embedding"].apply(
+    ast.literal_eval,
+)
+print("Finished loading embedding data!")
+# search function
+def strings_ranked_by_relatedness(
+    query: str,
+    df: pd.DataFrame,
+    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
+    top_n: int = 3,
+) -> tuple[list[str], list[float]]:
+    """Returns a list of strings and relatednesses,
+    sorted from most related to least.
+    """
+    query_embedding_response = openai.Embedding.create(
+        model=EMBEDDING_MODEL,
+        input=query,
+    )
+    query_embedding = query_embedding_response["data"][0]["embedding"]
+    strings_and_relatednesses = [
+        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
+        for i, row in df.iterrows()
+    ]
+    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
+    strings, relatednesses = zip(*strings_and_relatednesses)
+    return strings[:top_n], relatednesses[:top_n]
+def num_tokens(text: str, model: str = MODEL_NAME) -> int:
+    """Return the number of tokens in a string."""
+    encoding = tiktoken.encoding_for_model(model)
+    return len(encoding.encode(text))
+def query_message(
+    query: str,
+    df: pd.DataFrame,
+    model: str,
+    token_budget: int,
+) -> str:
+    """Return a message for GPT,
+    with relevant source texts pulled from a dataframe.
+    """
+    strings, _ = strings_ranked_by_relatedness(query, df)
+    """ example:
+    #strings, relatednesses = strings_ranked_by_relatedness(
+    #     "what solutions that TT provides?",
+    #     df,
+    #     top_n=5,
+    #     )
+    #for string, relatedness in zip(strings, relatednesses):
+    #    print(f"{relatedness=:.3f}\n{string}\n")
+    """
+    question = f"\n\nQuestion: {query}"
+    message = INTRODUCTION_MESSAGE
+    for string in strings:
+        next_article = f"\nTT article section:\n--\n{string}\n--"
+        next_article = f"\nFreemind article section:\n--\n{string}\n--"
+        if (
+            num_tokens(message + next_article + question, model=model)
+            > token_budget
+        ):
+            break
+        else:
+            message += next_article
+    return message + question
+def get_response(
+    query: str,
+    df: pd.DataFrame,
+    model: str = MODEL_NAME,
+    token_budget: int = TOKEN_BUDGET,
+    print_message: bool = False,
+) -> str:
+    """Answers a query using GPT and a dataframe of
+    relevant texts and embeddings.
+    """
+    message = query_message(query, df, model=model, token_budget=token_budget)
+    if print_message:
+        print(message)
+    messages = [
+        {"role": "system", "content": SYSTEM_CONTENT},
+        {"role": "user", "content": message},
+    ]
+    response = openai.ChatCompletion.create(
+        model=model,
+        messages=messages,
+        temperature=0,
+    )
+    response_message = response["choices"][0]["message"]["content"]
+    print(f'Total used tokens: {response["usage"]["total_tokens"]}')
+    return response_message, message
+# Code for getting chatbot's response ends here. Below code is for UI only.
+def format_response(responses: dict):
+    """
+    (Optional) Format one or multiple responses from version(s) of chatbot
+        Parameters:
+            responses (dict): chatbot response with the name of model
+        Returns:
+            output (str): formatted reponse
+    """
+    output = ""
+    for response in responses:
+        output += response + (responses[response]) + "\n\n"
+    return output
+with gr.Blocks() as chatgpt:
+    chatbot = gr.Chatbot(label="Freemind Bot", height=500)
+    message = gr.Textbox(
+        label="Enter your chat here",
+        placeholder="Press enter to send a message",
+        show_copy_button=True,
+    )
+    radio = gr.Radio(
+        [
+            "Full model (most capable but slow & expensive)",
+            "Lite model (Capable but fast & cheap)",
+        ],
+        label="Choose a chatbot model",
+        value="Lite model (Capable but fast & cheap)",
+    )
+    clear = gr.Button("Clear all chat")
+    def choice_model(choice):
+        if choice == "Full model (most capable but slow & expensive)":
+            return "gpt-4"
+        else:
+            return "gpt-3.5-turbo"
+    def get_user_message(user_message, history):
+        return "", history + [[user_message, None]]
+    def show_response(history, model):
+        message = history[-1][0]
+        model = choice_model(model)
+        print(f"model: {model}")
+        # Get the response from OpenAI
+        response, _ = get_response(
+            query=message,
+            df=embedding_data,
+            model=model,
+        )
+        # Correct URL
+        # I will remove this function after BE/FE fixing this bug
+        response = response.replace("help/document/", "wiki/1-")
+        response = response.replace(">>", ">")
+        print("Q: ", message, "\nA: ", response, "\n")
+        # Format the response
+        # responses = {
+        #    f"[{MODEL_NAME}] → ": response,
+        # }
+        # response = format_response(responses)
+        history[-1][1] = ""
+        for character in response:
+            history[-1][1] += character
+            time.sleep(0.01)
+            yield history
+    message.submit(
+        get_user_message,
+        [message, chatbot],
+        [message, chatbot],
+        queue=False,
+    ).then(
+        show_response,
+        [chatbot, radio],
+        chatbot,
+    )
+    clear.click(lambda: None, None, chatbot, queue=False)
+chatgpt.queue()
+chatgpt.launch(share=True)  # share=True to share the chat publicly

src/AI/training.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Author: Khanh Phan
+Date: 2023-04-20
+"""
+import configparser
+import os
+import openai
+from ai_configs import (  # CHUNK_SIZE_LIMIT,
+    CHUNK_OVERLAP_RATIO,
+    CONTEXT_WINDOW,
+    FOLDERPATH_DOCUMENTS,
+    FOLDERPATH_INDEXES,
+    MODEL_NAME,
+    NUM_OUTPUTS,
+    TEMPERATURE,
+)
+from langchain import OpenAI
+from llama_index import (
+    GPTVectorStoreIndex,
+    LLMPredictor,
+    PromptHelper,
+    ServiceContext,
+    SimpleDirectoryReader,
+)
+env = configparser.ConfigParser()
+env.read(".env")
+os.environ["OPENAI_API_KEY"] = env["OpenAI"]["OPENAI_KEY_TT"]
+openai.api_key = os.environ["OPENAI_API_KEY"]
+def construct_index(
+    folderpath_documents: str,
+    folderpath_index: str,
+) -> GPTVectorStoreIndex:
+    """
+    Construsted index for all the documents.
+        Parameters:
+            folderpath_documents (str): Path to a training folder
+            folderpath_index (str): Path to a folder to save the model
+        Returns:
+            document_index (GPTVectorStoreIndex): the model
+    """
+    # Create a prompt helper with initial parameters for the chatbot
+    prompt_helper = PromptHelper(
+        context_window=CONTEXT_WINDOW,
+        num_output=NUM_OUTPUTS,
+        chunk_overlap_ratio=CHUNK_OVERLAP_RATIO,
+        # chunk_size_limit=CHUNK_SIZE_LIMIT,
+    )
+    # Configure the LLM provider and model.
+    llm_predictor = LLMPredictor(
+        llm=OpenAI(
+            temperature=TEMPERATURE,
+            model_name=MODEL_NAME,
+            max_tokens=NUM_OUTPUTS,
+        ),
+    )
+    # Create the service context
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=llm_predictor,
+        prompt_helper=prompt_helper,
+    )
+    # Load the documents
+    documents = SimpleDirectoryReader(folderpath_documents).load_data()
+    # Generate the index from documents
+    document_index = GPTVectorStoreIndex.from_documents(
+        documents,
+        service_context=service_context,
+    )
+    # Save index to disk
+    document_index.storage_context.persist(persist_dir=folderpath_index)
+    return document_index
+document_index = construct_index(FOLDERPATH_DOCUMENTS, FOLDERPATH_INDEXES)
+"""
+# These lines are for testing purposes only.
+query = input("What do you want to ask? ")
+query_engine = document_index.as_query_engine()
+response = query_engine.query("what are the articles about?")
+print(response)
+"""

src/__init__.py ADDED Viewed

File without changes

src/backend/TTChatBot/.sample-env ADDED Viewed

	@@ -0,0 +1,24 @@

+ENV_NAME='local'
+DJANGO_SETTINGS_MODULE=config.settings.local
+DJANGO_SECRET_KEY='django-insecure-xfs(py=^axctf8(#5yd-svkffy3ft0u0z6^*&vx@g#)fttc#sl'
+DJANGO_DEBUG=True
+# Database
+# DB_NAME='postgres'
+# DB_USER='postgres'
+# DB_PASSWORD='postgres'
+# DB_HOST='127.0.0.1'
+# DB_PORT=5678
+# Celery
+# CELERY_BROKER_URL = 'redis://localhost:6379/0'
+# CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
+# Other API
+OPEN_AI_KEY='KEY'
+# Redis config
+REDIS_HOST = redis
+REDIS_PORT = 6380
+BROKER_URL = redis://${REDIS_HOST}:${REDIS_PORT}/0

src/backend/TTChatBot/chatbot/__init__.py ADDED Viewed

File without changes

src/backend/TTChatBot/chatbot/admin.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # from django.contrib import admin
2	+
3	+ # Register your models here.

src/backend/TTChatBot/chatbot/apps.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import ast
+import logging
+import os
+import openai
+import pandas as pd
+from django.apps import AppConfig
+from django.conf import settings
+from llama_index import (
+    StorageContext,
+    load_index_from_storage,
+)
+from scipy import spatial
+from .utils import num_tokens_from_messages
+# set OpenAI API key
+openai.api_key = os.environ["OPENAI_API_KEY"]
+logger = logging.getLogger(__name__)
+def load_chatgpt_index(apps_names: str, index_file: str):
+    """Functions to load chatGPT index
+    Args:
+        apps_names (str): TokyoTechies or Klever
+        index_file (str): Storage index path
+    """
+    # build storage context
+    logger.info("Building %s storage context", apps_names)
+    storage_context = StorageContext.from_defaults(
+        persist_dir=index_file,
+    )
+    # load index
+    index = load_index_from_storage(storage_context)
+    query_engine = index.as_query_engine()
+    logger.info("Loading index from %s storage completed", apps_names)
+    return query_engine
+class ChatGPTEmbeddingSearchBased:
+    """ChatGPT embedding search based method for Kleverbot"""
+    def __init__(
+        self,
+        service,
+        embedding_model,
+        chat_model,
+        filepath_embedding,
+    ):
+        self.embedding_model = embedding_model
+        self.chat_model = chat_model
+        self.filepath_embedding = filepath_embedding
+        self.service = service
+        # model related config
+        self.token_budget = settings.TOKEN_BUDGET
+        self.introduction_message = settings.INTRODUCTION_MESSAGE.format(
+            service=self.service,
+        )
+        self.system_content = settings.SYSTEM_CONTENT.format(
+            service=self.service,
+        )
+        self.next_article = settings.NEXT_ARTICLE
+        self.embedding_data = self.load_embedding_data()
+    def load_embedding_data(self):
+        """Loading embedding data from csv"""
+        df_data = pd.read_csv(self.filepath_embedding)
+        df_data["embedding"] = df_data["embedding"].apply(
+            ast.literal_eval,
+        )
+        logger.info(
+            "Loading embeddings from %s storage completed",
+            self.filepath_embedding,
+        )
+        return df_data
+    # search function
+    def strings_ranked_by_relatedness(
+        self,
+        query: str,
+        df_data: pd.DataFrame,
+        relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
+        top_n: int = 3,
+    ) -> tuple[list[str], list[float]]:
+        """Returns a list of strings and relatednesses,
+        sorted from most related to least.
+        """
+        query_embedding_response = openai.Embedding.create(
+            model=self.embedding_model,
+            input=query,
+        )
+        query_embedding = query_embedding_response["data"][0]["embedding"]
+        strings_and_relatednesses = [
+            (row["text"], relatedness_fn(query_embedding, row["embedding"]))
+            for i, row in df_data.iterrows()
+        ]
+        strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
+        strings, relatednesses = zip(*strings_and_relatednesses)
+        return strings[:top_n], relatednesses[:top_n]
+    def query_message(
+        self,
+        query: str,
+        dataframe: pd.DataFrame,
+        model: str,
+        token_budget: int,
+    ) -> str:
+        """Return a message for GPT,
+        with relevant source texts pulled from a dataframe.
+        """
+        strings, _ = self.strings_ranked_by_relatedness(query, dataframe)
+        question = f"\n\nQuestion: {query}"
+        message = self.introduction_message
+        for string in strings:
+            next_article = self.next_article.format(
+                service=self.service,
+                string=string,
+            )
+            if (
+                num_tokens_from_messages(
+                    message + next_article + question,
+                    model=model,
+                )
+                > token_budget
+            ):
+                break
+            else:
+                message += next_article
+        return message + question
+    def get_response(
+        self,
+        query: str,
+        data: pd.DataFrame,
+        model: str = "gpt-3.5-turbo",
+        token_budget: int = 4096 - 500,
+        log_message: bool = False,
+    ):
+        """Answers a query using GPT and a dataframe of
+        relevant texts and embeddings.
+        """
+        message = self.query_message(
+            query=query,
+            dataframe=data,
+            model=model,
+            token_budget=token_budget,
+        )
+        if log_message:
+            logging.info(message)
+        messages = [
+            {"role": "system", "content": self.system_content},
+            {"role": "user", "content": message},
+        ]
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=messages,
+            temperature=0,
+        )
+        response_message = response["choices"][0]["message"]["content"]
+        if log_message:
+            logging.info(
+                "Total used tokens: %s",
+                response["usage"]["total_tokens"],
+            )
+        return response_message, message
+    def chat(self, message):
+        """Chat with Kleverbot with message, return response from OpenAI"""
+        res, _ = self.get_response(
+            query=message,
+            data=self.embedding_data,
+            model=self.chat_model,
+            token_budget=self.token_budget,
+        )
+        # TODO: fix this one after Klever FE fix -> scraper
+        return (
+            res.replace("help/document/", "wiki/1-")
+            .replace(">>", ">")
+            .replace("https://tokyotechies.kleversuite.net", "{ORG_URL}")
+        )
+class TTChatbotConfig(AppConfig):
+    """TokyoTechies Chatbot Init"""
+    name = "chatbot"
+    label = "tt_chatbot"
+    # old method
+    # INDEXES_FILE = os.path.join(
+    #     settings.TT_MODELS_PATH,
+    #     settings.TT_MODEL_NAME,
+    # )
+    # QUERY_ENGINE = load_chatgpt_index(
+    #     apps_names="TokyoTechies",
+    #     index_file=INDEXES_FILE,
+    # )
+    # new method
+    QUERY_ENGINE = ChatGPTEmbeddingSearchBased(
+        service="TokyoTechies",
+        embedding_model=settings.TT_EMBEDDING_MODEL,
+        chat_model=settings.TT_EMBEDDING_CHAT_MODEL,
+        filepath_embedding=settings.TT_FILEPATH_EMBEDDING,
+    )
+class KleverChatbotConfig(AppConfig):
+    """Klever Chatbot Init"""
+    name = "chatbot"
+    label = "klever_chatbot"
+    # old method
+    # INDEXES_FILE = os.path.join(
+    #     settings.KLEVER_MODELS_PATH,
+    #     settings.KLEVER_MODEL_NAME,
+    # )
+    # QUERY_ENGINE = load_chatgpt_index(
+    #     apps_names="Klever",
+    #     index_file=INDEXES_FILE,
+    # )
+    # new method
+    QUERY_ENGINE = ChatGPTEmbeddingSearchBased(
+        service="Klever",
+        embedding_model=settings.KLEVER_EMBEDDING_MODEL,
+        chat_model=settings.KLEVER_EMBEDDING_CHAT_MODEL,
+        filepath_embedding=settings.KLEVER_FILEPATH_EMBEDDING,
+    )

src/backend/TTChatBot/chatbot/exceptions.py ADDED Viewed

	@@ -0,0 +1,10 @@

+class TTChatBotConnectException(Exception):
+    pass
+class TTChatBotEngineException(Exception):
+    pass
+class ChatbotVersionException(Exception):
+    pass

src/backend/TTChatBot/chatbot/migrations/__init__.py ADDED Viewed

File without changes

src/backend/TTChatBot/chatbot/serializers.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from rest_framework import serializers
+class ConversationSerializer(serializers.Serializer):
+    """Conversation content when user interact with the Chatbot"""
+    user_chat = serializers.CharField(required=True, max_length=1000)
+class MessageSerializer(ConversationSerializer):
+    """Response content when Chabot outputs to the user"""
+    chatbot_answer = serializers.CharField(required=True, max_length=1000)
+class TaskSerializer(ConversationSerializer):
+    """Response content when Chabot outputs to the user"""
+    task_id = serializers.CharField(required=True)
+class VersionSerializer(serializers.Serializer):
+    """Trained version of the chatbot"""
+    version = serializers.CharField(required=True)

src/backend/TTChatBot/chatbot/tasks.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import logging
+from celery import shared_task
+from celery.utils.log import get_task_logger
+from django.conf import settings
+from .apps import (
+    KleverChatbotConfig,
+    TTChatbotConfig,
+)
+from .exceptions import (
+    TTChatBotConnectException,
+    TTChatBotEngineException,
+)
+from .utils import num_tokens_from_messages
+# TODO: fix the logger does not goes into log file
+logger = get_task_logger(__name__)
+def tt_sync_chat_website(message):
+    return _chat_tt(message, engine="TokyoTechies")
+@shared_task(max_retries=0)
+def tt_async_chat_website(message):
+    max_token = settings.MAX_TOKEN
+    if (
+        num_tokens_from_messages(
+            messages=message,
+            model=settings.TT_MODEL_NAME,
+        )
+        >= max_token
+    ):
+        logging.warning(
+            "Maximum token %s reached for user messages: %s",
+            max_token,
+            message,
+        )
+        return settings.MAX_TOKEN_RESPONSE
+    else:
+        return _chat_tt(message, TTChatbotConfig)
+@shared_task(max_retries=0)
+def tt_async_chat_klever(message):
+    max_token = settings.MAX_TOKEN
+    if (
+        num_tokens_from_messages(
+            messages=message,
+            model=settings.KLEVER_MODEL_NAME,
+        )
+        >= max_token
+    ):
+        logging.warning(
+            "Maximum token %s reached for user messages: %s",
+            max_token,
+            message,
+        )
+        return settings.MAX_TOKEN_RESPONSE
+    else:
+        return _chat_tt(message, KleverChatbotConfig)
+def _chat_tt(message, engine=None):
+    try:
+        # TODO: check for query retries number of time when disconnected
+        if engine:
+            response = engine.QUERY_ENGINE.chat(message)
+            return response
+        else:
+            raise TTChatBotEngineException("Connect engine failed")
+    except Exception as exc:
+        logging.error("OpenAI error: ", exc_info=exc)
+        raise TTChatBotConnectException(
+            "Connect OpenAPI engine failed",
+        ) from exc

src/backend/TTChatBot/chatbot/urls.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from django.urls import path
+# from rest_framework.urlpatterns import format_suffix_patterns
+from . import views
+app_name = "chatbot"
+urlpatterns = [
+    # List and create conversations
+    path("chat/sync", views.ConversationSyncView.as_view()),
+    path(
+        "chat/tokyotechies/async",
+        views.TokyoTechiesConversationAsyncView.as_view(),
+    ),
+    path("chat/klever/async", views.KleverConversationAsyncView.as_view()),
+    path(
+        "chat/<str:task_id>/",
+        views.ChatTaskStatus.as_view(),
+        name="gpt_task_status",
+    ),
+    path(
+        "chat/tokyotechies/version",
+        views.TTBotVerion.as_view(),
+        name="ttbot_version",
+    ),
+    path(
+        "chat/klever/version",
+        views.KleverBotVerion.as_view(),
+        name="kleverbot_version",
+    ),
+]

src/backend/TTChatBot/chatbot/utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import logging
+import os
+import re
+from datetime import datetime
+import tiktoken
+from .exceptions import ChatbotVersionException
+logger = logging.getLogger(__name__)
+def num_tokens_from_messages(
+    messages: str,
+    model: str = "gpt-3.5-turbo-0613",
+) -> int:
+    """
+    Return the number of tokens used by a list of messages.
+    """
+    try:
+        if model in ["text-davinci-003", "text-davinci-002"]:
+            encoding = tiktoken.get_encoding("p50k_base")
+        else:  # gpt-4, "gpt-3.5-turbo"
+            encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        logger.warning("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = len(encoding.encode(messages))
+    return num_tokens
+def get_datetime_from_file(file_path: str) -> str:
+    """Get datetime from file
+    Args:
+        file_path (str): file path
+    Raises:
+        ChatbotVersionException: expcetion when file is not found
+    Returns:
+        str: version of the chatbot in format
+    """
+    try:
+        # Get the modification timestamp of the file
+        file_ts = os.path.getmtime(file_path)
+        return datetime.fromtimestamp(file_ts).strftime("%Y%m%d")
+    except FileNotFoundError as exc:
+        logging.error("File %s not found error: ", file_path, exc_info=exc)
+        raise ChatbotVersionException("File not found", exc) from exc
+def extract_datetime_from_file(
+    version_file_path: str,
+    weight_file_path: str,
+) -> str:
+    r"""Extract date from
+    "- Training data includes information up until (\w{3} \d{2})"
+    Args:
+        version_file_path (str): file path of version file
+        weight_file_path (str): file path of embedding file
+        to be used when can not get file path of version file
+    Raises:
+        ChatbotVersionException: expcetion when file is not found
+    Returns:
+        str: version of the chatbot in format
+    """
+    current_year = datetime.now().year
+    target_line_format = (
+        r"- Training data includes information up until (\w{3} \d{2})"
+    )
+    try:
+        # Open the input file for reading
+        with open(version_file_path, encoding="utf-8") as infile:
+            # Iterate through each line in the input file
+            for line in infile:
+                # Use regex to search for the date format in the line
+                match = re.search(target_line_format, line.strip())
+                if match:
+                    # If a match is found, extract the date
+                    extracted_date = match.group(1)
+                    break  # Exit the loop after finding the first date
+        # Check if a date was extracted and print it
+        if extracted_date:
+            return datetime.strptime(
+                f"{extracted_date} {current_year}",
+                "%b %d %Y",
+            ).strftime("%Y%m%d")
+        else:
+            logging.warning(
+                "Date not found in the file, fallback to deployment date",
+            )
+            return get_datetime_from_file(weight_file_path)
+    except FileNotFoundError as exc:
+        logging.error(
+            "File %s not found error: ",
+            version_file_path,
+            exc_info=exc,
+        )
+        raise ChatbotVersionException("File not found", exc) from exc

src/backend/TTChatBot/chatbot/views.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import json
+import logging
+from celery.result import AsyncResult
+from django.conf import settings
+from rest_framework import (
+    generics,
+    status,
+)
+from rest_framework.response import Response
+from .exceptions import (
+    ChatbotVersionException,
+    TTChatBotConnectException,
+    TTChatBotEngineException,
+)
+from .serializers import (
+    ConversationSerializer,
+    MessageSerializer,
+    TaskSerializer,
+    VersionSerializer,
+)
+from .tasks import (
+    tt_async_chat_klever,
+    tt_async_chat_website,
+    tt_sync_chat_website,
+)
+from .utils import (
+    extract_datetime_from_file,
+    num_tokens_from_messages,
+)
+# add logger
+logger = logging.getLogger(__name__)
+class ConversationSyncView(generics.GenericAPIView):
+    serializer_class = ConversationSerializer
+    def post(self, request, *args, **kwargs):
+        data = json.loads(request.body.decode("utf-8"))
+        question = data.get("user_chat", None)
+        max_token = settings.MAX_TOKEN
+        try:
+            if (
+                num_tokens_from_messages(
+                    messages=question,
+                    model=settings.TT_MODEL_NAME,
+                )
+                >= max_token
+            ):
+                logging.warning(
+                    "Maximum token %s reached for user messages: %s",
+                    max_token,
+                    question,
+                )
+                res = MessageSerializer(
+                    {
+                        "user_chat": question,
+                        "chatbot_answer": settings.MAX_TOKEN_RESPONSE,
+                    },
+                )
+            else:
+                answer = tt_sync_chat_website(question)
+                res = MessageSerializer(
+                    {
+                        "user_chat": question,
+                        "chatbot_answer": answer,
+                    },
+                )
+        except (TTChatBotConnectException, TTChatBotEngineException) as exc:
+            logger.error("Failed to send request to ChatGPT: %s", exc)
+            res = MessageSerializer(
+                {
+                    "user_chat": question,
+                    "chatbot_answer": settings.DEFAULT_RESPONSE,
+                },
+            )
+            return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
+        return Response(res.data, status=status.HTTP_200_OK)
+class TokyoTechiesConversationAsyncView(generics.GenericAPIView):
+    serializer_class = ConversationSerializer
+    def post(self, request, *args, **kwargs):
+        data = json.loads(request.body.decode("utf-8"))
+        question = data.get("user_chat", None)
+        try:
+            answer = tt_async_chat_website.delay(question)
+            res = TaskSerializer(
+                {
+                    "user_chat": question,
+                    "task_id": answer.id,
+                },
+            )
+        except (TTChatBotConnectException, TTChatBotEngineException) as exc:
+            logger.error("Failed to send request to ChatGPT: %s", exc)
+            res = TaskSerializer(
+                {
+                    "user_chat": question,
+                    "task_id": None,
+                },
+            )
+            return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
+        return Response(res.data, status=status.HTTP_200_OK)
+class KleverConversationAsyncView(generics.GenericAPIView):
+    serializer_class = ConversationSerializer
+    def post(self, request, *args, **kwargs):
+        data = json.loads(request.body.decode("utf-8"))
+        question = data.get("user_chat", None)
+        try:
+            answer = tt_async_chat_klever.delay(question)
+            res = TaskSerializer(
+                {
+                    "user_chat": question,
+                    "task_id": answer.id,
+                },
+            )
+        except (TTChatBotConnectException, TTChatBotEngineException) as exc:
+            logger.error("Failed to send request to ChatGPT: %s", exc)
+            res = TaskSerializer(
+                {
+                    "user_chat": question,
+                    "task_id": None,
+                },
+            )
+            return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
+        return Response(res.data, status=status.HTTP_200_OK)
+class ChatTaskStatus(generics.GenericAPIView):
+    """
+    Check the status of ChatGPT task
+    """
+    serializer_class = TaskSerializer
+    def get(self, request, task_id, *args, **kwargs):
+        task = AsyncResult(task_id)
+        if task.ready():
+            response = task.result
+            logging.info("Task reponse: %s", response)
+            return Response({"status": "READY", "response": response})
+        else:
+            return Response({"status": "PENDING"})
+class TTBotVerion(generics.GenericAPIView):
+    """Get version of TTBot"""
+    serializer_class = VersionSerializer
+    def get(self, *args, **kwargs):
+        try:
+            ttbot_version = extract_datetime_from_file(
+                version_file_path=settings.TT_TRAINING_VERSION,
+                weight_file_path=settings.TT_FILEPATH_EMBEDDING,
+            )
+            res = VersionSerializer({"version": ttbot_version})
+            return Response(res.data, status=status.HTTP_200_OK)
+        except ChatbotVersionException as exc:
+            logger.error("Failed to check version: %s", exc)
+            res = VersionSerializer({"version": None})
+            return Response(res.data, status=status.HTTP_400_BAD_REQUEST)
+class KleverBotVerion(generics.GenericAPIView):
+    """Get version of Klever"""
+    serializer_class = VersionSerializer
+    def get(self, *args, **kwargs):
+        try:
+            kleverbot_version = extract_datetime_from_file(
+                version_file_path=settings.KLEVER_TRAINING_VERSION,
+                weight_file_path=settings.KLEVER_FILEPATH_EMBEDDING,
+            )
+            res = VersionSerializer({"version": kleverbot_version})
+            return Response(res.data, status=status.HTTP_200_OK)
+        except ChatbotVersionException as exc:
+            logger.error("Failed to check version: %s", exc)
+            res = VersionSerializer({"version": None})
+            return Response(res.data, status=status.HTTP_400_BAD_REQUEST)

src/backend/TTChatBot/config/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .celery import app as celery_app
2	+
3	+ __all__ = ["celery_app"]

src/backend/TTChatBot/config/asgi.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+ASGI config for TTChatBot project.
+It exposes the ASGI callable as a module-level variable named ``application``.
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
+"""
+import os
+from django.core.asgi import get_asgi_application
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
+application = get_asgi_application()

src/backend/TTChatBot/config/celery.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+from celery import Celery
+from django.conf import settings
+# Set the default Django settings module for the 'celery' program.
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
+app = Celery("TTChatBot")
+app.conf.update(
+    broker_connection_retry_on_startup=True,
+    broker_connection_max_retries=10,
+    result_expires=60,
+    task_acks_late=True,
+)
+app.config_from_object("django.conf:settings", namespace="CELERY")
+app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
+# TODO: convention celery:
+#  https://qiita.com/hankehly/items/c3e0496eb04327a53ac4
+# TODO: crontab for celery:
+#  https://www.codingforentrepreneurs.com/blog/celery-redis-django/

src/backend/TTChatBot/config/settings/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+# Open AI key
+OPENAI_API_KEY = os.getenv("OPEN_AI_KEY")
+# settings keys for model
+os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+env_name = os.getenv("ENV_NAME", "prod")
+if env_name == "local":
+    from .local import *  # noqa
+elif env_name == "staging":
+    from .staging import *  # noqa
+else:
+    from .prod import *  # noqa

src/backend/TTChatBot/config/settings/common.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+from pathlib import Path
+BASE_DIR = Path(__file__).resolve().parent.parent.parent
+# Static files (CSS, JavaScript, Images)
+STORAGE_URL = BASE_DIR / "storage"
+# Swagger HTTPS
+USE_X_FORWARDED_HOST = True
+SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
+# Application definition
+DJANGO_APPS = [
+    "django.contrib.admin",
+    "django.contrib.auth",
+    "django.contrib.contenttypes",
+    "django.contrib.sessions",
+    "django.contrib.messages",
+    "django.contrib.staticfiles",
+]
+THIRD_PARTY_APPS = [
+    "gunicorn",
+    "rest_framework",
+    "drf_yasg",  # another way to swagger
+    "django_celery_results",  # Store Celery Result and cache
+]
+LOCAL_APPS = [
+    "chatbot.apps",
+    # 'users.apps.UsersConfig',
+    # 'site_settings.apps.SiteSettingsConfig',
+    # 'training_model.apps.TrainingModelConfig',
+]
+INSTALLED_APPS = DJANGO_APPS + THIRD_PARTY_APPS + LOCAL_APPS
+MIDDLEWARE = [
+    "django.middleware.security.SecurityMiddleware",
+    "django.contrib.sessions.middleware.SessionMiddleware",
+    "django.middleware.common.CommonMiddleware",
+    "django.middleware.csrf.CsrfViewMiddleware",
+    "django.contrib.auth.middleware.AuthenticationMiddleware",
+    "django.contrib.messages.middleware.MessageMiddleware",
+    "django.middleware.clickjacking.XFrameOptionsMiddleware",
+    "whitenoise.middleware.WhiteNoiseMiddleware",
+]
+SECRET_KEY = os.getenv("DJANGO_SECRET_KEY")
+ROOT_URLCONF = "config.urls"
+WSGI_APPLICATION = "config.wsgi.application"
+ASGI_APPLICATION = "config.asgi.application"
+TEMPLATES = [
+    {
+        "BACKEND": "django.template.backends.django.DjangoTemplates",
+        "DIRS": [BASE_DIR / "templates"],
+        "APP_DIRS": True,
+        "OPTIONS": {
+            "context_processors": [
+                "django.template.context_processors.debug",
+                "django.template.context_processors.request",
+                "django.contrib.auth.context_processors.auth",
+                "django.contrib.messages.context_processors.messages",
+            ],
+        },
+    },
+]
+# Logging
+LOGGING = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "default": {
+            "format": "%(asctime)s %(levelname)s: %(message)s",
+        },
+    },
+    "filters": {
+        "require_debug_false": {
+            "()": "django.utils.log.RequireDebugFalse",
+        },
+        "require_debug_true": {
+            "()": "django.utils.log.RequireDebugTrue",
+        },
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "default",
+            "level": "INFO",
+        },
+        "common": {
+            "class": "logging.FileHandler",
+            "filename": STORAGE_URL / "common.log",
+            "formatter": "default",
+            "level": "INFO",
+        },
+    },
+    "loggers": {
+        "": {
+            "handlers": ["console", "common"],
+            "level": 1,
+        },
+    },
+}
+# Internationalization
+# https://docs.djangoproject.com/en/4.2/topics/i18n/
+LANGUAGE_CODE = "en-us"
+TIME_ZONE = "UTC"
+USE_I18N = True
+USE_TZ = True
+# Static files (CSS, JavaScript, Images)
+# https://docs.djangoproject.com/en/4.2/howto/static-files/
+STATIC_URL = "/static/"
+STATIC_ROOT = os.path.join(BASE_DIR, "static")
+STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage"
+# Default primary key field type
+# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
+DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

src/backend/TTChatBot/config/settings/local.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from .common import *  # noqa
+ALLOWED_HOSTS = ["*"]
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = os.getenv("DJANGO_DEBUG")
+# Chatbot default answer
+DEFAULT_RESPONSE = "Sorry, I'm having trouble understanding you."
+DEFAULT_RESPONSE_JP = "申し訳ございません、ご質問を理解いたしかねます"
+MAX_TOKEN_RESPONSE = (
+    "Sorry, I'm having trouble processing all that information. "
+    "Could you summarize a bit more concisely?"
+)
+MAX_TOKEN_RESPONSE_JP = "申し訳ございません、いただいた全ての情報を処理することができません。もう少し簡潔にしてください。"
+# --Postgres--
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.postgresql',
+#         'NAME': os.getenv('DB_NAME'),
+#         'USER': os.getenv('DB_USER'),
+#         'PASSWORD': os.getenv('DB_PASSWORD'),
+#         'HOST': os.getenv('DB_HOST', 'localhost'),
+#         'PORT': os.getenv('DB_PORT'),
+#     }
+# }
+# --Celery--
+# List of modules to import when celery starts.
+# --Worker settings--
+# If you're doing mostly I/O you can have more processes,
+# but if mostly spending CPU, try to keep it close to the
+# number of CPUs on your machine. If not set, the number of CPUs/cores
+# available will be used.
+CELERY_WORKER_CONCURRENCY = 1
+# CELERYD_LOG_FILE = "celeryd.log"
+# CELERYD_LOG_LEVEL = "INFO"
+REDIS_HOST = os.getenv("REDIS_HOST")
+REDIS_PORT = os.getenv("REDIS_PORT")
+BROKER_URL = os.getenv("BROKER_URL")
+CELERY_BROKER_URL = BROKER_URL
+CELERY_RESULT_BACKEND = BROKER_URL
+CELERY_ACCEPT_CONTENT = ["application/json"]
+CELERY_RESULT_SERIALIZER = "json"
+CELERY_TASK_SERIALIZER = "json"
+# Config for old query methods
+# TT Websites Models path
+TT_MODELS_PATH = "../../../models/TokyoTechies/"
+# text-davinci-003 or gpt-4
+TT_MODEL_NAME = "text-davinci-003"
+# Klever Models path
+KLEVER_MODELS_PATH = "../../../models/Klever/"
+# text-davinci-003 or gpt-4
+KLEVER_MODEL_NAME = "text-davinci-003"
+# Config for new embedding methods
+KLEVER_EMBEDDING_MODEL = (
+    "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
+)
+KLEVER_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo"  # "gpt-4"
+KLEVER_FILEPATH_EMBEDDING = "../../../models/Klever/embeddings/Klever.csv"
+KLEVER_TRAINING_VERSION = "../../../models/Klever/_version.txt"
+# Config for new embedding methods
+TT_EMBEDDING_MODEL = (
+    "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
+)
+TT_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo"  # "gpt-4"
+TT_FILEPATH_EMBEDDING = (
+    "../../../models/TokyoTechies/embeddings/TokyoTechies.csv"
+)
+TT_TRAINING_VERSION = "../../../models/TokyoTechies/_version.txt"
+INTRODUCTION_MESSAGE = (
+    "You are a chatbot of {service}. "
+    "Use the below articles on the {service} to answer the subsequent question. "  # noqa: E501
+    "If the answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance."  # noqa: E501
+    r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.'  # noqa: E501
+)
+SYSTEM_CONTENT = "You answer questions about {service}"
+NEXT_ARTICLE = "\n{service}" + "article section:\n--\n{string}\n--"
+TOKEN_BUDGET = 4096 - 500
+# max token
+MAX_TOKEN = 200
+# CELERYD_TASK_SOFT_TIME_LIMIT = 3
+# Kill anything longer than 10 seconds:
+# CELERYD_TASK_TIME_LIMIT = 10
+# After 2 hours remove the task result:
+# CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 2

src/backend/TTChatBot/config/settings/prod.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+from .common import *  # noqa
+ALLOWED_HOSTS = ["*"]
+# SECURITY WARNING: don't run with debug turned on in production!
+DEBUG = False
+# Chatbot default answer
+DEFAULT_RESPONSE = "Sorry, I'm having trouble understanding you."
+DEFAULT_RESPONSE_JP = "申し訳ございません、ご質問を理解いたしかねます"
+MAX_TOKEN_RESPONSE = (
+    "Sorry, I'm having trouble processing all that information. "
+    "Could you summarize a bit more concisely?"
+)
+MAX_TOKEN_RESPONSE_JP = "申し訳ございません、いただいた全ての情報を処理することができません。もう少し簡潔にしてください。"
+# --Postgres--
+# DATABASES = {
+#     'default': {
+#         'ENGINE': 'django.db.backends.postgresql',
+#         'NAME': os.getenv('DB_NAME'),
+#         'USER': os.getenv('DB_USER'),
+#         'PASSWORD': os.getenv('DB_PASSWORD'),
+#         'HOST': os.getenv('DB_HOST', 'localhost'),
+#         'PORT': os.getenv('DB_PORT'),
+#     }
+# }
+# --Celery--
+# List of modules to import when celery starts.
+# --Worker settings--
+# If you're doing mostly I/O you can have more processes,
+# but if mostly spending CPU, try to keep it close to the
+# number of CPUs on your machine. If not set, the number of CPUs/cores
+# available will be used.
+CELERY_WORKER_CONCURRENCY = 20
+# CELERYD_LOG_FILE = "celeryd.log"
+# CELERYD_LOG_LEVEL = "INFO"
+REDIS_HOST = os.getenv("REDIS_HOST")
+REDIS_PORT = os.getenv("REDIS_PORT")
+BROKER_URL = os.getenv("BROKER_URL")
+CELERY_BROKER_URL = BROKER_URL
+CELERY_RESULT_BACKEND = BROKER_URL
+CELERY_ACCEPT_CONTENT = ["application/json"]
+CELERY_RESULT_SERIALIZER = "json"
+CELERY_TASK_SERIALIZER = "json"
+# Config for old query methods
+# TT Websites Models path
+TT_MODELS_PATH = "../../../models/TokyoTechies/"
+# text-davinci-003 or gpt-4
+TT_MODEL_NAME = "text-davinci-003"
+# Klever Models path
+KLEVER_MODELS_PATH = "../../../models/Klever/"
+# text-davinci-003 or gpt-4
+KLEVER_MODEL_NAME = "text-davinci-003"
+# Config for new embedding methods
+KLEVER_EMBEDDING_MODEL = (
+    "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
+)
+KLEVER_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo"  # "gpt-4"
+KLEVER_FILEPATH_EMBEDDING = "../../../models/Klever/embeddings/Klever.csv"
+KLEVER_TRAINING_VERSION = "../../../models/Klever/_version.txt"
+# Config for new embedding methods
+TT_EMBEDDING_MODEL = (
+    "text-embedding-ada-002"  # OpenAI's best embeddings as of Apr 2023
+)
+TT_EMBEDDING_CHAT_MODEL = "gpt-3.5-turbo"  # "gpt-4"
+TT_FILEPATH_EMBEDDING = (
+    "../../../models/TokyoTechies/embeddings/TokyoTechies.csv"
+)
+TT_TRAINING_VERSION = "../../../models/TokyoTechies/_version.txt"
+INTRODUCTION_MESSAGE = (
+    "You are a chatbot of {service}. "
+    "Use the below articles on the {service} to answer the subsequent question. "  # noqa: E501
+    "If the answer cannot be found in the articles, write sorry that I cannot answer your request, please contact our support team for further assistance."  # noqa: E501
+    r'If an answer is found, add embedding title in this format "[Title](URL)" to the end of an answer and ignore the same title.'  # noqa: E501
+)
+SYSTEM_CONTENT = "You answer questions about {service}"
+NEXT_ARTICLE = "\n{service}" + "article section:\n--\n{string}\n--"
+TOKEN_BUDGET = 4096 - 500
+# max token
+MAX_TOKEN = 200
+# CELERYD_TASK_SOFT_TIME_LIMIT = 3
+# Kill anything longer than 10 seconds:
+# CELERYD_TASK_TIME_LIMIT = 10
+# After 2 hours remove the task result:
+# CELERY_TASK_RESULT_EXPIRES = 60 * 60 * 2

src/backend/TTChatBot/config/settings/staging.py ADDED Viewed

File without changes

src/backend/TTChatBot/config/urls.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+URL configuration for TTChatBot project.
+The `urlpatterns` list routes URLs to views. For more information please see:
+    https://docs.djangoproject.com/en/4.2/topics/http/urls/
+Examples:
+Function views
+    1. Add an import:  from my_app import views
+    2. Add a URL to urlpatterns:  path('', views.home, name='home')
+Class-based views
+    1. Add an import:  from other_app.views import Home
+    2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
+Including another URLconf
+    1. Import the include() function: from django.urls import include, path
+    2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
+"""
+from django.conf import settings
+from django.conf.urls.static import static
+from django.urls import (
+    include,
+    path,
+)
+from drf_yasg import openapi
+from drf_yasg.generators import OpenAPISchemaGenerator
+from drf_yasg.views import get_schema_view
+from rest_framework import permissions
+class BothHttpAndHttpsSchemaGenerator(OpenAPISchemaGenerator):
+    def get_schema(self, request=None, public=False):
+        schema = super().get_schema(request, public)
+        schema.schemes = ["http", "https"]
+        return schema
+schema_view = get_schema_view(
+    openapi.Info(
+        title="Tokyo Techies Chatbot",
+        default_version="v1",
+        description="API documentation for Toyko Techies Chatbot API",
+    ),
+    public=True,
+    generator_class=BothHttpAndHttpsSchemaGenerator,
+    permission_classes=[permissions.AllowAny],
+)
+urlpatterns = [
+    path("api/v1/", include("chatbot.urls")),
+    # Swagger URLs
+    path(
+        "",
+        schema_view.with_ui("swagger", cache_timeout=0),
+        name="schema-swagger-ui",
+    ),
+]
+# Include static files serving only during development
+if settings.DEBUG:
+    urlpatterns += static(
+        settings.STATIC_URL,
+        document_root=settings.STATIC_ROOT,
+    )

src/backend/TTChatBot/config/wsgi.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+WSGI config for TTChatBot project.
+It exposes the WSGI callable as a module-level variable named ``application``.
+For more information on this file, see
+https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
+"""
+import os
+from django.core.wsgi import get_wsgi_application
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
+application = get_wsgi_application()

src/backend/TTChatBot/manage.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings")
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?",
+        ) from exc
+    execute_from_command_line(sys.argv)
+if __name__ == "__main__":
+    main()

src/backend/TTChatBot/storage/.gitkeep ADDED Viewed

File without changes

src/frontend/.gitkeep ADDED Viewed

File without changes

src/frontend/.prettierignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ public/*/

src/frontend/.prettierrc ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "semi": false,
+  "singleQuote": true
+}

src/frontend/.sample-env ADDED Viewed

	@@ -0,0 +1 @@


1	+ BACKEND_API_URL=http://localhost:8000/

src/frontend/Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+ARG NODE_VERSION=18.16.0
+ARG ALPINE_VERSION=3.17.2
+FROM node:${NODE_VERSION}-alpine AS node
+FROM alpine:${ALPINE_VERSION}
+COPY --from=node /usr/lib /usr/lib
+COPY --from=node /usr/local/lib /usr/local/lib
+COPY --from=node /usr/local/include /usr/local/include
+COPY --from=node /usr/local/bin /usr/local/bin
+# create destination directory
+RUN mkdir -p /src/frontend
+WORKDIR /src/frontend
+# copy the app
+COPY . /src/frontend
+RUN npm install
+EXPOSE 3000
+CMD [ "npm", "start" ]

src/frontend/environments/dev/build.args ADDED Viewed

	@@ -0,0 +1 @@


1	+ BACKEND_API_URL=https://www.chatbot-api.dev.aws.tokyotechies.co.jp/

src/frontend/environments/prod/build.args ADDED Viewed

	@@ -0,0 +1 @@


1	+ BACKEND_API_URL=https://www.chatbot-api.tokyotechies.com/

src/frontend/next-env.d.ts ADDED Viewed

	@@ -0,0 +1,5 @@

+/// <reference types="next" />
+/// <reference types="next/image-types/global" />
+// NOTE: This file should not be edited
+// see https://nextjs.org/docs/basic-features/typescript for more information.

src/frontend/next.config.js ADDED Viewed

	@@ -0,0 +1,11 @@

+/** @type {import('next').NextConfig} */
+const nextConfig = {
+  reactStrictMode: true,
+  i18n: {
+    locales: ['en', 'ja'],
+    defaultLocale: 'en',
+    localeDetection: false,
+  },
+}
+module.exports = nextConfig

src/frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/frontend/package.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "name": "tokyo-techies-chatbot",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "dev": "next dev",
+    "build": "next build",
+    "start": "next start",
+    "lint": "next lint"
+  },
+  "dependencies": {
+    "@types/node": "20.5.3",
+    "@types/react": "18.2.21",
+    "@types/react-dom": "18.2.7",
+    "autoprefixer": "10.4.15",
+    "axios": "^0.27.2",
+    "eslint": "8.47.0",
+    "eslint-config-next": "13.4.19",
+    "eslint-config-prettier": "^8.5.0",
+    "eslint-plugin-prettier": "^4.0.0",
+    "next": "13.4.19",
+    "postcss": "8.4.28",
+    "prettier": "^2.7.0",
+    "react": "18.2.0",
+    "react-dom": "18.2.0",
+    "react-markdown": "^8.0.4",
+    "tailwindcss": "3.3.3",
+    "typescript": "5.1.6"
+  }
+}

src/frontend/postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+module.exports = {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}

src/frontend/public/favicon.webp ADDED Viewed

src/frontend/public/locales/en.ts ADDED Viewed

	@@ -0,0 +1,15 @@

+export default {
+  name: 'Techie ',
+  askMe: 'Ask me anything',
+  connected: 'You are connected with a virtual assistant',
+  greeting: 'Hi there! 😊 \n' +
+    'I\'m Techie - a virtual assistant here to help you with anything related to Tokyo Techies.\n' +
+    'If you have any questions, need information, or just want to chat, feel free to ask me!\n' +
+    'How can I help you?',
+  placeholder: 'Type your question...',
+  maintenance: 'Sorry, we are under maintenance!',
+  year: '/',
+  month: '/',
+  day: '',
+  edition: ' version'
+}