diff --git a/.gitignore b/.gitignore index ddee9b537e47686f3d80562ad0addb0832ccffce..dcd7997dd8ab62bfe67ff330fc8da2d21a3b23b8 100644 --- a/.gitignore +++ b/.gitignore @@ -169,5 +169,18 @@ code/.chainlit/translations/ storage/logs/* vectorstores/* -*/.files/* +**/.files/* code/storage/models/ + +**/translations/en-US.json +**/translations/zh-CN.json + + +**/vectorstores/* + +**/private/students.json + +**/apps/*/storage/logs/* +**/apps/*/private/* + +.idea/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 2cd39e3489b4c2e12003dc59d16804cc1ef25128..cc7b53b213e0afe5d067b679e8e39e5a7540a56e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,10 +3,12 @@ FROM python:3.11 WORKDIR /code COPY ./requirements.txt /code/requirements.txt +COPY ./setup.py /code/setup.py RUN pip install --upgrade pip RUN pip install --no-cache-dir -r /code/requirements.txt +RUN pip install -e . COPY . /code @@ -17,12 +19,15 @@ RUN ls -R /code RUN chmod -R 777 /code # Create a logs directory and set permissions -RUN mkdir /code/logs && chmod 777 /code/logs +RUN mkdir /code/apps/ai_tutor/logs && chmod 777 /code/apps/ai_tutor/logs # Create a cache directory within the application's working directory RUN mkdir /.cache && chmod -R 777 /.cache -WORKDIR /code/code +WORKDIR /code/apps/ai_tutor + +# Expose the port the app runs on +EXPOSE 7860 RUN --mount=type=secret,id=HUGGINGFACEHUB_API_TOKEN,mode=0444,required=true RUN --mount=type=secret,id=OPENAI_API_KEY,mode=0444,required=true @@ -35,4 +40,4 @@ RUN --mount=type=secret,id=LITERAL_API_KEY_LOGGING,mode=0444,required=true RUN --mount=type=secret,id=CHAINLIT_AUTH_SECRET,mode=0444,required=true # Default command to run the application -CMD ["sh", "-c", "python -m modules.vectorstore.store_manager && uvicorn app:app --host 0.0.0.0 --port 7860"] +CMD python -m modules.vectorstore.store_manager --config_file config/config.yml --project_config_file config/project_config.yml && python -m uvicorn app:app --host 0.0.0.0 --port 7860 \ No newline at end of file diff --git a/Dockerfile.dev b/Dockerfile.dev index fe71dc7ef39e52d8433646a40075ef85d5ff4d07..c63abdafe2434209a44ac26ddef5794c456a207a 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -3,13 +3,18 @@ FROM python:3.11 WORKDIR /code COPY ./requirements.txt /code/requirements.txt +COPY ./setup.py /code/setup.py RUN pip install --upgrade pip RUN pip install --no-cache-dir -r /code/requirements.txt +RUN pip install -e . COPY . /code +# Copy .env file to the application directory +COPY .env /code/apps/ai_tutor/.env + # List the contents of the /code directory to verify files are copied correctly RUN ls -R /code @@ -17,15 +22,17 @@ RUN ls -R /code RUN chmod -R 777 /code # Create a logs directory and set permissions -RUN mkdir /code/logs && chmod 777 /code/logs +RUN mkdir /code/apps/ai_tutor/logs && chmod 777 /code/apps/ai_tutor/logs # Create a cache directory within the application's working directory RUN mkdir /.cache && chmod -R 777 /.cache -WORKDIR /code/code +WORKDIR /code/apps/ai_tutor + +RUN ls -R /code # Expose the port the app runs on -EXPOSE 8000 +EXPOSE 7860 # Default command to run the application -CMD ["sh", "-c", "python -m modules.vectorstore.store_manager && chainlit run main.py --host 0.0.0.0 --port 8000"] \ No newline at end of file +CMD python -m modules.vectorstore.store_manager --config_file config/config.yml --project_config_file config/project_config.yml && python -m uvicorn app:app --host 0.0.0.0 --port 7860 \ No newline at end of file diff --git a/README.md b/README.md index 99455300ca8890b2049f532d39cadad2bac964e6..9f7f4db2d08231133f8317cf70c8cb995b19ed13 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,12 @@ app_port: 7860 --- # DL4DS Tutor 🏃 +![Build Status](https://github.com/DL4DS/dl4ds_tutor/actions/workflows/push_to_hf_space.yml/badge.svg) +![License](https://img.shields.io/github/license/DL4DS/dl4ds_tutor) +![GitHub stars](https://img.shields.io/github/stars/DL4DS/dl4ds_tutor) +![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square) + + Check out the configuration reference at [Hugging Face Spaces Config Reference](https://huggingface.co/docs/hub/spaces-config-reference). You can find a "production" implementation of the Tutor running live at [DL4DS Tutor](https://dl4ds-dl4ds-tutor.hf.space/) from the @@ -31,26 +37,31 @@ Please visit [setup](https://dl4ds.github.io/dl4ds_tutor/guide/setup/) for more git clone https://github.com/DL4DS/dl4ds_tutor ``` -2. **Put your data under the `storage/data` directory** +2. Create your app in the apps folder. (An example is the `apps/ai_tutor` app) + ``` + cd apps + mkdir your_app + ``` + +2. **Put your data under the `apps/your_app/storage/data` directory** - Add URLs in the `urls.txt` file. - - Add other PDF files in the `storage/data` directory. + - Add other PDF files in the `apps/your_app/storage/data` directory. 3. **To test Data Loading (Optional)** ```bash - cd code - python -m modules.dataloader.data_loader --links "your_pdf_link" + cd apps/your_app + python -m modules.dataloader.data_loader --links "your_pdf_link" --config_file config/config.yml --project_config_file config/project_config.yml ``` 4. **Create the Vector Database** ```bash - cd code - python -m modules.vectorstore.store_manager + cd apps/your_app + python -m modules.vectorstore.store_manager --config_file config/config.yml --project_config_file config/project_config.yml ``` - - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated. 6. **Run the FastAPI App** ```bash - cd code + cd apps/your_app uvicorn app:app --port 7860 ``` @@ -65,7 +76,7 @@ The HuggingFace Space is built using the `Dockerfile` in the repository. To run ```bash docker build --tag dev -f Dockerfile.dev . -docker run -it --rm -p 8000:8000 dev +docker run -it --rm -p 7860:7860 dev ``` ## Contributing diff --git a/code/.chainlit/config.toml b/apps/ai_tutor/.chainlit/config.toml similarity index 98% rename from code/.chainlit/config.toml rename to apps/ai_tutor/.chainlit/config.toml index a76128d17ea50c55e41689ebf23f36b400567d04..4ee8911d7816e23f173acaf01b98f158bbc62d1e 100644 --- a/code/.chainlit/config.toml +++ b/apps/ai_tutor/.chainlit/config.toml @@ -69,7 +69,7 @@ github = "https://github.com/DL4DS/dl4ds_tutor" # Specify a CSS file that can be used to customize the user interface. # The CSS file can be served from the public directory or via an external link. -custom_css = "/public/test.css" +custom_css = "/public/files/test.css" # Specify a Javascript file that can be used to customize the user interface. # The Javascript file can be served from the public directory. diff --git a/apps/ai_tutor/README.md b/apps/ai_tutor/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ce60b629a88e9e59e51aec4e503994cd7bc9411f --- /dev/null +++ b/apps/ai_tutor/README.md @@ -0,0 +1,12 @@ +# WIP + + +## Run the encrypt_students script + +- If you don't want the emails to be public, run this script to encrypt the emails of the students. +- This will create a new file in the public/files/ directory. +- Place your file with the students' emails in the private/ directory (do not commit this file to the repository). + +```bash +python encrypt_students.py --students-file private/students.json --encrypted-students-file public/files/students_encrypted.json +``` diff --git a/code/app.py b/apps/ai_tutor/app.py similarity index 83% rename from code/app.py rename to apps/ai_tutor/app.py index 3b9393f4a3232a54c8f97fc834a3a85f743592f5..e26100ec24d6f064eb5915289c8d152ac925a744 100644 --- a/code/app.py +++ b/apps/ai_tutor/app.py @@ -8,24 +8,33 @@ from chainlit.utils import mount_chainlit import secrets import json import base64 -from modules.config.constants import ( +from config.constants import ( OAUTH_GOOGLE_CLIENT_ID, OAUTH_GOOGLE_CLIENT_SECRET, CHAINLIT_URL, - GITHUB_REPO, - DOCS_WEBSITE, - ALL_TIME_TOKENS_ALLOCATED, - TOKENS_LEFT, + EMAIL_ENCRYPTION_KEY, ) from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles -from modules.chat_processor.helpers import ( - get_user_details, +from helpers import ( get_time, reset_tokens_for_user, check_user_cooldown, - update_user_info, ) +from modules.chat_processor.helpers import get_user_details, update_user_info +from config.config_manager import config_manager +import hashlib + +# set config +config = config_manager.get_config().dict() + +# set constants +GITHUB_REPO = config["misc"]["github_repo"] +DOCS_WEBSITE = config["misc"]["docs_website"] +ALL_TIME_TOKENS_ALLOCATED = config["token_config"]["all_time_tokens_allocated"] +TOKENS_LEFT = config["token_config"]["tokens_left"] +COOLDOWN_TIME = config["token_config"]["cooldown_time"] +REGEN_TIME = config["token_config"]["regen_time"] GOOGLE_CLIENT_ID = OAUTH_GOOGLE_CLIENT_ID GOOGLE_CLIENT_SECRET = OAUTH_GOOGLE_CLIENT_SECRET @@ -46,13 +55,8 @@ session_store = {} CHAINLIT_PATH = "/chainlit_tutor" # only admin is given any additional permissions for now -- no limits on tokens -USER_ROLES = { - "tgardos@bu.edu": ["instructor", "bu"], - "xthomas@bu.edu": ["admin", "instructor", "bu"], - "faridkar@bu.edu": ["instructor", "bu"], - "xavierohan1@gmail.com": ["guest"], - # Add more users and roles as needed -} +with open("public/files/students_encrypted.json", "r") as file: + USER_ROLES = json.load(file) # Create a Google OAuth flow flow = Flow.from_client_config( @@ -80,7 +84,20 @@ flow = Flow.from_client_config( def get_user_role(username: str): - return USER_ROLES.get(username, ["guest"]) # Default to "guest" role + + # Function to deterministically hash emails + def deterministic_hash(email, salt): + return hashlib.pbkdf2_hmac("sha256", email.encode(), salt, 100000).hex() + + # encrypt email (#FIXME: this is not the best way to do this, not really encryption, more like a hash) + encryption_salt = EMAIL_ENCRYPTION_KEY.encode() + encrypted_email = deterministic_hash(username, encryption_salt) + role = USER_ROLES.get(encrypted_email, ["guest"]) + + if "guest" in role: + return "unauthorized" + + return role async def get_user_info_from_cookie(request: Request): @@ -146,6 +163,11 @@ async def login_page(request: Request): # return response +@app.get("/unauthorized", response_class=HTMLResponse) +async def unauthorized(request: Request): + return templates.TemplateResponse("unauthorized.html", {"request": request}) + + @app.get("/login/google") async def login_google(request: Request): # Clear any existing session cookies to avoid conflicts with guest sessions @@ -176,6 +198,9 @@ async def auth_google(request: Request): profile_image = user_info.get("picture", "") role = get_user_role(email) + if role == "unauthorized": + return RedirectResponse("/unauthorized") + session_token = secrets.token_hex(16) session_store[session_token] = { "email": email, @@ -210,7 +235,7 @@ async def cooldown(request: Request): user_details = await get_user_details(user_info["email"]) current_datetime = get_time() cooldown, cooldown_end_time = await check_user_cooldown( - user_details, current_datetime + user_details, current_datetime, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME ) print(f"User in cooldown: {cooldown}") print(f"Cooldown end time: {cooldown_end_time}") @@ -228,7 +253,11 @@ async def cooldown(request: Request): else: user_details.metadata["in_cooldown"] = False await update_user_info(user_details) - await reset_tokens_for_user(user_details) + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) return RedirectResponse("/post-signin") @@ -256,13 +285,19 @@ async def post_signin(request: Request): if "last_message_time" in user_details.metadata and "admin" not in get_user_role( user_info["email"] ): - cooldown, _ = await check_user_cooldown(user_details, current_datetime) + cooldown, _ = await check_user_cooldown( + user_details, current_datetime, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME + ) if cooldown: user_details.metadata["in_cooldown"] = True return RedirectResponse("/cooldown") else: user_details.metadata["in_cooldown"] = False - await reset_tokens_for_user(user_details) + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) if user_info: username = user_info["email"] @@ -335,7 +370,11 @@ async def get_tokens_left(request: Request): try: user_info = await get_user_info_from_cookie(request) user_details = await get_user_details(user_info["email"]) - await reset_tokens_for_user(user_details) + await reset_tokens_for_user( + user_details, + config["token_config"]["tokens_left"], + config["token_config"]["regen_time"], + ) tokens_left = user_details.metadata["tokens_left"] return {"tokens_left": tokens_left} except Exception as e: @@ -343,7 +382,7 @@ async def get_tokens_left(request: Request): return {"tokens_left": 0} -mount_chainlit(app=app, target="main.py", path=CHAINLIT_PATH) +mount_chainlit(app=app, target="chainlit_app.py", path=CHAINLIT_PATH) if __name__ == "__main__": import uvicorn diff --git a/code/main.py b/apps/ai_tutor/chainlit_app.py similarity index 86% rename from code/main.py rename to apps/ai_tutor/chainlit_app.py index e520efa9a0a2f6b85084978f4b05d1c336beefd4..800b6a5a2428cb39119f728da77c25441b5a364b 100644 --- a/code/main.py +++ b/apps/ai_tutor/chainlit_app.py @@ -1,12 +1,11 @@ import chainlit.data as cl_data import asyncio -from modules.config.constants import ( +from config.constants import ( LITERAL_API_KEY_LOGGING, LITERAL_API_URL, ) from modules.chat_processor.literal_ai import CustomLiteralDataLayer import json -import yaml from typing import Any, Dict, no_type_check import chainlit as cl from modules.chat.llm_tutor import LLMTutor @@ -14,22 +13,24 @@ from modules.chat.helpers import ( get_sources, get_history_chat_resume, get_history_setup_llm, - get_last_config, + # get_last_config, ) from modules.chat_processor.helpers import ( update_user_info, - get_time, + get_user_details, +) +from helpers import ( check_user_cooldown, reset_tokens_for_user, - get_user_details, ) +from helpers import get_time import copy from typing import Optional from chainlit.types import ThreadDict -import time import base64 from langchain_community.callbacks import get_openai_callback from datetime import datetime, timezone +from config.config_manager import config_manager USER_TIMEOUT = 60_000 SYSTEM = "System" @@ -38,8 +39,8 @@ AGENT = "Agent" YOU = "User" ERROR = "Error" -with open("modules/config/config.yml", "r") as f: - config = yaml.safe_load(f) +# set config +config = config_manager.get_config().dict() async def setup_data_layer(): @@ -81,13 +82,6 @@ class Chatbot: """ self.config = config - async def _load_config(self): - """ - Load the configuration from a YAML file. - """ - with open("modules/config/config.yml", "r") as f: - return yaml.safe_load(f) - @no_type_check async def setup_llm(self): """ @@ -95,7 +89,6 @@ class Chatbot: #TODO: Clean this up. """ - start_time = time.time() llm_settings = cl.user_session.get("llm_settings", {}) ( @@ -143,8 +136,6 @@ class Chatbot: cl.user_session.set("chain", self.chain) cl.user_session.set("llm_tutor", self.llm_tutor) - print("Time taken to setup LLM: ", time.time() - start_time) - @no_type_check async def update_llm(self, new_settings: Dict[str, Any]): """ @@ -227,32 +218,9 @@ class Chatbot: """ Inform the user about the updated LLM settings and display them as a message. """ - llm_settings: Dict[str, Any] = cl.user_session.get("llm_settings", {}) - llm_tutor = cl.user_session.get("llm_tutor") - settings_dict = { - "model": llm_settings.get("chat_model"), - "retriever": llm_settings.get("retriever_method"), - "memory_window": llm_settings.get("memory_window"), - "num_docs_in_db": ( - len(llm_tutor.vector_db) - if llm_tutor and hasattr(llm_tutor, "vector_db") - else 0 - ), - "view_sources": llm_settings.get("view_sources"), - "follow_up_questions": llm_settings.get("follow_up_questions"), - } - print("Settings Dict: ", settings_dict) await cl.Message( author=SYSTEM, content="LLM settings have been updated. You can continue with your Query!", - # elements=[ - # cl.Text( - # name="settings", - # display="side", - # content=json.dumps(settings_dict, indent=4), - # language="json", - # ), - # ], ).send() async def set_starters(self): @@ -271,24 +239,24 @@ class Chatbot: print(e) return [ cl.Starter( - label="recording on CNNs?", + label="recording on Transformers?", message="Where can I find the recording for the lecture on Transformers?", - icon="/public/adv-screen-recorder-svgrepo-com.svg", + icon="/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg", ), cl.Starter( - label="where's the slides?", + label="where's the schedule?", message="When are the lectures? I can't find the schedule.", - icon="/public/alarmy-svgrepo-com.svg", + icon="/public/assets/images/starter_icons/alarmy-svgrepo-com.svg", ), cl.Starter( label="Due Date?", message="When is the final project due?", - icon="/public/calendar-samsung-17-svgrepo-com.svg", + icon="/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg", ), cl.Starter( label="Explain backprop.", message="I didn't understand the math behind backprop, could you explain it?", - icon="/public/acastusphoton-svgrepo-com.svg", + icon="/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg", ), ] @@ -305,18 +273,12 @@ class Chatbot: rename_dict = {"Chatbot": LLM} return rename_dict.get(orig_author, orig_author) - async def start(self, config=None): + async def start(self): """ Start the chatbot, initialize settings widgets, and display and load previous conversation if chat logging is enabled. """ - start_time = time.time() - - self.config = ( - await self._load_config() if config is None else config - ) # Reload the configuration on chat resume - await self.make_llm_settings_widgets(self.config) # Reload the settings widgets user = cl.user_session.get("user") @@ -344,8 +306,6 @@ class Chatbot: cl.user_session.set("llm_tutor", self.llm_tutor) cl.user_session.set("chain", self.chain) - print("Time taken to start LLM: ", time.time() - start_time) - async def stream_response(self, response): """ Stream the response from the LLM. @@ -376,8 +336,6 @@ class Chatbot: message: The incoming chat message. """ - start_time = time.time() - chain = cl.user_session.get("chain") token_count = 0 # initialize token count if not chain: @@ -386,19 +344,25 @@ class Chatbot: # update user info with last message time user = cl.user_session.get("user") - await reset_tokens_for_user(user) + await reset_tokens_for_user( + user, + self.config["token_config"]["tokens_left"], + self.config["token_config"]["regen_time"], + ) updated_user = await get_user_details(user.identifier) user.metadata = updated_user.metadata cl.user_session.set("user", user) - print("\n\n User Tokens Left: ", user.metadata["tokens_left"]) - # see if user has token credits left # if not, return message saying they have run out of tokens if user.metadata["tokens_left"] <= 0 and "admin" not in user.metadata["role"]: current_datetime = get_time() cooldown, cooldown_end_time = await check_user_cooldown( - user, current_datetime + user, + current_datetime, + self.config["token_config"]["cooldown_time"], + self.config["token_config"]["tokens_left"], + self.config["token_config"]["regen_time"], ) if cooldown: # get time left in cooldown @@ -479,12 +443,9 @@ class Chatbot: ) answer_with_sources = answer_with_sources.replace("$$", "$") - print("Time taken to process the message: ", time.time() - start_time) - actions = [] if self.config["llm_params"]["generate_follow_up"]: - start_time = time.time() cb_follow_up = cl.AsyncLangchainCallbackHandler() config = { "callbacks": ( @@ -514,8 +475,6 @@ class Chatbot: ) ) - print("Time taken to generate questions: ", time.time() - start_time) - # # update user info with token count tokens_left = await update_user_from_chainlit(user, token_count) @@ -530,25 +489,23 @@ class Chatbot: elements=source_elements, author=LLM, actions=actions, - metadata=self.config, ).send() async def on_chat_resume(self, thread: ThreadDict): - thread_config = None + # thread_config = None steps = thread["steps"] k = self.config["llm_params"][ "memory_window" ] # on resume, alwyas use the default memory window conversation_list = get_history_chat_resume(steps, k, SYSTEM, LLM) - thread_config = get_last_config( - steps - ) # TODO: Returns None for now - which causes config to be reloaded with default values + # thread_config = get_last_config( + # steps + # ) # TODO: Returns None for now - which causes config to be reloaded with default values cl.user_session.set("memory", conversation_list) - await self.start(config=thread_config) + await self.start() @cl.header_auth_callback def header_auth_callback(headers: dict) -> Optional[cl.User]: - print("\n\n\nI am here\n\n\n") # try: # TODO: Add try-except block after testing # TODO: Implement to get the user information from the headers (not the cookie) cookie = headers.get("cookie") # gets back a str @@ -564,10 +521,6 @@ class Chatbot: ).decode() decoded_user_info = json.loads(decoded_user_info) - print( - f"\n\n USER ROLE: {decoded_user_info['literalai_info']['metadata']['role']} \n\n" - ) - return cl.User( id=decoded_user_info["literalai_info"]["id"], identifier=decoded_user_info["literalai_info"]["identifier"], diff --git a/code/modules/config/config.yml b/apps/ai_tutor/config/config.yml similarity index 90% rename from code/modules/config/config.yml rename to apps/ai_tutor/config/config.yml index 3cdc2581f40daac5c156c5adcf5db213a4f51178..eed8ee7b9bf5e63c79f94af8116656ba48843325 100644 --- a/code/modules/config/config.yml +++ b/apps/ai_tutor/config/config.yml @@ -1,15 +1,15 @@ -log_dir: '../storage/logs' # str -log_chunk_dir: '../storage/logs/chunks' # str +log_dir: 'storage/logs' # str +log_chunk_dir: 'storage/logs/chunks' # str device: 'cpu' # str [cuda, cpu] vectorstore: load_from_HF: True # bool reparse_files: True # bool - data_path: '../storage/data' # str - url_file_path: '../storage/data/urls.txt' # str + data_path: 'storage/data' # str + url_file_path: 'storage/data/urls.txt' # str expand_urls: True # bool db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR] - db_path : '../vectorstores' # str + db_path : 'vectorstores' # str model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002'] search_top_k : 3 # int score_threshold : 0.2 # float diff --git a/apps/ai_tutor/config/config_manager.py b/apps/ai_tutor/config/config_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..df540f9d82c05eef3809ac92421eade5a8ff0516 --- /dev/null +++ b/apps/ai_tutor/config/config_manager.py @@ -0,0 +1,189 @@ +from pydantic import BaseModel, conint, confloat, HttpUrl +from typing import Optional, List +import yaml + + +class FaissParams(BaseModel): + index_path: str = "vectorstores/faiss.index" + index_type: str = "Flat" # Options: [Flat, HNSW, IVF] + index_dimension: conint(gt=0) = 384 + index_nlist: conint(gt=0) = 100 + index_nprobe: conint(gt=0) = 10 + + +class ColbertParams(BaseModel): + index_name: str = "new_idx" + + +class VectorStoreConfig(BaseModel): + load_from_HF: bool = True + reparse_files: bool = True + data_path: str = "storage/data" + url_file_path: str = "storage/data/urls.txt" + expand_urls: bool = True + db_option: str = "RAGatouille" # Options: [FAISS, Chroma, RAGatouille, RAPTOR] + db_path: str = "vectorstores" + model: str = ( + # Options: [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002] + "sentence-transformers/all-MiniLM-L6-v2" + ) + search_top_k: conint(gt=0) = 3 + score_threshold: confloat(ge=0.0, le=1.0) = 0.2 + + faiss_params: Optional[FaissParams] = None + colbert_params: Optional[ColbertParams] = None + + +class OpenAIParams(BaseModel): + temperature: confloat(ge=0.0, le=1.0) = 0.7 + + +class LocalLLMParams(BaseModel): + temperature: confloat(ge=0.0, le=1.0) = 0.7 + repo_id: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # HuggingFace repo id + filename: str = ( + "tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Specific name of gguf file in the repo + ) + model_path: str = ( + "storage/models/tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Path to the model file + ) + + +class LLMParams(BaseModel): + llm_arch: str = "langchain" # Options: [langchain] + use_history: bool = True + generate_follow_up: bool = False + memory_window: conint(ge=1) = 3 + llm_style: str = "Normal" # Options: [Normal, ELI5] + llm_loader: str = ( + "gpt-4o-mini" # Options: [local_llm, gpt-3.5-turbo-1106, gpt-4, gpt-4o-mini] + ) + openai_params: Optional[OpenAIParams] = None + local_llm_params: Optional[LocalLLMParams] = None + stream: bool = False + pdf_reader: str = "gpt" # Options: [llama, pymupdf, gpt] + + +class ChatLoggingConfig(BaseModel): + log_chat: bool = True + platform: str = "literalai" + callbacks: bool = True + + +class SplitterOptions(BaseModel): + use_splitter: bool = True + split_by_token: bool = True + remove_leftover_delimiters: bool = True + remove_chunks: bool = False + chunking_mode: str = "semantic" # Options: [fixed, semantic] + chunk_size: conint(gt=0) = 300 + chunk_overlap: conint(ge=0) = 30 + chunk_separators: List[str] = ["\n\n", "\n", " ", ""] + front_chunks_to_remove: Optional[conint(ge=0)] = None + last_chunks_to_remove: Optional[conint(ge=0)] = None + delimiters_to_remove: List[str] = ["\t", "\n", " ", " "] + + +class RetrieverConfig(BaseModel): + retriever_hf_paths: dict[str, str] = {"RAGatouille": "XThomasBU/Colbert_Index"} + + +class MetadataConfig(BaseModel): + metadata_links: List[HttpUrl] = [ + "https://dl4ds.github.io/sp2024/lectures/", + "https://dl4ds.github.io/sp2024/schedule/", + ] + slide_base_link: HttpUrl = "https://dl4ds.github.io" + + +class TokenConfig(BaseModel): + cooldown_time: conint(gt=0) = 60 + regen_time: conint(gt=0) = 180 + tokens_left: conint(gt=0) = 2000 + all_time_tokens_allocated: conint(gt=0) = 1000000 + + +class MiscConfig(BaseModel): + github_repo: HttpUrl = "https://github.com/DL4DS/dl4ds_tutor" + docs_website: HttpUrl = "https://dl4ds.github.io/dl4ds_tutor/" + + +class APIConfig(BaseModel): + timeout: conint(gt=0) = 60 + + +class Config(BaseModel): + log_dir: str = "storage/logs" + log_chunk_dir: str = "storage/logs/chunks" + device: str = "cpu" # Options: ['cuda', 'cpu'] + + vectorstore: VectorStoreConfig + llm_params: LLMParams + chat_logging: ChatLoggingConfig + splitter_options: SplitterOptions + retriever: RetrieverConfig + metadata: MetadataConfig + token_config: TokenConfig + misc: MiscConfig + api_config: APIConfig + + +class ConfigManager: + def __init__(self, config_path: str, project_config_path: str): + self.config_path = config_path + self.project_config_path = project_config_path + self.config = self.load_config() + self.validate_config() + + def load_config(self) -> Config: + with open(self.config_path, "r") as f: + config_data = yaml.safe_load(f) + + with open(self.project_config_path, "r") as f: + project_config_data = yaml.safe_load(f) + + # Merge the two configurations + merged_config = {**config_data, **project_config_data} + + return Config(**merged_config) + + def get_config(self) -> Config: + return ConfigWrapper(self.config) + + def validate_config(self): + # If any required fields are missing, raise an error + # required_fields = [ + # "vectorstore", "llm_params", "chat_logging", "splitter_options", + # "retriever", "metadata", "token_config", "misc", "api_config" + # ] + # for field in required_fields: + # if not hasattr(self.config, field): + # raise ValueError(f"Missing required configuration field: {field}") + + # # Validate types of specific fields + # if not isinstance(self.config.vectorstore, VectorStoreConfig): + # raise TypeError("vectorstore must be an instance of VectorStoreConfig") + # if not isinstance(self.config.llm_params, LLMParams): + # raise TypeError("llm_params must be an instance of LLMParams") + pass + + +class ConfigWrapper: + def __init__(self, config: Config): + self._config = config + + def __getitem__(self, key): + return getattr(self._config, key) + + def __getattr__(self, name): + return getattr(self._config, name) + + def dict(self): + return self._config.dict() + + +# Usage +config_manager = ConfigManager( + config_path="config/config.yml", project_config_path="config/project_config.yml" +) +# config = config_manager.get_config().dict() diff --git a/code/modules/config/constants.py b/apps/ai_tutor/config/constants.py similarity index 80% rename from code/modules/config/constants.py rename to apps/ai_tutor/config/constants.py index c22b905ba6f720ee0fa8ad5bb7eb68509068bfc3..506d0afc61c05719371979ba0fa60e491c72593f 100644 --- a/code/modules/config/constants.py +++ b/apps/ai_tutor/config/constants.py @@ -3,15 +3,6 @@ import os load_dotenv() -TIMEOUT = 60 -COOLDOWN_TIME = 60 -REGEN_TIME = 180 -TOKENS_LEFT = 2000 -ALL_TIME_TOKENS_ALLOCATED = 1000000 - -GITHUB_REPO = "https://github.com/DL4DS/dl4ds_tutor" -DOCS_WEBSITE = "https://dl4ds.github.io/dl4ds_tutor/" - # API Keys - Loaded from the .env file OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") @@ -20,6 +11,7 @@ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") LITERAL_API_KEY_LOGGING = os.getenv("LITERAL_API_KEY_LOGGING") LITERAL_API_URL = os.getenv("LITERAL_API_URL") CHAINLIT_URL = os.getenv("CHAINLIT_URL") +EMAIL_ENCRYPTION_KEY = os.getenv("EMAIL_ENCRYPTION_KEY") OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID") OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET") diff --git a/apps/ai_tutor/config/project_config.yml b/apps/ai_tutor/config/project_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..cc593aed17ac55608408302c18ed84129bc5efe3 --- /dev/null +++ b/apps/ai_tutor/config/project_config.yml @@ -0,0 +1,20 @@ +retriever: + retriever_hf_paths: + RAGatouille: "XThomasBU/Colbert_Index" + +metadata: + metadata_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"] + slide_base_link: "https://dl4ds.github.io" + +token_config: + cooldown_time: 60 + regen_time: 180 + tokens_left: 2000 + all_time_tokens_allocated: 1000000 + +misc: + github_repo: "https://github.com/DL4DS/dl4ds_tutor" + docs_website: "https://dl4ds.github.io/dl4ds_tutor/" + +api_config: + timeout: 60 diff --git a/code/modules/config/prompts.py b/apps/ai_tutor/config/prompts.py similarity index 100% rename from code/modules/config/prompts.py rename to apps/ai_tutor/config/prompts.py diff --git a/apps/ai_tutor/encrypt_students.py b/apps/ai_tutor/encrypt_students.py new file mode 100644 index 0000000000000000000000000000000000000000..1eccf5c89a57497aaa6b22549eeef092cd0c9d80 --- /dev/null +++ b/apps/ai_tutor/encrypt_students.py @@ -0,0 +1,53 @@ +import os +import hashlib +import json +import argparse +from dotenv import load_dotenv + + +# Function to deterministically hash emails +def deterministic_hash(email, salt): + return hashlib.pbkdf2_hmac("sha256", email.encode(), salt, 100000).hex() + + +def main(args): + # Load the .env file + load_dotenv() + + # Get the encryption key (salt) + encryption_salt = os.getenv("EMAIL_ENCRYPTION_KEY").encode() + + # Load emails from the specified JSON file + with open(args.students_file, "r") as file: + emails = json.load(file) + + # Replace emails with deterministic hashed emails, {hashed_email: [roles]} + hashed_emails = { + deterministic_hash(email, encryption_salt): roles + for email, roles in emails.items() + } + + # Save hashed emails to the specified encrypted JSON file + with open(args.encrypted_students_file, "w") as file: + json.dump(hashed_emails, file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Encrypt student emails in a JSON file." + ) + parser.add_argument( + "--students-file", + type=str, + default="private/students.json", + help="Path to the students JSON file", + ) + parser.add_argument( + "--encrypted-students-file", + type=str, + default="public/files/students_encrypted.json", + help="Path to save the encrypted students JSON file", + ) + args = parser.parse_args() + + main(args) diff --git a/apps/ai_tutor/helpers.py b/apps/ai_tutor/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..32ff5d99c51dd7b3a699ddf090cddc4a8aeafcd8 --- /dev/null +++ b/apps/ai_tutor/helpers.py @@ -0,0 +1,90 @@ +from datetime import datetime, timedelta, timezone +import tiktoken +from modules.chat_processor.helpers import update_user_info, convert_to_dict + + +def get_time(): + return datetime.now(timezone.utc).isoformat() + + +async def check_user_cooldown( + user_info, current_time, COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME +): + # # Check if no tokens left + tokens_left = user_info.metadata.get("tokens_left", 0) + if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False): + return False, None + + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + # Convert from ISO format string to datetime object and ensure UTC timezone + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + current_time = datetime.fromisoformat(current_time).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time + elapsed_time = current_time - last_message_time + elapsed_time_in_seconds = elapsed_time.total_seconds() + + # Calculate when the cooldown period ends + cooldown_end_time = last_message_time + timedelta(seconds=COOLDOWN_TIME) + cooldown_end_time_iso = cooldown_end_time.isoformat() + + # Check if the user is still in cooldown + if elapsed_time_in_seconds < COOLDOWN_TIME: + return True, cooldown_end_time_iso # Return in ISO 8601 format + + user_info["metadata"]["in_cooldown"] = False + # If not in cooldown, regenerate tokens + await reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME) + + return False, None + + +async def reset_tokens_for_user(user_info, TOKENS_LEFT, REGEN_TIME): + user_info = convert_to_dict(user_info) + last_message_time_str = user_info["metadata"].get("last_message_time") + + last_message_time = datetime.fromisoformat(last_message_time_str).replace( + tzinfo=timezone.utc + ) + current_time = datetime.fromisoformat(get_time()).replace(tzinfo=timezone.utc) + + # Calculate the elapsed time since the last message + elapsed_time_in_seconds = (current_time - last_message_time).total_seconds() + + # Current token count (can be negative) + current_tokens = user_info["metadata"].get("tokens_left_at_last_message", 0) + current_tokens = min(current_tokens, TOKENS_LEFT) + + # Maximum tokens that can be regenerated + max_tokens = user_info["metadata"].get("max_tokens", TOKENS_LEFT) + + # Calculate how many tokens should have been regenerated proportionally + if current_tokens < max_tokens: + # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration + # If current_tokens is close to 0, then the regeneration rate is relatively high, and if current_tokens is close to max_tokens, then the regeneration rate is relatively low + regeneration_rate_per_second = ( + max_tokens - max(current_tokens, 0) + ) / REGEN_TIME + + # Calculate how many tokens should have been regenerated based on the elapsed time + tokens_to_regenerate = int( + elapsed_time_in_seconds * regeneration_rate_per_second + ) + + # Ensure the new token count does not exceed max_tokens + new_token_count = min(current_tokens + tokens_to_regenerate, max_tokens) + + # Update the user's token count + user_info["metadata"]["tokens_left"] = new_token_count + + await update_user_info(user_info) + + +def get_num_tokens(text, model): + encoding = tiktoken.encoding_for_model(model) + tokens = encoding.encode(text) + return len(tokens) diff --git a/apps/ai_tutor/private/placeholder_students_file.json b/apps/ai_tutor/private/placeholder_students_file.json new file mode 100644 index 0000000000000000000000000000000000000000..61aeed079120abbb24c58aeafbfa317374fd7309 --- /dev/null +++ b/apps/ai_tutor/private/placeholder_students_file.json @@ -0,0 +1,5 @@ +{ + "abc@bu.edu": ["student", "bu"], + "xyz@bu.edu": ["student", "bu"], + "test@bu.edu": ["admin", "instructor", "bu"] +} \ No newline at end of file diff --git a/code/public/avatars/ai_tutor.png b/apps/ai_tutor/public/assets/images/avatars/ai-tutor.png similarity index 100% rename from code/public/avatars/ai_tutor.png rename to apps/ai_tutor/public/assets/images/avatars/ai-tutor.png diff --git a/code/public/logo_dark.png b/apps/ai_tutor/public/assets/images/avatars/ai_tutor.png similarity index 100% rename from code/public/logo_dark.png rename to apps/ai_tutor/public/assets/images/avatars/ai_tutor.png diff --git a/code/public/acastusphoton-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg similarity index 100% rename from code/public/acastusphoton-svgrepo-com.svg rename to apps/ai_tutor/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg diff --git a/code/public/adv-screen-recorder-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg similarity index 100% rename from code/public/adv-screen-recorder-svgrepo-com.svg rename to apps/ai_tutor/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg diff --git a/code/public/alarmy-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/alarmy-svgrepo-com.svg similarity index 100% rename from code/public/alarmy-svgrepo-com.svg rename to apps/ai_tutor/public/assets/images/starter_icons/alarmy-svgrepo-com.svg diff --git a/code/public/calendar-samsung-17-svgrepo-com.svg b/apps/ai_tutor/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg similarity index 100% rename from code/public/calendar-samsung-17-svgrepo-com.svg rename to apps/ai_tutor/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg diff --git a/apps/ai_tutor/public/files/students_encrypted.json b/apps/ai_tutor/public/files/students_encrypted.json new file mode 100644 index 0000000000000000000000000000000000000000..4a337d5d0de26119e3d7956b48d774c1b792b4d1 --- /dev/null +++ b/apps/ai_tutor/public/files/students_encrypted.json @@ -0,0 +1 @@ +{"0645db6f7b415e3b04a4fc327151c3c7bbcd25ec546ee0b3604957b571a79bc2": ["instructor", "bu"], "51ebf87ac51618300acfef8bfa9768fdee40e2d3f39cfb4ae8a76722ee336de4": ["admin", "instructor", "bu"], "7810b25bef84317130e2a59da978ee716bb96f6a8a9296c051b7ad4108aa8e6a": ["instructor", "bu"], "a95f36e2700c554639d3522834b47733f5ed1f05c5a43d04ac2575571dd43563": ["student", "bu"]} \ No newline at end of file diff --git a/apps/ai_tutor/public/files/test.css b/apps/ai_tutor/public/files/test.css new file mode 100644 index 0000000000000000000000000000000000000000..dc4787b22a872e4050074bb2854632dd4b0b9e80 --- /dev/null +++ b/apps/ai_tutor/public/files/test.css @@ -0,0 +1,32 @@ +a[href*='https://github.com/Chainlit/chainlit'] { + visibility: hidden; +} + +/* Hide the default avatar image */ +.MuiAvatar-root img.MuiAvatar-img { + display: none; + } + +/* Target the container of the image and set a custom background image */ +.MuiAvatar-root.MuiAvatar-circular.css-m2icte { + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 100px; /* Ensure the dimensions match the original */ + height: 100px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} +.MuiAvatar-root.MuiAvatar-circular.css-v72an7 { + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 40px; /* Ensure the dimensions match the original */ + height: 40px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} + +.MuiStack-root.css-14k6mw7 img { + content: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with the path to your custom image */ + max-height: 45px; /* Ensure the height remains consistent */ + max-width: 45px; /* Ensure the width remains consistent */ +} \ No newline at end of file diff --git a/code/public/logo_light.png b/apps/ai_tutor/public/logo_dark.png similarity index 100% rename from code/public/logo_light.png rename to apps/ai_tutor/public/logo_dark.png diff --git a/apps/ai_tutor/public/logo_light.png b/apps/ai_tutor/public/logo_light.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/apps/ai_tutor/public/logo_light.png differ diff --git a/storage/data/urls.txt b/apps/ai_tutor/storage/data/urls.txt similarity index 100% rename from storage/data/urls.txt rename to apps/ai_tutor/storage/data/urls.txt diff --git a/code/templates/cooldown.html b/apps/ai_tutor/templates/cooldown.html similarity index 98% rename from code/templates/cooldown.html rename to apps/ai_tutor/templates/cooldown.html index 099df21662c15b39d44cfa84f7dfd740f15d396c..a735a21a1b2e47d02b7cd0081102f29c588ad98c 100644 --- a/code/templates/cooldown.html +++ b/apps/ai_tutor/templates/cooldown.html @@ -121,7 +121,7 @@
- AI Tutor Avatar + AI Tutor Avatar

Hello, {{ username }}

It seems like you need to wait a bit before starting a new session.

Time remaining until the cooldown period ends:

diff --git a/code/templates/dashboard.html b/apps/ai_tutor/templates/dashboard.html similarity index 97% rename from code/templates/dashboard.html rename to apps/ai_tutor/templates/dashboard.html index a06c54fc633417abc528baf0991035941a1e0f2e..24d67ab67109213e9faee076ca9f0ed3523d0ec6 100644 --- a/code/templates/dashboard.html +++ b/apps/ai_tutor/templates/dashboard.html @@ -123,7 +123,7 @@
- AI Tutor Avatar + AI Tutor Avatar

Welcome, {{ username }}

Ready to start your AI tutoring session?

Tokens Left: {{ tokens_left }}

diff --git a/code/templates/error.html b/apps/ai_tutor/templates/error.html similarity index 100% rename from code/templates/error.html rename to apps/ai_tutor/templates/error.html diff --git a/code/templates/error_404.html b/apps/ai_tutor/templates/error_404.html similarity index 100% rename from code/templates/error_404.html rename to apps/ai_tutor/templates/error_404.html diff --git a/code/templates/login.html b/apps/ai_tutor/templates/login.html similarity index 98% rename from code/templates/login.html rename to apps/ai_tutor/templates/login.html index 934dd32ff0726307299951529c625f0c678d906e..d9551f546070161c5af23182a6bb525adcb3541d 100644 --- a/code/templates/login.html +++ b/apps/ai_tutor/templates/login.html @@ -107,7 +107,7 @@
- AI Tutor Avatar + AI Tutor Avatar

Terrier Tutor

Welcome to the DS598 AI Tutor. Please sign in to continue.

diff --git a/code/templates/logout.html b/apps/ai_tutor/templates/logout.html similarity index 100% rename from code/templates/logout.html rename to apps/ai_tutor/templates/logout.html diff --git a/apps/ai_tutor/templates/unauthorized.html b/apps/ai_tutor/templates/unauthorized.html new file mode 100644 index 0000000000000000000000000000000000000000..423cc12cd2195bc9b44e352ba365d28eee7a1858 --- /dev/null +++ b/apps/ai_tutor/templates/unauthorized.html @@ -0,0 +1,94 @@ + + + + + + Access Restricted + + + +
+ AI Tutor Avatar +

Access Restricted

+

+ We're currently testing things out for the DS598 course. + Access is restricted to students of the course. If you're enrolled in DS598 and seeing this message, + please reach out to us, and we'll help you get access.

+ P.S. Don't forget to use your BU email when logging in! +

+ + + +
+ + diff --git a/apps/chainlit_base/.chainlit/config.toml b/apps/chainlit_base/.chainlit/config.toml new file mode 100644 index 0000000000000000000000000000000000000000..a91c4344a975334ac4a30c6f454ce17cdeb5fc09 --- /dev/null +++ b/apps/chainlit_base/.chainlit/config.toml @@ -0,0 +1,120 @@ +[project] +# Whether to enable telemetry (default: true). No personal data is collected. +enable_telemetry = true + + +# List of environment variables to be provided by each user to use the app. +user_env = [] + +# Duration (in seconds) during which the session is saved when the connection is lost +session_timeout = 3600 + +# Enable third parties caching (e.g LangChain cache) +cache = false + +# Authorized origins +allow_origins = ["*"] + +# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317) +# follow_symlink = false + +[features] +# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript) +unsafe_allow_html = false + +# Process and display mathematical expressions. This can clash with "$" characters in messages. +latex = false + +# Automatically tag threads with the current chat profile (if a chat profile is used) +auto_tag_thread = true + +# Authorize users to spontaneously upload files with messages +[features.spontaneous_file_upload] + enabled = true + accept = ["*/*"] + max_files = 20 + max_size_mb = 500 + +[features.audio] + # Threshold for audio recording + min_decibels = -45 + # Delay for the user to start speaking in MS + initial_silence_timeout = 3000 + # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop. + silence_timeout = 1500 + # Above this duration (MS), the recording will forcefully stop. + max_duration = 15000 + # Duration of the audio chunks in MS + chunk_duration = 1000 + # Sample rate of the audio + sample_rate = 44100 + +edit_message = true + +[UI] +# Name of the assistant. +name = "Assistant" + +# Description of the assistant. This is used for HTML tags. +# description = "" + +# Large size content are by default collapsed for a cleaner ui +default_collapse_content = true + +# Chain of Thought (CoT) display mode. Can be "hidden", "tool_call" or "full". +cot = "full" + +# Link to your github repo. This will add a github button in the UI's header. +# github = "" + +# Specify a CSS file that can be used to customize the user interface. +# The CSS file can be served from the public directory or via an external link. +custom_css = "/public/files/test.css" + +# Specify a Javascript file that can be used to customize the user interface. +# The Javascript file can be served from the public directory. +# custom_js = "/public/test.js" + +# Specify a custom font url. +# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" + +# Specify a custom meta image url. +# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png" + +# Specify a custom build directory for the frontend. +# This can be used to customize the frontend code. +# Be careful: If this is a relative path, it should not start with a slash. +# custom_build = "./public/build" + +[UI.theme] + default = "dark" + #layout = "wide" + #font_family = "Inter, sans-serif" +# Override default MUI light theme. (Check theme.ts) +[UI.theme.light] + #background = "#FAFAFA" + #paper = "#FFFFFF" + + [UI.theme.light.primary] + #main = "#F80061" + #dark = "#980039" + #light = "#FFE7EB" + [UI.theme.light.text] + #primary = "#212121" + #secondary = "#616161" + +# Override default MUI dark theme. (Check theme.ts) +[UI.theme.dark] + #background = "#FAFAFA" + #paper = "#FFFFFF" + + [UI.theme.dark.primary] + #main = "#F80061" + #dark = "#980039" + #light = "#FFE7EB" + [UI.theme.dark.text] + #primary = "#EEEEEE" + #secondary = "#BDBDBD" + +[meta] +generated_by = "1.1.402" diff --git a/apps/chainlit_base/chainlit.md b/apps/chainlit_base/chainlit.md new file mode 100644 index 0000000000000000000000000000000000000000..4507ac4676a6387c4b52a0d1111e94753a102b32 --- /dev/null +++ b/apps/chainlit_base/chainlit.md @@ -0,0 +1,14 @@ +# Welcome to Chainlit! 🚀🤖 + +Hi there, Developer! 👋 We're excited to have you on board. Chainlit is a powerful tool designed to help you prototype, debug and share applications built on top of LLMs. + +## Useful Links 🔗 + +- **Documentation:** Get started with our comprehensive [Chainlit Documentation](https://docs.chainlit.io) 📚 +- **Discord Community:** Join our friendly [Chainlit Discord](https://discord.gg/k73SQ3FyUh) to ask questions, share your projects, and connect with other developers! 💬 + +We can't wait to see what you create with Chainlit! Happy coding! 💻😊 + +## Welcome screen + +To modify the welcome screen, edit the `chainlit.md` file at the root of your project. If you do not want a welcome screen, just leave this file empty. diff --git a/code/chainlit_base.py b/apps/chainlit_base/chainlit_base.py similarity index 70% rename from code/chainlit_base.py rename to apps/chainlit_base/chainlit_base.py index 82e8a1711c2615390d5d68a6ebb0d2898a1b0866..7fa537f5be273e6c10f0e93796531e2279217164 100644 --- a/code/chainlit_base.py +++ b/apps/chainlit_base/chainlit_base.py @@ -1,19 +1,15 @@ import chainlit.data as cl_data import asyncio -import yaml from typing import Any, Dict, no_type_check import chainlit as cl from modules.chat.llm_tutor import LLMTutor from modules.chat.helpers import ( get_sources, - get_history_chat_resume, get_history_setup_llm, - get_last_config, ) import copy -from chainlit.types import ThreadDict -import time from langchain_community.callbacks import get_openai_callback +from config.config_manager import config_manager USER_TIMEOUT = 60_000 SYSTEM = "System" @@ -22,22 +18,7 @@ AGENT = "Agent" YOU = "User" ERROR = "Error" -with open("modules/config/config.yml", "r") as f: - config = yaml.safe_load(f) - - -# async def setup_data_layer(): -# """ -# Set up the data layer for chat logging. -# """ -# if config["chat_logging"]["log_chat"]: -# data_layer = CustomLiteralDataLayer( -# api_key=LITERAL_API_KEY_LOGGING, server=LITERAL_API_URL -# ) -# else: -# data_layer = None - -# return data_layer +config = config_manager.get_config().dict() class Chatbot: @@ -47,13 +28,6 @@ class Chatbot: """ self.config = config - async def _load_config(self): - """ - Load the configuration from a YAML file. - """ - with open("modules/config/config.yml", "r") as f: - return yaml.safe_load(f) - @no_type_check async def setup_llm(self): """ @@ -61,7 +35,6 @@ class Chatbot: #TODO: Clean this up. """ - start_time = time.time() llm_settings = cl.user_session.get("llm_settings", {}) ( @@ -109,8 +82,6 @@ class Chatbot: cl.user_session.set("chain", self.chain) cl.user_session.set("llm_tutor", self.llm_tutor) - print("Time taken to setup LLM: ", time.time() - start_time) - @no_type_check async def update_llm(self, new_settings: Dict[str, Any]): """ @@ -193,70 +164,38 @@ class Chatbot: """ Inform the user about the updated LLM settings and display them as a message. """ - llm_settings: Dict[str, Any] = cl.user_session.get("llm_settings", {}) - llm_tutor = cl.user_session.get("llm_tutor") - settings_dict = { - "model": llm_settings.get("chat_model"), - "retriever": llm_settings.get("retriever_method"), - "memory_window": llm_settings.get("memory_window"), - "num_docs_in_db": ( - len(llm_tutor.vector_db) - if llm_tutor and hasattr(llm_tutor, "vector_db") - else 0 - ), - "view_sources": llm_settings.get("view_sources"), - "follow_up_questions": llm_settings.get("follow_up_questions"), - } - print("Settings Dict: ", settings_dict) await cl.Message( author=SYSTEM, content="LLM settings have been updated. You can continue with your Query!", - # elements=[ - # cl.Text( - # name="settings", - # display="side", - # content=json.dumps(settings_dict, indent=4), - # language="json", - # ), - # ], ).send() async def set_starters(self): """ Set starter messages for the chatbot. """ - # Return Starters only if the chat is new - - try: - thread = cl_data._data_layer.get_thread( - cl.context.session.thread_id - ) # see if the thread has any steps - if thread.steps or len(thread.steps) > 0: - return None - except Exception as e: - print(e) - return [ - cl.Starter( - label="recording on CNNs?", - message="Where can I find the recording for the lecture on Transformers?", - icon="/public/adv-screen-recorder-svgrepo-com.svg", - ), - cl.Starter( - label="where's the slides?", - message="When are the lectures? I can't find the schedule.", - icon="/public/alarmy-svgrepo-com.svg", - ), - cl.Starter( - label="Due Date?", - message="When is the final project due?", - icon="/public/calendar-samsung-17-svgrepo-com.svg", - ), - cl.Starter( - label="Explain backprop.", - message="I didn't understand the math behind backprop, could you explain it?", - icon="/public/acastusphoton-svgrepo-com.svg", - ), - ] + + return [ + cl.Starter( + label="recording on Transformers?", + message="Where can I find the recording for the lecture on Transformers?", + icon="/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg", + ), + cl.Starter( + label="where's the slides?", + message="When are the lectures? I can't find the schedule.", + icon="/public/assets/images/starter_icons/alarmy-svgrepo-com.svg", + ), + cl.Starter( + label="Due Date?", + message="When is the final project due?", + icon="/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg", + ), + cl.Starter( + label="Explain backprop.", + message="I didn't understand the math behind backprop, could you explain it?", + icon="/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg", + ), + ] def rename(self, orig_author: str): """ @@ -271,34 +210,19 @@ class Chatbot: rename_dict = {"Chatbot": LLM} return rename_dict.get(orig_author, orig_author) - async def start(self, config=None): + async def start(self): """ Start the chatbot, initialize settings widgets, and display and load previous conversation if chat logging is enabled. """ - start_time = time.time() - - self.config = ( - await self._load_config() if config is None else config - ) # Reload the configuration on chat resume - await self.make_llm_settings_widgets(self.config) # Reload the settings widgets - user = cl.user_session.get("user") - # TODO: remove self.user with cl.user_session.get("user") - try: - self.user = { - "user_id": user.identifier, - "session_id": cl.context.session.thread_id, - } - except Exception as e: - print(e) - self.user = { - "user_id": "guest", - "session_id": cl.context.session.thread_id, - } + self.user = { + "user_id": "guest", + "session_id": cl.context.session.thread_id, + } memory = cl.user_session.get("memory", []) self.llm_tutor = LLMTutor(self.config, user=self.user) @@ -310,8 +234,6 @@ class Chatbot: cl.user_session.set("llm_tutor", self.llm_tutor) cl.user_session.set("chain", self.chain) - print("Time taken to start LLM: ", time.time() - start_time) - async def stream_response(self, response): """ Stream the response from the LLM. @@ -342,8 +264,6 @@ class Chatbot: message: The incoming chat message. """ - start_time = time.time() - chain = cl.user_session.get("chain") token_count = 0 # initialize token count if not chain: @@ -389,12 +309,9 @@ class Chatbot: ) answer_with_sources = answer_with_sources.replace("$$", "$") - print("Time taken to process the message: ", time.time() - start_time) - actions = [] if self.config["llm_params"]["generate_follow_up"]: - start_time = time.time() cb_follow_up = cl.AsyncLangchainCallbackHandler() config = { "callbacks": ( @@ -424,30 +341,13 @@ class Chatbot: ) ) - print("Time taken to generate questions: ", time.time() - start_time) - print("Total Tokens Used: ", token_count) - await cl.Message( content=answer_with_sources, elements=source_elements, author=LLM, actions=actions, - metadata=self.config, ).send() - async def on_chat_resume(self, thread: ThreadDict): - thread_config = None - steps = thread["steps"] - k = self.config["llm_params"][ - "memory_window" - ] # on resume, alwyas use the default memory window - conversation_list = get_history_chat_resume(steps, k, SYSTEM, LLM) - thread_config = get_last_config( - steps - ) # TODO: Returns None for now - which causes config to be reloaded with default values - cl.user_session.set("memory", conversation_list) - await self.start(config=thread_config) - async def on_follow_up(self, action: cl.Action): user = cl.user_session.get("user") message = await cl.Message( @@ -466,12 +366,9 @@ chatbot = Chatbot(config=config) async def start_app(): - # cl_data._data_layer = await setup_data_layer() - # chatbot.literal_client = cl_data._data_layer.client if cl_data._data_layer else None cl.set_starters(chatbot.set_starters) cl.author_rename(chatbot.rename) cl.on_chat_start(chatbot.start) - cl.on_chat_resume(chatbot.on_chat_resume) cl.on_message(chatbot.main) cl.on_settings_update(chatbot.update_llm) cl.action_callback("follow up question")(chatbot.on_follow_up) diff --git a/apps/chainlit_base/config/config.yml b/apps/chainlit_base/config/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..eed8ee7b9bf5e63c79f94af8116656ba48843325 --- /dev/null +++ b/apps/chainlit_base/config/config.yml @@ -0,0 +1,60 @@ +log_dir: 'storage/logs' # str +log_chunk_dir: 'storage/logs/chunks' # str +device: 'cpu' # str [cuda, cpu] + +vectorstore: + load_from_HF: True # bool + reparse_files: True # bool + data_path: 'storage/data' # str + url_file_path: 'storage/data/urls.txt' # str + expand_urls: True # bool + db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille, RAPTOR] + db_path : 'vectorstores' # str + model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002'] + search_top_k : 3 # int + score_threshold : 0.2 # float + + faiss_params: # Not used as of now + index_path: 'vectorstores/faiss.index' # str + index_type: 'Flat' # str [Flat, HNSW, IVF] + index_dimension: 384 # int + index_nlist: 100 # int + index_nprobe: 10 # int + + colbert_params: + index_name: "new_idx" # str + +llm_params: + llm_arch: 'langchain' # [langchain] + use_history: True # bool + generate_follow_up: False # bool + memory_window: 3 # int + llm_style: 'Normal' # str [Normal, ELI5] + llm_loader: 'gpt-4o-mini' # str [local_llm, gpt-3.5-turbo-1106, gpt-4, gpt-4o-mini] + openai_params: + temperature: 0.7 # float + local_llm_params: + temperature: 0.7 # float + repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id + filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo + model_path: 'storage/models/tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Path to the model file + stream: False # bool + pdf_reader: 'gpt' # str [llama, pymupdf, gpt] + +chat_logging: + log_chat: True # bool + platform: 'literalai' + callbacks: True # bool + +splitter_options: + use_splitter: True # bool + split_by_token : True # bool + remove_leftover_delimiters: True # bool + remove_chunks: False # bool + chunking_mode: 'semantic' # str [fixed, semantic] + chunk_size : 300 # int + chunk_overlap : 30 # int + chunk_separators : ["\n\n", "\n", " ", ""] # list of strings + front_chunks_to_remove : null # int or None + last_chunks_to_remove : null # int or None + delimiters_to_remove : ['\t', '\n', ' ', ' '] # list of strings diff --git a/apps/chainlit_base/config/config_manager.py b/apps/chainlit_base/config/config_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..da88660ed14b5b4f9834e19dabae6c687c6c1fcf --- /dev/null +++ b/apps/chainlit_base/config/config_manager.py @@ -0,0 +1,174 @@ +from pydantic import BaseModel, conint, confloat, HttpUrl +from typing import Optional, List +import yaml + + +class FaissParams(BaseModel): + index_path: str = "vectorstores/faiss.index" + index_type: str = "Flat" # Options: [Flat, HNSW, IVF] + index_dimension: conint(gt=0) = 384 + index_nlist: conint(gt=0) = 100 + index_nprobe: conint(gt=0) = 10 + + +class ColbertParams(BaseModel): + index_name: str = "new_idx" + + +class VectorStoreConfig(BaseModel): + load_from_HF: bool = True + reparse_files: bool = True + data_path: str = "storage/data" + url_file_path: str = "storage/data/urls.txt" + expand_urls: bool = True + db_option: str = "RAGatouille" # Options: [FAISS, Chroma, RAGatouille, RAPTOR] + db_path: str = "vectorstores" + model: str = ( + "sentence-transformers/all-MiniLM-L6-v2" # Options: [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002] + ) + search_top_k: conint(gt=0) = 3 + score_threshold: confloat(ge=0.0, le=1.0) = 0.2 + + faiss_params: Optional[FaissParams] = None + colbert_params: Optional[ColbertParams] = None + + +class OpenAIParams(BaseModel): + temperature: confloat(ge=0.0, le=1.0) = 0.7 + + +class LocalLLMParams(BaseModel): + temperature: confloat(ge=0.0, le=1.0) = 0.7 + repo_id: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # HuggingFace repo id + filename: str = ( + "tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Specific name of gguf file in the repo + ) + model_path: str = ( + "storage/models/tinyllama-1.1b-chat-v1.0.Q5_0.gguf" # Path to the model file + ) + + +class LLMParams(BaseModel): + llm_arch: str = "langchain" # Options: [langchain] + use_history: bool = True + generate_follow_up: bool = False + memory_window: conint(ge=1) = 3 + llm_style: str = "Normal" # Options: [Normal, ELI5] + llm_loader: str = ( + "gpt-4o-mini" # Options: [local_llm, gpt-3.5-turbo-1106, gpt-4, gpt-4o-mini] + ) + openai_params: Optional[OpenAIParams] = None + local_llm_params: Optional[LocalLLMParams] = None + stream: bool = False + pdf_reader: str = "gpt" # Options: [llama, pymupdf, gpt] + + +class ChatLoggingConfig(BaseModel): + log_chat: bool = True + platform: str = "literalai" + callbacks: bool = True + + +class SplitterOptions(BaseModel): + use_splitter: bool = True + split_by_token: bool = True + remove_leftover_delimiters: bool = True + remove_chunks: bool = False + chunking_mode: str = "semantic" # Options: [fixed, semantic] + chunk_size: conint(gt=0) = 300 + chunk_overlap: conint(ge=0) = 30 + chunk_separators: List[str] = ["\n\n", "\n", " ", ""] + front_chunks_to_remove: Optional[conint(ge=0)] = None + last_chunks_to_remove: Optional[conint(ge=0)] = None + delimiters_to_remove: List[str] = ["\t", "\n", " ", " "] + + +class RetrieverConfig(BaseModel): + retriever_hf_paths: dict[str, str] = {"RAGatouille": "XThomasBU/Colbert_Index"} + + +class MetadataConfig(BaseModel): + metadata_links: List[HttpUrl] = [ + "https://dl4ds.github.io/sp2024/lectures/", + "https://dl4ds.github.io/sp2024/schedule/", + ] + slide_base_link: HttpUrl = "https://dl4ds.github.io" + + +class APIConfig(BaseModel): + timeout: conint(gt=0) = 60 + + +class Config(BaseModel): + log_dir: str = "storage/logs" + log_chunk_dir: str = "storage/logs/chunks" + device: str = "cpu" # Options: ['cuda', 'cpu'] + + vectorstore: VectorStoreConfig + llm_params: LLMParams + chat_logging: ChatLoggingConfig + splitter_options: SplitterOptions + retriever: RetrieverConfig + metadata: MetadataConfig + api_config: APIConfig + + +class ConfigManager: + def __init__(self, config_path: str, project_config_path: str): + self.config_path = config_path + self.project_config_path = project_config_path + self.config = self.load_config() + self.validate_config() + + def load_config(self) -> Config: + with open(self.config_path, "r") as f: + config_data = yaml.safe_load(f) + + with open(self.project_config_path, "r") as f: + project_config_data = yaml.safe_load(f) + + # Merge the two configurations + merged_config = {**config_data, **project_config_data} + + return Config(**merged_config) + + def get_config(self) -> Config: + return ConfigWrapper(self.config) + + def validate_config(self): + # If any required fields are missing, raise an error + # required_fields = [ + # "vectorstore", "llm_params", "chat_logging", "splitter_options", + # "retriever", "metadata", "token_config", "misc", "api_config" + # ] + # for field in required_fields: + # if not hasattr(self.config, field): + # raise ValueError(f"Missing required configuration field: {field}") + + # # Validate types of specific fields + # if not isinstance(self.config.vectorstore, VectorStoreConfig): + # raise TypeError("vectorstore must be an instance of VectorStoreConfig") + # if not isinstance(self.config.llm_params, LLMParams): + # raise TypeError("llm_params must be an instance of LLMParams") + pass + + +class ConfigWrapper: + def __init__(self, config: Config): + self._config = config + + def __getitem__(self, key): + return getattr(self._config, key) + + def __getattr__(self, name): + return getattr(self._config, name) + + def dict(self): + return self._config.dict() + + +# Usage +config_manager = ConfigManager( + config_path="config/config.yml", project_config_path="config/project_config.yml" +) +# config = config_manager.get_config().dict() diff --git a/code/modules/config/project_config.yml b/apps/chainlit_base/config/project_config.yml similarity index 73% rename from code/modules/config/project_config.yml rename to apps/chainlit_base/config/project_config.yml index 4a93a45d809ee970503b726bf4857ae5b3183c94..2ffa4671c0221496668d45e311784ae35e9c9083 100644 --- a/code/modules/config/project_config.yml +++ b/apps/chainlit_base/config/project_config.yml @@ -4,4 +4,7 @@ retriever: metadata: metadata_links: ["https://dl4ds.github.io/sp2024/lectures/", "https://dl4ds.github.io/sp2024/schedule/"] - slide_base_link: "https://dl4ds.github.io" \ No newline at end of file + slide_base_link: "https://dl4ds.github.io" + +api_config: + timeout: 60 \ No newline at end of file diff --git a/apps/chainlit_base/config/prompts.py b/apps/chainlit_base/config/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..bdd6611f333a94637f55e4a604748acec462a8fc --- /dev/null +++ b/apps/chainlit_base/config/prompts.py @@ -0,0 +1,97 @@ +prompts = { + "openai": { + "rephrase_prompt": ( + "You are someone that rephrases statements. Rephrase the student's question to add context from their chat history if relevant, ensuring it remains from the student's point of view. " + "Incorporate relevant details from the chat history to make the question clearer and more specific. " + "Do not change the meaning of the original statement, and maintain the student's tone and perspective. " + "If the question is conversational and doesn't require context, do not rephrase it. " + "Example: If the student previously asked about backpropagation in the context of deep learning and now asks 'what is it', rephrase to 'What is backpropagation.'. " + "Example: Do not rephrase if the user is asking something specific like 'cool, suggest a project with transformers to use as my final project' " + "Chat history: \n{chat_history}\n" + "Rephrase the following question only if necessary: '{input}'" + "Rephrased Question:'" + ), + "prompt_with_history": { + "normal": ( + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata. Use the source context that is most relevant. " + "Render math equations in LaTeX format between $ or $$ signs, stick to the parameter and variable icons found in your context. Be sure to explain the parameters and variables in the equations." + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n\n" + "Do not get influenced by the style of conversation in the chat history. Follow the instructions given here." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, concise, and engaging manner. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation.\n" + "Student: {input}\n" + "AI Tutor:" + ), + "eli5": ( + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Your job is to explain things in the simplest and most engaging way possible, just like the 'Explain Like I'm 5' (ELI5) concept." + "If you don't know the answer, do your best without making things up. Keep your explanations straightforward and very easy to understand." + "Use the chat history and context to help you, but avoid repeating past responses. Provide links from the source_file metadata when they're helpful." + "Use very simple language and examples to explain any math equations, and put the equations in LaTeX format between $ or $$ signs." + "Be friendly and engaging, like you're chatting with a young child who's curious and eager to learn. Avoid complex terms and jargon." + "Include simple and clear examples wherever you can to make things easier to understand." + "Do not get influenced by the style of conversation in the chat history. Follow the instructions given here." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, simple, and engaging way, just like the ELI5 concept. Use the context and history only if they're relevant, otherwise, just have a natural conversation." + "Give a clear and detailed explanation with simple examples to make it easier to understand. Remember, your goal is to break down complex topics into very simple terms, just like ELI5." + "Student: {input}\n" + "AI Tutor:" + ), + "socratic": ( + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Engage the student in a Socratic dialogue to help them discover answers on their own. Use the provided context to guide your questioning." + "If you don't know the answer, do your best without making things up. Keep the conversation engaging and inquisitive." + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata when relevant. Use the source context that is most relevant." + "Speak in a friendly and engaging manner, encouraging critical thinking and self-discovery." + "Use questions to lead the student to explore the topic and uncover answers." + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below by guiding them through a series of questions and insights that lead to deeper understanding. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation." + "Foster an inquisitive mindset and help the student discover answers through dialogue." + "Student: {input}\n" + "AI Tutor:" + ), + }, + "prompt_no_history": ( + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Provide links from the source_file metadata. Use the source context that is most relevant. " + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n\n" + "Context:\n{context}\n\n" + "Answer the student's question below in a friendly, concise, and engaging manner. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation.\n" + "Student: {input}\n" + "AI Tutor:" + ), + }, + "tiny_llama": { + "prompt_no_history": ( + "system\n" + "Assistant is an intelligent chatbot designed to help students with questions regarding the course DS598, taught by Prof. Thomas Gardos. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance.\n" + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally.\n" + "Provide links from the source_file metadata. Use the source context that is most relevant.\n" + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n" + "\n\n" + "user\n" + "Context:\n{context}\n\n" + "Question: {input}\n" + "\n\n" + "assistant" + ), + "prompt_with_history": ( + "system\n" + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance. " + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata. Use the source context that is most relevant. " + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n" + "\n\n" + "user\n" + "Chat History:\n{chat_history}\n\n" + "Context:\n{context}\n\n" + "Question: {input}\n" + "\n\n" + "assistant" + ), + }, +} diff --git a/apps/chainlit_base/public/assets/images/avatars/ai-tutor.png b/apps/chainlit_base/public/assets/images/avatars/ai-tutor.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/apps/chainlit_base/public/assets/images/avatars/ai-tutor.png differ diff --git a/apps/chainlit_base/public/assets/images/avatars/ai_tutor.png b/apps/chainlit_base/public/assets/images/avatars/ai_tutor.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/apps/chainlit_base/public/assets/images/avatars/ai_tutor.png differ diff --git a/apps/chainlit_base/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg b/apps/chainlit_base/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..e8edaf55d695486fbf134388dbfb84f7dbb7ca8c --- /dev/null +++ b/apps/chainlit_base/public/assets/images/starter_icons/acastusphoton-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/chainlit_base/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg b/apps/chainlit_base/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..439e638eaf976a2278a2aa043de2feb2d65c3862 --- /dev/null +++ b/apps/chainlit_base/public/assets/images/starter_icons/adv-screen-recorder-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/chainlit_base/public/assets/images/starter_icons/alarmy-svgrepo-com.svg b/apps/chainlit_base/public/assets/images/starter_icons/alarmy-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..d2dee1b557beab61067452d4700fa1d62ba0b0e8 --- /dev/null +++ b/apps/chainlit_base/public/assets/images/starter_icons/alarmy-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/apps/chainlit_base/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg b/apps/chainlit_base/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..8f9caa7ac74d4ea04369169e830e4042b267de89 --- /dev/null +++ b/apps/chainlit_base/public/assets/images/starter_icons/calendar-samsung-17-svgrepo-com.svg @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/apps/chainlit_base/public/files/students_encrypted.json b/apps/chainlit_base/public/files/students_encrypted.json new file mode 100644 index 0000000000000000000000000000000000000000..826570552b1dcbfdf9cc1472a7bb5a3c68d9c2e5 --- /dev/null +++ b/apps/chainlit_base/public/files/students_encrypted.json @@ -0,0 +1 @@ +{"7f1cacca66ee914ddde2ee20e0f2c96651d60cd8aabd310ef25a9e6d88f42df0": ["instructor", "bu"], "f74d264b6b5b2b4c10ce69e4ec16e869e01cb5eb668ed846aa8f6dae5c96cda0": ["admin", "instructor", "bu"], "53401356a874b1539775c73a8564d5e5f4f840441630c9cf649e16d201454f20": ["instructor", "bu"]} \ No newline at end of file diff --git a/code/public/test.css b/apps/chainlit_base/public/files/test.css similarity index 69% rename from code/public/test.css rename to apps/chainlit_base/public/files/test.css index 42cc35e34bcfb7421c4fa007e1906662e4517c9f..b82b60b6ebd358fef85b73389e1d5b1e4d8856b9 100644 --- a/code/public/test.css +++ b/apps/chainlit_base/public/files/test.css @@ -2,20 +2,23 @@ a[href*='https://github.com/Chainlit/chainlit'] { visibility: hidden; } - +/* TODO: Default image is still shown in the starter questions */ +/* Hide the default avatar image */ +.MuiAvatar-root img.MuiAvatar-img { + display: none; + } + /* Target the container of the image and set a custom background image */ .MuiAvatar-root.MuiAvatar-circular.css-m2icte { - background-image: url('/public/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ background-size: cover; /* Ensure the image covers the entire container */ background-position: center; /* Center the image */ width: 100px; /* Ensure the dimensions match the original */ height: 100px; /* Ensure the dimensions match the original */ border-radius: 50%; /* Maintain circular shape */ } - - .MuiAvatar-root.MuiAvatar-circular.css-v72an7 { - background-image: url('/public/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-image: url('/public/assets/images/avatars/ai-tutor.png'); /* Replace with your custom image URL */ background-size: cover; /* Ensure the image covers the entire container */ background-position: center; /* Center the image */ width: 40px; /* Ensure the dimensions match the original */ diff --git a/apps/chainlit_base/public/logo_dark.png b/apps/chainlit_base/public/logo_dark.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/apps/chainlit_base/public/logo_dark.png differ diff --git a/apps/chainlit_base/public/logo_light.png b/apps/chainlit_base/public/logo_light.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/apps/chainlit_base/public/logo_light.png differ diff --git a/code/chainlit.md b/code/chainlit.md deleted file mode 100644 index a0304f6b61506558ada35d2fdc78fbbc797a0674..0000000000000000000000000000000000000000 --- a/code/chainlit.md +++ /dev/null @@ -1,5 +0,0 @@ -# Welcome to DL4DS Tutor! 🚀🤖 - -Hi there, this is an LLM chatbot designed to help answer questions on the course content. - -### --- Please wait while the Tutor loads... --- diff --git a/code/public/space.jpg b/code/public/space.jpg deleted file mode 100644 index 6aba262bc616ebb446a8b5c5ebb926dea8eeb60b..0000000000000000000000000000000000000000 --- a/code/public/space.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ed3f8e7fd9790c394bae59cd0e315742af862ed833e9f42906f36f140abbb07 -size 2677297 diff --git a/code/modules/__init__.py b/modules/__init__.py similarity index 100% rename from code/modules/__init__.py rename to modules/__init__.py diff --git a/code/modules/chat/__init__.py b/modules/chat/__init__.py similarity index 100% rename from code/modules/chat/__init__.py rename to modules/chat/__init__.py diff --git a/code/modules/chat/base.py b/modules/chat/base.py similarity index 100% rename from code/modules/chat/base.py rename to modules/chat/base.py diff --git a/code/modules/chat/chat_model_loader.py b/modules/chat/chat_model_loader.py similarity index 100% rename from code/modules/chat/chat_model_loader.py rename to modules/chat/chat_model_loader.py diff --git a/code/modules/chat/helpers.py b/modules/chat/helpers.py similarity index 72% rename from code/modules/chat/helpers.py rename to modules/chat/helpers.py index cb140bc75e7ffb7221f26bb872359e04a2fd9096..62ac2a52f6c1103c422bb6e543a1ef3463d06d18 100644 --- a/code/modules/chat/helpers.py +++ b/modules/chat/helpers.py @@ -1,4 +1,4 @@ -from modules.config.prompts import prompts +from config.prompts import prompts import chainlit as cl @@ -38,7 +38,7 @@ def get_sources(res, answer, stream=True, view_sources=False): full_answer = "" # Not to include the answer again if streaming if not stream: # First, display the answer if not streaming - full_answer = "**Answer:**\n" + # full_answer = "**Answer:**\n" full_answer += answer if view_sources: @@ -137,31 +137,59 @@ def get_history_chat_resume(steps, k, SYSTEM, LLM): def get_history_setup_llm(memory_list): conversation_list = [] - for message in memory_list: - message_dict = message.to_dict() if hasattr(message, "to_dict") else message - - # Check if the type attribute is present as a key or attribute - message_type = ( - message_dict.get("type", None) - if isinstance(message_dict, dict) - else getattr(message, "type", None) + i = 0 + while i < len(memory_list) - 1: + # Process the current and next message + current_message = memory_list[i] + next_message = memory_list[i + 1] + + # Convert messages to dictionary if necessary + current_message_dict = ( + current_message.to_dict() + if hasattr(current_message, "to_dict") + else current_message + ) + next_message_dict = ( + next_message.to_dict() if hasattr(next_message, "to_dict") else next_message ) - # Check if content is present as a key or attribute - message_content = ( - message_dict.get("content", None) - if isinstance(message_dict, dict) - else getattr(message, "content", None) + # Check message type and content for current and next message + current_message_type = ( + current_message_dict.get("type", None) + if isinstance(current_message_dict, dict) + else getattr(current_message, "type", None) + ) + current_message_content = ( + current_message_dict.get("content", None) + if isinstance(current_message_dict, dict) + else getattr(current_message, "content", None) ) - if message_type in ["ai", "ai_message"]: - conversation_list.append({"type": "ai_message", "content": message_content}) - elif message_type in ["human", "user_message"]: + next_message_type = ( + next_message_dict.get("type", None) + if isinstance(next_message_dict, dict) + else getattr(next_message, "type", None) + ) + next_message_content = ( + next_message_dict.get("content", None) + if isinstance(next_message_dict, dict) + else getattr(next_message, "content", None) + ) + + # Check if the current message is user message and the next one is AI message + if current_message_type in ["human", "user_message"] and next_message_type in [ + "ai", + "ai_message", + ]: + conversation_list.append( + {"type": "user_message", "content": current_message_content} + ) conversation_list.append( - {"type": "user_message", "content": message_content} + {"type": "ai_message", "content": next_message_content} ) + i += 2 # Skip the next message since it has been paired else: - raise ValueError("Invalid message type") + i += 1 # Move to the next message if not a valid pair (example user message, followed by the cooldown system message) return conversation_list diff --git a/code/modules/chat/langchain/__init__.py b/modules/chat/langchain/__init__.py similarity index 100% rename from code/modules/chat/langchain/__init__.py rename to modules/chat/langchain/__init__.py diff --git a/code/modules/chat/langchain/langchain_rag.py b/modules/chat/langchain/langchain_rag.py similarity index 100% rename from code/modules/chat/langchain/langchain_rag.py rename to modules/chat/langchain/langchain_rag.py diff --git a/code/modules/chat/langchain/utils.py b/modules/chat/langchain/utils.py similarity index 100% rename from code/modules/chat/langchain/utils.py rename to modules/chat/langchain/utils.py diff --git a/code/modules/chat/llm_tutor.py b/modules/chat/llm_tutor.py similarity index 100% rename from code/modules/chat/llm_tutor.py rename to modules/chat/llm_tutor.py diff --git a/code/modules/chat_processor/__init__.py b/modules/chat_processor/__init__.py similarity index 100% rename from code/modules/chat_processor/__init__.py rename to modules/chat_processor/__init__.py diff --git a/code/modules/chat_processor/helpers.py b/modules/chat_processor/helpers.py similarity index 50% rename from code/modules/chat_processor/helpers.py rename to modules/chat_processor/helpers.py index d33246801b020db529412bbf806dc592187894fc..94d481d3a89f06ec7398c21e5c99d703ad7c826a 100644 --- a/code/modules/chat_processor/helpers.py +++ b/modules/chat_processor/helpers.py @@ -1,9 +1,6 @@ import os from literalai import AsyncLiteralClient -from datetime import datetime, timedelta, timezone -from modules.config.constants import COOLDOWN_TIME, TOKENS_LEFT, REGEN_TIME from typing_extensions import TypedDict -import tiktoken from typing import Any, Generic, List, Literal, Optional, TypeVar, Union Field = TypeVar("Field") @@ -136,10 +133,6 @@ def convert_to_dict(user_info): return user_info -def get_time(): - return datetime.now(timezone.utc).isoformat() - - async def get_user_details(user_email_id): user_info = await literal_client.api.get_or_create_user(identifier=user_email_id) return user_info @@ -155,91 +148,6 @@ async def update_user_info(user_info): ) -async def check_user_cooldown(user_info, current_time): - # # Check if no tokens left - tokens_left = user_info.metadata.get("tokens_left", 0) - if tokens_left > 0 and not user_info.metadata.get("in_cooldown", False): - return False, None - - user_info = convert_to_dict(user_info) - last_message_time_str = user_info["metadata"].get("last_message_time") - - # Convert from ISO format string to datetime object and ensure UTC timezone - last_message_time = datetime.fromisoformat(last_message_time_str).replace( - tzinfo=timezone.utc - ) - current_time = datetime.fromisoformat(current_time).replace(tzinfo=timezone.utc) - - # Calculate the elapsed time - elapsed_time = current_time - last_message_time - elapsed_time_in_seconds = elapsed_time.total_seconds() - - # Calculate when the cooldown period ends - cooldown_end_time = last_message_time + timedelta(seconds=COOLDOWN_TIME) - cooldown_end_time_iso = cooldown_end_time.isoformat() - - # Debug: Print the cooldown end time - print(f"Cooldown end time (ISO): {cooldown_end_time_iso}") - - # Check if the user is still in cooldown - if elapsed_time_in_seconds < COOLDOWN_TIME: - return True, cooldown_end_time_iso # Return in ISO 8601 format - - user_info["metadata"]["in_cooldown"] = False - # If not in cooldown, regenerate tokens - await reset_tokens_for_user(user_info) - - return False, None - - -async def reset_tokens_for_user(user_info): - user_info = convert_to_dict(user_info) - last_message_time_str = user_info["metadata"].get("last_message_time") - - last_message_time = datetime.fromisoformat(last_message_time_str).replace( - tzinfo=timezone.utc - ) - current_time = datetime.fromisoformat(get_time()).replace(tzinfo=timezone.utc) - - # Calculate the elapsed time since the last message - elapsed_time_in_seconds = (current_time - last_message_time).total_seconds() - - # Current token count (can be negative) - current_tokens = user_info["metadata"].get("tokens_left_at_last_message", 0) - current_tokens = min(current_tokens, TOKENS_LEFT) - - # Maximum tokens that can be regenerated - max_tokens = user_info["metadata"].get("max_tokens", TOKENS_LEFT) - - # Calculate how many tokens should have been regenerated proportionally - if current_tokens < max_tokens: - # Calculate the regeneration rate per second based on REGEN_TIME for full regeneration - regeneration_rate_per_second = max_tokens / REGEN_TIME - - # Calculate how many tokens should have been regenerated based on the elapsed time - tokens_to_regenerate = int( - elapsed_time_in_seconds * regeneration_rate_per_second - ) - - # Ensure the new token count does not exceed max_tokens - new_token_count = min(current_tokens + tokens_to_regenerate, max_tokens) - - print( - f"\n\n Adding {tokens_to_regenerate} tokens to the user, Time elapsed: {elapsed_time_in_seconds} seconds, Tokens after regeneration: {new_token_count}, Tokens before: {current_tokens} \n\n" - ) - - # Update the user's token count - user_info["metadata"]["tokens_left"] = new_token_count - - await update_user_info(user_info) - - async def get_thread_step_info(thread_id): step = await literal_client.api.get_step(thread_id) return step - - -def get_num_tokens(text, model): - encoding = tiktoken.encoding_for_model(model) - tokens = encoding.encode(text) - return len(tokens) diff --git a/code/modules/chat_processor/literal_ai.py b/modules/chat_processor/literal_ai.py similarity index 100% rename from code/modules/chat_processor/literal_ai.py rename to modules/chat_processor/literal_ai.py diff --git a/code/modules/config/__init__.py b/modules/config/__init__.py similarity index 100% rename from code/modules/config/__init__.py rename to modules/config/__init__.py diff --git a/modules/config/constants.py b/modules/config/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0feb7c4b83c1f437fb06674210de655221fedc --- /dev/null +++ b/modules/config/constants.py @@ -0,0 +1,12 @@ +# from .env setup all constants here + +import os +from dotenv import load_dotenv + +load_dotenv() + +# Required Constants # TODO: MOVE THIS TO APP SPECIFIC DIRECTORY +TIMEOUT = os.getenv("TIMEOUT", 60) +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") +HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") diff --git a/code/modules/dataloader/__init__.py b/modules/dataloader/__init__.py similarity index 100% rename from code/modules/dataloader/__init__.py rename to modules/dataloader/__init__.py diff --git a/code/modules/dataloader/data_loader.py b/modules/dataloader/data_loader.py similarity index 97% rename from code/modules/dataloader/data_loader.py rename to modules/dataloader/data_loader.py index d08d4bd80b205bba328bc1f7c55ebdb8b9b008d1..c2a2fe47848ff41a688e44098c2fabf58cbb7f62 100644 --- a/code/modules/dataloader/data_loader.py +++ b/modules/dataloader/data_loader.py @@ -423,6 +423,15 @@ if __name__ == "__main__": parser.add_argument( "--links", nargs="+", required=True, help="List of links to process." ) + parser.add_argument( + "--config_file", type=str, help="Path to the main config file", required=True + ) + parser.add_argument( + "--project_config_file", + type=str, + help="Path to the project config file", + required=True, + ) args = parser.parse_args() links_to_process = args.links @@ -430,10 +439,10 @@ if __name__ == "__main__": logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) - with open("../code/modules/config/config.yml", "r") as f: + with open(args.config_file, "r") as f: config = yaml.safe_load(f) - with open("../code/modules/config/project_config.yml", "r") as f: + with open(args.project_config_file, "r") as f: project_config = yaml.safe_load(f) # Combine project config with the main config diff --git a/code/modules/dataloader/helpers.py b/modules/dataloader/helpers.py similarity index 97% rename from code/modules/dataloader/helpers.py rename to modules/dataloader/helpers.py index a9b0d1e8526fdad51b2f1b16cb142f504266ccb9..c7219e04fd10eb2ec3c6ff0041766eaef6dacc4a 100644 --- a/code/modules/dataloader/helpers.py +++ b/modules/dataloader/helpers.py @@ -2,7 +2,9 @@ import requests from bs4 import BeautifulSoup from urllib.parse import urlparse import tempfile -from modules.config.constants import TIMEOUT +from modules.config.constants import ( + TIMEOUT, +) # TODO: MOVE THIS TO APP SPECIFIC DIRECTORY def get_urls_from_file(file_path: str): diff --git a/code/modules/retriever/__init__.py b/modules/dataloader/pdf_readers/__init__.py similarity index 100% rename from code/modules/retriever/__init__.py rename to modules/dataloader/pdf_readers/__init__.py diff --git a/code/modules/dataloader/pdf_readers/base.py b/modules/dataloader/pdf_readers/base.py similarity index 100% rename from code/modules/dataloader/pdf_readers/base.py rename to modules/dataloader/pdf_readers/base.py diff --git a/code/modules/dataloader/pdf_readers/gpt.py b/modules/dataloader/pdf_readers/gpt.py similarity index 100% rename from code/modules/dataloader/pdf_readers/gpt.py rename to modules/dataloader/pdf_readers/gpt.py diff --git a/code/modules/dataloader/pdf_readers/llama.py b/modules/dataloader/pdf_readers/llama.py similarity index 100% rename from code/modules/dataloader/pdf_readers/llama.py rename to modules/dataloader/pdf_readers/llama.py diff --git a/code/modules/dataloader/webpage_crawler.py b/modules/dataloader/webpage_crawler.py similarity index 100% rename from code/modules/dataloader/webpage_crawler.py rename to modules/dataloader/webpage_crawler.py diff --git a/code/modules/vectorstore/__init__.py b/modules/retriever/__init__.py similarity index 100% rename from code/modules/vectorstore/__init__.py rename to modules/retriever/__init__.py diff --git a/code/modules/retriever/base.py b/modules/retriever/base.py similarity index 100% rename from code/modules/retriever/base.py rename to modules/retriever/base.py diff --git a/code/modules/retriever/chroma_retriever.py b/modules/retriever/chroma_retriever.py similarity index 100% rename from code/modules/retriever/chroma_retriever.py rename to modules/retriever/chroma_retriever.py diff --git a/code/modules/retriever/colbert_retriever.py b/modules/retriever/colbert_retriever.py similarity index 100% rename from code/modules/retriever/colbert_retriever.py rename to modules/retriever/colbert_retriever.py diff --git a/code/modules/retriever/faiss_retriever.py b/modules/retriever/faiss_retriever.py similarity index 100% rename from code/modules/retriever/faiss_retriever.py rename to modules/retriever/faiss_retriever.py diff --git a/code/modules/retriever/helpers.py b/modules/retriever/helpers.py similarity index 100% rename from code/modules/retriever/helpers.py rename to modules/retriever/helpers.py diff --git a/code/modules/retriever/raptor_retriever.py b/modules/retriever/raptor_retriever.py similarity index 100% rename from code/modules/retriever/raptor_retriever.py rename to modules/retriever/raptor_retriever.py diff --git a/code/modules/retriever/retriever.py b/modules/retriever/retriever.py similarity index 100% rename from code/modules/retriever/retriever.py rename to modules/retriever/retriever.py diff --git a/code/modules/vectorstore/helpers.py b/modules/vectorstore/__init__.py similarity index 100% rename from code/modules/vectorstore/helpers.py rename to modules/vectorstore/__init__.py diff --git a/code/modules/vectorstore/base.py b/modules/vectorstore/base.py similarity index 100% rename from code/modules/vectorstore/base.py rename to modules/vectorstore/base.py diff --git a/code/modules/vectorstore/chroma.py b/modules/vectorstore/chroma.py similarity index 100% rename from code/modules/vectorstore/chroma.py rename to modules/vectorstore/chroma.py diff --git a/code/modules/vectorstore/colbert.py b/modules/vectorstore/colbert.py similarity index 100% rename from code/modules/vectorstore/colbert.py rename to modules/vectorstore/colbert.py diff --git a/code/modules/vectorstore/embedding_model_loader.py b/modules/vectorstore/embedding_model_loader.py similarity index 100% rename from code/modules/vectorstore/embedding_model_loader.py rename to modules/vectorstore/embedding_model_loader.py diff --git a/code/modules/vectorstore/faiss.py b/modules/vectorstore/faiss.py similarity index 100% rename from code/modules/vectorstore/faiss.py rename to modules/vectorstore/faiss.py diff --git a/modules/vectorstore/helpers.py b/modules/vectorstore/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/vectorstore/raptor.py b/modules/vectorstore/raptor.py similarity index 100% rename from code/modules/vectorstore/raptor.py rename to modules/vectorstore/raptor.py diff --git a/code/modules/vectorstore/store_manager.py b/modules/vectorstore/store_manager.py similarity index 93% rename from code/modules/vectorstore/store_manager.py rename to modules/vectorstore/store_manager.py index 933a8878f3d237cdda52f22c7f87b5a4f6a514ae..a3f9bd237c66f440769e59d779212203176fc200 100644 --- a/code/modules/vectorstore/store_manager.py +++ b/modules/vectorstore/store_manager.py @@ -164,10 +164,24 @@ class VectorStoreManager: if __name__ == "__main__": import yaml - - with open("modules/config/config.yml", "r") as f: + import argparse + + # Add argument parsing for config files + parser = argparse.ArgumentParser(description="Load configuration files.") + parser.add_argument( + "--config_file", type=str, help="Path to the main config file", required=True + ) + parser.add_argument( + "--project_config_file", + type=str, + help="Path to the project config file", + required=True, + ) + args = parser.parse_args() + + with open(args.config_file, "r") as f: config = yaml.safe_load(f) - with open("modules/config/project_config.yml", "r") as f: + with open(args.project_config_file, "r") as f: project_config = yaml.safe_load(f) # combine the two configs diff --git a/code/modules/vectorstore/vectorstore.py b/modules/vectorstore/vectorstore.py similarity index 100% rename from code/modules/vectorstore/vectorstore.py rename to modules/vectorstore/vectorstore.py diff --git a/requirements.txt b/requirements.txt index 3416f29b11c8ba2537a57460c3ff781227c8e72a..f6dd81879b2e44b938ddf8804b3f342004ade096 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,3 +34,4 @@ fastapi google-auth google-auth-oauthlib Jinja2 +cryptography diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac55d5706e43e8d363984b377f5f64215a34e47 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup, find_packages + +# Read the contents of requirements.txt +with open("requirements.txt") as f: + requirements = f.read().splitlines() + +setup( + name="dl4ds_tutor", + version="0.1.0", + packages=find_packages(), + package_dir={"modules": "modules"}, + python_requires=">=3.7", + install_requires=requirements, + description="A Deep Learning for Data Science Tutor application", +)