MedicalChatbot_v2.0

Sleeping

App Files Files Community

sxandie commited on Jul 15, 2024

Commit

e899e0f

verified ·

1 Parent(s): d0803b5

Upload 22 files

Browse files

Files changed (23) hide show

.gitattributes +2 -0
Dockerfile +17 -0
app.py +109 -0
configs/app_config.yml +54 -0
data/docs/alphabet-2023.pdf +0 -0
data/docs/microsoft-2023.pdf +3 -0
data/vectordb/processed/chroma/chroma.sqlite3 +3 -0
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/data_level0.bin +3 -0
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/header.bin +3 -0
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/index_metadata.pickle +3 -0
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/length.bin +3 -0
data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/link_lists.bin +3 -0
images/chatbot.png +0 -0
images/user.png +0 -0
requirements.txt +145 -0
src/__init__.py +0 -0
src/finbot.py +154 -0
src/load_config.py +121 -0
src/prepare_bgesmall_vectordb.py +123 -0
src/prepare_openAIEmbeddings_vectordb.py +120 -0
src/ui_settings.py +35 -0
src/upload_data_manually.py +41 -0
src/upload_file.py +40 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/docs/microsoft-2023.pdf filter=lfs diff=lfs merge=lfs -text
+data/vectordb/processed/chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory
+WORKDIR /frontend
+# Copy the current directory contents into the container at /app
+COPY . /frontend
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Make port 7860 available to the world outside this container
+EXPOSE 7860
+# Run the Gradio app when the container launches
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+    This module uses Gradio to create an interactive web application for a chatbot with various features.
+    The application interface is organized into three rows:
+    1. The first row contains a Chatbot component that simulates a conversation with a language model, along with a hidden
+    reference bar initially. The reference bar can be toggled using a button. The chatbot supports feedback in the form
+    of like and dislike icons.
+    2. The second row consists of a Textbox for user input. Users can enter text or upload PDF/doc files.
+    3. The third row includes buttons for submitting text, toggling the reference bar visibility, uploading PDF/doc files,
+    adjusting temperature for GPT responses, selecting the document type, and clearing the input.
+    The application processes user interactions:
+    - Uploaded files trigger the processing of the files, updating the input and chatbot components.
+    - Submitting text triggers the chatbot to respond, considering the selected document type and temperature settings.
+    The response is displayed in the Textbox and Chatbot components, and the reference bar may be updated.
+    The application can be run as a standalone script, launching the Gradio interface for users to interact with the chatbot.
+    Note: The docstring provides an overview of the module's purpose and functionality, but detailed comments within the code
+    explain specific components, interactions, and logic throughout the implementation.
+"""
+import gradio as gr
+from src.upload_file import UploadFile
+from src.finbot import ChatBot
+from src.ui_settings import UISettings
+with gr.Blocks() as demo:
+    with gr.Tabs():
+        with gr.TabItem("FinGPT"):
+            # First ROW:
+            with gr.Row() as row_one:
+                with gr.Column(visible=False) as reference_bar:
+                    ref_output = gr.Markdown()
+                with gr.Column() as chatbot_output:
+                    chatbot = gr.Chatbot(
+                        [],
+                        elem_id="chatbot",
+                        bubble_full_width=False,
+                        height=500,
+                        avatar_images=(
+                            ("images/user.png"), "images/chatbot.png"),
+                    )
+                    chatbot.like(UISettings.feedback, None, None)  # feedbacks
+            # SECOND ROW:
+            with gr.Row():
+                input_txt = gr.Textbox(
+                    lines=4,
+                    scale=8,
+                    placeholder="Hi there! Have a question? Ask away! Or, upload your PDFs to find the answers within them.",
+                    container=False,
+                )
+                model_choice = gr.Dropdown(
+                    label="Choose model", choices=["gpt-3.5-turbo", "llama3-70b-8192", "mixtral-8x7b-32768"], value="llama3-70b-8192")
+            # Third ROW:
+            with gr.Row() as row_two:
+                text_submit_btn = gr.Button(value="Ask FinGPT 🤗")
+                sidebar_state = gr.State(False)
+                btn_toggle_sidebar = gr.Button(
+                    value="References")
+                btn_toggle_sidebar.click(UISettings.toggle_sidebar,
+                                         [sidebar_state],
+                                         [reference_bar, sidebar_state]
+                                         )
+                upload_btn = gr.UploadButton(
+                    "Upload you pdf/doc file 📄", file_types=[
+                        '.pdf',
+                        '.doc'
+                    ],
+                    file_count="multiple")
+                temperature_bar = gr.Slider(minimum=0, maximum=1, value=0, step=0.1,
+                                            label="Temperature", info="0: Coherent mode, 1: Creative mode")
+                rag_with_dropdown = gr.Dropdown(
+                    label="RAG with", choices=["Existing database", "Upload new data"], value="Existing database")
+                clear_button = gr.ClearButton([input_txt, chatbot])
+            # Backend Process:
+            file_msg = upload_btn.upload(fn=UploadFile.process_uploaded_files, inputs=[
+                upload_btn, chatbot, rag_with_dropdown, model_choice], outputs=[input_txt, chatbot], queue=False)
+            txt_msg = input_txt.submit(fn=ChatBot.respond,
+                                       inputs=[chatbot,
+                                               input_txt,
+                                               rag_with_dropdown,
+                                               temperature_bar,
+                                               model_choice],
+                                       outputs=[input_txt,chatbot,
+                                                ref_output],
+                                       queue=False).then(lambda: gr.Textbox(interactive=True),
+                                                         None,
+                                                         [input_txt], queue=False)
+            txt_msg = text_submit_btn.click(fn=ChatBot.respond,
+                                            inputs=[chatbot,
+                                                    input_txt,
+                                                    rag_with_dropdown,
+                                                    temperature_bar,
+                                                    model_choice],
+                                            outputs=[input_txt,
+                                                     chatbot, ref_output],
+                                            queue=False).then(lambda: gr.Textbox(interactive=True),
+                                                              None, [input_txt], queue=False)
+if __name__ == "__main__":
+    demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

configs/app_config.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+directories:
+  data_directory: data/docs
+  data_directory_2: data/docs_2
+  persist_directory: data/vectordb/processed/chroma/
+  custom_persist_directory: data/vectordb/uploaded/chroma/
+embedding_model_config:
+  engine: "text-embedding-ada-002"
+  #engine: "Alibaba-NLP/gte-base-en-v1.5"
+llm_config:
+    llm_system_role: "You are the The Best Financial Research Analyst. \
+    You are expert in analyzing financial statements, forecasting financial performance, \
+    valuing the company, assessing investment opportunities and preparing research reports. \
+    You will recieve a chat history, retrieved content from the vectorDB based on the user's question, and the source.\
+    Your task is to respond to the user's question using the information \
+    from the vectorDB and Chat history, without relying on your own knowledge. \
+    Your output should contain only your response, and if you can't find relevant context say that you don't know. \
+    You will receive a input prompt enclosed in triple backtics:
+    # Chat history:\n
+    [user query, response]\n\n
+    # Retrieved content number:\n
+    Content\n\n
+    Source\n\n
+    # User question:\n
+    New question
+    "
+    gpt_model: "gpt-3.5-turbo"
+    llama3_70bmodel: "llama3-70b-8192"
+    temperature: 0.2
+    max_token: 4096
+splitter_config:
+  chunk_size: 1000
+  chunk_overlap: 200
+# how many relevant nodes to return
+retrieval_config:
+  k: 5
+  num_of_final_doc: 3 # for reranking
+serve:
+  port: 8000
+memory:
+  qa_pair_count: 2

data/docs/alphabet-2023.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

data/docs/microsoft-2023.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4a3140732d95b86a1c823487787b849e1ca70117edcde8998ba0e8b702f8fd4
+size 5859293

data/vectordb/processed/chroma/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be261a97c4be87df854852f8adaf4195b3a07901338aebcb7687820831d0150
+size 20860928

data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7baf11e6838661c73f08811e0be0f61a8c55173dd5f51a585720c3c65cd7fda0
+size 6284000

data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b690cabe62b8902c35d848b48b407cd535aa5117621502dd552e3abe932aa2a9
+size 100

data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5761cdbcca93f7d658d69fe10a6df3d102b5f911a28dcd53e62d221c418bbb3e
+size 55974

data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6834a5486bb565f823b15118f39ef0d79879f2f1451948091acb278ac86f7079
+size 4000

data/vectordb/processed/chroma/f926420f-492c-48a8-a50b-448a3a18a87a/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94968e65bad434c804d4290e2ed54697844880fbd44952bf3dcd303259faffd8
+size 8148

images/chatbot.png ADDED Viewed

images/user.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,145 @@

+gradio
+ragatouille
+langchain-groq
+langchain
+langchain-community
+langchain-core
+langchain-openai
+langchain-text-splitters
+aiofiles
+aiohttp
+aiosignal
+altair
+annotated-types
+anyio
+asgiref
+attr
+bcrypt
+build
+cachetools
+certifi
+charset-normalizer
+chroma-hnswlib
+chromadb
+click
+coloredlogs
+contourpy
+cycler
+dataclasses-json
+Deprecated
+distro
+dnspython
+email_validator
+fastapi
+fastapi-cli
+ffmpy
+filelock
+flatbuffers
+fonttools
+frozenlist
+fsspec
+google-auth
+googleapis-common-protos
+gradio_client
+grpcio
+h11
+httpcore
+httptools
+httpx
+huggingface-hub
+humanfriendly
+idna
+importlib_metadata
+importlib_resources
+Jinja2
+jsonpatch
+jsonpointer
+jsonschema
+jsonschema-specifications
+kiwisolver
+kubernetes
+langsmith
+markdown-it-py
+MarkupSafe
+marshmallow
+matplotlib
+mdurl
+mmh3
+monotonic
+mpmath
+multidict
+mypy-extensions
+numpy
+oauthlib
+onnxruntime
+openai
+opentelemetry-api
+opentelemetry-exporter-otlp-proto-common
+opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-instrumentation
+opentelemetry-instrumentation-asgi
+opentelemetry-instrumentation-fastapi
+opentelemetry-proto
+opentelemetry-sdk
+opentelemetry-semantic-conventions
+opentelemetry-util-http
+orjson
+overrides
+packaging
+pandas
+pillow
+posthog
+protobuf
+pyasn1
+pyasn1_modules
+pydantic
+pydantic_core
+pydub
+Pygments
+pyparsing
+pypdf
+PyPika
+pyproject_hooks
+pyprojroot
+python-dateutil
+python-dotenv
+python-multipart
+pytz
+PyYAML
+referencing
+regex
+requests
+requests-oauthlib
+rich
+rpds-py
+rsa
+ruff
+semantic-version
+setuptools
+shellingham
+six
+sniffio
+SQLAlchemy
+starlette
+sympy
+tenacity
+tiktoken
+tokenizers
+tomlkit
+toolz
+tqdm
+typer
+typing-inspect
+typing_extensions
+tzdata
+ujson
+urllib3
+uvicorn
+uvloop
+watchfiles
+websocket-client
+websockets
+wheel
+wrapt
+yarl
+zipp

src/__init__.py ADDED Viewed

File without changes

src/finbot.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import os
+import re
+import ast
+import html
+import time
+import gradio as gr
+from openai import OpenAI
+from typing import List, Tuple
+from src.load_config import LoadConfig
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_groq import ChatGroq
+from langchain.vectorstores import Chroma
+from  uuid import uuid4
+import os
+APP_CONFIG = LoadConfig()
+# URGENT NOTICE
+unique_id = uuid4().hex[0:8]
+os.environ["LANGCHAIN_TRACING_V2"] = "true"
+os.environ["LANGCHAIN_PROJECT"] = f"Ragas_RAG_Eval"
+os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
+class ChatBot:
+    """
+    Class representing a chatbot with document retrieval and response generation capabilities.
+    This class provides static methods for responding to user queries, handling feedback, and
+    cleaning references from retrieved documents.
+    """
+    vectordb = None
+    @staticmethod
+    def respond(chatbot: List, message: str, data_type: str = "Existing database", temperature: float = 0.0, model_choice: str = APP_CONFIG.llama3_70bmodel) -> Tuple:
+        """
+        Generate a response to a user query using document retrieval and language model completion.
+        Parameters:
+            chatbot (List): List representing the chatbot's conversation history.
+            message (str): The user's query.
+            data_type (str): Type of data used for document retrieval ("Existing database" or "Upload new data").
+            temperature (float): Temperature parameter for language model completion.
+        Returns:
+            Tuple: A tuple containing an empty string, the updated chat history, and references from retrieved documents.
+        """
+        # Check if the vector database needs to be created
+        if ChatBot.vectordb is None:
+            if data_type == "Existing database":
+                if os.path.exists(APP_CONFIG.persist_directory):
+                    ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.persist_directory,
+                                        embedding_function=APP_CONFIG.embedding_model)
+                else:
+                    chatbot.append(
+                        (message, f"VectorDB does not exist. Please first execute the 'upload_data_manually.py' module. For further information please visit README.md of this repository."))
+                    return "", chatbot, None
+            elif data_type == "Upload new data":
+                if os.path.exists(APP_CONFIG.custom_persist_directory):
+                    ChatBot.vectordb = Chroma(persist_directory=APP_CONFIG.custom_persist_directory,
+                                        embedding_function=APP_CONFIG.embedding_model)
+                else:
+                    chatbot.append(
+                        (message, f"No file uploaded. Please first upload your files using the 'upload' button."))
+                    return "", chatbot, None
+        # single step proces for embed user query, serach in vectordb, and get retrieved docs
+        docs = ChatBot.vectordb.similarity_search(message, k=APP_CONFIG.k)
+        question = "# User new question:\n" + message
+        retrieved_content = ChatBot.clean_references(docs)
+        # Memory: previous  Q-n-A pairs
+        chat_history = f"Chat history:\n {str(chatbot[-APP_CONFIG.qa_pair_count:])}\n\n"
+        prompt = f"{chat_history}{retrieved_content}{question}"
+        print("========================")
+        print(prompt)
+        if model_choice == "gpt-3.5-turbo":
+            client = OpenAI()
+            response = client.chat.completions.create(model=model_choice,
+                                                  messages=[
+                                                      {"role": "system", "content": APP_CONFIG.llm_system_role},
+                                                      {"role": "user", "content": prompt}
+                                                      ],
+                                                      temperature=temperature)
+            print(f"Running {model_choice}...", response)
+            chatbot.append((message, response.choices[0].message.content))
+        else:
+            chat_llm = ChatGroq(
+                api_key = os.getenv("GROQ_API_KEY"),
+                model = model_choice,
+                temperature=APP_CONFIG.temperature
+                )
+            # Prompt template
+            prompt = ChatPromptTemplate.from_messages(
+                [
+                    ("system", APP_CONFIG.llm_system_role),
+                    ("human", prompt) # Directly using the message
+                ]
+            )
+            chain = prompt | chat_llm | StrOutputParser()
+            response = chain.invoke({})
+            print("Running {model_choice} via groq...", response)
+            chatbot.append((message, response))
+        time.sleep(2)
+        return "", chatbot, retrieved_content
+    @staticmethod
+    def extract_content(input_text):
+        begin_pattern = r"""page_content='"""
+        end_pattern = r"""'\s*metadata="""
+        between_pattern = rf'{begin_pattern}(.*?){end_pattern}'
+        from_end_pattern = rf"{end_pattern}(.*)"
+        between_match = re.search(between_pattern, input_text, re.DOTALL)
+        from_end_match = re.search(from_end_pattern, input_text, re.DOTALL)
+        between_text = between_match.group(1) if between_match else None
+        from_end_text = from_end_match.group(1) if from_end_match else None
+        return between_text, from_end_text
+    @staticmethod
+    def clean_references(documents: List,) -> str:
+        server_url = "http://localhost:8000"
+        documents = [str(x)+"\n\n" for x in documents]
+        markdown_documents = ""
+        counter = 1
+        for doc in documents:
+            content, metadata = re.match(r"page_content=(.*?)( metadata=\{.*\})", doc).groups()
+            metadata = metadata.split('=', 1)[1]
+            metadata_dict = ast.literal_eval(metadata)
+            content = bytes(content, "utf-8").decode("unicode_escape")
+            content = re.sub(r'\\n', '\n', content)
+            content = re.sub(r'\s*<EOS>\s*<pad>\s*', ' ', content)
+            content = re.sub(r'\s+', ' ', content).strip()
+            content = html.unescape(content)
+            content = content.encode('latin1').decode('utf-8', 'ignore')
+            pdf_url = f"{server_url}/{os.path.basename(metadata_dict['source'])}"
+            markdown_documents += f"# Retrieved content {counter}:\n" + content + "\n\n" + \
+                f"Source: {os.path.basename(metadata_dict['source'])}" + " | " +\
+                f"Page number: {str(metadata_dict['page'])}" + " | " +\
+                f"[View PDF]({pdf_url})" "\n\n"
+            counter += 1
+        return markdown_documents

src/load_config.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import openai
+import os
+from dotenv import load_dotenv
+import yaml
+from langchain.embeddings.openai import OpenAIEmbeddings
+from pyprojroot import here # for creating top-level directories in project without changing setwd()
+import shutil
+load_dotenv()
+class LoadConfig:
+    """
+    A class for loading configuration settings and managing directories.
+    This class loads various configuration settings from the 'app_config.yml' file,
+    including LLM configurations, retrieval configurations, and memory configurations.
+    It also performs directory-related operations such as creating and removing directories.
+    ...
+    Attributes:
+        llm_engine : str
+            The language model engine specified in the configuration.
+        llm_system_role : str
+            The role of the language model system specified in the configuration.
+        persist_directory : str
+            The path to the persist directory where data is stored.
+        custom_persist_directory : str
+            The path to the custom persist directory.
+        embedding_model : OpenAIEmbeddings
+            An instance of the OpenAIEmbeddings class for language model embeddings.
+        data_directory : str
+            The path to the data directory.
+        k : int
+            The value of 'k' specified in the retrieval configuration.
+        embedding_model_engine : str
+            The engine specified in the embedding model configuration.
+        chunk_size : int
+            The chunk size specified in the splitter configuration.
+        chunk_overlap : int
+            The chunk overlap specified in the splitter configuration.
+        temperature : float
+            The temperature specified in the LLM configuration.
+        qa_pair_count : int
+            The number of question-answer pairs specified in the memory configuration.
+    Methods:
+        create_directory(directory_path):
+            Create a directory if it does not exist.
+        remove_directory(directory_path):
+            Removes the specified directory.
+    """
+    def __init__(self) -> None:
+        with open(here("configs/app_config.yml")) as cfg:
+            app_config = yaml.load(cfg, Loader=yaml.FullLoader)
+        # llm configs
+        self.gpt_model = app_config["llm_config"]["gpt_model"]
+        self.llama3_70bmodel = app_config["llm_config"]["llama3_70bmodel"]
+        self.llm_system_role = app_config["llm_config"]["llm_system_role"]
+        self.persist_directory = str(here(app_config["directories"]["persist_directory"]))  # converting to string for adding in chromadb backend: self._settings.require("persist_directory") + "/chroma.sqlite3"
+        self.custom_persist_directory = str(here(app_config["directories"]["custom_persist_directory"]))
+        self.embedding_model = OpenAIEmbeddings()
+        # Retrieval configs
+        self.data_directory = app_config["directories"]["data_directory"]
+        self.k = app_config["retrieval_config"]["k"]
+        self.num_of_final_doc = app_config["retrieval_config"]["num_of_final_doc"]
+        self.embedding_model_engine = app_config["embedding_model_config"]["engine"]
+        self.chunk_size = app_config["splitter_config"]["chunk_size"]
+        self.chunk_overlap = app_config["splitter_config"]["chunk_overlap"]
+        self.temperature = app_config["llm_config"]["temperature"]
+        # Memory
+        self.qa_pair_count = app_config["memory"]["qa_pair_count"]
+        # Load OpenAI credentials
+        #self.load_openai_cfg()
+        # clean up the upload doc vectordb if it exists
+        self.create_directory(self.persist_directory)
+        self.remove_directory(self.custom_persist_directory)
+    def create_directory(self, directory_path: str):
+        """
+        Create a directory if it does not exist.
+        Parameters:
+            directory_path (str): The path of the directory to be created.
+        """
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+    def remove_directory(self, directory_path: str):
+        """
+        Removes the specified directory.
+        Parameters:
+            directory_path (str): The path of the directory to be removed.
+        Raises:
+            OSError: If an error occurs during the directory removal process.
+        Returns:
+            None
+        """
+        if os.path.exists(directory_path):
+            try:
+                shutil.rmtree(directory_path)
+                print(
+                    f"The directory '{directory_path}' has been successfully removed.")
+            except OSError as e:
+                print(f"Error: {e}")
+        else:
+            print(f"The directory '{directory_path}' does not exist.")

src/prepare_bgesmall_vectordb.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from typing import List
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.embeddings import HuggingFaceBgeEmbeddings
+class PrepareVectorDB:
+    """
+    A class for preparing and saving a VectorDB using OpenAI embeddings.
+    Involves process of loading documents, chunking them, and creating a VectorDB
+    with OpenAI embeddings. contains methods to prepare & save the vecotordb.
+    Parameters:
+        data_directory (str):  Directory or list of directories containing the documents.
+        persist_directory (str): Directory to save the VectorDB.
+        embedding_model_engine (str): The engine for OpenAI embeddings.
+        chunk_size (int): The size of the chunks for document processing.
+        chunk_overlap (int): The overlap between chunks.
+    """
+    def __init__(
+            self,
+            data_directory: str,
+            persist_directory: str,
+            embedding_model_engine: str,
+            chunk_size: int,
+            chunk_overlap: int) -> None:
+        """
+        Initializing the PrepareVectorDB instance.
+        Parameters:
+            data_directory (str):  Directory or list of directories containing the documents.
+            persist_directory (str): Directory to save the VectorDB.
+            embedding_model_engine (str): The engine for OpenAI embeddings.
+            chunk_size (int): The size of the chunks for document processing.
+            chunk_overlap (int): The overlap between chunks.
+        """
+        self.embedding_model_engine = embedding_model_engine
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=[
+                "\n#{1,6} ",
+                "```\n",
+                "\n\\*\\*\\*+\n",
+                "\n---+\n",
+                "\n___+\n",
+                "\n\n",
+                "\n",
+                " ",
+                "",
+                ]
+        )
+        """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
+        self.data_directory = data_directory
+        self.persist_directory = persist_directory
+        self.embedding = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
+                                                  model_kwargs={'device': 'cpu'},
+                                                  encode_kwargs={'normalize_embeddings': True})
+    def __load_all_documents(self) -> List:
+        """
+        Load all documents from the specified directory or directories and
+        handles the documents obtained live during chat.
+        Returns:
+            List: A list of loaded documents.
+        """
+        doc_counter = 0
+        if isinstance(self.data_directory, list):
+            print("Loading the uploaded documents...")
+            docs = [doc for doc_dir in self.data_directory
+                    for doc in PyPDFLoader(doc_dir).load()]
+        else:
+            print("Loading documents manually...")
+            document_list = os.listdir(self.data_directory)
+            docs = [doc for doc_name in document_list
+                    for doc in PyPDFLoader(os.path.join(
+                        self.data_directory, doc_name)).load()]
+        doc_counter = len(docs)
+        print(f"Number of loaded documents: {doc_counter}")
+        print(f"Number of pages: {len(docs)}\n\n")
+        return docs
+    def __chunk_documents(self, docs: List) -> List:
+        """
+        Chunk the loaded documents using the specified text splitter.
+        Parameters:
+            docs (List): The list of loaded documents.
+        Returns:
+            List: A list of chunked documents.
+        """
+        print("Chunking documents...")
+        chunked_documents = self.text_splitter.split_documents(docs)
+        print("Number of chunks:", len(chunked_documents), "\n\n")
+        return chunked_documents
+    def prepare_and_save_vectordb(self):
+        """
+        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
+        Returns:
+            Chroma: The created VectorDB.
+        """
+        docs = self.__load_all_documents()
+        chunked_documents = self.__chunk_documents(docs)
+        print("Preparing vectordb...")
+        vectordb = Chroma.from_documents(
+            documents=chunked_documents,
+            embedding=self.embedding,
+            persist_directory=self.persist_directory
+        )
+        print("Vectordb created and saved!")
+        print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
+        return vectordb

src/prepare_openAIEmbeddings_vectordb.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+from typing import List
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings
+class PrepareVectorDB:
+    """
+    A class for preparing and saving a VectorDB using OpenAI embeddings.
+    Involves process of loading documents, chunking them, and creating a VectorDB
+    with OpenAI embeddings. contains methods to prepare & save the vecotordb.
+    Parameters:
+        data_directory (str):  Directory or list of directories containing the documents.
+        persist_directory (str): Directory to save the VectorDB.
+        embedding_model_engine (str): The engine for OpenAI embeddings.
+        chunk_size (int): The size of the chunks for document processing.
+        chunk_overlap (int): The overlap between chunks.
+    """
+    def __init__(
+            self,
+            data_directory: str,
+            persist_directory: str,
+            embedding_model_engine: str,
+            chunk_size: int,
+            chunk_overlap: int) -> None:
+        """
+        Initializing the PrepareVectorDB instance.
+        Parameters:
+            data_directory (str):  Directory or list of directories containing the documents.
+            persist_directory (str): Directory to save the VectorDB.
+            embedding_model_engine (str): The engine for OpenAI embeddings.
+            chunk_size (int): The size of the chunks for document processing.
+            chunk_overlap (int): The overlap between chunks.
+        """
+        self.embedding_model_engine = embedding_model_engine
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators = [
+                "\n#{1,6} ",
+                "```\n",
+                "\n\\*\\*\\*+\n",
+                "\n---+\n",
+                "\n___+\n",
+                "\n\n",
+                "\n",
+                " ",
+                "",
+                ]
+        )
+        """choices: MarkdownHeaderTextSplitter,TokenTextSplitter, etc."""
+        self.data_directory = data_directory
+        self.persist_directory = persist_directory
+        self.embedding = OpenAIEmbeddings()
+    def __load_all_documents(self) -> List:
+        """
+        Load all documents from the specified directory or directories and
+        handles the documents obtained live during chat.
+        Returns:
+            List: A list of loaded documents.
+        """
+        doc_counter = 0
+        if isinstance(self.data_directory, list):
+            print("Loading the uploaded documents...")
+            docs = [doc for doc_dir in self.data_directory
+                    for doc in PyPDFLoader(doc_dir).load()]
+        else:
+            print("Loading documents manually...")
+            document_list = os.listdir(self.data_directory)
+            docs = [doc for doc_name in document_list
+                    for doc in PyPDFLoader(os.path.join(
+                        self.data_directory, doc_name)).load()]
+        doc_counter = len(docs)
+        print(f"Number of loaded documents: {doc_counter}")
+        print(f"Number of pages: {len(docs)}\n\n")
+        return docs
+    def __chunk_documents(self, docs: List) -> List:
+        """
+        Chunk the loaded documents using the specified text splitter.
+        Parameters:
+            docs (List): The list of loaded documents.
+        Returns:
+            List: A list of chunked documents.
+        """
+        print("Chunking documents...")
+        chunked_documents = self.text_splitter.split_documents(docs)
+        print("Number of chunks:", len(chunked_documents), "\n\n")
+        return chunked_documents
+    def prepare_and_save_vectordb(self):
+        """
+        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
+        Returns:
+            Chroma: The created VectorDB.
+        """
+        docs = self.__load_all_documents()
+        chunked_documents = self.__chunk_documents(docs)
+        print("Preparing vectordb...")
+        vectordb = Chroma.from_documents(
+            documents=chunked_documents,
+            embedding=self.embedding,
+            persist_directory=self.persist_directory
+        )
+        print("Vectordb created and saved!")
+        print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
+        return vectordb

src/ui_settings.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+class UISettings:
+    """
+    Utility class for managing UI settings.
+    This class provides static methods for toggling UI components, such as a sidebar and feedback.
+    """
+    @staticmethod
+    def toggle_sidebar(state):
+        """
+        Toggle the visibility state of a UI component.
+        Parameters:
+            state: The current state of the UI component.
+        Returns:
+            Tuple: A tuple containing the updated UI component state and the new state.
+        """
+        state = not state
+        return gr.update(visible=state), state
+    @staticmethod
+    def feedback(data: gr.LikeData):
+        """
+        Process user feedback on the generated response.
+        Parameters:
+            data (gr.LikeData): Gradio LikeData object containing user feedback.
+        """
+        if data.liked:
+            print("You upvoted this response: " + data.value)
+        else:
+            print("You downvoted this response: " + data.value)

src/upload_data_manually.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+#comment this if OPENAI EMbeddings are requireed
+#from prepare_bgesmall_vectordb import PrepareVectorDB
+from prepare_openAIEmbeddings_vectordb import PrepareVectorDB
+from load_config import LoadConfig
+CONFIG = LoadConfig()
+def upload_data_manually() -> None:
+    """
+    Uploads data manually to the VectorDB.
+    This function initializes a PrepareVectorDB instance with configuration parameters
+    such as data_directory, persist_directory, embedding_model_engine, chunk_size,
+    and chunk_overlap. It then checks if the VectorDB already exists in the specified
+    persist_directory. If not, it calls the prepare_and_save_vectordb method to
+    create and save the VectorDB. If the VectorDB already exists, a message is printed
+    indicating its presence.
+    Returns:
+        None
+    """
+    prepare_vectordb_instance = PrepareVectorDB(
+        data_directory=CONFIG.data_directory,
+        persist_directory=CONFIG.persist_directory,
+        embedding_model_engine=CONFIG.embedding_model_engine,
+        chunk_size=CONFIG.chunk_size,
+        chunk_overlap=CONFIG.chunk_overlap,
+    )
+    if not len(os.listdir(CONFIG.persist_directory)) != 0:
+        prepare_vectordb_instance.prepare_and_save_vectordb()
+    else:
+        print(f"VectorDB already exists in {CONFIG.persist_directory}")
+    return None
+if __name__ == "__main__":
+    upload_data_manually()

src/upload_file.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from src.prepare_openAIEmbeddings_vectordb import PrepareVectorDB
+from typing import List, Tuple
+from src.load_config import LoadConfig
+APP_CONFIG = LoadConfig()
+class UploadFile:
+    """
+    Utility class for handling file uploads and processing.
+    This class provides static methods for checking directories and processing uploaded files
+    to prepare a VectorDB.
+    """
+    @staticmethod
+    def process_uploaded_files(files_dir: List, chatbot: List, rag_with_dropdown: str) -> Tuple:
+        """
+        Prepares and saves a VectorDB from uploaded files.
+        Parameters:
+            files_dir (List): List of paths to the uploaded files.
+            chatbot: An instance of the chatbot for communication.
+        Returns:
+            Tuple: A tuple containing an empty string and the updated chatbot instance.
+        """
+        if rag_with_dropdown == "Upload docs to chat with:":
+            prepare_vectordb_instance = PrepareVectorDB(data_directory=files_dir,
+                                                        persist_directory=APP_CONFIG.custom_persist_directory,
+                                                        embedding_model_engine=APP_CONFIG.embedding_model_engine,
+                                                        chunk_size=APP_CONFIG.chunk_size,
+                                                        chunk_overlap=APP_CONFIG.chunk_overlap)
+            prepare_vectordb_instance.prepare_and_save_vectordb()
+            chatbot.append(
+                (" ", "Uploaded files are ready for querying."))
+        else:
+            chatbot.append(
+                (" ", "If you want to upload your own PDF, please select 'rag_with' from the dropdown."))
+        return "", chatbot