Spaces:

Asiya057
/

Incarna-Mind

Sleeping

App Files Files Community

Asiya057 commited on Aug 5, 2024

Commit

ba9f995

verified ·

1 Parent(s): 3ca4d99

upload model

Browse files

Files changed (22) hide show

.gitattributes +5 -0
.gitignore +12 -0
configparser.ini +169 -0
convo_qa_chain.py +387 -0
data/ABPI Code of Practice for the Pharmaceutical Industry 2021.pdf +0 -0
data/Attention Is All You Need.pdf +3 -0
data/Gradient Descent The Ultimate Optimizer.pdf +3 -0
data/JP Morgan 2022 Environmental Social Governance Report.pdf +3 -0
data/Language Models are Few-Shot Learners.pdf +3 -0
data/Language Models are Unsupervised Multitask Learners.pdf +0 -0
data/United Nations 2022 Annual Report.pdf +3 -0
docs2db.py +346 -0
figs/High_Level_Architecture.png +0 -0
figs/Sliding_Window_Chunking.png +0 -0
main.py +150 -0
requirements.txt +13 -1
toolkit/___init__.py +0 -0
toolkit/local_llm.py +193 -0
toolkit/prompts.py +169 -0
toolkit/retrivers.py +643 -0
toolkit/together_api_llm.py +72 -0
toolkit/utils.py +389 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Attention[[:space:]]Is[[:space:]]All[[:space:]]You[[:space:]]Need.pdf filter=lfs diff=lfs merge=lfs -text
+data/Gradient[[:space:]]Descent[[:space:]]The[[:space:]]Ultimate[[:space:]]Optimizer.pdf filter=lfs diff=lfs merge=lfs -text
+data/JP[[:space:]]Morgan[[:space:]]2022[[:space:]]Environmental[[:space:]]Social[[:space:]]Governance[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text
+data/Language[[:space:]]Models[[:space:]]are[[:space:]]Few-Shot[[:space:]]Learners.pdf filter=lfs diff=lfs merge=lfs -text
+data/United[[:space:]]Nations[[:space:]]2022[[:space:]]Annual[[:space:]]Report.pdf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.DS_Store
+.history
+.vscode
+__pycache__
+Archieve
+database_store
+IncarnaMind.log
+experiments.ipynb
+.pylintrc
+.flake8
+models/
+model/

configparser.ini ADDED Viewed

	@@ -0,0 +1,169 @@

+[tokens]
+; Enter one/all of your API key here.
+; E.g., OPENAI_API_KEY = sk-xxxxxxx
+OPENAI_API_KEY = sk-proj-2JwvyIn7WoKlkbjPOYVWT3BlbkFJnGAk65YAzvPH6cEVQXmr
+ANTHROPIC_API_KEY = xxxxx
+TOGETHER_API_KEY = xxxxx
+; if you use Meta-Llama models, you may need Huggingface token to access.
+HUGGINGFACE_TOKEN = xxxxx
+VERSION = 1.0.1
+[directory]
+; Directory for source files.
+DOCS_DIR = ./data
+; Directory to store embeddings and Langchain documents.
+DB_DIR = ./database_store
+LOCAL_MODEL_DIR = ./models
+; The below parameters are optional to modify:
+; --------------------------------------------
+[parameters]
+; Model name schema: Model Provider|Model Name|Model File. Model File is only valid for GGUF format, set None for other format.
+; For example:
+; OpenAI|gpt-3.5-turbo|None
+; OpenAI|gpt-4|None
+; Anthropic|claude-2.0|None
+; Together|togethercomputer/llama-2-70b-chat|None
+; HuggingFace|TheBloke/Llama-2-70B-chat-GGUF|llama-2-70b-chat.q4_K_M.gguf
+; HuggingFace|meta-llama/Llama-2-70b-chat-hf|None
+; The full Together.AI model list can be found in the end of this file; We currently only support quantized gguf and the full huggingface local LLMs.
+MODEL_NAME = OpenAI|gpt-4-1106-preview|None
+; LLM temperature
+TEMPURATURE = 0
+; Maximum tokens for storing chat history.
+MAX_CHAT_HISTORY = 800
+; Maximum tokens for LLM context for retrieved information.
+MAX_LLM_CONTEXT = 1200
+; Maximum tokens for LLM generation.
+MAX_LLM_GENERATION = 1000
+; Supported embeddings: openAIEmbeddings and hkunlpInstructorLarge.
+EMBEDDING_NAME = openAIEmbeddings
+; This is dependent on your GPU type.
+N_GPU_LAYERS = 100
+; this is depend on your GPU and CPU ram when using open source LLMs.
+N_BATCH = 512
+; The base (small) chunk size for first stage document retrieval.
+BASE_CHUNK_SIZE = 100
+; Set to 0 for no overlap.
+CHUNK_OVERLAP = 0
+; The final retrieval (medium) chunk size will be BASE_CHUNK_SIZE * CHUNK_SCALE.
+CHUNK_SCALE = 3
+WINDOW_STEPS = 3
+; The # tokens of window chunk will be BASE_CHUNK_SIZE * WINDOW_SCALE.
+WINDOW_SCALE = 18
+; Ratio of BM25 retriever to Chroma Vectorstore retriever.
+RETRIEVER_WEIGHTS = 0.5, 0.5
+; Number of retrieved chunks will range from FIRST_RETRIEVAL_K to 2*FIRST_RETRIEVAL_K due to the ensemble retriever.
+FIRST_RETRIEVAL_K = 3
+; Number of retrieved chunks will range from SECOND_RETRIEVAL_K to 2*SECOND_RETRIEVAL_K due to the ensemble retriever.
+SECOND_RETRIEVAL_K = 3
+; Number of windows (large chunks) for the third retriever.
+NUM_WINDOWS = 2
+; (The third retrieval gets the final chunks passed to the LLM QA chain. The 'k' value is dynamic (based on MAX_LLM_CONTEXT), depending on the number of rephrased questions and retrieved documents.)
+[logging]
+; If you do not want to enable logging, set enabled to False.
+enabled = True
+level = INFO
+filename = IncarnaMind.log
+format = %(asctime)s [%(levelname)s] %(name)s: %(message)s
+; Together.AI supported models:
+; 0 Austism/chronos-hermes-13b
+; 1 EleutherAI/pythia-12b-v0
+; 2 EleutherAI/pythia-1b-v0
+; 3 EleutherAI/pythia-2.8b-v0
+; 4 EleutherAI/pythia-6.9b
+; 5 Gryphe/MythoMax-L2-13b
+; 6 HuggingFaceH4/starchat-alpha
+; 7 NousResearch/Nous-Hermes-13b
+; 8 NousResearch/Nous-Hermes-Llama2-13b
+; 9 NumbersStation/nsql-llama-2-7B
+; 10 OpenAssistant/llama2-70b-oasst-sft-v10
+; 11 OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
+; 12 OpenAssistant/stablelm-7b-sft-v7-epoch-3
+; 13 Phind/Phind-CodeLlama-34B-Python-v1
+; 14 Phind/Phind-CodeLlama-34B-v2
+; 15 SG161222/Realistic_Vision_V3.0_VAE
+; 16 WizardLM/WizardCoder-15B-V1.0
+; 17 WizardLM/WizardCoder-Python-34B-V1.0
+; 18 WizardLM/WizardLM-70B-V1.0
+; 19 bigcode/starcoder
+; 20 databricks/dolly-v2-12b
+; 21 databricks/dolly-v2-3b
+; 22 databricks/dolly-v2-7b
+; 23 defog/sqlcoder
+; 24 garage-bAInd/Platypus2-70B-instruct
+; 25 huggyllama/llama-13b
+; 26 huggyllama/llama-30b
+; 27 huggyllama/llama-65b
+; 28 huggyllama/llama-7b
+; 29 lmsys/fastchat-t5-3b-v1.0
+; 30 lmsys/vicuna-13b-v1.3
+; 31 lmsys/vicuna-13b-v1.5-16k
+; 32 lmsys/vicuna-13b-v1.5
+; 33 lmsys/vicuna-7b-v1.3
+; 34 prompthero/openjourney
+; 35 runwayml/stable-diffusion-v1-5
+; 36 stabilityai/stable-diffusion-2-1
+; 37 stabilityai/stable-diffusion-xl-base-1.0
+; 38 togethercomputer/CodeLlama-13b-Instruct
+; 39 togethercomputer/CodeLlama-13b-Python
+; 40 togethercomputer/CodeLlama-13b
+; 41 togethercomputer/CodeLlama-34b-Instruct
+; 42 togethercomputer/CodeLlama-34b-Python
+; 43 togethercomputer/CodeLlama-34b
+; 44 togethercomputer/CodeLlama-7b-Instruct
+; 45 togethercomputer/CodeLlama-7b-Python
+; 46 togethercomputer/CodeLlama-7b
+; 47 togethercomputer/GPT-JT-6B-v1
+; 48 togethercomputer/GPT-JT-Moderation-6B
+; 49 togethercomputer/GPT-NeoXT-Chat-Base-20B
+; 50 togethercomputer/Koala-13B
+; 51 togethercomputer/LLaMA-2-7B-32K
+; 52 togethercomputer/Llama-2-7B-32K-Instruct
+; 53 togethercomputer/Pythia-Chat-Base-7B-v0.16
+; 54 togethercomputer/Qwen-7B-Chat
+; 55 togethercomputer/Qwen-7B
+; 56 togethercomputer/RedPajama-INCITE-7B-Base
+; 57 togethercomputer/RedPajama-INCITE-7B-Chat
+; 58 togethercomputer/RedPajama-INCITE-7B-Instruct
+; 59 togethercomputer/RedPajama-INCITE-Base-3B-v1
+; 60 togethercomputer/RedPajama-INCITE-Chat-3B-v1
+; 61 togethercomputer/RedPajama-INCITE-Instruct-3B-v1
+; 62 togethercomputer/alpaca-7b
+; 63 togethercomputer/codegen2-16B
+; 64 togethercomputer/codegen2-7B
+; 65 togethercomputer/falcon-40b-instruct
+; 66 togethercomputer/falcon-40b
+; 67 togethercomputer/falcon-7b-instruct
+; 68 togethercomputer/falcon-7b
+; 69 togethercomputer/guanaco-13b
+; 70 togethercomputer/guanaco-33b
+; 71 togethercomputer/guanaco-65b
+; 72 togethercomputer/guanaco-7b
+; 73 togethercomputer/llama-2-13b-chat
+; 74 togethercomputer/llama-2-13b
+; 75 togethercomputer/llama-2-70b-chat
+; 76 togethercomputer/llama-2-70b
+; 77 togethercomputer/llama-2-7b-chat
+; 78 togethercomputer/llama-2-7b
+; 79 togethercomputer/mpt-30b-chat
+; 80 togethercomputer/mpt-30b-instruct
+; 81 togethercomputer/mpt-30b
+; 82 togethercomputer/mpt-7b-chat
+; 83 togethercomputer/mpt-7b
+; 84 togethercomputer/replit-code-v1-3b
+; 85 upstage/SOLAR-0-70b-16bit
+; 86 wavymulder/Analog-Diffusion

convo_qa_chain.py ADDED Viewed

	@@ -0,0 +1,387 @@

+"""Conversational QA Chain"""
+from __future__ import annotations
+import inspect
+import logging
+from typing import Any, Dict, List, Optional
+from pydantic import Field
+from langchain.schema import BasePromptTemplate, BaseRetriever, Document
+from langchain.schema.language_model import BaseLanguageModel
+from langchain.chains import LLMChain
+from langchain.chains.question_answering import load_qa_chain
+from langchain.chains.conversational_retrieval.base import (
+    BaseConversationalRetrievalChain,
+)
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+    Callbacks,
+)
+from toolkit.utils import (
+    Config,
+    _get_chat_history,
+    _get_standalone_questions_list,
+)
+from toolkit.retrivers import MyRetriever
+from toolkit.prompts import PromptTemplates
+configs = Config("configparser.ini")
+logger = logging.getLogger(__name__)
+prompt_templates = PromptTemplates()
+class ConvoRetrievalChain(BaseConversationalRetrievalChain):
+    """Chain for having a conversation based on retrieved documents.
+    This chain takes in chat history (a list of messages) and new questions,
+    and then returns an answer to that question.
+    The algorithm for this chain consists of three parts:
+    1. Use the chat history and the new question to create a "standalone question".
+    This is done so that this question can be passed into the retrieval step to fetch
+    relevant documents. If only the new question was passed in, then relevant context
+    may be lacking. If the whole conversation was passed into retrieval, there may
+    be unnecessary information there that would distract from retrieval.
+    2. This new question is passed to the retriever and relevant documents are
+    returned.
+    3. The retrieved documents are passed to an LLM along with either the new question
+    (default behavior) or the original question and chat history to generate a final
+    response.
+    Example:
+        .. code-block:: python
+            from langchain.chains import (
+                StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
+            )
+            from langchain.prompts import PromptTemplate
+            from langchain.llms import OpenAI
+            combine_docs_chain = StuffDocumentsChain(...)
+            vectorstore = ...
+            retriever = vectorstore.as_retriever()
+            # This controls how the standalone question is generated.
+            # Should take `chat_history` and `question` as input variables.
+            template = (
+                "Combine the chat history and follow up question into "
+                "a standalone question. Chat History: {chat_history}"
+                "Follow up question: {question}"
+            )
+            prompt = PromptTemplate.from_template(template)
+            llm = OpenAI()
+            question_generator_chain = LLMChain(llm=llm, prompt=prompt)
+            chain = ConversationalRetrievalChain(
+                combine_docs_chain=combine_docs_chain,
+                retriever=retriever,
+                question_generator=question_generator_chain,
+            )
+    """
+    retriever: MyRetriever = Field(exclude=True)
+    """Retriever to use to fetch documents."""
+    file_names: List = Field(exclude=True)
+    """file_names (List): List of file names used for retrieval."""
+    def _get_docs(
+        self,
+        question: str,
+        inputs: Dict[str, Any],
+        num_query: int,
+        *,
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[Document]:
+        """Get docs."""
+        try:
+            docs = self.retriever.get_relevant_documents(
+                question, num_query=num_query, run_manager=run_manager
+            )
+            return docs
+        except (IOError, FileNotFoundError) as error:
+            logger.error("An error occurred in _get_docs: %s", error)
+            return []
+    def _retrieve(
+        self,
+        question_list: List[str],
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[str]:
+        num_query = len(question_list)
+        accepts_run_manager = (
+            "run_manager" in inspect.signature(self._get_docs).parameters
+        )
+        total_results = {}
+        for question in question_list:
+            docs_dict = (
+                self._get_docs(
+                    question, inputs, num_query=num_query, run_manager=run_manager
+                )
+                if accepts_run_manager
+                else self._get_docs(question, inputs, num_query=num_query)
+            )
+            for file_name, docs in docs_dict.items():
+                if file_name not in total_results:
+                    total_results[file_name] = docs
+                else:
+                    total_results[file_name].extend(docs)
+            logger.info(
+                "-----step_done--------------------------------------------------",
+            )
+        snippets = ""
+        redundancy = set()
+        for file_name, docs in total_results.items():
+            sorted_docs = sorted(docs, key=lambda x: x.metadata["medium_chunk_idx"])
+            temp = "\n".join(
+                doc.page_content
+                for doc in sorted_docs
+                if doc.metadata["page_content_md5"] not in redundancy
+            )
+            redundancy.update(doc.metadata["page_content_md5"] for doc in sorted_docs)
+            snippets += f"\nContext about {file_name}:\n{{{temp}}}\n"
+        return snippets, docs_dict
+    def _call(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
+        question = inputs["question"]
+        get_chat_history = self.get_chat_history or _get_chat_history
+        chat_history_str = get_chat_history(inputs["chat_history"])
+        callbacks = _run_manager.get_child()
+        new_questions = self.question_generator.run(
+            question=question,
+            chat_history=chat_history_str,
+            database=self.file_names,
+            callbacks=callbacks,
+        )
+        logger.info("new_questions: %s", new_questions)
+        new_question_list = _get_standalone_questions_list(new_questions, question)[:3]
+        # print("new_question_list:", new_question_list)
+        logger.info("user_input: %s", question)
+        logger.info("new_question_list: %s", new_question_list)
+        snippets, source_docs = self._retrieve(
+            new_question_list, inputs, run_manager=_run_manager
+        )
+        docs = [
+            Document(
+                page_content=snippets,
+                metadata={},
+            )
+        ]
+        new_inputs = inputs.copy()
+        new_inputs["chat_history"] = chat_history_str
+        answer = self.combine_docs_chain.run(
+            input_documents=docs,
+            database=self.file_names,
+            callbacks=_run_manager.get_child(),
+            **new_inputs,
+        )
+        output: Dict[str, Any] = {self.output_key: answer}
+        if self.return_source_documents:
+            output["source_documents"] = source_docs
+        if self.return_generated_question:
+            output["generated_question"] = new_questions
+        logger.info("*****response*****: %s", output["answer"])
+        logger.info(
+            "=====epoch_done============================================================",
+        )
+        return output
+    async def _aget_docs(
+        self,
+        question: str,
+        inputs: Dict[str, Any],
+        num_query: int,
+        *,
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> List[Document]:
+        """Get docs."""
+        try:
+            docs = await self.retriever.aget_relevant_documents(
+                question, num_query=num_query, run_manager=run_manager
+            )
+            return docs
+        except (IOError, FileNotFoundError) as error:
+            logger.error("An error occurred in _get_docs: %s", error)
+            return []
+    async def _aretrieve(
+        self,
+        question_list: List[str],
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        num_query = len(question_list)
+        accepts_run_manager = (
+            "run_manager" in inspect.signature(self._get_docs).parameters
+        )
+        total_results = {}
+        for question in question_list:
+            docs_dict = (
+                await self._aget_docs(
+                    question, inputs, num_query=num_query, run_manager=run_manager
+                )
+                if accepts_run_manager
+                else await self._aget_docs(question, inputs, num_query=num_query)
+            )
+            for file_name, docs in docs_dict.items():
+                if file_name not in total_results:
+                    total_results[file_name] = docs
+                else:
+                    total_results[file_name].extend(docs)
+            logger.info(
+                "-----step_done--------------------------------------------------",
+            )
+        snippets = ""
+        redundancy = set()
+        for file_name, docs in total_results.items():
+            sorted_docs = sorted(docs, key=lambda x: x.metadata["medium_chunk_idx"])
+            temp = "\n".join(
+                doc.page_content
+                for doc in sorted_docs
+                if doc.metadata["page_content_md5"] not in redundancy
+            )
+            redundancy.update(doc.metadata["page_content_md5"] for doc in sorted_docs)
+            snippets += f"\nContext about {file_name}:\n{{{temp}}}\n"
+        return snippets, docs_dict
+    async def _acall(
+        self,
+        inputs: Dict[str, Any],
+        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
+    ) -> Dict[str, Any]:
+        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
+        question = inputs["question"]
+        get_chat_history = self.get_chat_history or _get_chat_history
+        chat_history_str = get_chat_history(inputs["chat_history"])
+        callbacks = _run_manager.get_child()
+        new_questions = await self.question_generator.arun(
+            question=question,
+            chat_history=chat_history_str,
+            database=self.file_names,
+            callbacks=callbacks,
+        )
+        new_question_list = _get_standalone_questions_list(new_questions, question)[:3]
+        logger.info("new_questions: %s", new_questions)
+        logger.info("new_question_list: %s", new_question_list)
+        snippets, source_docs = await self._aretrieve(
+            new_question_list, inputs, run_manager=_run_manager
+        )
+        docs = [
+            Document(
+                page_content=snippets,
+                metadata={},
+            )
+        ]
+        new_inputs = inputs.copy()
+        new_inputs["chat_history"] = chat_history_str
+        answer = await self.combine_docs_chain.arun(
+            input_documents=docs,
+            database=self.file_names,
+            callbacks=_run_manager.get_child(),
+            **new_inputs,
+        )
+        output: Dict[str, Any] = {self.output_key: answer}
+        if self.return_source_documents:
+            output["source_documents"] = source_docs
+        if self.return_generated_question:
+            output["generated_question"] = new_questions
+        logger.info("*****response*****: %s", output["answer"])
+        logger.info(
+            "=====epoch_done============================================================",
+        )
+        return output
+    @classmethod
+    def from_llm(
+        cls,
+        llm: BaseLanguageModel,
+        retriever: BaseRetriever,
+        condense_question_prompt: BasePromptTemplate = prompt_templates.get_refine_qa_template(
+            configs.model_name
+        ),
+        chain_type: str = "stuff",  # only support stuff chain now
+        verbose: bool = False,
+        condense_question_llm: Optional[BaseLanguageModel] = None,
+        combine_docs_chain_kwargs: Optional[Dict] = None,
+        callbacks: Callbacks = None,
+        **kwargs: Any,
+    ) -> BaseConversationalRetrievalChain:
+        """Convenience method to load chain from LLM and retriever.
+        This provides some logic to create the `question_generator` chain
+        as well as the combine_docs_chain.
+        Args:
+            llm: The default language model to use at every part of this chain
+                (eg in both the question generation and the answering)
+            retriever: The retriever to use to fetch relevant documents from.
+            condense_question_prompt: The prompt to use to condense the chat history
+                and new question into standalone question(s).
+            chain_type: The chain type to use to create the combine_docs_chain, will
+                be sent to `load_qa_chain`.
+            verbose: Verbosity flag for logging to stdout.
+            condense_question_llm: The language model to use for condensing the chat
+                history and new question into standalone question(s). If none is
+                provided, will default to `llm`.
+            combine_docs_chain_kwargs: Parameters to pass as kwargs to `load_qa_chain`
+                when constructing the combine_docs_chain.
+            callbacks: Callbacks to pass to all subchains.
+            **kwargs: Additional parameters to pass when initializing
+                ConversationalRetrievalChain
+        """
+        combine_docs_chain_kwargs = combine_docs_chain_kwargs or {
+            "prompt": prompt_templates.get_retrieval_qa_template_selector(
+                configs.model_name
+            ).get_prompt(llm)
+        }
+        doc_chain = load_qa_chain(
+            llm,
+            chain_type=chain_type,
+            verbose=verbose,
+            callbacks=callbacks,
+            **combine_docs_chain_kwargs,
+        )
+        _llm = condense_question_llm or llm
+        condense_question_chain = LLMChain(
+            llm=_llm,
+            prompt=condense_question_prompt,
+            verbose=verbose,
+            callbacks=callbacks,
+        )
+        return cls(
+            retriever=retriever,
+            combine_docs_chain=doc_chain,
+            question_generator=condense_question_chain,
+            callbacks=callbacks,
+            **kwargs,
+        )

data/ABPI Code of Practice for the Pharmaceutical Industry 2021.pdf ADDED Viewed

Binary file (803 kB). View file

data/Attention Is All You Need.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7d72988fd8107d07f7d278bf0ba6621adb6ed47df74be4014fa4a01f03aff6a
+size 2215244

data/Gradient Descent The Ultimate Optimizer.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c76077e02756ef3281ce3b1195d080009cb88e00382a8fc225948db339053296
+size 1923635

data/JP Morgan 2022 Environmental Social Governance Report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80eab2c81a6c82bde9ccff1a8636fddc8ce1457a13c833d8a7f1e374a4bb439f
+size 7474626

data/Language Models are Few-Shot Learners.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97fd272f1fdfc18677462d0292f5fbf26ca86b4d1b485c2dba03269b643a0e83
+size 6768044

data/Language Models are Unsupervised Multitask Learners.pdf ADDED Viewed

Binary file (583 kB). View file

data/United Nations 2022 Annual Report.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4ee2835c06f98e74ab93aa69a0c026577c464fc6bd3942068f14cba5dcad536
+size 36452281

docs2db.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+This module save documents to embeddings and langchain Documents.
+"""
+import os
+import glob
+import pickle
+from typing import List
+from multiprocessing import Pool
+from collections import deque
+import hashlib
+import tiktoken
+from tqdm import tqdm
+from langchain.schema import Document
+from langchain.vectorstores import Chroma
+from langchain.text_splitter import (
+    RecursiveCharacterTextSplitter,
+)
+from langchain.document_loaders import (
+    PyPDFLoader,
+    TextLoader,
+)
+from toolkit.utils import Config, choose_embeddings, clean_text
+# Load the config file
+configs = Config("configparser.ini")
+os.environ["OPENAI_API_KEY"] = configs.openai_api_key
+os.environ["ANTHROPIC_API_KEY"] = configs.anthropic_api_key
+embedding_store_path = configs.db_dir
+files_path = glob.glob(configs.docs_dir + "/*")
+tokenizer_name = tiktoken.encoding_for_model("gpt-3.5-turbo")
+tokenizer = tiktoken.get_encoding(tokenizer_name.name)
+loaders = {
+    "pdf": (PyPDFLoader, {}),
+    "txt": (TextLoader, {}),
+}
+def tiktoken_len(text: str):
+    """Calculate the token length of a given text string using TikToken.
+    Args:
+        text (str): The text to be tokenized.
+    Returns:
+        int: The length of the tokenized text.
+    """
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return len(tokens)
+def string2md5(text: str):
+    """Convert a string to its MD5 hash.
+    Args:
+        text (str): The text to be hashed.
+    Returns:
+        str: The MD5 hash of the input string.
+    """
+    hash_md5 = hashlib.md5()
+    hash_md5.update(text.encode("utf-8"))
+    return hash_md5.hexdigest()
+def load_file(file_path):
+    """Load a file and return its content as a Document object.
+    Args:
+        file_path (str): The path to the file.
+    Returns:
+        Document: The loaded document.
+    """
+    ext = file_path.split(".")[-1]
+    if ext in loaders:
+        loader_type, args = loaders[ext]
+        loader = loader_type(file_path, **args)
+        doc = loader.load()
+        return doc
+    raise ValueError(f"Extension {ext} not supported")
+def docs2vectorstore(docs: List[Document], embedding_name: str, suffix: str = ""):
+    """Convert a list of Documents into a Chroma vector store.
+    Args:
+        docs (Document): The list of Documents.
+        suffix (str, optional): Suffix for the embedding. Defaults to "".
+    """
+    embedding = choose_embeddings(embedding_name)
+    name = f"{embedding_name}_{suffix}"
+    # if embedding_store_path is not existing, create it
+    if not os.path.exists(embedding_store_path):
+        os.makedirs(embedding_store_path)
+    Chroma.from_documents(
+        docs,
+        embedding,
+        persist_directory=f"{embedding_store_path}/chroma_{name}",
+    )
+def file_names2pickle(file_names: list, save_name: str = ""):
+    """Save the list of file names to a pickle file.
+    Args:
+        file_names (list): The list of file names.
+        save_name (str, optional): The name for the saved pickle file. Defaults to "".
+    """
+    name = f"{save_name}"
+    if not os.path.exists(embedding_store_path):
+        os.makedirs(embedding_store_path)
+    with open(f"{embedding_store_path}/{name}.pkl", "wb") as file:
+        pickle.dump(file_names, file)
+def docs2pickle(docs: List[Document], suffix: str = ""):
+    """Serializes a list of Document objects to a pickle file.
+    Args:
+        docs (Document): List of Document objects.
+        suffix (str, optional): Suffix for the pickle file. Defaults to "".
+    """
+    for doc in docs:
+        doc.page_content = clean_text(doc.page_content)
+    name = f"pickle_{suffix}"
+    if not os.path.exists(embedding_store_path):
+        os.makedirs(embedding_store_path)
+    with open(f"{embedding_store_path}/docs_{name}.pkl", "wb") as file:
+        pickle.dump(docs, file)
+def split_doc(
+    doc: List[Document], chunk_size: int, chunk_overlap: int, chunk_idx_name: str
+):
+    """Splits a document into smaller chunks based on the provided size and overlap.
+    Args:
+        doc (Document): Document to be split.
+        chunk_size (int): Size of each chunk.
+        chunk_overlap (int): Overlap between adjacent chunks.
+        chunk_idx_name (str): Metadata key for storing chunk indices.
+    Returns:
+        list: List of Document objects representing the chunks.
+    """
+    data_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=tiktoken_len,
+    )
+    doc_split = data_splitter.split_documents(doc)
+    chunk_idx = 0
+    for d_split in doc_split:
+        d_split.metadata[chunk_idx_name] = chunk_idx
+        chunk_idx += 1
+    return doc_split
+def process_metadata(doc: List[Document]):
+    """Processes and updates the metadata for a list of Document objects.
+    Args:
+        doc (list): List of Document objects.
+    """
+    # get file name and remove extension
+    file_name_with_extension = os.path.basename(doc[0].metadata["source"])
+    file_name, _ = os.path.splitext(file_name_with_extension)
+    for _, item in enumerate(doc):
+        for key, value in item.metadata.items():
+            if isinstance(value, list):
+                item.metadata[key] = str(value)
+        item.metadata["page_content"] = item.page_content
+        item.metadata["page_content_md5"] = string2md5(item.page_content)
+        item.metadata["source_md5"] = string2md5(item.metadata["source"])
+        item.page_content = f"{file_name}\n{item.page_content}"
+def add_window(
+    doc: Document, window_steps: int, window_size: int, window_idx_name: str
+):
+    """Adds windowing information to the metadata of each document in the list.
+    Args:
+        doc (Document): List of Document objects.
+        window_steps (int): Step size for windowing.
+        window_size (int): Size of each window.
+        window_idx_name (str): Metadata key for storing window indices.
+    """
+    window_id = 0
+    window_deque = deque()
+    for idx, item in enumerate(doc):
+        if idx % window_steps == 0 and idx != 0 and idx < len(doc) - window_size:
+            window_id += 1
+        window_deque.append(window_id)
+        if len(window_deque) > window_size:
+            for _ in range(window_steps):
+                window_deque.popleft()
+        window = set(window_deque)
+        item.metadata[f"{window_idx_name}_lower_bound"] = min(window)
+        item.metadata[f"{window_idx_name}_upper_bound"] = max(window)
+def merge_metadata(dicts_list: dict):
+    """Merges a list of metadata dictionaries into a single dictionary.
+    Args:
+        dicts_list (list): List of metadata dictionaries.
+    Returns:
+        dict: Merged metadata dictionary.
+    """
+    merged_dict = {}
+    bounds_dict = {}
+    keys_to_remove = set()
+    for dic in dicts_list:
+        for key, value in dic.items():
+            if key in merged_dict:
+                if value not in merged_dict[key]:
+                    merged_dict[key].append(value)
+            else:
+                merged_dict[key] = [value]
+    for key, values in merged_dict.items():
+        if len(values) > 1 and all(isinstance(x, (int, float)) for x in values):
+            bounds_dict[f"{key}_lower_bound"] = min(values)
+            bounds_dict[f"{key}_upper_bound"] = max(values)
+            keys_to_remove.add(key)
+    merged_dict.update(bounds_dict)
+    for key in keys_to_remove:
+        del merged_dict[key]
+    return {
+        k: v[0] if isinstance(v, list) and len(v) == 1 else v
+        for k, v in merged_dict.items()
+    }
+def merge_chunks(doc: Document, scale_factor: int, chunk_idx_name: str):
+    """Merges adjacent chunks into larger chunks based on a scaling factor.
+    Args:
+        doc (Document): List of Document objects.
+        scale_factor (int): The number of small chunks to merge into a larger chunk.
+        chunk_idx_name (str): Metadata key for storing chunk indices.
+    Returns:
+        list: List of Document objects representing the merged chunks.
+    """
+    merged_doc = []
+    page_content = ""
+    metadata_list = []
+    chunk_idx = 0
+    for idx, item in enumerate(doc):
+        page_content += item.page_content
+        metadata_list.append(item.metadata)
+        if (idx + 1) % scale_factor == 0 or idx == len(doc) - 1:
+            metadata = merge_metadata(metadata_list)
+            metadata[chunk_idx_name] = chunk_idx
+            merged_doc.append(
+                Document(
+                    page_content=page_content,
+                    metadata=metadata,
+                )
+            )
+            chunk_idx += 1
+            page_content = ""
+            metadata_list = []
+    return merged_doc
+def process_files():
+    """Main function for processing files. Loads, tokenizes, and saves document data."""
+    with Pool() as pool:
+        chunks_small = []
+        chunks_medium = []
+        file_names = []
+        with tqdm(total=len(files_path), desc="Processing files", ncols=80) as pbar:
+            for doc in pool.imap_unordered(load_file, files_path):
+                file_name_with_extension = os.path.basename(doc[0].metadata["source"])
+                # file_name, _ = os.path.splitext(file_name_with_extension)
+                chunk_split_small = split_doc(
+                    doc=doc,
+                    chunk_size=configs.base_chunk_size,
+                    chunk_overlap=configs.chunk_overlap,
+                    chunk_idx_name="small_chunk_idx",
+                )
+                add_window(
+                    doc=chunk_split_small,
+                    window_steps=configs.window_steps,
+                    window_size=configs.window_scale,
+                    window_idx_name="large_chunks_idx",
+                )
+                chunk_split_medium = merge_chunks(
+                    doc=chunk_split_small,
+                    scale_factor=configs.chunk_scale,
+                    chunk_idx_name="medium_chunk_idx",
+                )
+                process_metadata(chunk_split_small)
+                process_metadata(chunk_split_medium)
+                file_names.append(file_name_with_extension)
+                chunks_small.extend(chunk_split_small)
+                chunks_medium.extend(chunk_split_medium)
+                pbar.update()
+    file_names2pickle(file_names, save_name="file_names")
+    docs2vectorstore(chunks_small, configs.embedding_name, suffix="chunks_small")
+    docs2vectorstore(chunks_medium, configs.embedding_name, suffix="chunks_medium")
+    docs2pickle(chunks_small, suffix="chunks_small")
+    docs2pickle(chunks_medium, suffix="chunks_medium")
+if __name__ == "__main__":
+    process_files()

figs/High_Level_Architecture.png ADDED Viewed

figs/Sliding_Window_Chunking.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Conversational QA Chain"""
+from __future__ import annotations
+import os
+import re
+import time
+import logging
+from langchain.chat_models import ChatOpenAI, ChatAnthropic
+from langchain.memory import ConversationTokenBufferMemory
+from convo_qa_chain import ConvoRetrievalChain
+from toolkit.together_api_llm import TogetherLLM
+from toolkit.retrivers import MyRetriever
+from toolkit.local_llm import load_local_llm
+from toolkit.utils import (
+    Config,
+    choose_embeddings,
+    load_embedding,
+    load_pickle,
+    check_device,
+)
+# Load the config file
+configs = Config("configparser.ini")
+logger = logging.getLogger(__name__)
+os.environ["OPENAI_API_KEY"] = configs.openai_api_key
+os.environ["ANTHROPIC_API_KEY"] = configs.anthropic_api_key
+embedding = choose_embeddings(configs.embedding_name)
+db_store_path = configs.db_dir
+# get models
+def get_llm(llm_name: str, temperature: float, max_tokens: int):
+    """Get the LLM model from the model name."""
+    if not os.path.exists(configs.local_model_dir):
+        os.makedirs(configs.local_model_dir)
+    splits = llm_name.split("|")  # [provider, model_name, model_file]
+    if "openai" in splits[0].lower():
+        llm_model = ChatOpenAI(
+            model=splits[1],
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    elif "anthropic" in splits[0].lower():
+        llm_model = ChatAnthropic(
+            model=splits[1],
+            temperature=temperature,
+            max_tokens_to_sample=max_tokens,
+        )
+    elif "together" in splits[0].lower():
+        llm_model = TogetherLLM(
+            model=splits[1],
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+    elif "huggingface" in splits[0].lower():
+        llm_model = load_local_llm(
+            model_id=splits[1],
+            model_basename=splits[-1],
+            temperature=temperature,
+            max_tokens=max_tokens,
+            device_type=check_device(),
+        )
+    else:
+        raise ValueError("Invalid Model Name")
+    return llm_model
+llm = get_llm(configs.model_name, configs.temperature, configs.max_llm_generation)
+# load retrieval database
+db_embedding_chunks_small = load_embedding(
+    store_name=configs.embedding_name,
+    embedding=embedding,
+    suffix="chunks_small",
+    path=db_store_path,
+)
+db_embedding_chunks_medium = load_embedding(
+    store_name=configs.embedding_name,
+    embedding=embedding,
+    suffix="chunks_medium",
+    path=db_store_path,
+)
+db_docs_chunks_small = load_pickle(
+    prefix="docs_pickle", suffix="chunks_small", path=db_store_path
+)
+db_docs_chunks_medium = load_pickle(
+    prefix="docs_pickle", suffix="chunks_medium", path=db_store_path
+)
+file_names = load_pickle(prefix="file", suffix="names", path=db_store_path)
+# Initialize the retriever
+my_retriever = MyRetriever(
+    llm=llm,
+    embedding_chunks_small=db_embedding_chunks_small,
+    embedding_chunks_medium=db_embedding_chunks_medium,
+    docs_chunks_small=db_docs_chunks_small,
+    docs_chunks_medium=db_docs_chunks_medium,
+    first_retrieval_k=configs.first_retrieval_k,
+    second_retrieval_k=configs.second_retrieval_k,
+    num_windows=configs.num_windows,
+    retriever_weights=configs.retriever_weights,
+)
+# Initialize the memory
+memory = ConversationTokenBufferMemory(
+    llm=llm,
+    memory_key="chat_history",
+    input_key="question",
+    output_key="answer",
+    return_messages=True,
+    max_token_limit=configs.max_chat_history,
+)
+# Initialize the QA chain
+qa = ConvoRetrievalChain.from_llm(
+    llm,
+    my_retriever,
+    file_names=file_names,
+    memory=memory,
+    return_source_documents=False,
+    return_generated_question=False,
+)
+if __name__ == "__main__":
+    while True:
+        user_input = input("Human: ")
+        start_time = time.time()
+        user_input_ = re.sub(r"^Human: ", "", user_input)
+        print("*" * 6)
+        resp = qa({"question": user_input_})
+        print()
+        print(f"AI:{resp['answer']}")
+        print(f"Time used: {time.time() - start_time}")
+        print("-" * 60)

requirements.txt CHANGED Viewed

	@@ -1 +1,13 @@
1	- ~~huggingface_hub~~==0.22.2

+chromadb==0.4.13
+InstructorEmbedding==1.0.1
+langchain==0.0.308
+openai==0.28.1
+pypdf==3.16.2
+rank-bm25==0.2.2
+sentence-transformers==2.2.2
+tiktoken==0.5.1
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+together==0.2.4
+tqdm==4.66.1

toolkit/___init__.py ADDED Viewed

File without changes

toolkit/local_llm.py ADDED Viewed

	@@ -0,0 +1,193 @@

+"""The below code is borrowed from: https://github.com/PromtEngineer/localGPT
+The reason to use gguf/ggml models: https://huggingface.co/TheBloke/wizardLM-7B-GGML/discussions/3"""
+import logging
+import torch
+from huggingface_hub import hf_hub_download
+from huggingface_hub import login
+from langchain.llms import LlamaCpp, HuggingFacePipeline
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    GenerationConfig,
+    pipeline,
+)
+from toolkit.utils import Config
+configs = Config("configparser.ini")
+logger = logging.getLogger(__name__)
+def load_gguf_hf_model(
+    model_id: str,
+    model_basename: str,
+    max_tokens: int,
+    temperature: float,
+    device_type: str,
+):
+    """
+    Load a GGUF/GGML quantized model using LlamaCpp.
+    This function attempts to load a GGUF/GGML quantized model using the LlamaCpp library.
+    If the model is of type GGML, and newer version of LLAMA-CPP is used which does not support GGML,
+    it logs a message indicating that LLAMA-CPP has dropped support for GGML.
+    Parameters:
+    - model_id (str): The identifier for the model on HuggingFace Hub.
+    - model_basename (str): The base name of the model file.
+    - max_tokens (int): The maximum number of tokens to generate in the completion.
+    - temperature (float): The temperature of LLM.
+    - device_type (str): The type of device where the model will run, e.g., 'mps', 'cuda', etc.
+    Returns:
+    - LlamaCpp: An instance of the LlamaCpp model if successful, otherwise None.
+    Notes:
+    - The function uses the `hf_hub_download` function to download the model from the HuggingFace Hub.
+    - The number of GPU layers is set based on the device type.
+    """
+    try:
+        logger.info("Using Llamacpp for GGUF/GGML quantized models")
+        model_path = hf_hub_download(
+            repo_id=model_id,
+            filename=model_basename,
+            resume_download=True,
+            cache_dir=configs.local_model_dir,
+        )
+        kwargs = {
+            "model_path": model_path,
+            "n_ctx": configs.max_llm_context,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "n_batch": configs.n_batch,  # set this based on your GPU & CPU RAM
+            "verbose": False,
+        }
+        if device_type.lower() == "mps":
+            kwargs["n_gpu_layers"] = 1
+        if device_type.lower() == "cuda":
+            kwargs["n_gpu_layers"] = configs.n_gpu_layers  # set this based on your GPU
+        return LlamaCpp(**kwargs)
+    except:
+        if "ggml" in model_basename:
+            logger.info(
+                "If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead"
+            )
+        return None
+def load_full_hf_model(model_id: str, model_basename: str, device_type: str):
+    """
+    Load a full model using either LlamaTokenizer or AutoModelForCausalLM.
+    This function loads a full model based on the specified device type.
+    If the device type is 'mps' or 'cpu', it uses LlamaTokenizer and LlamaForCausalLM.
+    Otherwise, it uses AutoModelForCausalLM.
+    Parameters:
+    - model_id (str): The identifier for the model on HuggingFace Hub.
+    - model_basename (str): The base name of the model file.
+    - device_type (str): The type of device where the model will run.
+    Returns:
+    - model (Union[LlamaForCausalLM, AutoModelForCausalLM]): The loaded model.
+    - tokenizer (Union[LlamaTokenizer, AutoTokenizer]): The tokenizer associated with the model.
+    Notes:
+    - The function uses the `from_pretrained` method to load both the model and the tokenizer.
+    - Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
+    """
+    if "meta-llama" in model_id.lower():
+        login(token=configs.huggingface_token)
+    if device_type.lower() in ["mps", "cpu"]:
+        logger.info("Using LlamaTokenizer")
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_id,
+            cache_dir=configs.local_model_dir,
+        )
+        model = LlamaForCausalLM.from_pretrained(
+            model_id,
+            cache_dir=configs.local_model_dir,
+        )
+    else:
+        logger.info("Using AutoModelForCausalLM for full models")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, cache_dir=configs.local_model_dir
+        )
+        logger.info("Tokenizer loaded")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            cache_dir=configs.local_model_dir,
+            # trust_remote_code=True, # set these if you are using NVIDIA GPU
+            # load_in_4bit=True,
+            # bnb_4bit_quant_type="nf4",
+            # bnb_4bit_compute_dtype=torch.float16,
+            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
+        )
+        model.tie_weights()
+    return model, tokenizer
+def load_local_llm(
+    model_id: str,
+    model_basename: str,
+    temperature: float,
+    max_tokens: int,
+    device_type: str,
+):
+    """
+    Select a model for text generation using the HuggingFace library.
+    If you are running this for the first time, it will download a model for you.
+    subsequent runs will use the model from the disk.
+    Args:
+        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
+        model_id (str): Identifier of the model to load from HuggingFace's model hub.
+        model_basename (str, optional): Basename of the model if using quantized models.
+            Defaults to None.
+    Returns:
+        HuggingFacePipeline: A pipeline object for text generation using the loaded model.
+    Raises:
+        ValueError: If an unsupported model or device type is provided.
+    """
+    logger.info(f"Loading Model: {model_id}, on: {device_type}")
+    logger.info("This action can take a few minutes!")
+    if model_basename.lower() != "none":
+        if ".gguf" in model_basename.lower():
+            llm = load_gguf_hf_model(
+                model_id, model_basename, max_tokens, temperature, device_type
+            )
+            return llm
+    model, tokenizer = load_full_hf_model(model_id, None, device_type)
+    # Load configuration from the model to avoid warnings
+    generation_config = GenerationConfig.from_pretrained(model_id)
+    # see here for details:
+    # https://huggingface.co/docs/transformers/
+    # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
+    # Create a pipeline for text generation
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_length=max_tokens,
+        temperature=temperature,
+        # top_p=0.95,
+        repetition_penalty=1.15,
+        generation_config=generation_config,
+    )
+    local_llm = HuggingFacePipeline(pipeline=pipe)
+    logger.info("Local LLM Loaded")
+    return local_llm

toolkit/prompts.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from langchain.prompts import PromptTemplate
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
+# ================================================================================
+REFINE_QA_TEMPLATE = """Break down or rephrase the follow up input into fewer than 3 heterogeneous one-hop queries to be the input of a retrieval tool, if the follow up inout is multi-hop, multi-step, complex or comparative queries and relevant to Chat History and Document Names. Otherwise keep the follow up input as it is.
+The output format should strictly follow the following, and each query can only conatain 1 document name:
+```
+1. One-hop standalone query
+...
+3. One-hop standalone query
+...
+```
+Document Names in the database:
+```
+{database}
+```
+Chat History:
+```
+{chat_history}
+```
+Begin:
+Follow Up Input: {question}
+One-hop standalone queries(s):
+"""
+# ================================================================================
+DOCS_SELECTION_TEMPLATE = """Below are some verified sources and a human input. If you think any of them are relevant to the human input, then list all possible context numbers.
+```
+{snippets}
+```
+The output format must be like the following, nothing else. If not, you will output []:
+[0, ..., n]
+Human Input: {query}
+"""
+# ================================================================================
+RETRIEVAL_QA_SYS = """You are a helpful assistant designed by IncarnaMind.
+If you think the below below information are relevant to the human input, please respond to the human based on the relevant retrieved sources; otherwise, respond in your own words only about the human input."""
+RETRIEVAL_QA_TEMPLATE = """
+File Names in the database:
+```
+{database}
+```
+Chat History:
+```
+{chat_history}
+```
+Verified Sources:
+```
+{context}
+```
+User: {question}
+"""
+RETRIEVAL_QA_CHAT_TEMPLATE = """
+File Names in the database:
+```
+{database}
+```
+Chat History:
+```
+{chat_history}
+```
+Verified Sources:
+```
+{context}
+```
+"""
+class PromptTemplates:
+    """_summary_"""
+    def __init__(self):
+        self.refine_qa_prompt = REFINE_QA_TEMPLATE
+        self.docs_selection_prompt = DOCS_SELECTION_TEMPLATE
+        self.retrieval_qa_sys = RETRIEVAL_QA_SYS
+        self.retrieval_qa_prompt = RETRIEVAL_QA_TEMPLATE
+        self.retrieval_qa_chat_prompt = RETRIEVAL_QA_CHAT_TEMPLATE
+    def get_refine_qa_template(self, llm: str):
+        """get the refine qa prompt template"""
+        if "llama" in llm.lower():
+            temp = f"[INST] {self.refine_qa_prompt} [/INST]"
+        else:
+            temp = self.refine_qa_prompt
+        return PromptTemplate(
+            input_variables=["database", "chat_history", "question"],
+            template=temp,
+        )
+    def get_docs_selection_template(self, llm: str):
+        """get the docs selection prompt template"""
+        if "llama" in llm.lower():
+            temp = f"[INST] {self.docs_selection_prompt} [/INST]"
+        else:
+            temp = self.docs_selection_prompt
+        return PromptTemplate(
+            input_variables=["snippets", "query"],
+            template=temp,
+        )
+    def get_retrieval_qa_template_selector(self, llm: str):
+        """get the retrieval qa prompt template"""
+        if "llama" in llm.lower():
+            temp = f"[INST] <<SYS>>\n{self.retrieval_qa_sys}\n<</SYS>>\n\n{self.retrieval_qa_prompt} [/INST]"
+            messages = [
+                SystemMessagePromptTemplate.from_template(
+                    f"[INST] <<SYS>>\n{self.retrieval_qa_sys}\n<</SYS>>\n\n{self.retrieval_qa_chat_prompt} [/INST]"
+                ),
+                HumanMessagePromptTemplate.from_template("{question}"),
+            ]
+        else:
+            temp = f"{self.retrieval_qa_sys}\n{self.retrieval_qa_prompt}"
+            messages = [
+                SystemMessagePromptTemplate.from_template(
+                    f"{self.retrieval_qa_sys}\n{self.retrieval_qa_chat_prompt}"
+                ),
+                HumanMessagePromptTemplate.from_template("{question}"),
+            ]
+        prompt_temp = PromptTemplate(
+            template=temp,
+            input_variables=["database", "chat_history", "context", "question"],
+        )
+        prompt_temp_chat = ChatPromptTemplate.from_messages(messages)
+        return ConditionalPromptSelector(
+            default_prompt=prompt_temp,
+            conditionals=[(is_chat_model, prompt_temp_chat)],
+        )

toolkit/retrivers.py ADDED Viewed

	@@ -0,0 +1,643 @@

+"""
+This module provides custom implementation of a document retriever, designed for multi-stage retrieval.
+The system uses ensemble methods combining BM25 and Chroma Embeddings to retrieve relevant documents for a given query.
+It also utilizes various optimizations like rank fusion and weighted reciprocal rank by Langchain.
+Classes:
+--------
+- MyEnsembleRetriever: Custom retriever for BM25 and Chroma Embeddings.
+- MyRetriever: Handles multi-stage retrieval.
+"""
+import re
+import ast
+import copy
+import math
+import logging
+from typing import Dict, List, Optional
+from langchain.chains import LLMChain
+from langchain.schema import BaseRetriever, Document
+from langchain.retrievers import BM25Retriever, EnsembleRetriever
+from langchain.callbacks.manager import (
+    AsyncCallbackManagerForRetrieverRun,
+    CallbackManagerForRetrieverRun,
+    AsyncCallbackManagerForChainRun,
+    CallbackManagerForChainRun,
+)
+from toolkit.utils import Config, clean_text, DocIndexer, IndexerOperator
+from toolkit.prompts import PromptTemplates
+prompt_templates = PromptTemplates()
+configs = Config("configparser.ini")
+logger = logging.getLogger(__name__)
+class MyEnsembleRetriever(EnsembleRetriever):
+    """
+    Custom retriever for BM24 and Chroma Embeddings
+    """
+    retrievers: Dict[str, BaseRetriever]
+    def rank_fusion(
+        self, query: str, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        """
+        Retrieve the results of the retrievers and use rank_fusion_func to get
+        the final result.
+        Args:
+            query: The query to search for.
+        Returns:
+            A list of reranked documents.
+        """
+        # Get the results of all retrievers.
+        retriever_docs = []
+        for key, retriever in self.retrievers.items():
+            if key == "bm25":
+                res = retriever.get_relevant_documents(
+                    clean_text(query),
+                    callbacks=run_manager.get_child(tag=f"retriever_{key}"),
+                )
+                retriever_docs.append(res)
+            else:
+                res = retriever.get_relevant_documents(
+                    query, callbacks=run_manager.get_child(tag=f"retriever_{key}")
+                )
+                retriever_docs.append(res)
+        # apply rank fusion
+        fused_documents = self.weighted_reciprocal_rank(retriever_docs)
+        return fused_documents
+    async def arank_fusion(
+        self, query: str, run_manager: AsyncCallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        """
+        Asynchronously retrieve the results of the retrievers
+        and use rank_fusion_func to get the final result.
+        Args:
+            query: The query to search for.
+        Returns:
+            A list of reranked documents.
+        """
+        # Get the results of all retrievers.
+        retriever_docs = []
+        for key, retriever in self.retrievers.items():
+            if key == "bm25":
+                res = retriever.get_relevant_documents(
+                    clean_text(query),
+                    callbacks=run_manager.get_child(tag=f"retriever_{key}"),
+                )
+                retriever_docs.append(res)
+                # print("retriever_docs 1:", res)
+            else:
+                res = await retriever.aget_relevant_documents(
+                    query, callbacks=run_manager.get_child(tag=f"retriever_{key}")
+                )
+                retriever_docs.append(res)
+        # apply rank fusion
+        fused_documents = self.weighted_reciprocal_rank(retriever_docs)
+        return fused_documents
+    def weighted_reciprocal_rank(
+        self, doc_lists: List[List[Document]]
+    ) -> List[Document]:
+        """
+        Perform weighted Reciprocal Rank Fusion on multiple rank lists.
+        You can find more details about RRF here:
+        https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf
+        Args:
+            doc_lists: A list of rank lists, where each rank list contains unique items.
+        Returns:
+            list: The final aggregated list of items sorted by their weighted RRF
+                    scores in descending order.
+        """
+        if len(doc_lists) != len(self.weights):
+            raise ValueError(
+                "Number of rank lists must be equal to the number of weights."
+            )
+        # replace the page_content with the original uncleaned page_content
+        doc_lists_ = copy.copy(doc_lists)
+        for doc_list in doc_lists_:
+            for doc in doc_list:
+                doc.page_content = doc.metadata["page_content"]
+                # doc.metadata["page_content"] = None
+        # Create a union of all unique documents in the input doc_lists
+        all_documents = set()
+        for doc_list in doc_lists_:
+            for doc in doc_list:
+                all_documents.add(doc.page_content)
+        # Initialize the RRF score dictionary for each document
+        rrf_score_dic = {doc: 0.0 for doc in all_documents}
+        # Calculate RRF scores for each document
+        for doc_list, weight in zip(doc_lists_, self.weights):
+            for rank, doc in enumerate(doc_list, start=1):
+                rrf_score = weight * (1 / (rank + self.c))
+                rrf_score_dic[doc.page_content] += rrf_score
+        # Sort documents by their RRF scores in descending order
+        sorted_documents = sorted(
+            rrf_score_dic.keys(), key=lambda x: rrf_score_dic[x], reverse=True
+        )
+        # Map the sorted page_content back to the original document objects
+        page_content_to_doc_map = {
+            doc.page_content: doc for doc_list in doc_lists_ for doc in doc_list
+        }
+        sorted_docs = [
+            page_content_to_doc_map[page_content] for page_content in sorted_documents
+        ]
+        return sorted_docs
+class MyRetriever:
+    """
+    Retriever class to handle multi-stage retrieval.
+    """
+    def __init__(
+        self,
+        llm,
+        embedding_chunks_small: List[Document],
+        embedding_chunks_medium: List[Document],
+        docs_chunks_small: DocIndexer,
+        docs_chunks_medium: DocIndexer,
+        first_retrieval_k: int,
+        second_retrieval_k: int,
+        num_windows: int,
+        retriever_weights: List[float],
+    ):
+        """
+        Initialize the MyRetriever class.
+        Args:
+            llm: Language model for retrieval.
+            embedding_chunks_small (List[Document]): List of small embedding chunks.
+            embedding_chunks_medium (List[Document]): List of medium embedding chunks.
+            docs_chunks_small (DocIndexer): Document indexer for small chunks.
+            docs_chunks_medium (DocIndexer): Document indexer for medium chunks.
+            first_retrieval_k (int): Number of top documents to retrieve in first retrieval.
+            second_retrieval_k (int): Number of top documents to retrieve in second retrieval.
+            num_windows (int): Number of overlapping windows to consider.
+            retriever_weights (List[float]): Weights for ensemble retrieval.
+        """
+        self.llm = llm
+        self.embedding_chunks_small = embedding_chunks_small
+        self.embedding_chunks_medium = embedding_chunks_medium
+        self.docs_index_small = DocIndexer(docs_chunks_small)
+        self.docs_index_medium = DocIndexer(docs_chunks_medium)
+        self.first_retrieval_k = first_retrieval_k
+        self.second_retrieval_k = second_retrieval_k
+        self.num_windows = num_windows
+        self.retriever_weights = retriever_weights
+    def get_retriever(
+        self,
+        docs_chunks,
+        emb_chunks,
+        emb_filter=None,
+        k=2,
+        weights=(0.5, 0.5),
+    ):
+        """
+        Initialize and return a retriever instance with specified parameters.
+        Args:
+            docs_chunks: The document chunks for the BM25 retriever.
+            emb_chunks: The document chunks for the Embedding retriever.
+            emb_filter: A filter for embedding retriever.
+            k (int): The number of top documents to return.
+            weights (list): Weights for ensemble retrieval.
+        Returns:
+            MyEnsembleRetriever: An instance of MyEnsembleRetriever.
+        """
+        bm25_retriever = BM25Retriever.from_documents(docs_chunks)
+        bm25_retriever.k = k
+        emb_retriever = emb_chunks.as_retriever(
+            search_kwargs={
+                "filter": emb_filter,
+                "k": k,
+                "search_type": "mmr",
+            }
+        )
+        return MyEnsembleRetriever(
+            retrievers={"bm25": bm25_retriever, "chroma": emb_retriever},
+            weights=weights,
+        )
+    def find_overlaps(self, doc: List[Document]):
+        """
+        Find overlapping intervals of windows.
+        Args:
+            doc (Document): A document object to find overlaps in.
+        Returns:
+            list: A list of overlapping intervals.
+        """
+        intervals = []
+        for item in doc:
+            intervals.append(
+                (
+                    item.metadata["large_chunks_idx_lower_bound"],
+                    item.metadata["large_chunks_idx_upper_bound"],
+                )
+            )
+        remaining_intervals, grouped_intervals, centroids = intervals.copy(), [], []
+        while remaining_intervals:
+            curr_interval = remaining_intervals.pop(0)
+            curr_group = [curr_interval]
+            subset_interval = None
+            for start, end in remaining_intervals.copy():
+                for s, e in curr_group:
+                    overlap = set(range(s, e + 1)) & set(range(start, end + 1))
+                    if overlap:
+                        curr_group.append((start, end))
+                        remaining_intervals.remove((start, end))
+                        if set(range(start, end + 1)).issubset(set(range(s, e + 1))):
+                            subset_interval = (start, end)
+                        break
+            if subset_interval:
+                centroid = [math.ceil((subset_interval[0] + subset_interval[1]) / 2)]
+            elif len(curr_group) > 2:
+                first_overlap = max(
+                    set(range(curr_group[0][0], curr_group[0][1] + 1))
+                    & set(range(curr_group[1][0], curr_group[1][1] + 1))
+                )
+                last_overlap_set = set(
+                    range(curr_group[-1][0], curr_group[-1][1] + 1)
+                ) & set(range(curr_group[-2][0], curr_group[-2][1] + 1))
+                if not last_overlap_set:
+                    last_overlap = first_overlap  # Fallback if no overlap
+                else:
+                    last_overlap = min(last_overlap_set)
+                step = 1 if first_overlap <= last_overlap else -1
+                centroid = list(range(first_overlap, last_overlap + step, step))
+            else:
+                centroid = [
+                    round(
+                        sum([math.ceil((s + e) / 2) for s, e in curr_group])
+                        / len(curr_group)
+                    )
+                ]
+            grouped_intervals.append(
+                curr_group if len(curr_group) > 1 else curr_group[0]
+            )
+            centroids.extend(centroid)
+        return centroids
+    def get_filter(self, top_k: int, file_md5: str, doc: List[Document]):
+        """
+        Create a filter for retrievers based on overlapping intervals.
+        Args:
+            top_k (int): Number of top intervals to consider.
+            file_md5 (str): MD5 hash of the file to filter.
+            doc (List[Document]): List of document objects.
+        Returns:
+            tuple: A tuple of containing dictionary filters for DocIndexer and Chroma retrievers.
+        """
+        overlaps = self.find_overlaps(doc)
+        if len(overlaps) < 1:
+            raise ValueError("No overlapping intervals found.")
+        overlaps_k = overlaps[:top_k]
+        logger.info("windows_at_2nd_retrieval: %s", overlaps_k)
+        search_dict_docindexer = {"OR": []}
+        search_dict_chroma = {"$or": []}
+        for chunk_idx in overlaps_k:
+            search_dict_docindexer["OR"].append(
+                {
+                    "large_chunks_idx_lower_bound": (
+                        IndexerOperator.LTE,
+                        chunk_idx,
+                    ),
+                    "large_chunks_idx_upper_bound": (
+                        IndexerOperator.GTE,
+                        chunk_idx,
+                    ),
+                    "source_md5": (IndexerOperator.EQ, file_md5),
+                }
+            )
+            if len(overlaps_k) == 1:
+                search_dict_chroma = {
+                    "$and": [
+                        {"large_chunks_idx_lower_bound": {"$lte": overlaps_k[0]}},
+                        {"large_chunks_idx_upper_bound": {"$gte": overlaps_k[0]}},
+                        {"source_md5": {"$eq": file_md5}},
+                    ]
+                }
+            else:
+                search_dict_chroma["$or"].append(
+                    {
+                        "$and": [
+                            {"large_chunks_idx_lower_bound": {"$lte": chunk_idx}},
+                            {"large_chunks_idx_upper_bound": {"$gte": chunk_idx}},
+                            {"source_md5": {"$eq": file_md5}},
+                        ]
+                    }
+                )
+        return search_dict_docindexer, search_dict_chroma
+    def get_relevant_doc_ids(self, docs: List[Document], query: str):
+        """
+        Get relevant document IDs given a query using an LLM.
+        Args:
+            docs (List[Document]): List of document objects to find relevant IDs in.
+            query (str): The query string.
+        Returns:
+            list: A list of relevant document IDs.
+        """
+        snippets = "\n\n\n".join(
+            [
+                f"Context {idx}:\n{{{doc.page_content}}}. {{source: {doc.metadata['source']}}}"
+                for idx, doc in enumerate(docs)
+            ]
+        )
+        id_chain = LLMChain(
+            llm=self.llm,
+            prompt=prompt_templates.get_docs_selection_template(configs.model_name),
+            output_key="IDs",
+        )
+        ids = id_chain.run({"query": query, "snippets": snippets})
+        logger.info("relevant doc ids: %s", ids)
+        pattern = r"\[\s*\d+\s*(?:,\s*\d+\s*)*\]"
+        match = re.search(pattern, ids)
+        if match:
+            return ast.literal_eval(match.group(0))
+        else:
+            return []
+    def get_relevant_documents(
+        self,
+        query: str,
+        num_query: int,
+        *,
+        run_manager: Optional[CallbackManagerForChainRun] = None,
+    ) -> List[Document]:
+        """
+        Perform multi-stage retrieval to get relevant documents.
+        Args:
+            query (str): The query string.
+            num_query (int): Number of queries.
+            run_manager (Optional[CallbackManagerForChainRun], optional): Callback manager for chain run.
+        Returns:
+            List[Document]: A list of relevant documents.
+        """
+        # ! First retrieval
+        first_retriever = self.get_retriever(
+            docs_chunks=self.docs_index_small.documents,
+            emb_chunks=self.embedding_chunks_small,
+            emb_filter=None,
+            k=self.first_retrieval_k,
+            weights=self.retriever_weights,
+        )
+        first = first_retriever.get_relevant_documents(
+            query, callbacks=run_manager.get_child()
+        )
+        for doc in first:
+            logger.info("----1st retrieval----: %s", doc)
+        ids_clean = self.get_relevant_doc_ids(first, query)
+        # ids_clean = [0, 1, 2]
+        logger.info("relevant cleaned doc ids: %s", ids_clean)
+        qa_chunks = {}  # key is file name, value is a list of relevant documents
+        # res_chunks = []
+        if ids_clean and isinstance(ids_clean, list):
+            source_md5_dict = {}
+            for ids_c in ids_clean:
+                if ids_c < len(first):
+                    if ids_c not in source_md5_dict:
+                        source_md5_dict[first[ids_c].metadata["source_md5"]] = [
+                            first[ids_c]
+                        ]
+                    # else:
+                    #     source_md5_dict[first[ids_c].metadata["source_md5"]].append(
+                    #         ids_clean[ids_c]
+                    #     )
+            if len(source_md5_dict) == 0:
+                source_md5_dict[first[0].metadata["source_md5"]] = [first[0]]
+            num_docs = len(source_md5_dict.keys())
+            third_num_k = max(
+                1,
+                (
+                    int(
+                        (
+                            configs.max_llm_context
+                            / (configs.base_chunk_size * configs.chunk_scale)
+                        )
+                        // (num_docs * num_query)
+                    )
+                ),
+            )
+            for source_md5, docs in source_md5_dict.items():
+                logger.info(
+                    "selected_docs_at_1st_retrieval: %s", docs[0].metadata["source"]
+                )
+                second_docs_chunks = self.docs_index_small.retrieve_metadata(
+                    {
+                        "source_md5": (IndexerOperator.EQ, source_md5),
+                    }
+                )
+                second_retriever = self.get_retriever(
+                    docs_chunks=second_docs_chunks,
+                    emb_chunks=self.embedding_chunks_small,
+                    emb_filter={"source_md5": source_md5},
+                    k=self.second_retrieval_k,
+                    weights=self.retriever_weights,
+                )
+                # ! Second retrieval
+                second = second_retriever.get_relevant_documents(
+                    query, callbacks=run_manager.get_child()
+                )
+                for doc in second:
+                    logger.info("----2nd retrieval----: %s", doc)
+                docs.extend(second)
+                docindexer_filter, chroma_filter = self.get_filter(
+                    self.num_windows, source_md5, docs
+                )
+                third_docs_chunks = self.docs_index_medium.retrieve_metadata(
+                    docindexer_filter
+                )
+                third_retriever = self.get_retriever(
+                    docs_chunks=third_docs_chunks,
+                    emb_chunks=self.embedding_chunks_medium,
+                    emb_filter=chroma_filter,
+                    k=third_num_k,
+                    weights=self.retriever_weights,
+                )
+                # ! Third retrieval
+                third_temp = third_retriever.get_relevant_documents(
+                    query, callbacks=run_manager.get_child()
+                )
+                third = third_temp[:third_num_k]
+                # chunks = sorted(third, key=lambda x: x.metadata["medium_chunk_idx"])
+                for doc in third:
+                    logger.info(
+                        "----3rd retrieval----page_content: %s", [doc.page_content]
+                    )
+                    mtdata = doc.metadata
+                    mtdata["page_content"] = None
+                    logger.info("----3rd retrieval----metadata: %s", mtdata)
+                file_name = third[0].metadata["source"].split("/")[-1]
+                if file_name not in qa_chunks:
+                    qa_chunks[file_name] = third
+                else:
+                    qa_chunks[file_name].extend(third)
+        return qa_chunks
+    async def aget_relevant_documents(
+        self,
+        query: str,
+        num_query: int,
+        *,
+        run_manager: AsyncCallbackManagerForChainRun,
+    ) -> List[Document]:
+        """
+        Asynchronous version of get_relevant_documents method.
+        Args:
+            query (str): The query string.
+            num_query (int): Number of queries.
+            run_manager (AsyncCallbackManagerForChainRun): Callback manager for asynchronous chain run.
+        Returns:
+            List[Document]: A list of relevant documents.
+        """
+        # ! First retrieval
+        first_retriever = self.get_retriever(
+            docs_chunks=self.docs_index_small.documents,
+            emb_chunks=self.embedding_chunks_small,
+            emb_filter=None,
+            k=self.first_retrieval_k,
+            weights=self.retriever_weights,
+        )
+        first = await first_retriever.aget_relevant_documents(
+            query, callbacks=run_manager.get_child()
+        )
+        for doc in first:
+            logger.info("----1st retrieval----: %s", doc)
+        ids_clean = self.get_relevant_doc_ids(first, query)
+        logger.info("relevant doc ids: %s", ids_clean)
+        qa_chunks = {}  # key is file name, value is a list of relevant documents
+        # res_chunks = []
+        if ids_clean and isinstance(ids_clean, list):
+            source_md5_dict = {}
+            for ids_c in ids_clean:
+                if ids_c < len(first):
+                    if ids_c not in source_md5_dict:
+                        source_md5_dict[first[ids_c].metadata["source_md5"]] = [
+                            first[ids_c]
+                        ]
+                    # else:
+                    #     source_md5_dict[first[ids_c].metadata["source_md5"]].append(
+                    #         ids_clean[ids_c]
+                    #     )
+            if len(source_md5_dict) == 0:
+                source_md5_dict[first[0].metadata["source_md5"]] = [first[0]]
+            num_docs = len(source_md5_dict.keys())
+            third_num_k = max(
+                1,
+                (
+                    int(
+                        (
+                            configs.max_llm_context
+                            / (configs.base_chunk_size * configs.chunk_scale)
+                        )
+                        // (num_docs * num_query)
+                    )
+                ),
+            )
+            for source_md5, docs in source_md5_dict.items():
+                logger.info(
+                    "selected_docs_at_1st_retrieval: %s", docs[0].metadata["source"]
+                )
+                second_docs_chunks = self.docs_index_small.retrieve_metadata(
+                    {
+                        "source_md5": (IndexerOperator.EQ, source_md5),
+                    }
+                )
+                second_retriever = self.get_retriever(
+                    docs_chunks=second_docs_chunks,
+                    emb_chunks=self.embedding_chunks_small,
+                    emb_filter={"source_md5": source_md5},
+                    k=self.second_retrieval_k,
+                    weights=self.retriever_weights,
+                )
+                # ! Second retrieval
+                second = await second_retriever.aget_relevant_documents(
+                    query, callbacks=run_manager.get_child()
+                )
+                for doc in second:
+                    logger.info("----2nd retrieval----: %s", doc)
+                docs.extend(second)
+                docindexer_filter, chroma_filter = self.get_filter(
+                    self.num_windows, source_md5, docs
+                )
+                third_docs_chunks = self.docs_index_medium.retrieve_metadata(
+                    docindexer_filter
+                )
+                third_retriever = self.get_retriever(
+                    docs_chunks=third_docs_chunks,
+                    emb_chunks=self.embedding_chunks_medium,
+                    emb_filter=chroma_filter,
+                    k=third_num_k,
+                    weights=self.retriever_weights,
+                )
+                # ! Third retrieval
+                third_temp = await third_retriever.aget_relevant_documents(
+                    query, callbacks=run_manager.get_child()
+                )
+                third = third_temp[:third_num_k]
+                # chunks = sorted(third, key=lambda x: x.metadata["medium_chunk_idx"])
+                for doc in third:
+                    logger.info(
+                        "----3rd retrieval----page_content: %s", [doc.page_content]
+                    )
+                    mtdata = doc.metadata
+                    mtdata["page_content"] = None
+                    logger.info("----3rd retrieval----metadata: %s", mtdata)
+                file_name = third[0].metadata["source"].split("/")[-1]
+                if file_name not in qa_chunks:
+                    qa_chunks[file_name] = third
+                else:
+                    qa_chunks[file_name].extend(third)
+        return qa_chunks

toolkit/together_api_llm.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""The code borrowed from https://colab.research.google.com/drive/1RW2yTxh5b9w7F3IrK00Iz51FTO5W01Rx?usp=sharing#scrollTo=RgbLVmf-o4j7"""
+import os
+from typing import Any, Dict
+import together
+from pydantic import Extra, root_validator
+from langchain.llms.base import LLM
+from langchain.utils import get_from_dict_or_env
+from toolkit.utils import Config
+configs = Config("configparser.ini")
+os.environ["TOGETHER_API_KEY"] = configs.together_api_key
+# together.api_key = configs.together_api_key
+# models = together.Models.list()
+# for idx, model in enumerate(models):
+#     print(idx, model["name"])
+class TogetherLLM(LLM):
+    """Together large language models."""
+    model: str = "togethercomputer/llama-2-70b-chat"
+    """model endpoint to use"""
+    together_api_key: str = os.environ["TOGETHER_API_KEY"]
+    """Together API key"""
+    temperature: float = 0
+    """What sampling temperature to use."""
+    max_tokens: int = 512
+    """The maximum number of tokens to generate in the completion."""
+    class Config:
+        extra = "forbid"
+    # @root_validator()
+    # def validate_environment(cls, values: Dict) -> Dict:
+    #     """Validate that the API key is set."""
+    #     api_key = get_from_dict_or_env(values, "together_api_key", "TOGETHER_API_KEY")
+    #     values["together_api_key"] = api_key
+    #     return values
+    @property
+    def _llm_type(self) -> str:
+        """Return type of LLM."""
+        return "together"
+    def _call(
+        self,
+        prompt: str,
+        **kwargs: Any,
+    ) -> str:
+        """Call to Together endpoint."""
+        together.api_key = self.together_api_key
+        output = together.Complete.create(
+            prompt,
+            model=self.model,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+        )
+        text = output["output"]["choices"][0]["text"]
+        return text
+# if __name__ == "__main__":
+#     test_llm = TogetherLLM(
+#         model="togethercomputer/llama-2-70b-chat", temperature=0, max_tokens=1000
+#     )
+#     print(test_llm("What are the olympics? "))

toolkit/utils.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+The widgets defines utility functions for loading data, text cleaning,
+and indexing documents, as well as classes for handling document queries
+and formatting chat history.
+"""
+import re
+import pickle
+import string
+import logging
+import configparser
+from enum import Enum
+from typing import List, Tuple, Union
+import nltk
+from nltk.stem import WordNetLemmatizer
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import torch
+import tiktoken
+from langchain.vectorstores import Chroma
+from langchain.schema import Document, BaseMessage
+from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+tokenizer_name = tiktoken.encoding_for_model("gpt-3.5-turbo")
+tokenizer = tiktoken.get_encoding(tokenizer_name.name)
+# if nltk stopwords, punkt and wordnet are not downloaded, download it
+try:
+    nltk.data.find("corpora/stopwords")
+except LookupError:
+    nltk.download("stopwords")
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt")
+try:
+    nltk.data.find("corpora/wordnet")
+except LookupError:
+    nltk.download("wordnet")
+ChatTurnType = Union[Tuple[str, str], BaseMessage]
+_ROLE_MAP = {"human": "Human: ", "ai": "Assistant: "}
+class Config:
+    """Initializes configs."""
+    def __init__(self, config_file):
+        self.config = configparser.ConfigParser(interpolation=None)
+        self.config.read(config_file)
+        # Tokens
+        self.openai_api_key = self.config.get("tokens", "OPENAI_API_KEY")
+        self.anthropic_api_key = self.config.get("tokens", "ANTHROPIC_API_KEY")
+        self.together_api_key = self.config.get("tokens", "TOGETHER_API_KEY")
+        self.huggingface_token = self.config.get("tokens", "HUGGINGFACE_TOKEN")
+        self.version = self.config.get("tokens", "VERSION")
+        # Directory
+        self.docs_dir = self.config.get("directory", "DOCS_DIR")
+        self.db_dir = self.config.get("directory", "db_DIR")
+        self.local_model_dir = self.config.get("directory", "LOCAL_MODEL_DIR")
+        # Parameters
+        self.model_name = self.config.get("parameters", "MODEL_NAME")
+        self.temperature = self.config.getfloat("parameters", "TEMPURATURE")
+        self.max_chat_history = self.config.getint("parameters", "MAX_CHAT_HISTORY")
+        self.max_llm_context = self.config.getint("parameters", "MAX_LLM_CONTEXT")
+        self.max_llm_generation = self.config.getint("parameters", "MAX_LLM_GENERATION")
+        self.embedding_name = self.config.get("parameters", "EMBEDDING_NAME")
+        self.n_gpu_layers = self.config.getint("parameters", "N_GPU_LAYERS")
+        self.n_batch = self.config.getint("parameters", "N_BATCH")
+        self.base_chunk_size = self.config.getint("parameters", "BASE_CHUNK_SIZE")
+        self.chunk_overlap = self.config.getint("parameters", "CHUNK_OVERLAP")
+        self.chunk_scale = self.config.getint("parameters", "CHUNK_SCALE")
+        self.window_steps = self.config.getint("parameters", "WINDOW_STEPS")
+        self.window_scale = self.config.getint("parameters", "WINDOW_SCALE")
+        self.retriever_weights = [
+            float(x.strip())
+            for x in self.config.get("parameters", "RETRIEVER_WEIGHTS").split(",")
+        ]
+        self.first_retrieval_k = self.config.getint("parameters", "FIRST_RETRIEVAL_K")
+        self.second_retrieval_k = self.config.getint("parameters", "SECOND_RETRIEVAL_K")
+        self.num_windows = self.config.getint("parameters", "NUM_WINDOWS")
+        # Logging
+        self.logging_enabled = self.config.getboolean("logging", "enabled")
+        self.logging_level = self.config.get("logging", "level")
+        self.logging_filename = self.config.get("logging", "filename")
+        self.logging_format = self.config.get("logging", "format")
+        self.configure_logging()
+    def configure_logging(self):
+        """
+        Configure the logger for each .py files.
+        """
+        if not self.logging_enabled:
+            logging.disable(logging.CRITICAL + 1)
+            return
+        log_level = self.config.get("logging", "level")
+        log_filename = self.config.get("logging", "filename")
+        log_format = self.config.get("logging", "format")
+        logging.basicConfig(level=log_level, filename=log_filename, format=log_format)
+def configure_logger():
+    """
+    Configure the logger for each .py files.
+    """
+    config = configparser.ConfigParser(interpolation=None)
+    config.read("configparser.ini")
+    enabled = config.getboolean("logging", "enabled")
+    if not enabled:
+        logging.disable(logging.CRITICAL + 1)
+        return
+    log_level = config.get("logging", "level")
+    log_filename = config.get("logging", "filename")
+    log_format = config.get("logging", "format")
+    logging.basicConfig(level=log_level, filename=log_filename, format=log_format)
+def tiktoken_len(text):
+    """token length function"""
+    tokens = tokenizer.encode(text, disallowed_special=())
+    return len(tokens)
+def check_device():
+    """Check if cuda or MPS is available, else fallback to CPU"""
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    else:
+        device = "cpu"
+    return device
+def choose_embeddings(embedding_name):
+    """Choose embeddings for a given model's name"""
+    try:
+        if embedding_name == "openAIEmbeddings":
+            return OpenAIEmbeddings()
+        elif embedding_name == "hkunlpInstructorLarge":
+            device = check_device()
+            return HuggingFaceInstructEmbeddings(
+                model_name="hkunlp/instructor-large", model_kwargs={"device": device}
+            )
+        else:
+            device = check_device()
+            return HuggingFaceEmbeddings(model_name=embedding_name, device=device)
+    except Exception as error:
+        raise ValueError(f"Embedding {embedding_name} not supported") from error
+def load_embedding(store_name, embedding, suffix, path):
+    """Load chroma embeddings"""
+    vector_store = Chroma(
+        persist_directory=f"{path}/chroma_{store_name}_{suffix}",
+        embedding_function=embedding,
+    )
+    return vector_store
+def load_pickle(prefix, suffix, path):
+    """Load langchain documents from a pickle file.
+    Args:
+        store_name (str): The name of the store where data is saved.
+        suffix (str): Suffix to append to the store name.
+        path (str): The path where the pickle file is stored.
+    Returns:
+        Document: documents from the pickle file
+    """
+    with open(f"{path}/{prefix}_{suffix}.pkl", "rb") as file:
+        return pickle.load(file)
+def clean_text(text):
+    """
+    Converts text to lowercase, removes punctuation, stopwords, and lemmatizes it
+    for BM25 retriever.
+    Parameters:
+        text (str): The text to be cleaned.
+    Returns:
+        str: The cleaned and lemmatized text.
+    """
+    # remove [SEP] in the text
+    text = text.replace("[SEP]", "")
+    # Tokenization
+    tokens = word_tokenize(text)
+    # Lowercasing
+    tokens = [w.lower() for w in tokens]
+    # Remove punctuation
+    table = str.maketrans("", "", string.punctuation)
+    stripped = [w.translate(table) for w in tokens]
+    # Keep tokens that are alphabetic, numeric, or contain both.
+    words = [
+        word
+        for word in stripped
+        if word.isalpha()
+        or word.isdigit()
+        or (re.search("\d", word) and re.search("[a-zA-Z]", word))
+    ]
+    # Remove stopwords
+    stop_words = set(stopwords.words("english"))
+    words = [w for w in words if w not in stop_words]
+    # Lemmatization (or you could use stemming instead)
+    lemmatizer = WordNetLemmatizer()
+    lemmatized = [lemmatizer.lemmatize(w) for w in words]
+    # Convert list of words to a string
+    lemmatized_ = " ".join(lemmatized)
+    return lemmatized_
+class IndexerOperator(Enum):
+    """
+    Enumeration for different query operators used in indexing.
+    """
+    EQ = "=="
+    GT = ">"
+    GTE = ">="
+    LT = "<"
+    LTE = "<="
+class DocIndexer:
+    """
+    A class to handle indexing and searching of documents.
+    Attributes:
+        documents (List[Document]): List of documents to be indexed.
+    """
+    def __init__(self, documents):
+        self.documents = documents
+        self.index = self.build_index(documents)
+    def build_index(self, documents):
+        """
+        Build an index for the given list of documents.
+        Parameters:
+            documents (List[Document]): The list of documents to be indexed.
+        Returns:
+            dict: The built index.
+        """
+        index = {}
+        for doc in documents:
+            for key, value in doc.metadata.items():
+                if key not in index:
+                    index[key] = {}
+                if value not in index[key]:
+                    index[key][value] = []
+                index[key][value].append(doc)
+        return index
+    def retrieve_metadata(self, search_dict):
+        """
+        Retrieve documents based on the search criteria provided in search_dict.
+        Parameters:
+            search_dict (dict): Dictionary specifying the search criteria.
+                                It can contain "AND" or "OR" operators for
+                                complex queries.
+        Returns:
+            List[Document]: List of documents that match the search criteria.
+        """
+        if "AND" in search_dict:
+            return self._handle_and(search_dict["AND"])
+        elif "OR" in search_dict:
+            return self._handle_or(search_dict["OR"])
+        else:
+            return self._handle_single(search_dict)
+    def _handle_and(self, search_dicts):
+        results = [self.retrieve_metadata(sd) for sd in search_dicts]
+        if results:
+            intersection = set.intersection(
+                *[set(map(self._hash_doc, r)) for r in results]
+            )
+            return [self._unhash_doc(h) for h in intersection]
+        else:
+            return []
+    def _handle_or(self, search_dicts):
+        results = [self.retrieve_metadata(sd) for sd in search_dicts]
+        union = set.union(*[set(map(self._hash_doc, r)) for r in results])
+        return [self._unhash_doc(h) for h in union]
+    def _handle_single(self, search_dict):
+        unions = []
+        for key, query in search_dict.items():
+            operator, value = query
+            union = set()
+            if operator == IndexerOperator.EQ:
+                if key in self.index and value in self.index[key]:
+                    union.update(map(self._hash_doc, self.index[key][value]))
+            else:
+                if key in self.index:
+                    for k, v in self.index[key].items():
+                        if (
+                            (operator == IndexerOperator.GT and k > value)
+                            or (operator == IndexerOperator.GTE and k >= value)
+                            or (operator == IndexerOperator.LT and k < value)
+                            or (operator == IndexerOperator.LTE and k <= value)
+                        ):
+                            union.update(map(self._hash_doc, v))
+            if union:
+                unions.append(union)
+        if unions:
+            intersection = set.intersection(*unions)
+            return [self._unhash_doc(h) for h in intersection]
+        else:
+            return []
+    def _hash_doc(self, doc):
+        return (doc.page_content, frozenset(doc.metadata.items()))
+    def _unhash_doc(self, hashed_doc):
+        page_content, metadata = hashed_doc
+        return Document(page_content=page_content, metadata=dict(metadata))
+def _get_chat_history(chat_history: List[ChatTurnType]) -> str:
+    buffer = ""
+    for dialogue_turn in chat_history:
+        if isinstance(dialogue_turn, BaseMessage):
+            role_prefix = _ROLE_MAP.get(dialogue_turn.type, f"{dialogue_turn.type}: ")
+            buffer += f"\n{role_prefix}{dialogue_turn.content}"
+        elif isinstance(dialogue_turn, tuple):
+            human = "Human: " + dialogue_turn[0]
+            ai = "Assistant: " + dialogue_turn[1]
+            buffer += "\n" + "\n".join([human, ai])
+        else:
+            raise ValueError(
+                f"Unsupported chat history format: {type(dialogue_turn)}."
+                f" Full chat history: {chat_history} "
+            )
+    return buffer
+def _get_standalone_questions_list(
+    standalone_questions_str: str, original_question: str
+) -> List[str]:
+    pattern = r"\d+\.\s(.*?)(?=\n\d+\.|\n|$)"
+    matches = [
+        match.group(1) for match in re.finditer(pattern, standalone_questions_str)
+    ]
+    if matches:
+        return matches
+    match = re.search(
+        r"(?i)standalone[^\n]*:[^\n](.*)", standalone_questions_str, re.DOTALL
+    )
+    sentence_source = match.group(1).strip() if match else standalone_questions_str
+    sentences = sentence_source.split("\n")
+    return [
+        re.sub(
+            r"^\((\d+)\)\.? ?|^\d+\.? ?\)?|^(\d+)\) ?|^(\d+)\) ?|^[Qq]uery \d+: ?|^[Qq]uery: ?",
+            "",
+            sentence.strip(),
+        )
+        for sentence in sentences
+        if sentence.strip()
+    ]