Spaces:

bachephysicdun
/

Backend

Sleeping

App Files Files Community

bachephysicdun commited on Oct 31, 2024

Commit

8dbef5d

1 Parent(s): a605a95

implemented rag and filtered_rag

Browse files

Files changed (5) hide show

app/chains.py +30 -24
app/data_indexing.py +47 -16
app/main.py +47 -23
app/prompts.py +26 -11
app/schemas.py +1 -1

app/chains.py CHANGED Viewed

@@ -10,14 +10,14 @@ from prompts import (
     raw_prompt,
     raw_prompt_formatted,
     history_prompt_formatted,
     format_context,
     tokenizer
 )
 from data_indexing import DataIndexer
-# data_indexer = DataIndexer()
 llm = HuggingFaceEndpoint(
     repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
     huggingfacehub_api_token=os.environ['HF_TOKEN'],
@@ -27,31 +27,37 @@ llm = HuggingFaceEndpoint(
 )
 simple_chain = (raw_prompt | llm).with_types(input_type=schemas.UserQuestion)
-# %%
-# data_indexer = DataIndexer()
-# TODO: create formatted_chain by piping raw_prompt_formatted and the LLM endpoint.
 formatted_chain = (raw_prompt_formatted | llm).with_types(input_type=schemas.UserQuestion)
-# # TODO: use history_prompt_formatted and HistoryInput to create the history_chain
 history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
-# # TODO: Let's construct the standalone_chain by piping standalone_prompt_formatted with the LLM
-# standalone_chain = None
-# input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
-# input_2 = {
-#     'context': lambda x: format_context(data_indexer.search(x['new_question'])),
-#     'standalone_question': lambda x: x['new_question']
-# }
-# input_to_rag_chain = input_1 | input_2
-# # TODO: use input_to_rag_chain, rag_prompt_formatted,
-# # HistoryInput and the LLM to build the rag_chain.
-# rag_chain = None
-# # TODO:  Implement the filtered_rag_chain. It should be the
-# # same as the rag_chain but with hybrid_search = True.
-# filtered_rag_chain = None

     raw_prompt,
     raw_prompt_formatted,
     history_prompt_formatted,
+    standalone_prompt_formatted,
+    rag_prompt_formatted,
     format_context,
     tokenizer
 )
 from data_indexing import DataIndexer
 llm = HuggingFaceEndpoint(
     repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
     huggingfacehub_api_token=os.environ['HF_TOKEN'],
 )
 simple_chain = (raw_prompt | llm).with_types(input_type=schemas.UserQuestion)
+data_indexer = DataIndexer()
+# create formatted_chain by piping raw_prompt_formatted and the LLM endpoint.
 formatted_chain = (raw_prompt_formatted | llm).with_types(input_type=schemas.UserQuestion)
+# use history_prompt_formatted and HistoryInput to create the history_chain
 history_chain = (history_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+# Let's construct the standalone_chain by piping standalone_prompt_formatted with the LLM
+standalone_chain = (standalone_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+# store the result of standalone_chain chain in the variable "new_question". using the variable input_1
+input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
+# store the result of the search and pull new_question into the standalone_question
+input_2 = {
+    'context': lambda x: format_context(data_indexer.search(x['new_question'])),
+    'standalone_question': lambda x: x['new_question']
+}
+input_to_rag_chain = input_1 | input_2
+# use input_to_rag_chain, rag_prompt_formatted,
+# HistoryInput and the LLM to build the rag_chain.
+rag_chain = (input_to_rag_chain | rag_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)
+# Implement the filtered_rag_chain. It should be the
+# same as the rag_chain but with hybrid_search = True.
+input_1 = RunnablePassthrough.assign(new_question=standalone_chain)
+input_2 = {
+    'context': lambda x: format_context(data_indexer.search(x['new_question'], hybrid_search=True)),
+    'standalone_question': lambda x: x['new_question']
+}
+input_to_filtered_rag_chain = input_1 | input_2
+filtered_rag_chain = (input_to_filtered_rag_chain | rag_prompt_formatted | llm).with_types(input_type=schemas.HistoryInput)

app/data_indexing.py CHANGED Viewed

@@ -6,14 +6,19 @@ from pinecone import ServerlessSpec
 from langchain_community.vectorstores import Chroma
 from langchain_openai import OpenAIEmbeddings
-current_dir = Path(__file__).resolve().parent
 class DataIndexer:
     source_file =  os.path.join(current_dir, 'sources.txt')
-    def __init__(self, index_name='langchain-repo') -> None:
         # TODO: choose your embedding model
         # self.embedding_client = InferenceClient(
@@ -25,13 +30,20 @@ class DataIndexer:
         self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
         if index_name not in self.pinecone_client.list_indexes().names():
             # TODO: create your index if it doesn't exist. Use the create_index function.
             # Make sure to choose the dimension that corresponds to your embedding model
-            pass
         self.index = self.pinecone_client.Index(self.index_name)
         # TODO: make sure to build the index.
-        self.source_index = None
     def get_source_index(self):
         if not os.path.isfile(self.source_file):
@@ -58,8 +70,8 @@ class DataIndexer:
         for i in range(0, len(docs), batch_size):
             batch = docs[i: i + batch_size]
-            # TODO: create a list of the vector representations of each text data in the batch
-            # TODO: choose your embedding model
             # values = self.embedding_client.embed_documents([
             #     doc.page_content for doc in batch
             # ])
@@ -67,14 +79,19 @@ class DataIndexer:
             # values = self.embedding_client.feature_extraction([
             #     doc.page_content for doc in batch
             # ])
-            values = None
-            # TODO: create a list of unique identifiers for each element in the batch with the uuid package.
-            vector_ids = None
-            # TODO: create a list of dictionaries representing the metadata. Capture the text data
             # with the "text" key, and make sure to capture the rest of the doc.metadata.
-            metadatas = None
             # create a list of dictionaries with keys "id" (the unique identifiers), "values"
             # (the vector representation), and "metadata" (the metadata).
@@ -86,7 +103,7 @@ class DataIndexer:
             try:
                 # TODO: Use the function upsert to upload the data to the database.
-                upsert_response = None
                 print(upsert_response)
             except Exception as e:
                 print(e)
@@ -104,16 +121,25 @@ class DataIndexer:
         # TODO: choose your embedding model
         # vector = self.embedding_client.feature_extraction(text_query)
         # vector = self.embedding_client.embed_query(text_query)
-        vector = None
         # TODO: use the vector representation of the text_query to
         # search the database by using the query function.
-        result = None
         docs = []
         for res in result["matches"]:
             # TODO: From the result's metadata, extract the "text" element.
-            pass
         return docs
@@ -126,12 +152,14 @@ if __name__ == '__main__':
         RecursiveCharacterTextSplitter,
     )
     loader = GitLoader(
         clone_url="https://github.com/langchain-ai/langchain",
         repo_path="./code_data/langchain_repo/",
         branch="master",
     )
     python_splitter = RecursiveCharacterTextSplitter.from_language(
         language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
     )
@@ -143,8 +171,11 @@ if __name__ == '__main__':
     for doc in docs:
         doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
     indexer = DataIndexer()
-    with open('/app/sources.txt', 'a') as file:
         for doc in docs:
             file.writelines(doc.metadata['source'] + '\n')
     indexer.index_data(docs)

 from langchain_community.vectorstores import Chroma
 from langchain_openai import OpenAIEmbeddings
+from dotenv import load_dotenv
+# Specify the path to the .env file two directories up
+env_path = Path(__file__).resolve().parents[2] / '.env'
+load_dotenv(dotenv_path=env_path)
+current_dir = Path(__file__).resolve().parent
 class DataIndexer:
     source_file =  os.path.join(current_dir, 'sources.txt')
+    def __init__(self, index_name='langchain-repo'):
         # TODO: choose your embedding model
         # self.embedding_client = InferenceClient(
         self.pinecone_client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
         if index_name not in self.pinecone_client.list_indexes().names():
             # TODO: create your index if it doesn't exist. Use the create_index function.
             # Make sure to choose the dimension that corresponds to your embedding model
+            self.pinecone_client.create_index(
+                name=index_name,
+                dimension=1536,
+                metric='cosine',
+                spec=ServerlessSpec(cloud='aws', region='us-east-1')
+            )
         self.index = self.pinecone_client.Index(self.index_name)
         # TODO: make sure to build the index.
+        self.source_index = self.get_source_index()
     def get_source_index(self):
         if not os.path.isfile(self.source_file):
         for i in range(0, len(docs), batch_size):
             batch = docs[i: i + batch_size]
+            # create a list of the vector representations of each text data in the batch
+            # based on the selected model, choose you extract values
             # values = self.embedding_client.embed_documents([
             #     doc.page_content for doc in batch
             # ])
             # values = self.embedding_client.feature_extraction([
             #     doc.page_content for doc in batch
             # ])
+            values = self.embedding_client.embed_documents([
+                doc.page_content for doc in batch
+            ]) # list of vectors -> vector presentation of the doc
+            # create a list of unique identifiers for each element in the batch with the uuid package.
+            vector_ids = [str(uuid.uuid4()) for _ in batch]
+            # create a list of dictionaries representing the metadata. Capture the text data
             # with the "text" key, and make sure to capture the rest of the doc.metadata.
+            metadatas = [{
+                'text': doc.page_content, **doc.metadata
+            } for doc in batch]
             # create a list of dictionaries with keys "id" (the unique identifiers), "values"
             # (the vector representation), and "metadata" (the metadata).
             try:
                 # TODO: Use the function upsert to upload the data to the database.
+                upsert_response = self.index.upsert(vectors=vectors)
                 print(upsert_response)
             except Exception as e:
                 print(e)
         # TODO: choose your embedding model
         # vector = self.embedding_client.feature_extraction(text_query)
         # vector = self.embedding_client.embed_query(text_query)
+        vector = self.embedding_client.embed_query(text_query)
         # TODO: use the vector representation of the text_query to
         # search the database by using the query function.
+        result = self.index.query(
+            # namespace=self.index_name,
+            vector=vector,
+            filter=filter,
+            top_k=top_k,
+            include_metadata=True,
+        )
         docs = []
         for res in result["matches"]:
             # TODO: From the result's metadata, extract the "text" element.
+            metadata = res['metadata']
+            if 'text' in metadata:
+                text = metadata.pop('text')
+                docs.append(text)
         return docs
         RecursiveCharacterTextSplitter,
     )
+    print('start the GitLoader')
     loader = GitLoader(
         clone_url="https://github.com/langchain-ai/langchain",
         repo_path="./code_data/langchain_repo/",
         branch="master",
     )
+    print('perfrom python splitter')
     python_splitter = RecursiveCharacterTextSplitter.from_language(
         language=Language.PYTHON, chunk_size=10000, chunk_overlap=100
     )
     for doc in docs:
         doc.page_content = '# {}\n\n'.format(doc.metadata['source']) + doc.page_content
+    print('instantiat the data indexer')
     indexer = DataIndexer()
+    # with open('/app/sources.txt', 'a') as file:
+    with open(indexer.source_file, 'a') as file:
         for doc in docs:
             file.writelines(doc.metadata['source'] + '\n')
     indexer.index_data(docs)

app/main.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import List
 from sqlalchemy.orm import Session
 import schemas
-from chains import simple_chain, formatted_chain, history_chain
 import crud, models, schemas, prompts
 from database import SessionLocal, engine
 from callbacks import LogResponseCallback
@@ -114,30 +114,54 @@ async def history_stream(request: Request, db: Session = Depends(get_db)):
     ))
-# @app.post("/rag/stream")
-# async def rag_stream(request: Request, db: Session = Depends(get_db)):
-#     # TODO: Let's implement the "/rag/stream" endpoint. The endpoint should follow those steps:
-#     # - The endpoint receives the request
-#     # - The request is parsed into a user request
-#     # - The user request is used to pull the chat history of the user
-#     # - We add as part of the user history the current question by using add_message.
-#     # - We create an instance of HistoryInput by using format_chat_history.
-#     # - We use the history input within the rag chain.
-#     raise NotImplemented
-# @app.post("/filtered_rag/stream")
-# async def filtered_rag_stream(request: Request, db: Session = Depends(get_db)):
-#     # TODO: Let's implement the "/filtered_rag/stream" endpoint. The endpoint should follow those steps:
-#     # - The endpoint receives the request
-#     # - The request is parsed into a user request
-#     # - The user request is used to pull the chat history of the user
-#     # - We add as part of the user history the current question by using add_message.
-#     # - We create an instance of HistoryInput by using format_chat_history.
-#     # - We use the history input within the filtered rag chain.
-#     raise NotImplemented
 # Run From the Parent Directory with Script

 from sqlalchemy.orm import Session
 import schemas
+from chains import simple_chain, formatted_chain, history_chain, rag_chain, filtered_rag_chain
 import crud, models, schemas, prompts
 from database import SessionLocal, engine
 from callbacks import LogResponseCallback
     ))
+@app.post("/rag/stream")
+async def rag_stream(request: Request, db: Session = Depends(get_db)):
+    # TODO: Let's implement the "/rag/stream" endpoint. The endpoint should follow those steps:
+    # - The endpoint receives the request
+    # - The request is parsed into a user request
+    # - The user request is used to pull the chat history of the user
+    # - We add as part of the user history the current question by using add_message.
+    # - We create an instance of HistoryInput by using format_chat_history.
+    # - We use the history input within the rag chain.
+    data = await request.json()
+    user_request = schemas.UserRequest(**data['input'])
+    chat_history = crud.get_user_chat_history(db=db, username=user_request.username)
+    message = schemas.MessageBase(message=user_request.question, type='User', timestamp=datetime.now())
+    crud.add_message(db, message=message, username=user_request.username)
+    rag_input = schemas.HistoryInput(
+        question=user_request.question,
+        chat_history=prompts.format_chat_history(chat_history)
+    )
+    return EventSourceResponse(generate_stream(
+        rag_input, rag_chain, [LogResponseCallback(user_request, db)]
+    ))
+@app.post("/filtered_rag/stream")
+async def filtered_rag_stream(request: Request, db: Session = Depends(get_db)):
+    # TODO: Let's implement the "/filtered_rag/stream" endpoint. The endpoint should follow those steps:
+    # - The endpoint receives the request
+    # - The request is parsed into a user request
+    # - The user request is used to pull the chat history of the user
+    # - We add as part of the user history the current question by using add_message.
+    # - We create an instance of HistoryInput by using format_chat_history.
+    # - We use the history input within the filtered rag chain.
+    data = await request.json()
+    user_request = schemas.UserRequest(**data['input'])
+    chat_history = crud.get_user_chat_history(db=db, username=user_request.username)
+    message = schemas.MessageBase(message=user_request.question, type='User', timestamp=datetime.now())
+    crud.add_message(db, message=message, username=user_request.username)
+    rag_input = schemas.HistoryInput(
+        question=user_request.question,
+        chat_history=prompts.format_chat_history(chat_history)
+    )
+    return EventSourceResponse(generate_stream(
+        rag_input, filtered_rag_chain, [LogResponseCallback(user_request, db)]
+    ))
 # Run From the Parent Directory with Script

app/prompts.py CHANGED Viewed

@@ -53,7 +53,7 @@ def format_context(docs: List[str]):
     # so we need to concatenate that list into a text that can fit into
     # the rag_prompt_formatted. Implement format_context that takes a
     # like of strings and returns the context as one string.
-    raise NotImplemented
 prompt = "{question}"
@@ -70,15 +70,29 @@ Follow Up Question: {question}
 helpful answer:
 """
-# TODO: Create the standalone_prompt prompt that will capture the question and the chat history
 # to generate a standalone question. It needs a {chat_history} placeholder and a {question} placeholder,
-standalone_prompt: str = None
-# TODO: Create the rag_prompt that will capture the context and the standalone question to generate
 # a final answer to the question.
-rag_prompt: str = None
-# TODO: create raw_prompt_formatted by using format_prompt
 #raw_prompt_formatted = format_prompt(raw_prompt)
 #raw_prompt = PromptTemplate.from_template(raw_prompt)
@@ -89,10 +103,11 @@ raw_prompt = PromptTemplate.from_template(prompt)
 raw_prompt_formatted = format_prompt(prompt)
-# TODO: use format_prompt to create history_prompt_formatted
 history_prompt_formatted = format_prompt(history_prompt)
-# TODO: use format_prompt to create standalone_prompt_formatted
-standalone_prompt_formatted: PromptTemplate = None
-# TODO: use format_prompt to create rag_prompt_formatted
-rag_prompt_formatted: PromptTemplate = None

     # so we need to concatenate that list into a text that can fit into
     # the rag_prompt_formatted. Implement format_context that takes a
     # like of strings and returns the context as one string.
+    return '\n\n'.join(docs)
 prompt = "{question}"
 helpful answer:
 """
+# Create the standalone_prompt prompt that will capture the question and the chat history
 # to generate a standalone question. It needs a {chat_history} placeholder and a {question} placeholder,
+standalone_prompt: str = """
+Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
+Chat History:
+{chat_history}
+Follow Up Input: {question}
+Standalone question:
+"""
+# Create the rag_prompt that will capture the context and the standalone question to generate
 # a final answer to the question.
+rag_prompt: str = """
+Answer the question based only on the following context:
+{context}
+Question: {standalone_question}
+"""
+# create raw_prompt_formatted by using format_prompt
 #raw_prompt_formatted = format_prompt(raw_prompt)
 #raw_prompt = PromptTemplate.from_template(raw_prompt)
 raw_prompt_formatted = format_prompt(prompt)
+# use format_prompt to create history_prompt_formatted
 history_prompt_formatted = format_prompt(history_prompt)
+#  use format_prompt to create standalone_prompt_formatted
+standalone_prompt_formatted = format_prompt(standalone_prompt)
+# use format_prompt to create rag_prompt_formatted
+rag_prompt_formatted = format_prompt(rag_prompt)

app/schemas.py CHANGED Viewed

@@ -15,7 +15,7 @@ class UserRequest(BaseModel):
     username: str
     question: str
-# TODO: implement MessageBase as a schema mapping from the database model to the
 # FastAPI data model. Basically MessageBase should have the same attributes as models.Message
 class MessageBase(BaseModel):
     # id: int

     username: str
     question: str
+# implement MessageBase as a schema mapping from the database model to the
 # FastAPI data model. Basically MessageBase should have the same attributes as models.Message
 class MessageBase(BaseModel):
     # id: int