Spaces:

salgadev
/

docverifyrag

Sleeping

App Files Files Community

SANDRAMSC commited on Apr 18, 2024

Commit

9f023a2

2 Parent(s): a5cfd46 3663ccd

Merge branch 'sandramsc-dev' of https://github.com/sandramsc/DocVerifyRAG into sandramsc-dev

Browse files

Files changed (13) hide show

.gitignore +3 -0
README.md +0 -13
app.py +219 -0
backend/generate_metadata.py +21 -28
backend/requirements.txt +1 -0
backend/schema.py +2 -2
flake.lock +27 -0
flake.nix +0 -1
flake.nix +46 -0
html_templates.py +44 -0
ingest.py +0 -7
notebooks/preprocess_dataset.ipynb +467 -0
notebooks/vectarize.ipynb +239 -0

.gitignore CHANGED Viewed

@@ -6,4 +6,7 @@
 flake.nix
 *__pycache__*
 .idea

 flake.nix
 *__pycache__*
 .idea
+flake.lock
+flake.nix
+docs/

README.md CHANGED Viewed

@@ -1,16 +1,3 @@
-## Higging Face configuration
----
-title: Docverifyrag
-emoji: 🐠
-colorFrom: indigo
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.33.0
-app_file: app.py
-pinned: false
----
 <!-- PROJECT TITLE -->
   <h1 align="center">DocVerifyRAG: Document Verification and Anomaly Detection</h1>
  <div id="header" align="center">

 <!-- PROJECT TITLE -->
   <h1 align="center">DocVerifyRAG: Document Verification and Anomaly Detection</h1>
  <div id="header" align="center">

app.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import time
+import streamlit as st
+from PyPDF2 import PdfReader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
+from langchain.vectorstores import FAISS
+from langchain.chat_models import ChatOpenAI
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+import os
+import pickle
+from datetime import datetime
+from backend.generate_metadata import generate_metadata, ingest
+css = '''
+<style>
+.chat-message {
+    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
+}
+.chat-message.user {
+    background-color: #2b313e
+}
+.chat-message.bot {
+    background-color: #475063
+}
+.chat-message .avatar {
+  width: 20%;
+}
+.chat-message .avatar img {
+  max-width: 78px;
+  max-height: 78px;
+  border-radius: 50%;
+  object-fit: cover;
+}
+.chat-message .message {
+  width: 80%;
+  padding: 0 1.5rem;
+  color: #fff;
+}
+'''
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png"
+        style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+def get_pdf_text(pdf_docs):
+    text = ""
+    for pdf in pdf_docs:
+        pdf_reader = PdfReader(pdf)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    return text
+def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+def get_vectorstore(text_chunks):
+    embeddings = OpenAIEmbeddings()
+    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
+    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
+    return vectorstore
+def get_conversation_chain(vectorstore):
+    llm = ChatOpenAI()
+    # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
+    memory = ConversationBufferMemory(
+        memory_key='chat_history', return_messages=True)
+    conversation_chain = ConversationalRetrievalChain.from_llm(
+        llm=llm,
+        retriever=vectorstore.as_retriever(),
+        memory=memory
+    )
+    return conversation_chain
+def handle_userinput(user_question):
+    response = st.session_state.conversation({'question': user_question})
+    st.session_state.chat_history = response['chat_history']
+    for i, message in enumerate(st.session_state.chat_history):
+        # Display user message
+        if i % 2 == 0:
+            st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+        else:
+            print(message)
+            # Display AI response
+            st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
+            # THIS DOESNT WORK, SOMEONE PLS FIX
+            # Display source document information if available in the message
+            if hasattr(message, 'source') and message.source:
+                st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
+def safe_vec_store():
+    # USE VECTARA INSTEAD
+    os.makedirs('vectorstore', exist_ok=True)
+    filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
+    file_path = os.path.join('vectorstore', filename)
+    vector_store = st.session_state.vectorstore
+    # Serialize and save the entire FAISS object using pickle
+    with open(file_path, 'wb') as f:
+        pickle.dump(vector_store, f)
+def main():
+    st.set_page_config(page_title="Doc Verify RAG", page_icon=":hospital:")
+    st.write(css, unsafe_allow_html=True)
+    if "openai_api_key" not in st.session_state:
+        st.session_state.openai_api_key = False
+    if "openai_org" not in st.session_state:
+        st.session_state.openai_org = False
+    if "classify" not in st.session_state:
+        st.session_state.classify = False
+    def set_pw():
+        st.session_state.openai_api_key = True
+    st.subheader("Your documents")
+    # OPENAI_ORG_ID = st.text_input("OPENAI ORG ID:")
+    OPENAI_API_KEY = st.text_input("OPENAI API KEY:", type="password",
+                                   disabled=st.session_state.openai_api_key, on_change=set_pw)
+    if st.session_state.classify:
+        pdf_doc = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=False)
+    else:
+        pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
+        filenames = [file.name for file in pdf_docs if file is not None]
+    if st.button("Process"):
+        with st.spinner("Processing"):
+            if st.session_state.classify:
+                # THE CLASSIFICATION APP
+                st.write("Classifying")
+                plain_text_doc = ingest(pdf_doc.name)
+                classification_result = generate_metadata(plain_text_doc)
+                st.write(classification_result)
+            else:
+                # NORMAL RAG
+                loaded_vec_store = None
+                for filename in filenames:
+                    if ".pkl" in filename:
+                        file_path = os.path.join('vectorstore', filename)
+                        with open(file_path, 'rb') as f:
+                            loaded_vec_store = pickle.load(f)
+                raw_text = get_pdf_text(pdf_docs)
+                text_chunks = get_text_chunks(raw_text)
+                vec = get_vectorstore(text_chunks)
+                if loaded_vec_store:
+                    vec.merge_from(loaded_vec_store)
+                    st.warning("loaded vectorstore")
+                if "vectorstore" in st.session_state:
+                    vec.merge_from(st.session_state.vectorstore)
+                    st.warning("merged to existing")
+                st.session_state.vectorstore = vec
+                st.session_state.conversation = get_conversation_chain(vec)
+        st.success("data loaded")
+    if "conversation" not in st.session_state:
+        st.session_state.conversation = None
+    if "chat_history" not in st.session_state:
+        st.session_state.chat_history = None
+    st.header("Doc Verify RAG :hospital:")
+    user_question = st.text_input("Ask a question about your documents:")
+    if user_question:
+        handle_userinput(user_question)
+    with st.sidebar:
+        st.subheader("Classification Instrucitons")
+        classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
+        filenames = [file.name for file in classifier_docs if file is not None]
+        if st.button("Process Classification"):
+            st.session_state.classify = True
+            with st.spinner("Processing"):
+                st.warning("set classify")
+                time.sleep(3)
+        # Save and Load Embeddings
+        if st.button("Save Embeddings"):
+            if "vectorstore" in st.session_state:
+                safe_vec_store()
+                # st.session_state.vectorstore.save_local("faiss_index")
+                st.sidebar.success("saved")
+            else:
+                st.sidebar.warning("No embeddings to save. Please process documents first.")
+        if st.button("Load Embeddings"):
+            st.warning("this function is not in use, just upload the vectorstore")
+if __name__ == '__main__':
+    main()

backend/generate_metadata.py CHANGED Viewed

@@ -1,29 +1,16 @@
 import os
 import argparse
 import json
 import openai
 from dotenv import load_dotenv
 from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.embeddings.fake import FakeEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import Vectara
-from schema import Metadata, BimDiscipline
 load_dotenv()
-vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
-vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
-vectara_api_key = os.environ['VECTARA_API_KEY']
-vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
-                      vectara_corpus_id=vectara_corpus_id,
-                      vectara_api_key=vectara_api_key)
 def ingest(file_path):
     extension = file_path.split('.')[-1]
@@ -51,46 +38,52 @@ def ingest(file_path):
         "",
     ])
     docs = text_splitter.split_documents(documents)
-    #print(docs)
     return docs
-    # vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
-    # retriever = vectara.as_retriever()
-    # return retriever
-def extract_metadata(docs):
     # plain text
     context = "".join(
         [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
     # Create client
     client = openai.OpenAI(
         base_url="https://api.together.xyz/v1",
         api_key=os.environ["TOGETHER_API_KEY"],
     )
     # Call the LLM with the JSON schema
     chat_completion = client.chat.completions.create(
-        model="mistralai/Mixtral-8x7B-Instruct-v0.1",
-        response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
         messages=[
             {
                 "role": "system",
-                "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
             },
             {
                 "role": "user",
-                "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
             }
         ]
     )
-    created_user = json.loads(chat_completion.choices[0].message.content)
-    return created_user
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
@@ -104,5 +97,5 @@ if __name__ == "__main__":
         sys.exit(-1)
     docs = ingest(args.document)
-    metadata = extract_metadata(docs)
-    print(json.dumps(metadata, indent=2))

 import os
 import argparse
 import json
 import openai
+import sys
 from dotenv import load_dotenv
 from langchain_community.document_loaders import TextLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from langchain_community.embeddings.fake import FakeEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 load_dotenv()
 def ingest(file_path):
     extension = file_path.split('.')[-1]
         "",
     ])
     docs = text_splitter.split_documents(documents)
     return docs
+def generate_metadata(docs):
+    prompt_template = """
+    BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']
+    You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
+    Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
+    context="
+    """
     # plain text
+    filepath = [doc.metadata for doc in docs][0]['source']
     context = "".join(
         [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
+    prompt = f'{prompt_template}{context}"\nFilepath:{filepath}'
+    #print(prompt)
     # Create client
     client = openai.OpenAI(
         base_url="https://api.together.xyz/v1",
         api_key=os.environ["TOGETHER_API_KEY"],
+        #api_key=userdata.get('TOGETHER_API_KEY'),
     )
     # Call the LLM with the JSON schema
     chat_completion = client.chat.completions.create(
+        model="mistralai/Mixtral-8x7B-Instruct-v0.1",
         messages=[
             {
                 "role": "system",
+                "content": f"You are a helpful assistant that responsds in JSON format"
             },
             {
                 "role": "user",
+                "content": prompt
             }
         ]
     )
+    return json.loads(chat_completion.choices[0].message.content)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
         sys.exit(-1)
     docs = ingest(args.document)
+    metadata = generate_metadata(docs)
+    print(metadata)

backend/requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ langchain
 openai
 chromadb
 tiktoken

 openai
 chromadb
 tiktoken
+python-poppler

backend/schema.py CHANGED Viewed

@@ -5,11 +5,11 @@ from pydantic import BaseModel, Field, conlist
 from enum import Enum
 class BimDiscipline(str, Enum):
-    plumbing = 'S - Sanitär'
     network = 'D - Datennetz'
     heating = 'H - Heizung'
     electrical = 'E - Elektro'
-    ventilation = 'L - Lüftung'
     architecture = 'A - Architektur'
 # Define the schema for the output.

 from enum import Enum
 class BimDiscipline(str, Enum):
+    plumbing = 'S - Sanitaer'
     network = 'D - Datennetz'
     heating = 'H - Heizung'
     electrical = 'E - Elektro'
+    ventilation = 'L - Lueftung'
     architecture = 'A - Architektur'
 # Define the schema for the output.

flake.lock ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "nodes": {
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1713248628,
+        "narHash": "sha256-NLznXB5AOnniUtZsyy/aPWOk8ussTuePp2acb9U+ISA=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "5672bc9dbf9d88246ddab5ac454e82318d094bb8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix DELETED Viewed

	@@ -1 +0,0 @@
1	- /home/salgadev/code/dev-flakes/templates/langchain-rag/flake.nix

flake.nix ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  description = "A LLM backend development flake powered by unstructured and langchain";
+  inputs = {
+    nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
+  };
+  outputs = {nixpkgs, ...}: let
+    system = "x86_64-linux";
+    #       ↑ Swap it for your system if needed
+    #       "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
+    pkgs = nixpkgs.legacyPackages.${system};
+  in {
+    devShells.${system}.default = pkgs.mkShell {
+      packages = [
+        (pkgs.python311.withPackages (python-pkgs: [
+          python-pkgs.numpy
+          python-pkgs.pandas
+          python-pkgs.scipy
+          python-pkgs.matplotlib
+          python-pkgs.requests
+          python-pkgs.langchain-community
+          python-pkgs.langchain
+          python-pkgs.langchain-text-splitters
+          python-pkgs.unstructured
+          python-pkgs.openai
+          python-pkgs.pydantic
+          python-pkgs.python-dotenv
+          python-pkgs.configargparse
+          python-pkgs.streamlit
+          python-pkgs.pip
+          python-pkgs.lark
+          python-pkgs.jupyter
+          python-pkgs.notebook
+          python-pkgs.sentence-transformers
+          pkgs.unstructured-api
+        ]))
+      ];
+      shellHook = ''
+        venv="$(cd $(dirname $(which python)); cd ..; pwd)"
+        ln -Tsf "$venv" .venv
+      '';
+    };
+  };
+}

html_templates.py ADDED Viewed

	@@ -0,0 +1,44 @@

+css = '''
+<style>
+.chat-message {
+    padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
+}
+.chat-message.user {
+    background-color: #2b313e
+}
+.chat-message.bot {
+    background-color: #475063
+}
+.chat-message .avatar {
+  width: 20%;
+}
+.chat-message .avatar img {
+  max-width: 78px;
+  max-height: 78px;
+  border-radius: 50%;
+  object-fit: cover;
+}
+.chat-message .message {
+  width: 80%;
+  padding: 0 1.5rem;
+  color: #fff;
+}
+'''
+bot_template = '''
+<div class="chat-message bot">
+    <div class="avatar">
+        <img src="" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''
+user_template = '''
+<div class="chat-message user">
+    <div class="avatar">
+        <img src="">
+    </div>
+    <div class="message">{{MSG}}</div>
+</div>
+'''

ingest.py DELETED Viewed

@@ -1,7 +0,0 @@
-from langchain_community.document_loaders import UnstructuredPDFLoader
-def ingest_pdf(path):
-    loader = UnstructuredPDFLoader()
-    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-    return data

notebooks/preprocess_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,467 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 106,
+      "metadata": {
+        "id": "f-ERaM64ONeC"
+      },
+      "outputs": [],
+      "source": [
+        "# preprocess csv\n",
+        "import pandas as pd\n",
+        "filename = '/content/U3_Metadaten.csv'\n",
+        "df = pd.read_csv(filename, on_bad_lines='skip')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 118,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 424
+        },
+        "id": "AYxRURTvQiFb",
+        "outputId": "18bf4139-47ac-4939-e635-9f09f560200c"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.google.colaboratory.intrinsic+json": {
+              "summary": "{\n  \"name\": \"clean_df\",\n  \"rows\": 158,\n  \"fields\": [\n    {\n      \"column\": \"Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 158,\n        \"samples\": [\n          \"ISB-020-U3-W-R-01-B17012-028-000\",\n          \"ISB-020-U3-W-L-01-B15100-018-000\",\n          \"ISB-020-U3-W-R-01-B17012-034-000\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Beschreibung\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 40,\n        \"samples\": [\n          \"Foto\",\n          \"Bodenheizung / Ventileinstellung / FBH AB PM\",\n          \"Foto - Novocon S demontiert und Stellenantriebe montiert!\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Disziplin\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"D - Datennetz\",\n          \"E - Elektroanlagen\",\n          \"S - Sanitaer\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
+              "type": "dataframe",
+              "variable_name": "clean_df"
+            },
+            "text/html": [
+              "\n",
+              "  <div id=\"df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0\" class=\"colab-df-container\">\n",
+              "    <div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Name</th>\n",
+              "      <th>Beschreibung</th>\n",
+              "      <th>Disziplin</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>ISB-020-U3-W-D-01-B07005-001-000</td>\n",
+              "      <td>Bauarten und Stuecknachweis SGK</td>\n",
+              "      <td>D - Datennetz</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>ISB-020-U3-W-D-01-B07005-002-000</td>\n",
+              "      <td>Bauarten und Stuecknachweis SGK</td>\n",
+              "      <td>D - Datennetz</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>ISB-020-U3-W-D-01-B07005-003-000</td>\n",
+              "      <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
+              "      <td>D - Datennetz</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>ISB-020-U3-W-D-01-B07005-004-000</td>\n",
+              "      <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
+              "      <td>D - Datennetz</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>ISB-020-U3-W-D-01-B18012-001-000</td>\n",
+              "      <td>Sicherungslegende G-020 U3 779-AS 1</td>\n",
+              "      <td>D - Datennetz</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>...</th>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "      <td>...</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>153</th>\n",
+              "      <td>ISB-020-U3-W-S-01-B17012-008-000</td>\n",
+              "      <td>Foto</td>\n",
+              "      <td>S - Sanitaer</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>159</th>\n",
+              "      <td>ISB-020-U3-W-S-01-B17012-010-000</td>\n",
+              "      <td>Foto</td>\n",
+              "      <td>S - Sanitaer</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>160</th>\n",
+              "      <td>ISB-020-U3-W-S-01-B17012-011-000</td>\n",
+              "      <td>Foto</td>\n",
+              "      <td>S - Sanitaer</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>161</th>\n",
+              "      <td>ISB-020-U3-W-S-01-B18003-001-020</td>\n",
+              "      <td>Schieber / Hawle / Schieber 4000 + Handrad 780...</td>\n",
+              "      <td>S - Sanitaer</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>162</th>\n",
+              "      <td>ISB-020-U3-W-S-01-B19009-001-020</td>\n",
+              "      <td>Schieber / Hawle / 4000 Schutzraum</td>\n",
+              "      <td>S - Sanitaer</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "<p>158 rows × 3 columns</p>\n",
+              "</div>\n",
+              "    <div class=\"colab-df-buttons\">\n",
+              "\n",
+              "  <div class=\"colab-df-container\">\n",
+              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0')\"\n",
+              "            title=\"Convert this dataframe to an interactive table.\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
+              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
+              "  </svg>\n",
+              "    </button>\n",
+              "\n",
+              "  <style>\n",
+              "    .colab-df-container {\n",
+              "      display:flex;\n",
+              "      gap: 12px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert {\n",
+              "      background-color: #E8F0FE;\n",
+              "      border: none;\n",
+              "      border-radius: 50%;\n",
+              "      cursor: pointer;\n",
+              "      display: none;\n",
+              "      fill: #1967D2;\n",
+              "      height: 32px;\n",
+              "      padding: 0 0 0 0;\n",
+              "      width: 32px;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-convert:hover {\n",
+              "      background-color: #E2EBFA;\n",
+              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "      fill: #174EA6;\n",
+              "    }\n",
+              "\n",
+              "    .colab-df-buttons div {\n",
+              "      margin-bottom: 4px;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert {\n",
+              "      background-color: #3B4455;\n",
+              "      fill: #D2E3FC;\n",
+              "    }\n",
+              "\n",
+              "    [theme=dark] .colab-df-convert:hover {\n",
+              "      background-color: #434B5C;\n",
+              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "      fill: #FFFFFF;\n",
+              "    }\n",
+              "  </style>\n",
+              "\n",
+              "    <script>\n",
+              "      const buttonEl =\n",
+              "        document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0 button.colab-df-convert');\n",
+              "      buttonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "      async function convertToInteractive(key) {\n",
+              "        const element = document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0');\n",
+              "        const dataTable =\n",
+              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
+              "                                                    [key], {});\n",
+              "        if (!dataTable) return;\n",
+              "\n",
+              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
+              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
+              "          + ' to learn more about interactive tables.';\n",
+              "        element.innerHTML = '';\n",
+              "        dataTable['output_type'] = 'display_data';\n",
+              "        await google.colab.output.renderOutput(dataTable, element);\n",
+              "        const docLink = document.createElement('div');\n",
+              "        docLink.innerHTML = docLinkHtml;\n",
+              "        element.appendChild(docLink);\n",
+              "      }\n",
+              "    </script>\n",
+              "  </div>\n",
+              "\n",
+              "\n",
+              "<div id=\"df-518b8ddb-11a0-49a2-8903-71e4063ca189\">\n",
+              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-518b8ddb-11a0-49a2-8903-71e4063ca189')\"\n",
+              "            title=\"Suggest charts\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "     width=\"24px\">\n",
+              "    <g>\n",
+              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
+              "    </g>\n",
+              "</svg>\n",
+              "  </button>\n",
+              "\n",
+              "<style>\n",
+              "  .colab-df-quickchart {\n",
+              "      --bg-color: #E8F0FE;\n",
+              "      --fill-color: #1967D2;\n",
+              "      --hover-bg-color: #E2EBFA;\n",
+              "      --hover-fill-color: #174EA6;\n",
+              "      --disabled-fill-color: #AAA;\n",
+              "      --disabled-bg-color: #DDD;\n",
+              "  }\n",
+              "\n",
+              "  [theme=dark] .colab-df-quickchart {\n",
+              "      --bg-color: #3B4455;\n",
+              "      --fill-color: #D2E3FC;\n",
+              "      --hover-bg-color: #434B5C;\n",
+              "      --hover-fill-color: #FFFFFF;\n",
+              "      --disabled-bg-color: #3B4455;\n",
+              "      --disabled-fill-color: #666;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart {\n",
+              "    background-color: var(--bg-color);\n",
+              "    border: none;\n",
+              "    border-radius: 50%;\n",
+              "    cursor: pointer;\n",
+              "    display: none;\n",
+              "    fill: var(--fill-color);\n",
+              "    height: 32px;\n",
+              "    padding: 0;\n",
+              "    width: 32px;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart:hover {\n",
+              "    background-color: var(--hover-bg-color);\n",
+              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "    fill: var(--button-hover-fill-color);\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-quickchart-complete:disabled,\n",
+              "  .colab-df-quickchart-complete:disabled:hover {\n",
+              "    background-color: var(--disabled-bg-color);\n",
+              "    fill: var(--disabled-fill-color);\n",
+              "    box-shadow: none;\n",
+              "  }\n",
+              "\n",
+              "  .colab-df-spinner {\n",
+              "    border: 2px solid var(--fill-color);\n",
+              "    border-color: transparent;\n",
+              "    border-bottom-color: var(--fill-color);\n",
+              "    animation:\n",
+              "      spin 1s steps(1) infinite;\n",
+              "  }\n",
+              "\n",
+              "  @keyframes spin {\n",
+              "    0% {\n",
+              "      border-color: transparent;\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "      border-left-color: var(--fill-color);\n",
+              "    }\n",
+              "    20% {\n",
+              "      border-color: transparent;\n",
+              "      border-left-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "    }\n",
+              "    30% {\n",
+              "      border-color: transparent;\n",
+              "      border-left-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "      border-right-color: var(--fill-color);\n",
+              "    }\n",
+              "    40% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "      border-top-color: var(--fill-color);\n",
+              "    }\n",
+              "    60% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "    }\n",
+              "    80% {\n",
+              "      border-color: transparent;\n",
+              "      border-right-color: var(--fill-color);\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "    }\n",
+              "    90% {\n",
+              "      border-color: transparent;\n",
+              "      border-bottom-color: var(--fill-color);\n",
+              "    }\n",
+              "  }\n",
+              "</style>\n",
+              "\n",
+              "  <script>\n",
+              "    async function quickchart(key) {\n",
+              "      const quickchartButtonEl =\n",
+              "        document.querySelector('#' + key + ' button');\n",
+              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
+              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
+              "      try {\n",
+              "        const charts = await google.colab.kernel.invokeFunction(\n",
+              "            'suggestCharts', [key], {});\n",
+              "      } catch (error) {\n",
+              "        console.error('Error during call to suggestCharts:', error);\n",
+              "      }\n",
+              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
+              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
+              "    }\n",
+              "    (() => {\n",
+              "      let quickchartButtonEl =\n",
+              "        document.querySelector('#df-518b8ddb-11a0-49a2-8903-71e4063ca189 button');\n",
+              "      quickchartButtonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "    })();\n",
+              "  </script>\n",
+              "</div>\n",
+              "\n",
+              "  <div id=\"id_5f410c26-0cce-4d03-86e0-353ac70a1d74\">\n",
+              "    <style>\n",
+              "      .colab-df-generate {\n",
+              "        background-color: #E8F0FE;\n",
+              "        border: none;\n",
+              "        border-radius: 50%;\n",
+              "        cursor: pointer;\n",
+              "        display: none;\n",
+              "        fill: #1967D2;\n",
+              "        height: 32px;\n",
+              "        padding: 0 0 0 0;\n",
+              "        width: 32px;\n",
+              "      }\n",
+              "\n",
+              "      .colab-df-generate:hover {\n",
+              "        background-color: #E2EBFA;\n",
+              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
+              "        fill: #174EA6;\n",
+              "      }\n",
+              "\n",
+              "      [theme=dark] .colab-df-generate {\n",
+              "        background-color: #3B4455;\n",
+              "        fill: #D2E3FC;\n",
+              "      }\n",
+              "\n",
+              "      [theme=dark] .colab-df-generate:hover {\n",
+              "        background-color: #434B5C;\n",
+              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
+              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
+              "        fill: #FFFFFF;\n",
+              "      }\n",
+              "    </style>\n",
+              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('clean_df')\"\n",
+              "            title=\"Generate code using this dataframe.\"\n",
+              "            style=\"display:none;\">\n",
+              "\n",
+              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
+              "       width=\"24px\">\n",
+              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
+              "  </svg>\n",
+              "    </button>\n",
+              "    <script>\n",
+              "      (() => {\n",
+              "      const buttonEl =\n",
+              "        document.querySelector('#id_5f410c26-0cce-4d03-86e0-353ac70a1d74 button.colab-df-generate');\n",
+              "      buttonEl.style.display =\n",
+              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
+              "\n",
+              "      buttonEl.onclick = () => {\n",
+              "        google.colab.notebook.generateWithVariable('clean_df');\n",
+              "      }\n",
+              "      })();\n",
+              "    </script>\n",
+              "  </div>\n",
+              "\n",
+              "    </div>\n",
+              "  </div>\n"
+            ],
+            "text/plain": [
+              "                                 Name  \\\n",
+              "0    ISB-020-U3-W-D-01-B07005-001-000   \n",
+              "1    ISB-020-U3-W-D-01-B07005-002-000   \n",
+              "2    ISB-020-U3-W-D-01-B07005-003-000   \n",
+              "3    ISB-020-U3-W-D-01-B07005-004-000   \n",
+              "4    ISB-020-U3-W-D-01-B18012-001-000   \n",
+              "..                                ...   \n",
+              "153  ISB-020-U3-W-S-01-B17012-008-000   \n",
+              "159  ISB-020-U3-W-S-01-B17012-010-000   \n",
+              "160  ISB-020-U3-W-S-01-B17012-011-000   \n",
+              "161  ISB-020-U3-W-S-01-B18003-001-020   \n",
+              "162  ISB-020-U3-W-S-01-B19009-001-020   \n",
+              "\n",
+              "                                          Beschreibung      Disziplin  \n",
+              "0                      Bauarten und Stuecknachweis SGK  D - Datennetz  \n",
+              "1                      Bauarten und Stuecknachweis SGK  D - Datennetz  \n",
+              "2                 Pruefprotokoll nach DIN EN 61439-1/3  D - Datennetz  \n",
+              "3                 Pruefprotokoll nach DIN EN 61439-1/3  D - Datennetz  \n",
+              "4                  Sicherungslegende G-020 U3 779-AS 1  D - Datennetz  \n",
+              "..                                                 ...            ...  \n",
+              "153                                               Foto   S - Sanitaer  \n",
+              "159                                               Foto   S - Sanitaer  \n",
+              "160                                               Foto   S - Sanitaer  \n",
+              "161  Schieber / Hawle / Schieber 4000 + Handrad 780...   S - Sanitaer  \n",
+              "162                 Schieber / Hawle / 4000 Schutzraum   S - Sanitaer  \n",
+              "\n",
+              "[158 rows x 3 columns]"
+            ]
+          },
+          "execution_count": 118,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# drop all columns except name, description, discipline\n",
+        "features = ['Name', 'Beschreibung', 'Disziplin']\n",
+        "# Remove rows with NaN values\n",
+        "clean_df = df[features].dropna()\n",
+        "clean_df"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 143,
+      "metadata": {
+        "id": "_PtvbAskQa72"
+      },
+      "outputs": [],
+      "source": [
+        "clean_df.to_csv('name-description-discipline-data.csv')"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

notebooks/vectarize.ipynb ADDED Viewed

	@@ -0,0 +1,239 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os \n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
+    "\n",
+    "from langchain_community.vectorstores import Vectara\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = CSVLoader(file_path='/home/salgadev/code/DocVerifyRAG/name-description-discipline-data.csv')\n",
+    "data = loader.load()\n",
+    "\n",
+    "vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']\n",
+    "vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']\n",
+    "vectara_api_key = os.environ['VECTARA_API_KEY']\n",
+    "#hf_token = os.environ['HF_API_TOKEN']\n",
+    "\n",
+    "vectorstore = Vectara(vectara_customer_id=vectara_customer_id,\n",
+    "                      vectara_corpus_id=vectara_corpus_id,\n",
+    "                      vectara_api_key=vectara_api_key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
+    "embeddings = HuggingFaceEmbeddings(model_name=\"intfloat/multilingual-e5-large\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vectara = Vectara.from_documents(data, embedding=embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summary_config = {\"is_enabled\": True, \"max_results\": 5, \"response_lang\": \"eng\"}\n",
+    "retriever = vectara.as_retriever(\n",
+    "    search_kwargs={\"k\": 3, \"summary_config\": summary_config}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_sources(documents):\n",
+    "    return documents[:-1]\n",
+    "\n",
+    "\n",
+    "def get_summary(documents):\n",
+    "    return documents[-1].page_content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The documents related to the electrical discipline include items like ISB-020-U3-W-E-01-B07005-002-020, which pertains to U3 740KV 2 USV, and ISB-020-U3-W-E-01-B07005-002-040 for U3 780KV 4 equipment. These documents are part of the E - Elektroanlagen discipline, focusing on electrical systems and installations [7][11]. Additionally, there are documents specifying different aspects such as AS 1_G010, AS 2_G011, and AS 1_G009, highlighting specific details within the electrical discipline documentation [7][11]. These documents are crucial for ensuring proper electrical planning, design, and implementation within various systems and structures.'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_str = \"Describe document related to the electrical discipline\"\n",
+    "\n",
+    "(retriever | get_summary).invoke(query_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content=': 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung : 16\\nName: ISB-020-U3-W-E-01-B15100-036-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '14', 'lang': 'deu', 'offset': '0', 'len': '110'}),\n",
+       " Document(page_content=': 7\\nName: ISB-020-U3-W-E-01-B07005-001-010\\nBeschreibung: E_020 U3 780_KV 4_E031 E_Ladestationen\\nDisziplin: E - Elektroanlagen : 8\\nName: ISB-020-U3-W-E-01-B07005-001-020\\nBeschreibung: E_020 U3 740_KV 2\\nDisziplin: E - Elektroanlagen : 9\\nName: ISB-020-U3-W-E-01-B07005-001-040\\nBeschreibung: G_020 U3 779_AS 1_G009\\nDisziplin: E - Elektroanlagen : 10\\nName: ISB-020-U3-W-E-01-B07005-001-999\\nBeschreibung: 772 UV 1 G022 / WW 218057\\nDisziplin: E - Elektroanlagen : 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen', metadata={'source': 'langchain', 'row': '9', 'lang': 'deu', 'offset': '0', 'len': '109'}),\n",
+       " Document(page_content=': 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen : 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '13', 'lang': 'deu', 'offset': '0', 'len': '105'})]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(retriever | get_sources).invoke(query_str)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "madeup_metadata = {'filename': 'school_plumbing.txt', 'description': 'This document describes the plumbing system for a typical school building, including potable water supply, fixtures and appliances, drainage waste and vent (DWV) systems, and stormwater management.', 'discipline': 'plumbing'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt_template = \"\"\"Compare the following metadata and return a confidence interval measuring how much the metadata is similar to your available information \n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The returned results did not contain sufficient information to be summarized into a useful answer for your query. Please try a different search or restate your query differently.'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "query_str = f'{prompt_template}\\nmetadata:{madeup_metadata}'\n",
+    "(retriever | get_summary).invoke(query_str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_str = 'What discipline does this description belong to? Description: This document provides instructions for handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, primarily used in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C.'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The description provided pertains to the discipline of Sanitaer (Sanitary), as indicated by search results [159] and [160]. These instructions are related to handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, commonly utilized in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C. The document likely focuses on the proper procedures for managing and servicing these components within sanitary systems.'"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(retriever | get_summary).invoke(query_str)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}