SANDRAMSC commited on
Commit
9f023a2
·
2 Parent(s): a5cfd46 3663ccd

Merge branch 'sandramsc-dev' of https://github.com/sandramsc/DocVerifyRAG into sandramsc-dev

Browse files
.gitignore CHANGED
@@ -6,4 +6,7 @@
6
  flake.nix
7
  *__pycache__*
8
  .idea
 
 
 
9
 
 
6
  flake.nix
7
  *__pycache__*
8
  .idea
9
+ flake.lock
10
+ flake.nix
11
+ docs/
12
 
README.md CHANGED
@@ -1,16 +1,3 @@
1
- ## Higging Face configuration
2
-
3
- ---
4
- title: Docverifyrag
5
- emoji: 🐠
6
- colorFrom: indigo
7
- colorTo: indigo
8
- sdk: streamlit
9
- sdk_version: 1.33.0
10
- app_file: app.py
11
- pinned: false
12
- ---
13
-
14
  <!-- PROJECT TITLE -->
15
  <h1 align="center">DocVerifyRAG: Document Verification and Anomaly Detection</h1>
16
  <div id="header" align="center">
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  <!-- PROJECT TITLE -->
2
  <h1 align="center">DocVerifyRAG: Document Verification and Anomaly Detection</h1>
3
  <div id="header" align="center">
app.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import streamlit as st
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ import os
11
+ import pickle
12
+ from datetime import datetime
13
+ from backend.generate_metadata import generate_metadata, ingest
14
+
15
+
16
+ css = '''
17
+ <style>
18
+ .chat-message {
19
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
20
+ }
21
+ .chat-message.user {
22
+ background-color: #2b313e
23
+ }
24
+ .chat-message.bot {
25
+ background-color: #475063
26
+ }
27
+ .chat-message .avatar {
28
+ width: 20%;
29
+ }
30
+ .chat-message .avatar img {
31
+ max-width: 78px;
32
+ max-height: 78px;
33
+ border-radius: 50%;
34
+ object-fit: cover;
35
+ }
36
+ .chat-message .message {
37
+ width: 80%;
38
+ padding: 0 1.5rem;
39
+ color: #fff;
40
+ }
41
+ '''
42
+ bot_template = '''
43
+ <div class="chat-message bot">
44
+ <div class="avatar">
45
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png"
46
+ style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
47
+ </div>
48
+ <div class="message">{{MSG}}</div>
49
+ </div>
50
+ '''
51
+ user_template = '''
52
+ <div class="chat-message user">
53
+ <div class="avatar">
54
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
55
+ </div>
56
+ <div class="message">{{MSG}}</div>
57
+ </div>
58
+ '''
59
+
60
+
61
+ def get_pdf_text(pdf_docs):
62
+ text = ""
63
+ for pdf in pdf_docs:
64
+ pdf_reader = PdfReader(pdf)
65
+ for page in pdf_reader.pages:
66
+ text += page.extract_text()
67
+ return text
68
+
69
+
70
+ def get_text_chunks(text):
71
+ text_splitter = CharacterTextSplitter(
72
+ separator="\n",
73
+ chunk_size=1000,
74
+ chunk_overlap=200,
75
+ length_function=len
76
+ )
77
+ chunks = text_splitter.split_text(text)
78
+ return chunks
79
+
80
+
81
+ def get_vectorstore(text_chunks):
82
+ embeddings = OpenAIEmbeddings()
83
+ # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
84
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
85
+ return vectorstore
86
+
87
+
88
+ def get_conversation_chain(vectorstore):
89
+ llm = ChatOpenAI()
90
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
91
+
92
+ memory = ConversationBufferMemory(
93
+ memory_key='chat_history', return_messages=True)
94
+ conversation_chain = ConversationalRetrievalChain.from_llm(
95
+ llm=llm,
96
+ retriever=vectorstore.as_retriever(),
97
+ memory=memory
98
+ )
99
+ return conversation_chain
100
+
101
+
102
+ def handle_userinput(user_question):
103
+ response = st.session_state.conversation({'question': user_question})
104
+ st.session_state.chat_history = response['chat_history']
105
+
106
+ for i, message in enumerate(st.session_state.chat_history):
107
+ # Display user message
108
+ if i % 2 == 0:
109
+ st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
110
+ else:
111
+ print(message)
112
+ # Display AI response
113
+ st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
114
+
115
+ # THIS DOESNT WORK, SOMEONE PLS FIX
116
+ # Display source document information if available in the message
117
+ if hasattr(message, 'source') and message.source:
118
+ st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
119
+
120
+
121
+ def safe_vec_store():
122
+ # USE VECTARA INSTEAD
123
+ os.makedirs('vectorstore', exist_ok=True)
124
+ filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
125
+ file_path = os.path.join('vectorstore', filename)
126
+ vector_store = st.session_state.vectorstore
127
+
128
+ # Serialize and save the entire FAISS object using pickle
129
+ with open(file_path, 'wb') as f:
130
+ pickle.dump(vector_store, f)
131
+
132
+
133
+ def main():
134
+ st.set_page_config(page_title="Doc Verify RAG", page_icon=":hospital:")
135
+ st.write(css, unsafe_allow_html=True)
136
+ if "openai_api_key" not in st.session_state:
137
+ st.session_state.openai_api_key = False
138
+ if "openai_org" not in st.session_state:
139
+ st.session_state.openai_org = False
140
+ if "classify" not in st.session_state:
141
+ st.session_state.classify = False
142
+ def set_pw():
143
+ st.session_state.openai_api_key = True
144
+ st.subheader("Your documents")
145
+ # OPENAI_ORG_ID = st.text_input("OPENAI ORG ID:")
146
+ OPENAI_API_KEY = st.text_input("OPENAI API KEY:", type="password",
147
+ disabled=st.session_state.openai_api_key, on_change=set_pw)
148
+ if st.session_state.classify:
149
+ pdf_doc = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=False)
150
+ else:
151
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
152
+ filenames = [file.name for file in pdf_docs if file is not None]
153
+ if st.button("Process"):
154
+ with st.spinner("Processing"):
155
+ if st.session_state.classify:
156
+ # THE CLASSIFICATION APP
157
+ st.write("Classifying")
158
+ plain_text_doc = ingest(pdf_doc.name)
159
+ classification_result = generate_metadata(plain_text_doc)
160
+ st.write(classification_result)
161
+ else:
162
+ # NORMAL RAG
163
+ loaded_vec_store = None
164
+ for filename in filenames:
165
+ if ".pkl" in filename:
166
+ file_path = os.path.join('vectorstore', filename)
167
+ with open(file_path, 'rb') as f:
168
+ loaded_vec_store = pickle.load(f)
169
+ raw_text = get_pdf_text(pdf_docs)
170
+ text_chunks = get_text_chunks(raw_text)
171
+ vec = get_vectorstore(text_chunks)
172
+ if loaded_vec_store:
173
+ vec.merge_from(loaded_vec_store)
174
+ st.warning("loaded vectorstore")
175
+ if "vectorstore" in st.session_state:
176
+ vec.merge_from(st.session_state.vectorstore)
177
+ st.warning("merged to existing")
178
+ st.session_state.vectorstore = vec
179
+ st.session_state.conversation = get_conversation_chain(vec)
180
+ st.success("data loaded")
181
+
182
+
183
+ if "conversation" not in st.session_state:
184
+ st.session_state.conversation = None
185
+ if "chat_history" not in st.session_state:
186
+ st.session_state.chat_history = None
187
+
188
+ st.header("Doc Verify RAG :hospital:")
189
+ user_question = st.text_input("Ask a question about your documents:")
190
+ if user_question:
191
+ handle_userinput(user_question)
192
+ with st.sidebar:
193
+
194
+ st.subheader("Classification Instrucitons")
195
+ classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
196
+ filenames = [file.name for file in classifier_docs if file is not None]
197
+
198
+ if st.button("Process Classification"):
199
+ st.session_state.classify = True
200
+ with st.spinner("Processing"):
201
+ st.warning("set classify")
202
+ time.sleep(3)
203
+
204
+
205
+ # Save and Load Embeddings
206
+ if st.button("Save Embeddings"):
207
+ if "vectorstore" in st.session_state:
208
+ safe_vec_store()
209
+ # st.session_state.vectorstore.save_local("faiss_index")
210
+ st.sidebar.success("saved")
211
+ else:
212
+ st.sidebar.warning("No embeddings to save. Please process documents first.")
213
+
214
+ if st.button("Load Embeddings"):
215
+ st.warning("this function is not in use, just upload the vectorstore")
216
+
217
+
218
+ if __name__ == '__main__':
219
+ main()
backend/generate_metadata.py CHANGED
@@ -1,29 +1,16 @@
1
  import os
2
-
3
  import argparse
4
  import json
5
  import openai
6
-
7
  from dotenv import load_dotenv
8
  from langchain_community.document_loaders import TextLoader
9
  from langchain_community.document_loaders import UnstructuredPDFLoader
10
  from langchain_community.embeddings.fake import FakeEmbeddings
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
 
13
- from langchain_community.vectorstores import Vectara
14
-
15
- from schema import Metadata, BimDiscipline
16
-
17
  load_dotenv()
18
 
19
- vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
20
- vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
21
- vectara_api_key = os.environ['VECTARA_API_KEY']
22
-
23
- vectorstore = Vectara(vectara_customer_id=vectara_customer_id,
24
- vectara_corpus_id=vectara_corpus_id,
25
- vectara_api_key=vectara_api_key)
26
-
27
 
28
  def ingest(file_path):
29
  extension = file_path.split('.')[-1]
@@ -51,46 +38,52 @@ def ingest(file_path):
51
  "",
52
  ])
53
  docs = text_splitter.split_documents(documents)
54
- #print(docs)
55
 
56
  return docs
57
 
58
 
59
- # vectara = Vectara.from_documents(docs, embedding=FakeEmbeddings(size=768))
60
- # retriever = vectara.as_retriever()
61
-
62
- # return retriever
63
 
 
64
 
65
- def extract_metadata(docs):
 
 
66
  # plain text
 
67
  context = "".join(
68
  [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
69
 
 
 
 
 
70
  # Create client
71
  client = openai.OpenAI(
72
  base_url="https://api.together.xyz/v1",
73
  api_key=os.environ["TOGETHER_API_KEY"],
 
74
  )
75
 
76
  # Call the LLM with the JSON schema
77
  chat_completion = client.chat.completions.create(
78
- model="mistralai/Mixtral-8x7B-Instruct-v0.1",
79
- response_format={"type": "json_object", "schema": Metadata.model_json_schema()},
80
  messages=[
81
  {
82
  "role": "system",
83
- "content": f"You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the title, a brief one-sentence summary, and the discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
84
  },
85
  {
86
  "role": "user",
87
- "content": f"Analyze the provided document, which could be in either German or English. Extract the title, summarize it briefly in one sentence, and infer the discipline. Document:\n{context}"
88
  }
89
  ]
90
  )
91
 
92
- created_user = json.loads(chat_completion.choices[0].message.content)
93
- return created_user
94
 
95
  if __name__ == "__main__":
96
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
@@ -104,5 +97,5 @@ if __name__ == "__main__":
104
  sys.exit(-1)
105
 
106
  docs = ingest(args.document)
107
- metadata = extract_metadata(docs)
108
- print(json.dumps(metadata, indent=2))
 
1
  import os
 
2
  import argparse
3
  import json
4
  import openai
5
+ import sys
6
  from dotenv import load_dotenv
7
  from langchain_community.document_loaders import TextLoader
8
  from langchain_community.document_loaders import UnstructuredPDFLoader
9
  from langchain_community.embeddings.fake import FakeEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
 
 
 
 
 
12
  load_dotenv()
13
 
 
 
 
 
 
 
 
 
14
 
15
  def ingest(file_path):
16
  extension = file_path.split('.')[-1]
 
38
  "",
39
  ])
40
  docs = text_splitter.split_documents(documents)
 
41
 
42
  return docs
43
 
44
 
45
+ def generate_metadata(docs):
46
+ prompt_template = """
47
+ BimDiscipline = ['plumbing', 'network', 'heating', 'electrical', 'ventilation', 'architecture']
 
48
 
49
+ You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
50
 
51
+ Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
52
+ context="
53
+ """
54
  # plain text
55
+ filepath = [doc.metadata for doc in docs][0]['source']
56
  context = "".join(
57
  [doc.page_content.replace('\n\n','').replace('..','') for doc in docs])
58
 
59
+ prompt = f'{prompt_template}{context}"\nFilepath:{filepath}'
60
+
61
+ #print(prompt)
62
+
63
  # Create client
64
  client = openai.OpenAI(
65
  base_url="https://api.together.xyz/v1",
66
  api_key=os.environ["TOGETHER_API_KEY"],
67
+ #api_key=userdata.get('TOGETHER_API_KEY'),
68
  )
69
 
70
  # Call the LLM with the JSON schema
71
  chat_completion = client.chat.completions.create(
72
+ model="mistralai/Mixtral-8x7B-Instruct-v0.1",
 
73
  messages=[
74
  {
75
  "role": "system",
76
+ "content": f"You are a helpful assistant that responsds in JSON format"
77
  },
78
  {
79
  "role": "user",
80
+ "content": prompt
81
  }
82
  ]
83
  )
84
 
85
+ return json.loads(chat_completion.choices[0].message.content)
86
+
87
 
88
  if __name__ == "__main__":
89
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
 
97
  sys.exit(-1)
98
 
99
  docs = ingest(args.document)
100
+ metadata = generate_metadata(docs)
101
+ print(metadata)
backend/requirements.txt CHANGED
@@ -8,3 +8,4 @@ langchain
8
  openai
9
  chromadb
10
  tiktoken
 
 
8
  openai
9
  chromadb
10
  tiktoken
11
+ python-poppler
backend/schema.py CHANGED
@@ -5,11 +5,11 @@ from pydantic import BaseModel, Field, conlist
5
  from enum import Enum
6
 
7
  class BimDiscipline(str, Enum):
8
- plumbing = 'S - Sanitär'
9
  network = 'D - Datennetz'
10
  heating = 'H - Heizung'
11
  electrical = 'E - Elektro'
12
- ventilation = 'L - Lüftung'
13
  architecture = 'A - Architektur'
14
 
15
  # Define the schema for the output.
 
5
  from enum import Enum
6
 
7
  class BimDiscipline(str, Enum):
8
+ plumbing = 'S - Sanitaer'
9
  network = 'D - Datennetz'
10
  heating = 'H - Heizung'
11
  electrical = 'E - Elektro'
12
+ ventilation = 'L - Lueftung'
13
  architecture = 'A - Architektur'
14
 
15
  # Define the schema for the output.
flake.lock ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nodes": {
3
+ "nixpkgs": {
4
+ "locked": {
5
+ "lastModified": 1713248628,
6
+ "narHash": "sha256-NLznXB5AOnniUtZsyy/aPWOk8ussTuePp2acb9U+ISA=",
7
+ "owner": "nixos",
8
+ "repo": "nixpkgs",
9
+ "rev": "5672bc9dbf9d88246ddab5ac454e82318d094bb8",
10
+ "type": "github"
11
+ },
12
+ "original": {
13
+ "owner": "nixos",
14
+ "ref": "nixos-unstable",
15
+ "repo": "nixpkgs",
16
+ "type": "github"
17
+ }
18
+ },
19
+ "root": {
20
+ "inputs": {
21
+ "nixpkgs": "nixpkgs"
22
+ }
23
+ }
24
+ },
25
+ "root": "root",
26
+ "version": 7
27
+ }
flake.nix DELETED
@@ -1 +0,0 @@
1
- /home/salgadev/code/dev-flakes/templates/langchain-rag/flake.nix
 
 
flake.nix ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ description = "A LLM backend development flake powered by unstructured and langchain";
3
+
4
+ inputs = {
5
+ nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
6
+ };
7
+
8
+ outputs = {nixpkgs, ...}: let
9
+ system = "x86_64-linux";
10
+ # ↑ Swap it for your system if needed
11
+ # "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
12
+ pkgs = nixpkgs.legacyPackages.${system};
13
+ in {
14
+ devShells.${system}.default = pkgs.mkShell {
15
+ packages = [
16
+ (pkgs.python311.withPackages (python-pkgs: [
17
+ python-pkgs.numpy
18
+ python-pkgs.pandas
19
+ python-pkgs.scipy
20
+ python-pkgs.matplotlib
21
+ python-pkgs.requests
22
+ python-pkgs.langchain-community
23
+ python-pkgs.langchain
24
+ python-pkgs.langchain-text-splitters
25
+ python-pkgs.unstructured
26
+ python-pkgs.openai
27
+ python-pkgs.pydantic
28
+ python-pkgs.python-dotenv
29
+ python-pkgs.configargparse
30
+ python-pkgs.streamlit
31
+ python-pkgs.pip
32
+ python-pkgs.lark
33
+ python-pkgs.jupyter
34
+ python-pkgs.notebook
35
+ python-pkgs.sentence-transformers
36
+ pkgs.unstructured-api
37
+ ]))
38
+ ];
39
+
40
+ shellHook = ''
41
+ venv="$(cd $(dirname $(which python)); cd ..; pwd)"
42
+ ln -Tsf "$venv" .venv
43
+ '';
44
+ };
45
+ };
46
+ }
html_templates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="">
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
ingest.py DELETED
@@ -1,7 +0,0 @@
1
- from langchain_community.document_loaders import UnstructuredPDFLoader
2
-
3
- def ingest_pdf(path):
4
- loader = UnstructuredPDFLoader()
5
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
6
-
7
- return data
 
 
 
 
 
 
 
 
notebooks/preprocess_dataset.ipynb ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 106,
6
+ "metadata": {
7
+ "id": "f-ERaM64ONeC"
8
+ },
9
+ "outputs": [],
10
+ "source": [
11
+ "# preprocess csv\n",
12
+ "import pandas as pd\n",
13
+ "filename = '/content/U3_Metadaten.csv'\n",
14
+ "df = pd.read_csv(filename, on_bad_lines='skip')"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 118,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/",
23
+ "height": 424
24
+ },
25
+ "id": "AYxRURTvQiFb",
26
+ "outputId": "18bf4139-47ac-4939-e635-9f09f560200c"
27
+ },
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "application/vnd.google.colaboratory.intrinsic+json": {
32
+ "summary": "{\n \"name\": \"clean_df\",\n \"rows\": 158,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 158,\n \"samples\": [\n \"ISB-020-U3-W-R-01-B17012-028-000\",\n \"ISB-020-U3-W-L-01-B15100-018-000\",\n \"ISB-020-U3-W-R-01-B17012-034-000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Beschreibung\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 40,\n \"samples\": [\n \"Foto\",\n \"Bodenheizung / Ventileinstellung / FBH AB PM\",\n \"Foto - Novocon S demontiert und Stellenantriebe montiert!\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Disziplin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"D - Datennetz\",\n \"E - Elektroanlagen\",\n \"S - Sanitaer\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}",
33
+ "type": "dataframe",
34
+ "variable_name": "clean_df"
35
+ },
36
+ "text/html": [
37
+ "\n",
38
+ " <div id=\"df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0\" class=\"colab-df-container\">\n",
39
+ " <div>\n",
40
+ "<style scoped>\n",
41
+ " .dataframe tbody tr th:only-of-type {\n",
42
+ " vertical-align: middle;\n",
43
+ " }\n",
44
+ "\n",
45
+ " .dataframe tbody tr th {\n",
46
+ " vertical-align: top;\n",
47
+ " }\n",
48
+ "\n",
49
+ " .dataframe thead th {\n",
50
+ " text-align: right;\n",
51
+ " }\n",
52
+ "</style>\n",
53
+ "<table border=\"1\" class=\"dataframe\">\n",
54
+ " <thead>\n",
55
+ " <tr style=\"text-align: right;\">\n",
56
+ " <th></th>\n",
57
+ " <th>Name</th>\n",
58
+ " <th>Beschreibung</th>\n",
59
+ " <th>Disziplin</th>\n",
60
+ " </tr>\n",
61
+ " </thead>\n",
62
+ " <tbody>\n",
63
+ " <tr>\n",
64
+ " <th>0</th>\n",
65
+ " <td>ISB-020-U3-W-D-01-B07005-001-000</td>\n",
66
+ " <td>Bauarten und Stuecknachweis SGK</td>\n",
67
+ " <td>D - Datennetz</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>1</th>\n",
71
+ " <td>ISB-020-U3-W-D-01-B07005-002-000</td>\n",
72
+ " <td>Bauarten und Stuecknachweis SGK</td>\n",
73
+ " <td>D - Datennetz</td>\n",
74
+ " </tr>\n",
75
+ " <tr>\n",
76
+ " <th>2</th>\n",
77
+ " <td>ISB-020-U3-W-D-01-B07005-003-000</td>\n",
78
+ " <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
79
+ " <td>D - Datennetz</td>\n",
80
+ " </tr>\n",
81
+ " <tr>\n",
82
+ " <th>3</th>\n",
83
+ " <td>ISB-020-U3-W-D-01-B07005-004-000</td>\n",
84
+ " <td>Pruefprotokoll nach DIN EN 61439-1/3</td>\n",
85
+ " <td>D - Datennetz</td>\n",
86
+ " </tr>\n",
87
+ " <tr>\n",
88
+ " <th>4</th>\n",
89
+ " <td>ISB-020-U3-W-D-01-B18012-001-000</td>\n",
90
+ " <td>Sicherungslegende G-020 U3 779-AS 1</td>\n",
91
+ " <td>D - Datennetz</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <th>...</th>\n",
95
+ " <td>...</td>\n",
96
+ " <td>...</td>\n",
97
+ " <td>...</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>153</th>\n",
101
+ " <td>ISB-020-U3-W-S-01-B17012-008-000</td>\n",
102
+ " <td>Foto</td>\n",
103
+ " <td>S - Sanitaer</td>\n",
104
+ " </tr>\n",
105
+ " <tr>\n",
106
+ " <th>159</th>\n",
107
+ " <td>ISB-020-U3-W-S-01-B17012-010-000</td>\n",
108
+ " <td>Foto</td>\n",
109
+ " <td>S - Sanitaer</td>\n",
110
+ " </tr>\n",
111
+ " <tr>\n",
112
+ " <th>160</th>\n",
113
+ " <td>ISB-020-U3-W-S-01-B17012-011-000</td>\n",
114
+ " <td>Foto</td>\n",
115
+ " <td>S - Sanitaer</td>\n",
116
+ " </tr>\n",
117
+ " <tr>\n",
118
+ " <th>161</th>\n",
119
+ " <td>ISB-020-U3-W-S-01-B18003-001-020</td>\n",
120
+ " <td>Schieber / Hawle / Schieber 4000 + Handrad 780...</td>\n",
121
+ " <td>S - Sanitaer</td>\n",
122
+ " </tr>\n",
123
+ " <tr>\n",
124
+ " <th>162</th>\n",
125
+ " <td>ISB-020-U3-W-S-01-B19009-001-020</td>\n",
126
+ " <td>Schieber / Hawle / 4000 Schutzraum</td>\n",
127
+ " <td>S - Sanitaer</td>\n",
128
+ " </tr>\n",
129
+ " </tbody>\n",
130
+ "</table>\n",
131
+ "<p>158 rows × 3 columns</p>\n",
132
+ "</div>\n",
133
+ " <div class=\"colab-df-buttons\">\n",
134
+ "\n",
135
+ " <div class=\"colab-df-container\">\n",
136
+ " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0')\"\n",
137
+ " title=\"Convert this dataframe to an interactive table.\"\n",
138
+ " style=\"display:none;\">\n",
139
+ "\n",
140
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
141
+ " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
142
+ " </svg>\n",
143
+ " </button>\n",
144
+ "\n",
145
+ " <style>\n",
146
+ " .colab-df-container {\n",
147
+ " display:flex;\n",
148
+ " gap: 12px;\n",
149
+ " }\n",
150
+ "\n",
151
+ " .colab-df-convert {\n",
152
+ " background-color: #E8F0FE;\n",
153
+ " border: none;\n",
154
+ " border-radius: 50%;\n",
155
+ " cursor: pointer;\n",
156
+ " display: none;\n",
157
+ " fill: #1967D2;\n",
158
+ " height: 32px;\n",
159
+ " padding: 0 0 0 0;\n",
160
+ " width: 32px;\n",
161
+ " }\n",
162
+ "\n",
163
+ " .colab-df-convert:hover {\n",
164
+ " background-color: #E2EBFA;\n",
165
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
166
+ " fill: #174EA6;\n",
167
+ " }\n",
168
+ "\n",
169
+ " .colab-df-buttons div {\n",
170
+ " margin-bottom: 4px;\n",
171
+ " }\n",
172
+ "\n",
173
+ " [theme=dark] .colab-df-convert {\n",
174
+ " background-color: #3B4455;\n",
175
+ " fill: #D2E3FC;\n",
176
+ " }\n",
177
+ "\n",
178
+ " [theme=dark] .colab-df-convert:hover {\n",
179
+ " background-color: #434B5C;\n",
180
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
181
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
182
+ " fill: #FFFFFF;\n",
183
+ " }\n",
184
+ " </style>\n",
185
+ "\n",
186
+ " <script>\n",
187
+ " const buttonEl =\n",
188
+ " document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0 button.colab-df-convert');\n",
189
+ " buttonEl.style.display =\n",
190
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
191
+ "\n",
192
+ " async function convertToInteractive(key) {\n",
193
+ " const element = document.querySelector('#df-3f4ad131-d55b-46a5-8dff-6fa3e12c15b0');\n",
194
+ " const dataTable =\n",
195
+ " await google.colab.kernel.invokeFunction('convertToInteractive',\n",
196
+ " [key], {});\n",
197
+ " if (!dataTable) return;\n",
198
+ "\n",
199
+ " const docLinkHtml = 'Like what you see? Visit the ' +\n",
200
+ " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
201
+ " + ' to learn more about interactive tables.';\n",
202
+ " element.innerHTML = '';\n",
203
+ " dataTable['output_type'] = 'display_data';\n",
204
+ " await google.colab.output.renderOutput(dataTable, element);\n",
205
+ " const docLink = document.createElement('div');\n",
206
+ " docLink.innerHTML = docLinkHtml;\n",
207
+ " element.appendChild(docLink);\n",
208
+ " }\n",
209
+ " </script>\n",
210
+ " </div>\n",
211
+ "\n",
212
+ "\n",
213
+ "<div id=\"df-518b8ddb-11a0-49a2-8903-71e4063ca189\">\n",
214
+ " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-518b8ddb-11a0-49a2-8903-71e4063ca189')\"\n",
215
+ " title=\"Suggest charts\"\n",
216
+ " style=\"display:none;\">\n",
217
+ "\n",
218
+ "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
219
+ " width=\"24px\">\n",
220
+ " <g>\n",
221
+ " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
222
+ " </g>\n",
223
+ "</svg>\n",
224
+ " </button>\n",
225
+ "\n",
226
+ "<style>\n",
227
+ " .colab-df-quickchart {\n",
228
+ " --bg-color: #E8F0FE;\n",
229
+ " --fill-color: #1967D2;\n",
230
+ " --hover-bg-color: #E2EBFA;\n",
231
+ " --hover-fill-color: #174EA6;\n",
232
+ " --disabled-fill-color: #AAA;\n",
233
+ " --disabled-bg-color: #DDD;\n",
234
+ " }\n",
235
+ "\n",
236
+ " [theme=dark] .colab-df-quickchart {\n",
237
+ " --bg-color: #3B4455;\n",
238
+ " --fill-color: #D2E3FC;\n",
239
+ " --hover-bg-color: #434B5C;\n",
240
+ " --hover-fill-color: #FFFFFF;\n",
241
+ " --disabled-bg-color: #3B4455;\n",
242
+ " --disabled-fill-color: #666;\n",
243
+ " }\n",
244
+ "\n",
245
+ " .colab-df-quickchart {\n",
246
+ " background-color: var(--bg-color);\n",
247
+ " border: none;\n",
248
+ " border-radius: 50%;\n",
249
+ " cursor: pointer;\n",
250
+ " display: none;\n",
251
+ " fill: var(--fill-color);\n",
252
+ " height: 32px;\n",
253
+ " padding: 0;\n",
254
+ " width: 32px;\n",
255
+ " }\n",
256
+ "\n",
257
+ " .colab-df-quickchart:hover {\n",
258
+ " background-color: var(--hover-bg-color);\n",
259
+ " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
260
+ " fill: var(--button-hover-fill-color);\n",
261
+ " }\n",
262
+ "\n",
263
+ " .colab-df-quickchart-complete:disabled,\n",
264
+ " .colab-df-quickchart-complete:disabled:hover {\n",
265
+ " background-color: var(--disabled-bg-color);\n",
266
+ " fill: var(--disabled-fill-color);\n",
267
+ " box-shadow: none;\n",
268
+ " }\n",
269
+ "\n",
270
+ " .colab-df-spinner {\n",
271
+ " border: 2px solid var(--fill-color);\n",
272
+ " border-color: transparent;\n",
273
+ " border-bottom-color: var(--fill-color);\n",
274
+ " animation:\n",
275
+ " spin 1s steps(1) infinite;\n",
276
+ " }\n",
277
+ "\n",
278
+ " @keyframes spin {\n",
279
+ " 0% {\n",
280
+ " border-color: transparent;\n",
281
+ " border-bottom-color: var(--fill-color);\n",
282
+ " border-left-color: var(--fill-color);\n",
283
+ " }\n",
284
+ " 20% {\n",
285
+ " border-color: transparent;\n",
286
+ " border-left-color: var(--fill-color);\n",
287
+ " border-top-color: var(--fill-color);\n",
288
+ " }\n",
289
+ " 30% {\n",
290
+ " border-color: transparent;\n",
291
+ " border-left-color: var(--fill-color);\n",
292
+ " border-top-color: var(--fill-color);\n",
293
+ " border-right-color: var(--fill-color);\n",
294
+ " }\n",
295
+ " 40% {\n",
296
+ " border-color: transparent;\n",
297
+ " border-right-color: var(--fill-color);\n",
298
+ " border-top-color: var(--fill-color);\n",
299
+ " }\n",
300
+ " 60% {\n",
301
+ " border-color: transparent;\n",
302
+ " border-right-color: var(--fill-color);\n",
303
+ " }\n",
304
+ " 80% {\n",
305
+ " border-color: transparent;\n",
306
+ " border-right-color: var(--fill-color);\n",
307
+ " border-bottom-color: var(--fill-color);\n",
308
+ " }\n",
309
+ " 90% {\n",
310
+ " border-color: transparent;\n",
311
+ " border-bottom-color: var(--fill-color);\n",
312
+ " }\n",
313
+ " }\n",
314
+ "</style>\n",
315
+ "\n",
316
+ " <script>\n",
317
+ " async function quickchart(key) {\n",
318
+ " const quickchartButtonEl =\n",
319
+ " document.querySelector('#' + key + ' button');\n",
320
+ " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n",
321
+ " quickchartButtonEl.classList.add('colab-df-spinner');\n",
322
+ " try {\n",
323
+ " const charts = await google.colab.kernel.invokeFunction(\n",
324
+ " 'suggestCharts', [key], {});\n",
325
+ " } catch (error) {\n",
326
+ " console.error('Error during call to suggestCharts:', error);\n",
327
+ " }\n",
328
+ " quickchartButtonEl.classList.remove('colab-df-spinner');\n",
329
+ " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
330
+ " }\n",
331
+ " (() => {\n",
332
+ " let quickchartButtonEl =\n",
333
+ " document.querySelector('#df-518b8ddb-11a0-49a2-8903-71e4063ca189 button');\n",
334
+ " quickchartButtonEl.style.display =\n",
335
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
336
+ " })();\n",
337
+ " </script>\n",
338
+ "</div>\n",
339
+ "\n",
340
+ " <div id=\"id_5f410c26-0cce-4d03-86e0-353ac70a1d74\">\n",
341
+ " <style>\n",
342
+ " .colab-df-generate {\n",
343
+ " background-color: #E8F0FE;\n",
344
+ " border: none;\n",
345
+ " border-radius: 50%;\n",
346
+ " cursor: pointer;\n",
347
+ " display: none;\n",
348
+ " fill: #1967D2;\n",
349
+ " height: 32px;\n",
350
+ " padding: 0 0 0 0;\n",
351
+ " width: 32px;\n",
352
+ " }\n",
353
+ "\n",
354
+ " .colab-df-generate:hover {\n",
355
+ " background-color: #E2EBFA;\n",
356
+ " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
357
+ " fill: #174EA6;\n",
358
+ " }\n",
359
+ "\n",
360
+ " [theme=dark] .colab-df-generate {\n",
361
+ " background-color: #3B4455;\n",
362
+ " fill: #D2E3FC;\n",
363
+ " }\n",
364
+ "\n",
365
+ " [theme=dark] .colab-df-generate:hover {\n",
366
+ " background-color: #434B5C;\n",
367
+ " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
368
+ " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
369
+ " fill: #FFFFFF;\n",
370
+ " }\n",
371
+ " </style>\n",
372
+ " <button class=\"colab-df-generate\" onclick=\"generateWithVariable('clean_df')\"\n",
373
+ " title=\"Generate code using this dataframe.\"\n",
374
+ " style=\"display:none;\">\n",
375
+ "\n",
376
+ " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
377
+ " width=\"24px\">\n",
378
+ " <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
379
+ " </svg>\n",
380
+ " </button>\n",
381
+ " <script>\n",
382
+ " (() => {\n",
383
+ " const buttonEl =\n",
384
+ " document.querySelector('#id_5f410c26-0cce-4d03-86e0-353ac70a1d74 button.colab-df-generate');\n",
385
+ " buttonEl.style.display =\n",
386
+ " google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
387
+ "\n",
388
+ " buttonEl.onclick = () => {\n",
389
+ " google.colab.notebook.generateWithVariable('clean_df');\n",
390
+ " }\n",
391
+ " })();\n",
392
+ " </script>\n",
393
+ " </div>\n",
394
+ "\n",
395
+ " </div>\n",
396
+ " </div>\n"
397
+ ],
398
+ "text/plain": [
399
+ " Name \\\n",
400
+ "0 ISB-020-U3-W-D-01-B07005-001-000 \n",
401
+ "1 ISB-020-U3-W-D-01-B07005-002-000 \n",
402
+ "2 ISB-020-U3-W-D-01-B07005-003-000 \n",
403
+ "3 ISB-020-U3-W-D-01-B07005-004-000 \n",
404
+ "4 ISB-020-U3-W-D-01-B18012-001-000 \n",
405
+ ".. ... \n",
406
+ "153 ISB-020-U3-W-S-01-B17012-008-000 \n",
407
+ "159 ISB-020-U3-W-S-01-B17012-010-000 \n",
408
+ "160 ISB-020-U3-W-S-01-B17012-011-000 \n",
409
+ "161 ISB-020-U3-W-S-01-B18003-001-020 \n",
410
+ "162 ISB-020-U3-W-S-01-B19009-001-020 \n",
411
+ "\n",
412
+ " Beschreibung Disziplin \n",
413
+ "0 Bauarten und Stuecknachweis SGK D - Datennetz \n",
414
+ "1 Bauarten und Stuecknachweis SGK D - Datennetz \n",
415
+ "2 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n",
416
+ "3 Pruefprotokoll nach DIN EN 61439-1/3 D - Datennetz \n",
417
+ "4 Sicherungslegende G-020 U3 779-AS 1 D - Datennetz \n",
418
+ ".. ... ... \n",
419
+ "153 Foto S - Sanitaer \n",
420
+ "159 Foto S - Sanitaer \n",
421
+ "160 Foto S - Sanitaer \n",
422
+ "161 Schieber / Hawle / Schieber 4000 + Handrad 780... S - Sanitaer \n",
423
+ "162 Schieber / Hawle / 4000 Schutzraum S - Sanitaer \n",
424
+ "\n",
425
+ "[158 rows x 3 columns]"
426
+ ]
427
+ },
428
+ "execution_count": 118,
429
+ "metadata": {},
430
+ "output_type": "execute_result"
431
+ }
432
+ ],
433
+ "source": [
434
+ "# drop all columns except name, description, discipline\n",
435
+ "features = ['Name', 'Beschreibung', 'Disziplin']\n",
436
+ "# Remove rows with NaN values\n",
437
+ "clean_df = df[features].dropna()\n",
438
+ "clean_df"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "code",
443
+ "execution_count": 143,
444
+ "metadata": {
445
+ "id": "_PtvbAskQa72"
446
+ },
447
+ "outputs": [],
448
+ "source": [
449
+ "clean_df.to_csv('name-description-discipline-data.csv')"
450
+ ]
451
+ }
452
+ ],
453
+ "metadata": {
454
+ "colab": {
455
+ "provenance": []
456
+ },
457
+ "kernelspec": {
458
+ "display_name": "Python 3",
459
+ "name": "python3"
460
+ },
461
+ "language_info": {
462
+ "name": "python"
463
+ }
464
+ },
465
+ "nbformat": 4,
466
+ "nbformat_minor": 0
467
+ }
notebooks/vectarize.ipynb ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "text/plain": [
11
+ "True"
12
+ ]
13
+ },
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "output_type": "execute_result"
17
+ }
18
+ ],
19
+ "source": [
20
+ "import os \n",
21
+ "from dotenv import load_dotenv\n",
22
+ "\n",
23
+ "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
24
+ "\n",
25
+ "from langchain_community.vectorstores import Vectara\n",
26
+ "load_dotenv()"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 2,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "loader = CSVLoader(file_path='/home/salgadev/code/DocVerifyRAG/name-description-discipline-data.csv')\n",
36
+ "data = loader.load()\n",
37
+ "\n",
38
+ "vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']\n",
39
+ "vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']\n",
40
+ "vectara_api_key = os.environ['VECTARA_API_KEY']\n",
41
+ "#hf_token = os.environ['HF_API_TOKEN']\n",
42
+ "\n",
43
+ "vectorstore = Vectara(vectara_customer_id=vectara_customer_id,\n",
44
+ " vectara_corpus_id=vectara_corpus_id,\n",
45
+ " vectara_api_key=vectara_api_key)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 3,
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "from langchain_community.embeddings import HuggingFaceEmbeddings\n",
55
+ "embeddings = HuggingFaceEmbeddings(model_name=\"intfloat/multilingual-e5-large\")"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": 4,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "vectara = Vectara.from_documents(data, embedding=embeddings)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 5,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
74
+ "\n"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 7,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "summary_config = {\"is_enabled\": True, \"max_results\": 5, \"response_lang\": \"eng\"}\n",
84
+ "retriever = vectara.as_retriever(\n",
85
+ " search_kwargs={\"k\": 3, \"summary_config\": summary_config}\n",
86
+ ")"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 8,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "def get_sources(documents):\n",
96
+ " return documents[:-1]\n",
97
+ "\n",
98
+ "\n",
99
+ "def get_summary(documents):\n",
100
+ " return documents[-1].page_content"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": 9,
106
+ "metadata": {},
107
+ "outputs": [
108
+ {
109
+ "data": {
110
+ "text/plain": [
111
+ "'The documents related to the electrical discipline include items like ISB-020-U3-W-E-01-B07005-002-020, which pertains to U3 740KV 2 USV, and ISB-020-U3-W-E-01-B07005-002-040 for U3 780KV 4 equipment. These documents are part of the E - Elektroanlagen discipline, focusing on electrical systems and installations [7][11]. Additionally, there are documents specifying different aspects such as AS 1_G010, AS 2_G011, and AS 1_G009, highlighting specific details within the electrical discipline documentation [7][11]. These documents are crucial for ensuring proper electrical planning, design, and implementation within various systems and structures.'"
112
+ ]
113
+ },
114
+ "execution_count": 9,
115
+ "metadata": {},
116
+ "output_type": "execute_result"
117
+ }
118
+ ],
119
+ "source": [
120
+ "query_str = \"Describe document related to the electrical discipline\"\n",
121
+ "\n",
122
+ "(retriever | get_summary).invoke(query_str)"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": 10,
128
+ "metadata": {},
129
+ "outputs": [
130
+ {
131
+ "data": {
132
+ "text/plain": [
133
+ "[Document(page_content=': 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung : 16\\nName: ISB-020-U3-W-E-01-B15100-036-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '14', 'lang': 'deu', 'offset': '0', 'len': '110'}),\n",
134
+ " Document(page_content=': 7\\nName: ISB-020-U3-W-E-01-B07005-001-010\\nBeschreibung: E_020 U3 780_KV 4_E031 E_Ladestationen\\nDisziplin: E - Elektroanlagen : 8\\nName: ISB-020-U3-W-E-01-B07005-001-020\\nBeschreibung: E_020 U3 740_KV 2\\nDisziplin: E - Elektroanlagen : 9\\nName: ISB-020-U3-W-E-01-B07005-001-040\\nBeschreibung: G_020 U3 779_AS 1_G009\\nDisziplin: E - Elektroanlagen : 10\\nName: ISB-020-U3-W-E-01-B07005-001-999\\nBeschreibung: 772 UV 1 G022 / WW 218057\\nDisziplin: E - Elektroanlagen : 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen', metadata={'source': 'langchain', 'row': '9', 'lang': 'deu', 'offset': '0', 'len': '109'}),\n",
135
+ " Document(page_content=': 11\\nName: ISB-020-U3-W-E-01-B07005-002-010\\nBeschreibung: G_020 U3 711_AS 1_G010\\nDisziplin: E - Elektroanlagen : 12\\nName: ISB-020-U3-W-E-01-B07005-002-020\\nBeschreibung: E_020 U3 740_KV 2_USV\\nDisziplin: E - Elektroanlagen : 13\\nName: ISB-020-U3-W-E-01-B07005-002-040\\nBeschreibung: E_020 U3 780_KV 4\\nDisziplin: E - Elektroanlagen : 14\\nName: ISB-020-U3-W-E-01-B07005-003-010\\nBeschreibung: G_020 U3 711_AS 2_G011\\nDisziplin: E - Elektroanlagen : 15\\nName: ISB-020-U3-W-E-01-B15100-035-000\\nBeschreibung: Luftmengen Protokoll\\nDisziplin: L - Lueftung', metadata={'source': 'langchain', 'row': '13', 'lang': 'deu', 'offset': '0', 'len': '105'})]"
136
+ ]
137
+ },
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "output_type": "execute_result"
141
+ }
142
+ ],
143
+ "source": [
144
+ "(retriever | get_sources).invoke(query_str)\n",
145
+ "\n"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 11,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "madeup_metadata = {'filename': 'school_plumbing.txt', 'description': 'This document describes the plumbing system for a typical school building, including potable water supply, fixtures and appliances, drainage waste and vent (DWV) systems, and stormwater management.', 'discipline': 'plumbing'}"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 12,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "prompt_template = \"\"\"Compare the following metadata and return a confidence interval measuring how much the metadata is similar to your available information \n",
164
+ "\"\"\""
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 13,
170
+ "metadata": {},
171
+ "outputs": [
172
+ {
173
+ "data": {
174
+ "text/plain": [
175
+ "'The returned results did not contain sufficient information to be summarized into a useful answer for your query. Please try a different search or restate your query differently.'"
176
+ ]
177
+ },
178
+ "execution_count": 13,
179
+ "metadata": {},
180
+ "output_type": "execute_result"
181
+ }
182
+ ],
183
+ "source": [
184
+ "query_str = f'{prompt_template}\\nmetadata:{madeup_metadata}'\n",
185
+ "(retriever | get_summary).invoke(query_str)"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 15,
191
+ "metadata": {},
192
+ "outputs": [],
193
+ "source": [
194
+ "query_str = 'What discipline does this description belong to? Description: This document provides instructions for handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, primarily used in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C.'\n"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": 16,
200
+ "metadata": {},
201
+ "outputs": [
202
+ {
203
+ "data": {
204
+ "text/plain": [
205
+ "'The description provided pertains to the discipline of Sanitaer (Sanitary), as indicated by search results [159] and [160]. These instructions are related to handling, assembly, maintenance, and troubleshooting of Hawle Flanschen-Schieber, commonly utilized in water supply systems with a maximum operating pressure of 25 bar and temperature of 40°C. The document likely focuses on the proper procedures for managing and servicing these components within sanitary systems.'"
206
+ ]
207
+ },
208
+ "execution_count": 16,
209
+ "metadata": {},
210
+ "output_type": "execute_result"
211
+ }
212
+ ],
213
+ "source": [
214
+ "(retriever | get_summary).invoke(query_str)"
215
+ ]
216
+ }
217
+ ],
218
+ "metadata": {
219
+ "kernelspec": {
220
+ "display_name": "Python 3",
221
+ "language": "python",
222
+ "name": "python3"
223
+ },
224
+ "language_info": {
225
+ "codemirror_mode": {
226
+ "name": "ipython",
227
+ "version": 3
228
+ },
229
+ "file_extension": ".py",
230
+ "mimetype": "text/x-python",
231
+ "name": "python",
232
+ "nbconvert_exporter": "python",
233
+ "pygments_lexer": "ipython3",
234
+ "version": "3.11.8"
235
+ }
236
+ },
237
+ "nbformat": 4,
238
+ "nbformat_minor": 2
239
+ }