tabesink92 commited on
Commit
bfb179c
·
1 Parent(s): 2b4d491
.gitattributes copy ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+ RUN useradd -m -u 1000 user
3
+ USER user
4
+ ENV HOME=/home/user \
5
+ PATH=/home/user/.local/bin:$PATH
6
+ WORKDIR $HOME/app
7
+ COPY --chown=user . $HOME/app
8
+ COPY ./requirements.txt ~/app/requirements.txt
9
+ RUN pip install -r requirements.txt
10
+ RUN pip install pydantic==2.10.1
11
+ COPY . .
12
+ CMD ["chainlit", "run", "app.py", "--port", "7860"]
README copy.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Mg Alloy LLM App
3
+ emoji: 🏆
4
+ colorFrom: yellow
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ license: openrail
9
+ short_description: AIE5 Midterm Project
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
README.md CHANGED
@@ -1,11 +1,12 @@
1
  ---
2
- title: Mg Alloy Knowledgebase V2
3
- emoji: 🏆
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: docker
7
  pinned: false
8
- short_description: 'Mg alloy LLM app '
 
9
  ---
10
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: AIE5 Mg Alloy LLM App
3
+ emoji: 🦀
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ license: openrail
9
+ short_description: 'LLM application - Midterm submisson '
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
2
+ import os
3
+ import chainlit as cl # importing chainlit for our app
4
+ from typing import Annotated, List
5
+ from dotenv import load_dotenv
6
+ from typing_extensions import List, TypedDict
7
+
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain.prompts import ChatPromptTemplate
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_core.documents import Document
12
+ from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
13
+ from langchain_cohere import CohereRerank
14
+ from langgraph.graph import START, StateGraph, END
15
+ from langchain_core.messages import HumanMessage
16
+ from langchain_core.tools import tool
17
+ from langchain_community.tools import TavilySearchResults
18
+ from langgraph.prebuilt.tool_node import ToolNode
19
+ from langgraph.graph.message import add_messages
20
+ from langchain_community.vectorstores import FAISS
21
+ from vectorstore import VectorStore
22
+
23
+ load_dotenv()
24
+
25
+ # Using OpenAI API for embeddings/llms
26
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
27
+ os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
28
+
29
+ COHERE_API_KEY = os.getenv("COHERE_API_KEY")
30
+ os.environ["COHERE_API_KEY"] = COHERE_API_KEY
31
+
32
+
33
+ # ------- Models/Tools ------- #
34
+ embed_model = HuggingFaceEmbeddings(
35
+ model_name="Snowflake/snowflake-arctic-embed-l",
36
+ model_kwargs={'device': 'cpu'},
37
+ encode_kwargs={'normalize_embeddings': True}
38
+ )
39
+
40
+ llm_sml = ChatOpenAI(
41
+ model="gpt-4o-mini",
42
+ temperature=0,
43
+ )
44
+
45
+ # ------- Prompts ------- #
46
+ rag_prompt = ChatPromptTemplate.from_template("""\
47
+ You are a helpful assistant who answers questions based on provided context. You must only use the provided context. Do NOT use your own knowledge.
48
+ if you don't know the answer, say so.
49
+ ### Question
50
+ {question}
51
+ ### Context
52
+ {context}
53
+ """)
54
+
55
+ # load documents and create vector store
56
+ vectorstore = VectorStore(
57
+ collection_name="mg_alloy_collection_snowflake",
58
+ )
59
+ #documents = VectorStore.load_chunks_as_documents("data/contextual_chunks")
60
+ #vectorstore.add_documents(documents)
61
+ retriever = vectorstore.as_retriever(k=5)
62
+
63
+ # ------- Pydantic Models ------- #
64
+ class State(TypedDict):
65
+ question: str
66
+ context: List[Document]
67
+ response: str
68
+
69
+
70
+ # ------- Functions ------- #
71
+
72
+ def generate(state):
73
+ docs_content = "\n\n".join(doc.page_content for doc in state["context"])
74
+ messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
75
+ response = llm_sml.invoke(messages)
76
+ return {"response" : response.content}
77
+
78
+
79
+ def retrieve_adjusted(state: State):
80
+ compressor = CohereRerank(model="rerank-v3.5")
81
+ compression_retriever = ContextualCompressionRetriever(
82
+ base_compressor=compressor, base_retriever=retriever, search_kwargs={"k": 5}
83
+ )
84
+ retrieved_docs = compression_retriever.invoke(state["question"])
85
+ return {"context" : retrieved_docs}
86
+
87
+
88
+ def should_continue(state):
89
+ last_message = state["messages"][-1]
90
+
91
+ if last_message.tool_calls:
92
+ return "action"
93
+
94
+ return END
95
+
96
+ # ------- Runnables ------- #
97
+
98
+ # retrieve graph
99
+ graph_builder = StateGraph(State).add_sequence([retrieve_adjusted, generate])
100
+ graph_builder.add_edge(START, "retrieve_adjusted")
101
+ graph = graph_builder.compile()
102
+
103
+
104
+ @tool
105
+ def ai_rag_tool(question: str) -> str:
106
+ """Useful for when you need to answer questions about magnesium alloys. Input should be a fully formed question."""
107
+ response = graph.invoke({"question" : question})
108
+ return {
109
+ "messages": [HumanMessage(content=response["response"])],
110
+ "context": response["context"]
111
+ }
112
+
113
+
114
+ # ------------------------------------------------ #
115
+ tool_belt = [
116
+ ai_rag_tool
117
+ ]
118
+
119
+
120
+ class AgentState(TypedDict):
121
+ messages: Annotated[list, add_messages]
122
+ context: List[Document]
123
+
124
+ tool_node = ToolNode(tool_belt)
125
+
126
+ uncompiled_graph = StateGraph(AgentState)
127
+
128
+ def call_model(state):
129
+ messages = state["messages"]
130
+ response = llm_sml.invoke(messages)
131
+ return {
132
+ "messages": [response],
133
+ "context": state.get("context", [])
134
+ }
135
+
136
+ uncompiled_graph.add_node("agent", call_model)
137
+ uncompiled_graph.add_node("action", tool_node)
138
+ uncompiled_graph.set_entry_point("agent")
139
+
140
+ def should_continue(state):
141
+ last_message = state["messages"][-1]
142
+
143
+ if last_message.tool_calls:
144
+ return "action"
145
+
146
+ return END
147
+
148
+ uncompiled_graph.add_conditional_edges(
149
+ "agent",
150
+ should_continue
151
+ )
152
+
153
+ uncompiled_graph.add_edge("action", "agent")
154
+
155
+ compiled_graph = uncompiled_graph.compile()
156
+
157
+
158
+ # ------- Chainlit ------- #
159
+ @cl.on_chat_start
160
+ async def start():
161
+ cl.user_session.set(
162
+ "graph", compiled_graph)
163
+
164
+ @cl.on_message
165
+ async def handle(message: cl.Message):
166
+ graph = cl.user_session.get("graph")
167
+ state = {"messages" : [HumanMessage(content=message.content)]}
168
+ response = await graph.ainvoke(state)
169
+ await cl.Message(content=response["messages"][-1].content).send()
chainlit.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Magnesium Alloy Knowledge Base 🤖
2
+
3
+ Hi there, the model has been preloaded with academic papers and technical documents on magnesium alloys. The mode is capable of answering questions about magnesium alloys.
4
+
5
+ ## Example Questions
6
+
7
+ - What is magnesium alloy?
8
+
9
+ - What are the properties of magnesium alloy?
10
+
11
+ - Describe how and why fatigue life in different regions of AZ80 alloy wheel maybe different, what aspects of the metal microstructure are most influential in fracture.
12
+
13
+ - If fatigue life was to be maximum in particular locations, what microstructure morphology would be most desireable and how can that be acheived, also, what would the best processing and post processing considtions that would be required?
14
+
data/contextual_chunks/A_Method_for_Comparing_the_Fatigue_Performance_of_Forged.pkl ADDED
Binary file (72.4 kB). View file
 
data/contextual_chunks/Characterization_of_forged_magnesium_alloys.pkl ADDED
Binary file (185 kB). View file
 
data/contextual_chunks/Fatigue_of_Forged_AZ80_Magnesium_Alloy.pkl ADDED
Binary file (581 kB). View file
 
data/contextual_chunks/Microstructure_and_texture_evolution_during_hot_deformation_of_AZ80_magnesium_alloy.pkl ADDED
Binary file (462 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ chainlit==0.7.700
2
+ langchain-huggingface==0.1.2
3
+ langchain==0.3.19
4
+ langchain_openai==0.3.7
5
+ langchain_cohere==0.4.2
6
+ langgraph==0.2.73
7
+ qdrant-client==1.13.2
8
+ langchain-qdrant==0.2.0
9
+ fastembed==0.6.0
10
+ grpcio==1.67.1
vectorstore.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import getpass
2
+ import os
3
+ import pickle
4
+ import tqdm
5
+ import yaml
6
+ import sys
7
+ from langchain_core.documents import Document
8
+
9
+ from langchain_openai import ChatOpenAI
10
+ from dotenv import load_dotenv
11
+ from qdrant_client import QdrantClient, models
12
+ from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
13
+ from langchain_openai import OpenAIEmbeddings
14
+ from sentence_transformers import SentenceTransformer
15
+ from langchain_huggingface import HuggingFaceEmbeddings
16
+
17
+
18
+ # Load environment variables from .env
19
+ from dotenv import load_dotenv
20
+ load_dotenv()
21
+
22
+ # Initialize embedding model
23
+ model_id = "Snowflake/snowflake-arctic-embed-l"
24
+ EMBEDDINGS = HuggingFaceEmbeddings(
25
+ model_name=model_id,
26
+ model_kwargs={'device': 'cpu'},
27
+ encode_kwargs={'normalize_embeddings': True}
28
+ )
29
+ SPARSE_EMBEDDINGS = FastEmbedSparse(model_name="Qdrant/BM25")
30
+
31
+ class VectorStore:
32
+ def __init__(self, collection_name="testCollection"):
33
+
34
+ # Initialize the Qdrant client
35
+ self.collection_name = collection_name
36
+ self.collections_path= "data/collections"
37
+ try:
38
+ lock_file = os.path.join(self.collections_path, ".lock")
39
+ if os.path.exists(lock_file):
40
+ os.remove(lock_file)
41
+ except:
42
+ pass
43
+
44
+ self.client = QdrantClient(path=self.collections_path)
45
+
46
+ # create the collection if it doesn't exist
47
+ if not self.client.collection_exists(collection_name):
48
+ self.client.create_collection(
49
+ collection_name=collection_name,
50
+ vectors_config={
51
+ "dense_vector": models.VectorParams(
52
+ size=1024, distance=models.Distance.COSINE # arctic embed dim: 1024
53
+ )
54
+ },
55
+ sparse_vectors_config={
56
+ "sparse_vector": models.SparseVectorParams(
57
+ index=models.SparseIndexParams(
58
+ on_disk=False,
59
+ )
60
+ )
61
+ },
62
+ )
63
+ print(f"\nCollection {collection_name} created")
64
+ else:
65
+ print(f"\nLoading existing collection: {collection_name}")
66
+ self._vector_store = self._as_vector_store(collection_name)
67
+
68
+ def get_collection_documents(self):
69
+ """get all documents in the collection"""
70
+ records = self._vector_store.client.retrieve(
71
+ ids=list(range(1, self.client.count(self.collection_name).count + 1)),
72
+ collection_name=self.collection_name,
73
+ with_payload=True
74
+ )
75
+ documents = []
76
+ for record in records:
77
+ documents.append(Document(page_content=record.payload['page_content'], metadata=record.payload['metadata']))
78
+ return documents
79
+
80
+ def _as_vector_store(self, collection_name):
81
+ return QdrantVectorStore(
82
+ client=self.client,
83
+ collection_name=collection_name,
84
+ embedding=EMBEDDINGS,
85
+ sparse_embedding=SPARSE_EMBEDDINGS,
86
+ retrieval_mode=RetrievalMode.HYBRID,
87
+ vector_name="dense_vector",
88
+ sparse_vector_name="sparse_vector",
89
+ )
90
+
91
+ def as_retriever(self, k=3):
92
+ return self._vector_store.as_retriever(
93
+ search_type="mmr",
94
+ search_kwargs={"k": k, "lambda_mult": 0.5},
95
+ )
96
+
97
+ def add_documents(self, documents, batch_size=4):
98
+ """add documents to the collection"""
99
+
100
+ # Skip if no documents to add
101
+ if not documents:
102
+ return
103
+
104
+ # get the number of points in the collection
105
+ point_count = self.client.count(self.collection_name)
106
+
107
+ # create a list of ids for the documents
108
+ ids = list(range(1, point_count.count))
109
+
110
+ # Get the existing documents in the collection
111
+ records = self._vector_store.client.retrieve(
112
+ ids=ids,
113
+ collection_name=self.collection_name,
114
+ with_payload=True
115
+ )
116
+
117
+ # Extract unique titles from metadata
118
+ existing_docs = list(set([record.payload['metadata']['filename'] for record in records]))
119
+
120
+ # Filter out documents that already exist
121
+ documents = [doc for doc in documents if doc.metadata["filename"] not in existing_docs]
122
+
123
+ # Skip if all documents already exist
124
+ if not documents:
125
+ print("All documents already exist in collection. Skipping upload.")
126
+ return
127
+
128
+ # create a list of ids for the documents
129
+ ids = list(range(point_count.count + 1, point_count.count + len(documents) + 1))
130
+
131
+ # add the documents to the collection
132
+ self._vector_store.add_documents(documents=documents, ids=ids)
133
+
134
+ @staticmethod
135
+ def load_chunks_as_documents(path):
136
+
137
+ file_list = []
138
+ if os.path.isfile(path) and path.endswith('.pkl'):
139
+ # Single pkl file
140
+ file_list.append(path)
141
+ elif os.path.isdir(path):
142
+ # Directory of pkl files
143
+ for filename in os.listdir(path):
144
+ if filename.endswith('.pkl'):
145
+ path_ = os.path.join(path, filename)
146
+ file_list.append(path_)
147
+
148
+ loaded_chunk_data = {}
149
+ for file in file_list:
150
+ with open(file, 'rb') as f:
151
+ data = pickle.load(f)
152
+ loaded_chunk_data[data["filename"]] = data["chunks"]
153
+
154
+ print(f"Loaded {len(loaded_chunk_data)} documents from {path}:")
155
+ for i, doc_name in enumerate(loaded_chunk_data.keys()):
156
+ print(f" {i+1}. {doc_name}")
157
+
158
+ # Convert the chunks to langhcain documents
159
+ documents = []
160
+ for fname in loaded_chunk_data.keys():
161
+
162
+ chunks = loaded_chunk_data[fname]
163
+ for chunk in chunks:
164
+ documents.append(
165
+ Document(
166
+ page_content=chunk.page_content,
167
+ metadata=chunk.metadata
168
+ )
169
+ )
170
+ return documents
171
+
172
+ def inspect_collection(self):
173
+ """inspect the collection"""
174
+ print(f"Collection {self.collection_name} has {self.client.count(self.collection_name).count} documents")
175
+
176
+ # Get the existing documents in the collection
177
+ point_count = self.client.count(self.collection_name)
178
+ ids = list(range(1, point_count.count + 1))
179
+
180
+ records = self._vector_store.client.retrieve(
181
+ ids=ids,
182
+ collection_name=self.collection_name,
183
+ with_payload=True
184
+ )
185
+
186
+ # Extract unique titles from metadata
187
+ existing_docs = list(set([record.payload['metadata']['filename'] for record in records]))
188
+ print(f"Documents in collection:")
189
+ for i, doc_name in enumerate(existing_docs):
190
+ print(f" {i+1}. {doc_name}")
191
+
192
+ """ def main():
193
+ collection_name = input("\nEnter a collection name to add documents:").strip()
194
+ if not collection_name:
195
+ collection_name = "testCollection"
196
+
197
+ # Load the documents
198
+ if not os.path.exists(configs["CONTEXTUAL_CHUNKS_FOLDER"]):
199
+ print(f"Error: {configs['CONTEXTUAL_CHUNKS_FOLDER']} does not exist")
200
+ sys.exit(1)
201
+
202
+ documents = VectorStore.load_chunks_as_documents(configs["CONTEXTUAL_CHUNKS_FOLDER"])
203
+
204
+ # Initialize the vector store
205
+ vector_store = VectorStore(collection_name)
206
+
207
+ # Add the documents to the vector store
208
+ vector_store.add_documents(documents) """