Spaces:

tabesink92
/

Mg_Alloy_Knowledgebase_v2

Running

App Files Files Community

tabesink92 commited on 1 day ago

Commit

bfb179c

1 Parent(s): 2b4d491

Ship!

Browse files

Files changed (13) hide show

.gitattributes copy +35 -0
.gitignore +160 -0
Dockerfile +12 -0
README copy.md +12 -0
README.md +5 -4
app.py +169 -0
chainlit.md +14 -0
data/contextual_chunks/A_Method_for_Comparing_the_Fatigue_Performance_of_Forged.pkl +0 -0
data/contextual_chunks/Characterization_of_forged_magnesium_alloys.pkl +0 -0
data/contextual_chunks/Fatigue_of_Forged_AZ80_Magnesium_Alloy.pkl +0 -0
data/contextual_chunks/Microstructure_and_texture_evolution_during_hot_deformation_of_AZ80_magnesium_alloy.pkl +0 -0
requirements.txt +10 -0
vectorstore.py +208 -0

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.11
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+COPY ./requirements.txt ~/app/requirements.txt
+RUN pip install -r requirements.txt
+RUN pip install pydantic==2.10.1
+COPY . .
+CMD ["chainlit", "run", "app.py", "--port", "7860"]

README copy.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Mg Alloy LLM App
+emoji: 🏆
+colorFrom: yellow
+colorTo: gray
+sdk: docker
+pinned: false
+license: openrail
+short_description: AIE5 Midterm Project
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

README.md CHANGED Viewed

@@ -1,11 +1,12 @@
 ---
-title: Mg Alloy Knowledgebase V2
-emoji: 🏆
 colorFrom: blue
-colorTo: red
 sdk: docker
 pinned: false
-short_description: 'Mg alloy LLM app '
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AIE5 Mg Alloy LLM App
+emoji: 🦀
 colorFrom: blue
+colorTo: green
 sdk: docker
 pinned: false
+license: openrail
+short_description: 'LLM application - Midterm submisson '
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# You can find this code for Chainlit python streaming here (https://docs.chainlit.io/concepts/streaming/python)
+import os
+import chainlit as cl  # importing chainlit for our app
+from typing import Annotated, List
+from dotenv import load_dotenv
+from typing_extensions import List, TypedDict
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain.prompts import ChatPromptTemplate
+from langchain_openai import ChatOpenAI
+from langchain_core.documents import Document
+from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
+from langchain_cohere import CohereRerank
+from langgraph.graph import START, StateGraph, END
+from langchain_core.messages import HumanMessage
+from langchain_core.tools import tool
+from langchain_community.tools import TavilySearchResults
+from langgraph.prebuilt.tool_node import ToolNode
+from langgraph.graph.message import add_messages
+from langchain_community.vectorstores import FAISS
+from vectorstore import VectorStore
+load_dotenv()
+# Using OpenAI API for embeddings/llms
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+os.environ["COHERE_API_KEY"] = COHERE_API_KEY
+# ------- Models/Tools ------- #
+embed_model = HuggingFaceEmbeddings(
+    model_name="Snowflake/snowflake-arctic-embed-l",
+    model_kwargs={'device': 'cpu'},
+    encode_kwargs={'normalize_embeddings': True}
+)
+llm_sml = ChatOpenAI(
+        model="gpt-4o-mini",
+        temperature=0,
+    )
+# ------- Prompts ------- #
+rag_prompt = ChatPromptTemplate.from_template("""\
+    You are a helpful assistant who answers questions based on provided context. You must only use the provided context. Do NOT use your own knowledge.
+    if you don't know the answer, say so.
+    ### Question
+    {question}
+    ### Context
+    {context}
+    """)
+# load documents and create vector store
+vectorstore = VectorStore(
+  collection_name="mg_alloy_collection_snowflake",
+)
+#documents = VectorStore.load_chunks_as_documents("data/contextual_chunks")
+#vectorstore.add_documents(documents)
+retriever = vectorstore.as_retriever(k=5)
+# ------- Pydantic Models ------- #
+class State(TypedDict):
+  question: str
+  context: List[Document]
+  response: str
+# ------- Functions ------- #
+def generate(state):
+  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
+  response = llm_sml.invoke(messages)
+  return {"response" : response.content}
+def retrieve_adjusted(state: State):
+  compressor = CohereRerank(model="rerank-v3.5")
+  compression_retriever = ContextualCompressionRetriever(
+    base_compressor=compressor, base_retriever=retriever, search_kwargs={"k": 5}
+  )
+  retrieved_docs = compression_retriever.invoke(state["question"])
+  return {"context" : retrieved_docs}
+def should_continue(state):
+  last_message = state["messages"][-1]
+  if last_message.tool_calls:
+    return "action"
+  return END
+# ------- Runnables ------- #
+# retrieve graph
+graph_builder = StateGraph(State).add_sequence([retrieve_adjusted, generate])
+graph_builder.add_edge(START, "retrieve_adjusted")
+graph = graph_builder.compile()
+@tool
+def ai_rag_tool(question: str) -> str:
+  """Useful for when you need to answer questions about magnesium alloys. Input should be a fully formed question."""
+  response = graph.invoke({"question" : question})
+  return {
+        "messages": [HumanMessage(content=response["response"])],
+        "context": response["context"]
+    }
+# ------------------------------------------------ #
+tool_belt = [
+    ai_rag_tool
+]
+class AgentState(TypedDict):
+  messages: Annotated[list, add_messages]
+  context: List[Document]
+tool_node = ToolNode(tool_belt)
+uncompiled_graph = StateGraph(AgentState)
+def call_model(state):
+  messages = state["messages"]
+  response = llm_sml.invoke(messages)
+  return {
+        "messages": [response],
+        "context": state.get("context", [])
+    }
+uncompiled_graph.add_node("agent", call_model)
+uncompiled_graph.add_node("action", tool_node)
+uncompiled_graph.set_entry_point("agent")
+def should_continue(state):
+  last_message = state["messages"][-1]
+  if last_message.tool_calls:
+    return "action"
+  return END
+uncompiled_graph.add_conditional_edges(
+    "agent",
+    should_continue
+)
+uncompiled_graph.add_edge("action", "agent")
+compiled_graph = uncompiled_graph.compile()
+# ------- Chainlit ------- #
+@cl.on_chat_start
+async def start():
+  cl.user_session.set(
+      "graph", compiled_graph)
+@cl.on_message
+async def handle(message: cl.Message):
+  graph = cl.user_session.get("graph")
+  state = {"messages" : [HumanMessage(content=message.content)]}
+  response = await graph.ainvoke(state)
+  await cl.Message(content=response["messages"][-1].content).send()

chainlit.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Magnesium Alloy Knowledge Base 🤖
+Hi there, the model has been preloaded with academic papers and technical documents on magnesium alloys. The mode is capable of answering questions about magnesium alloys.
+## Example Questions
+- What is magnesium alloy?
+- What are the properties of magnesium alloy?
+- Describe how and why fatigue life in different regions of AZ80 alloy wheel maybe different, what aspects of the metal microstructure are most influential in fracture.
+- If fatigue life was to be maximum in particular locations, what microstructure morphology would be most desireable and how can that be acheived, also, what would the best processing and post processing considtions that would be required?

data/contextual_chunks/A_Method_for_Comparing_the_Fatigue_Performance_of_Forged.pkl ADDED Viewed

Binary file (72.4 kB). View file

data/contextual_chunks/Characterization_of_forged_magnesium_alloys.pkl ADDED Viewed

Binary file (185 kB). View file

data/contextual_chunks/Fatigue_of_Forged_AZ80_Magnesium_Alloy.pkl ADDED Viewed

Binary file (581 kB). View file

data/contextual_chunks/Microstructure_and_texture_evolution_during_hot_deformation_of_AZ80_magnesium_alloy.pkl ADDED Viewed

Binary file (462 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+chainlit==0.7.700
+langchain-huggingface==0.1.2
+langchain==0.3.19
+langchain_openai==0.3.7
+langchain_cohere==0.4.2
+langgraph==0.2.73
+qdrant-client==1.13.2
+langchain-qdrant==0.2.0
+fastembed==0.6.0
+grpcio==1.67.1

vectorstore.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import getpass
+import os
+import pickle
+import tqdm
+import yaml
+import sys
+from langchain_core.documents import Document
+from langchain_openai import ChatOpenAI
+from dotenv import load_dotenv
+from qdrant_client import QdrantClient, models
+from langchain_qdrant import FastEmbedSparse, QdrantVectorStore, RetrievalMode
+from langchain_openai import OpenAIEmbeddings
+from sentence_transformers import SentenceTransformer
+from langchain_huggingface import HuggingFaceEmbeddings
+# Load environment variables from .env
+from dotenv import load_dotenv
+load_dotenv()
+# Initialize embedding model
+model_id = "Snowflake/snowflake-arctic-embed-l"
+EMBEDDINGS = HuggingFaceEmbeddings(
+    model_name=model_id,
+    model_kwargs={'device': 'cpu'},
+    encode_kwargs={'normalize_embeddings': True}
+)
+SPARSE_EMBEDDINGS = FastEmbedSparse(model_name="Qdrant/BM25")
+class VectorStore:
+    def __init__(self, collection_name="testCollection"):
+        # Initialize the Qdrant client
+        self.collection_name = collection_name
+        self.collections_path= "data/collections"
+        try:
+            lock_file = os.path.join(self.collections_path, ".lock")
+            if os.path.exists(lock_file):
+                os.remove(lock_file)
+        except:
+            pass
+        self.client = QdrantClient(path=self.collections_path)
+        # create the collection if it doesn't exist
+        if not self.client.collection_exists(collection_name):
+            self.client.create_collection(
+            collection_name=collection_name,
+            vectors_config={
+                "dense_vector": models.VectorParams(
+                    size=1024, distance=models.Distance.COSINE  # arctic embed dim: 1024
+                    )
+            },
+            sparse_vectors_config={
+                "sparse_vector": models.SparseVectorParams(
+                    index=models.SparseIndexParams(
+                        on_disk=False,
+                    )
+                )
+            },
+            )
+            print(f"\nCollection {collection_name} created")
+        else:
+            print(f"\nLoading existing collection: {collection_name}")
+        self._vector_store = self._as_vector_store(collection_name)
+    def get_collection_documents(self):
+        """get all documents in the collection"""
+        records = self._vector_store.client.retrieve(
+            ids=list(range(1, self.client.count(self.collection_name).count + 1)),
+            collection_name=self.collection_name,
+            with_payload=True
+        )
+        documents = []
+        for record in records:
+            documents.append(Document(page_content=record.payload['page_content'], metadata=record.payload['metadata']))
+        return documents
+    def _as_vector_store(self, collection_name):
+        return QdrantVectorStore(
+            client=self.client,
+            collection_name=collection_name,
+            embedding=EMBEDDINGS,
+            sparse_embedding=SPARSE_EMBEDDINGS,
+            retrieval_mode=RetrievalMode.HYBRID,
+            vector_name="dense_vector",
+            sparse_vector_name="sparse_vector",
+        )
+    def as_retriever(self, k=3):
+        return self._vector_store.as_retriever(
+            search_type="mmr",
+            search_kwargs={"k": k, "lambda_mult": 0.5},
+        )
+    def add_documents(self, documents, batch_size=4):
+        """add documents to the collection"""
+        # Skip if no documents to add
+        if not documents:
+            return
+        # get the number of points in the collection
+        point_count = self.client.count(self.collection_name)
+        # create a list of ids for the documents
+        ids = list(range(1, point_count.count))
+        # Get the existing documents in the collection
+        records = self._vector_store.client.retrieve(
+            ids=ids,
+            collection_name=self.collection_name,
+            with_payload=True
+        )
+        # Extract unique titles from metadata
+        existing_docs = list(set([record.payload['metadata']['filename'] for record in records]))
+        # Filter out documents that already exist
+        documents = [doc for doc in documents if doc.metadata["filename"] not in existing_docs]
+        # Skip if all documents already exist
+        if not documents:
+            print("All documents already exist in collection. Skipping upload.")
+            return
+        # create a list of ids for the documents
+        ids = list(range(point_count.count + 1, point_count.count + len(documents) + 1))
+        # add the documents to the collection
+        self._vector_store.add_documents(documents=documents, ids=ids)
+    @staticmethod
+    def load_chunks_as_documents(path):
+        file_list = []
+        if os.path.isfile(path) and path.endswith('.pkl'):
+            # Single pkl file
+            file_list.append(path)
+        elif os.path.isdir(path):
+            # Directory of pkl files
+            for filename in os.listdir(path):
+                if filename.endswith('.pkl'):
+                    path_ = os.path.join(path, filename)
+                    file_list.append(path_)
+        loaded_chunk_data = {}
+        for file in file_list:
+            with open(file, 'rb') as f:
+                data = pickle.load(f)
+                loaded_chunk_data[data["filename"]] = data["chunks"]
+        print(f"Loaded {len(loaded_chunk_data)} documents from {path}:")
+        for i, doc_name in enumerate(loaded_chunk_data.keys()):
+            print(f"  {i+1}. {doc_name}")
+        # Convert the chunks to langhcain documents
+        documents = []
+        for fname in loaded_chunk_data.keys():
+            chunks = loaded_chunk_data[fname]
+            for chunk in chunks:
+                documents.append(
+                    Document(
+                        page_content=chunk.page_content,
+                        metadata=chunk.metadata
+                    )
+                )
+        return documents
+    def inspect_collection(self):
+        """inspect the collection"""
+        print(f"Collection {self.collection_name} has {self.client.count(self.collection_name).count} documents")
+        # Get the existing documents in the collection
+        point_count = self.client.count(self.collection_name)
+        ids = list(range(1, point_count.count + 1))
+        records = self._vector_store.client.retrieve(
+            ids=ids,
+            collection_name=self.collection_name,
+            with_payload=True
+        )
+        # Extract unique titles from metadata
+        existing_docs = list(set([record.payload['metadata']['filename'] for record in records]))
+        print(f"Documents in collection:")
+        for i, doc_name in enumerate(existing_docs):
+            print(f"  {i+1}. {doc_name}")
+""" def main():
+    collection_name = input("\nEnter a collection name to add documents:").strip()
+    if not collection_name:
+        collection_name = "testCollection"
+    # Load the documents
+    if not os.path.exists(configs["CONTEXTUAL_CHUNKS_FOLDER"]):
+        print(f"Error: {configs['CONTEXTUAL_CHUNKS_FOLDER']} does not exist")
+        sys.exit(1)
+    documents = VectorStore.load_chunks_as_documents(configs["CONTEXTUAL_CHUNKS_FOLDER"])
+    # Initialize the vector store
+    vector_store = VectorStore(collection_name)
+    # Add the documents to the vector store
+    vector_store.add_documents(documents) """