{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/larawehbe/Documents/fakkerai/sehatech/venv/lib/python3.13/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from tqdm.autonotebook import tqdm\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "import pinecone\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.embeddings import OpenAIEmbeddings # Adjust to your embedding model\n", "from langchain.vectorstores import Pinecone\n", "from langchain.chains import RetrievalQA\n", "from langchain.llms import OpenAI # Replace with the LLM of your choice\n", "from dotenv import load_dotenv\n", "load_dotenv()\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Initialize Pinecone\n", "\n", "pc = pinecone.Pinecone(api_key=os.getenv(\"PINECONE_API_KEY\"))\n", "index_name = \"clec16a-study\"\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "spec = pinecone.ServerlessSpec(cloud='aws',region=\"us-east-1\")\n", "\n", "# Create the index if it doesn't exist\n", "if index_name not in pc.list_indexes():\n", " pc.create_index(index_name, dimension=1536, spec=spec) # Adjust dimension as needed\n", "# Connect to the index\n", "index = pc.Index(index_name)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import openai \n", "openai.api_key = os.getenv(\"OPENAI_API_KEY\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/qt/8nj7tb591mx9xtqkgz7mjyjh0000gn/T/ipykernel_17808/4087293823.py:7: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-openai package and should be used instead. To use it run `pip install -U :class:`~langchain-openai` and import as `from :class:`~langchain_openai import OpenAIEmbeddings``.\n", " embedding_model = OpenAIEmbeddings(model=MODEL)\n" ] } ], "source": [ "MODEL = 'text-embedding-ada-002' \n", "pdf_path = \"../data/main.pdf\" # Replace with your actual PDF path\n", "loader = PyPDFLoader(pdf_path)\n", "documents = loader.load()\n", "\n", "# Initialize embedding model\n", "embedding_model = OpenAIEmbeddings(model=MODEL)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Define function to create or connect to an existing index\n", "def create_or_connect_index(index_name, dimension):\n", " spec = pinecone.ServerlessSpec(cloud='aws',region=\"us-east-1\")\n", " if index_name not in pc.list_indexes().names():\n", " pc.create_index(\n", " name=index_name,\n", " dimension=dimension,\n", " metric='cosine', # You can use 'dotproduct' or other metrics if needed\n", " spec=spec\n", " )\n", " return pc.Index(index_name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sampleembedding: 1536\n" ] } ], "source": [ "sample_embedding = embedding_model.embed_query(\"Test\")\n", "index = create_or_connect_index(index_name, dimension=len(sample_embedding))\n", "print(f'sample embedding: {len(sample_embedding)}')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "for i, doc in enumerate(documents):\n", " embedding = embedding_model.embed_query(doc.page_content)\n", " pinecone_id = f\"page-{i}\"\n", " metadata = {\"text\": doc.page_content} # Include a 'text' snippet in metadata\n", " index.upsert([(pinecone_id, embedding, metadata)]) # Upsert embedding with metadata" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from langchain_openai import ChatOpenAI\n", "\n", "\n", "vector_store = Pinecone.from_existing_index(index_name=index_name, embedding=embedding_model)\n", "\n", "# Set up RetrievalQA chain for querying using a chat-based model for better responses\n", "llm = ChatOpenAI(model=\"gpt-4\", openai_api_key=openai.api_key) # Replace with the chat model of choice\n", "qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever())" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Question: What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\n", "Answer: CLEC16A is an E3 ubiquitin ligase that plays a significant role in mitochondrial quality control through a process called mitophagy. Mitophagy is a type of autophagy where damaged mitochondria are selectively eliminated from the cell. CLEC16A regulates mitophagy by forming a tripartite complex with another E3 ubiquitin ligase, RNF41, and a ubiquitin-specific peptidase, USP8. This complex controls the activity of the mitophagy regulator PRKN/Parkin. \n", "\n", "Maintaining mitochondrial quality control is crucial for cellular health as damaged mitochondria can lead to a decrease in energy production, increase in harmful reactive oxygen species, and potential induction of cell death. Therefore, the role of CLEC16A in mitochondrial quality control is important for maintaining cellular health and function. It is also noteworthy that the gene for CLEC16A is associated with over 20 human diseases, including diabetes, cardiovascular disease, stroke, multiple sclerosis, arthritis, and Crohn's disease, further underscoring its importance in cellular health.\n", "\n", "Question: How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\n", "Answer: The intrinsically disordered protein region (IDPR) within CLEC16A plays a significant role in its stability and its interaction with RNF41. The IDPR facilitates CLEC16A's turnover and degradation, with mutations in this region leading to increased stability of the protein. This region also contributes to the interaction between CLEC16A and RNF41, a process that is essential for the assembly of the CLEC16A-RNF41-USP8 mitophagy complex. \n", "\n", "Furthermore, the IDPR within CLEC16A is required for RNF41-mediated turnover of CLEC16A, as the removal or shuffling of the IDPR prevents RNF41 from reducing CLEC16A protein levels. This suggests the internal IDPR destabilizes CLEC16A and that this action depends upon the IDPR's amino acid sequence order.\n", "\n", "Moreover, the lysine residues within the IDPR are crucial for both CLEC16A turnover and for RNF41 to act upon CLEC16A. However, simply retaining the lysine residues in their original positions within a shuffled IDPR does not restore CLEC16A turnover or RNF41 action, indicating that the entire IDPR sequence needs to be intact for RNF41 to destabilize CLEC16A. \n", "\n", "Overall, the internal IDPR within CLEC16A plays a key role in the protein's stability, its interaction with RNF41, and its regulation within the cellular environment.\n", "\n", "Question: What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\n", "Answer: The CLEC16A-RNF41 complex plays a crucial role in the regulation of mitophagy, a process for eliminating damaged mitochondria. The CLEC16A gene encodes an E3 ubiquitin ligase that helps maintain mitochondrial health through selective mitochondrial autophagy (mitophagy). CLEC16A forms a complex with another E3 ligase, RNF41, and a ubiquitin-specific peptidase, USP8, to control the activity of the mitophagy regulator PRKN/Parkin. CLEC16A directly binds and ubiquitinates RNF41 to promote assembly and stability of the tripartite mitophagy complex. The study found that an intrinsically disordered protein region (IDPR) within CLEC16A is crucial for its function and turnover. The IDPR is essential to control the reciprocal regulatory balance between CLEC16A and RNF41, a balance which could possibly be targeted to improve mitochondrial health in disease.\n", "\n" ] } ], "source": [ "# Define the list of questions\n", "questions = [\n", " \"What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\",\n", " \"How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\",\n", " \"What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\",\n", " # Add more questions as needed\n", "]\n", "\n", "# Query each question and print the answers\n", "for question in questions:\n", " answer = qa_chain.run(question)\n", " print(f\"Question: {question}\")\n", " print(f\"Answer: {answer}\\n\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Now, turn it into a chat" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "chat_llm = ChatOpenAI(model=\"gpt-4o\", openai_api_key=openai.api_key)\n", "chat_qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever(search_kwargs={\"k\" : 3}))\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.\n", "AI: The document is a scientific article discussing research on the regulation of CLEC16A stability through an intrinsically disordered protein region (IDPR) and its implications in various diseases. It includes detailed methodologies such as statistical analysis using Prism software, data availability, acknowledgments, and contributions from various authors supported by institutions like the University of Michigan and the NIH. The research explores genetic associations with diseases like type 1 diabetes, multiple sclerosis, and myocardial infarction. The document also discusses experimental procedures including protein purification, nuclear magnetic resonance (NMR), circular dichroism, and cell culture techniques. Additionally, it highlights the significance of intrinsically disordered proteins in cellular functions and diseases. The article contains references to previous studies and provides a comprehensive overview of the research conducted on CLEC16A and its regulatory mechanisms.\n", "\n", "AI: 1. **Research and Data Availability**: The study involves biophysical studies of recombinant proteins, and all data are contained within the manuscript. Supporting information is available in the referenced article.\n", "\n", "2. **Funding and Support**: The research received significant support from institutions such as the University of Michigan Center for Structural Biology and various grants from organizations including the NIH, JDRF, and the Department of Veterans Affairs. Specific grants and awards are mentioned, highlighting the financial backing that facilitated the study.\n", "\n", "3. **Contributions and Acknowledgments**: The research involved multiple contributors, with specific roles such as conceptualization, investigation, formal analysis, and writing. Acknowledgments are given to individuals and facilities that provided assistance, such as the University of Michigan BioNMR Core for help with NMR studies. The authors declare no conflicts of interest.\n", "\n", "AI: The document you provided seems to be a scientific research article about the protein CLEC16A and its intrinsically disordered protein region (IDPR), as well as its regulation and structural characteristics. As an internal medicine doctor, the direct application of this specific research to your practice may not be immediately clear unless the findings relate to a particular medical condition or treatment relevant to your patients.\n", "\n", "However, staying informed about the latest scientific research can be beneficial in several ways:\n", "\n", "1. **Understanding Disease Mechanisms**: Research into proteins like CLEC16A can provide insights into the mechanisms of diseases, especially if these proteins are implicated in conditions that you treat.\n", "\n", "2. **Potential for New Treatments**: Understanding the regulation and stability of proteins might lead to the development of new therapeutic targets or drugs in the future.\n", "\n", "3. **Educating Patients**: Knowledge of ongoing research allows you to provide patients with the most current information about their conditions and potential future therapies.\n", "\n", "4. **Interdisciplinary Collaboration**: Familiarity with cutting-edge research can facilitate collaboration with specialists, researchers, or clinical trials that might benefit your patients.\n", "\n", "5. **Continuing Education**: Engaging with scientific literature is a part of lifelong learning and can help you stay current with medical advancements and innovations.\n", "\n", "If the study is related to a specific condition that you encounter, it would be worthwhile to explore how these findings might translate into clinical practices over time.\n", "\n", "AI: I'm sorry, but I don't have enough information to answer your question. Could you please provide more context or clarify your inquiry?\n", "\n", "AI: The document appears to be a scientific article related to biochemistry and molecular biology, specifically focusing on protein interactions and intrinsically disordered protein regions (IDPR) in the context of diseases like diabetes and autoimmune disorders. As an AI engineer, you might not directly benefit from the specific scientific content unless your work involves bioinformatics, computational biology, or the development of AI models for analyzing biological data. If your work involves these areas, you could gain insights into the types of data and analyses that are relevant in this field, which could inform the development of AI tools or models. Otherwise, the document may not be directly relevant to your work as an AI engineer.\n", "\n", "AI: Goodbye! If you have any more questions in the future, feel free to ask. Have a great day!\n", "\n", "Exiting the chat. Goodbye!\n" ] } ], "source": [ "def chat_system():\n", " print(\"Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.\")\n", " while True:\n", " question = input(\"You: \")\n", " if question.lower() in ['exit', 'quit']:\n", " print(\"Exiting the chat. Goodbye!\")\n", " break\n", " answer = chat_qa_chain.run(question)\n", " print(f\"AI: {answer}\\n\")\n", "\n", "# Run the chat system\n", "chat_system()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now, i want to add a prompt template" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Welcome to the CLEC16A Chat System! Type 'exit' to quit.\n", "AI (Prompt): Sure, I can help with that. Please go ahead with your questions.\n", "\n", "AI: The article investigates the role of an internal intrinsically disordered protein region (IDPR) within the CLEC16A protein, which is an E3 ubiquitin ligase involved in mitochondrial quality control through mitophagy. CLEC16A forms a complex with other proteins, RNF41 and USP8, to regulate mitochondrial health. The study highlights that the internal IDPR of CLEC16A is crucial for the protein's function and turnover. It is essential for the binding and ubiquitination of RNF41, which promotes the stability and assembly of the CLEC16A–RNF41–USP8 complex. Disruption of this IDPR prevents CLEC16A turnover and destabilizes the mitophagy complex. The presence of the IDPR in CLEC16A was confirmed using NMR and CD spectroscopy. This research suggests that targeting the IDPR could improve mitochondrial health in diseases associated with CLEC16A, such as diabetes, cardiovascular disease, and multiple sclerosis.\n", "\n", "AI: Based on the document's content, here are three benefits related to mitochondrial quality control facilitated by CLEC16A:\n", "\n", "1. **Mitophagy and Mitochondrial Health**: CLEC16A, as an E3 ubiquitin ligase, regulates mitochondrial quality control through the process of mitophagy, which eliminates damaged mitochondria. This helps maintain mitochondrial health.\n", "\n", "2. **Tripartite Complex Formation**: CLEC16A forms a complex with RNF41 and USP8, which together regulate the activity of the mitophagy regulator PRKN/Parkin. This complex plays a crucial role in maintaining mitochondrial function.\n", "\n", "3. **Disease Prevention and Cellular Function**: Proper functioning of CLEC16A in mitochondrial quality control is crucial for preventing cellular dysfunction and diseases such as diabetes, cardiovascular disease, and multiple sclerosis, as it is associated with over 20 human diseases. This highlights the importance of maintaining mitochondrial integrity and function in various cell types, including pancreatic β-cells, sensory neurons, and immune cells.\n", "\n", "AI: I'm sorry, but I don't have access to real-time information, including current weather conditions. I recommend checking a weather app or website for the most up-to-date information.\n", "\n", "AI: Goodbye! If you have any more questions in the future, feel free to ask.\n", "\n", "Exiting the chat. Goodbye!\n" ] } ], "source": [ "# Define the initial system prompt\n", "initial_prompt = (\n", " \"You are an AI assistant specializing in CLEC16A-related research, focusing on mitochondrial quality control, \"\n", " \"the role of intrinsically disordered protein regions, and disease implications. \"\n", " \"Answer the following questions based on the document's content.\"\n", ")\n", "\n", "# Define the chat function with prompt\n", "def chat_system():\n", " print(\"Welcome to the CLEC16A Chat System! Type 'exit' to quit.\")\n", " \n", " # Send the initial prompt\n", " response = chat_qa_chain.run(initial_prompt)\n", " print(f\"AI (Prompt): {response}\\n\")\n", "\n", " # Start the chat loop\n", " while True:\n", " question = input(\"You: \")\n", " if question.lower() in ['exit', 'quit']:\n", " print(\"Exiting the chat. Goodbye!\")\n", " break\n", " # Prepend initial prompt to each question\n", " full_prompt = f\"{initial_prompt}\\n\\n{question}\"\n", " answer = chat_qa_chain.run(full_prompt)\n", " print(f\"AI: {answer}\\n\")\n", "\n", "# Run the chat system\n", "chat_system()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 2 }