{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import PyPDFLoader\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import getpass\n", "import os\n", "from dotenv import load_dotenv\n", "\n", "load_dotenv()\n", "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\")\n", "\n", "from langchain_openai import ChatOpenAI\n", "\n", "llm = ChatOpenAI(model=\"gpt-4o\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from langchain_core.vectorstores import InMemoryVectorStore\n", "from langchain_openai import OpenAIEmbeddings\n", "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input': 'What is this paper about?',\n", " 'context': [Document(id='de54a167-b052-4340-8c9d-c96f3b20b1c8', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='Dunnett’s post hoc multiple comparisons test. A 5% sig-\\nnificance level was used for all statistical tests. All statis-\\ntical analysis was performed using Prism software\\n(GraphPad software, LLC).\\nData availability\\nAll data are contained within the manuscript.\\nSupporting information—This article contains supporting informa-\\ntion (62).\\nAcknowledgments—Recombinant protein for biophysical studies\\nreported in this publication was generated with supported from the\\nUniversity of Michigan Center for Structural Biology (CSB). The\\nCSB acknowledges support from the U-M Life Sciences Institute,\\nthe U-M Rogel Cancer Center, the U-M Medical School Endow-\\nment for Basic Sciences, and grants from the NIH. We thank the\\nUniversity of Michigan BioNMR Core fand for assistance per-\\nforming, analyzing, and interpreting NMR studies. The University\\nof Michigan BioNMR Core is supported by the U-M College of\\nLiterature, Sciences and Arts, Life Sciences Institute, College of'),\n", " Document(id='6a25243d-4b08-4b3d-8db6-d900d6a616ed', metadata={'source': '../data/main.pdf', 'page': 10}, page_content='PAQDVPRSSAKPSIRCFIKPTETLERSLEMNKHKGKKRM\\nQKRPNYKNVGEEEDEERGSAEDAQEDAEKTKGTEGGSKS\\nMKTSGEREEIEMVIMKLGKLSEVAAAGTSVQEQNTTDEE\\nKSAATNSEN\\nShuffle IDPR:\\nMARDKMESNNKTSACSEITGEPETQASREQKVDESEQA\\nEKTDGPNDEMSEAIVAKVLRKNVKKPFKKTREEELLKMN\\nMGASRITNQHKKAYSSLGEEPIGGEARRKGESAPETEKDG\\nEETGSQSTV\\nIDPR K-to-R:\\nPAQDVPRSSARPSIRCFIRPTETLERSLEMNRHRGRRRM\\nQRRPNYRNVGEEEDEERGSAEDAQEDAERTRGTEGGSRS\\nMRTSGEREEIEMVIMRLGRLSEVAAAGTSVQEQNTTDEE\\nRSAATNSEN\\nShuffle IDPR retain K:\\nRNF41 regulates CLEC16A stabilityvia an IDPR\\nJ. Biol. Chem. (2023) 299(4) 103057 11'),\n", " Document(id='eda80cfe-dd68-40e5-84e1-b64db1e9e93d', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='forming, analyzing, and interpreting NMR studies. The University\\nof Michigan BioNMR Core is supported by the U-M College of\\nLiterature, Sciences and Arts, Life Sciences Institute, College of\\nPharmacy, and the Medical School along with the U-M Biosciences\\nInitiative. We thank Drs. H. Popelka, P. Arvan, D. Fingar, and\\nmembers of the Soleimanpour laboratory for helpful advice.\\nAuthor contributions—M. A. G. and S. A. S. conceptualization; M.\\nA. G., J. Z., B. C., M. P. V., N. X., V. S., and D. S. investigation;\\nM. A. G., M. P. V., and D. S. formal analysis; M. A. G. data curation;\\nM. A. G. and S. A. S. writing– original draft; M. A. G. and S. A. S.\\nfunding acquisition; J. Z., B. C., M. P. V., V. S., N. A. K., D. S., D. J.\\nK., S. S., and S. A. S. writing– review and editing; N. A. K., D. S., D. J.\\nK., S. S., and S. A. S. resources; N. A. K., D. S., D. J. K., S. S., and S. A.\\nS. supervision; S. A. S. visualization.\\nFunding and additional information—M. A. G. was supported by'),\n", " Document(id='fa72767a-90cc-4ca0-9889-d4eaf81549e5', metadata={'source': '../data/main.pdf', 'page': 12}, page_content='Protein Sci. 25, 1767–1785\\n20. Guharoy, M., Bhowmick, P., and Tompa, P. (2016) Design principles\\ninvolving protein disorder facilitate specific substrate selection and degra-\\ndation by the ubiquitin-proteasome system.J. Biol. Chem.291,6 7 2 3–6731\\n21. Bhowmick, P., Pancsa, R., Guharoy, M., and Tompa, P. (2013) Functional\\ndiversity and structural disorder in the human ubiquitination pathway.\\nPLoS One8, e65443\\n22. Tunyasuvunakool, K., Adler, J., Wu, Z., Green, T., Zielinski, M.,/C20Zídek, A.,\\net al. (2021) Highly accurate protein structure prediction for the human\\nproteome. Nature 596, 590–596\\n23. Varadi, M., Anyango, S., Deshpande, M., Nair, S., Natassia, C., Yorda-\\nnova, G., et al. (2022) AlphaFold protein structure database: Massively\\nRNF41 regulates CLEC16A stabilityvia an IDPR\\nJ. Biol. Chem. (2023) 299(4) 103057 13')],\n", " 'answer': 'This paper investigates the regulation of CLEC16A stability by RNF41 through an intrinsically disordered protein region (IDPR). It explores the mechanisms by which protein disorder contributes to substrate selection and degradation processes in the ubiquitin-proteasome system. The study involves biophysical and structural analyses supported by various University of Michigan resources.'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain.chains import create_retrieval_chain\n", "from langchain.chains.combine_documents import create_stuff_documents_chain\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "system_prompt = (\n", " \"You are an assistant for question-answering tasks. \"\n", " \"Use the following pieces of retrieved context to answer \"\n", " \"the question. If you don't know the answer, say that you \"\n", " \"don't know. Use three sentences maximum and keep the \"\n", " \"answer concise.\"\n", " \"\\n\\n\"\n", " \"{context}\"\n", ")\n", "\n", "prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\"system\", system_prompt),\n", " (\"human\", \"{input}\"),\n", " ]\n", ")\n", "\n", "\n", "question_answer_chain = create_stuff_documents_chain(llm, prompt)\n", "rag_chain = create_retrieval_chain(retriever, question_answer_chain)\n", "\n", "results = rag_chain.invoke({\"input\": \"What is this paper about?\"})\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This paper investigates the regulation of CLEC16A stability by RNF41 through an intrinsically disordered protein region (IDPR). It explores the mechanisms by which protein disorder contributes to substrate selection and degradation processes in the ubiquitin-proteasome system. The study involves biophysical and structural analyses supported by various University of Michigan resources.\n" ] } ], "source": [ "print(results['answer'])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Question: What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\n", "Answer: CLEC16A is an E3 ubiquitin ligase that regulates mitochondrial quality control by facilitating mitophagy, a process that eliminates damaged mitochondria. It forms a complex with RNF41 and USP8 to control the activity of the mitophagy regulator PRKN/Parkin. This function is crucial for cellular health as it maintains mitochondrial integrity, which is vital for energy production and preventing cellular damage, especially in cell types like pancreatic β-cells, sensory neurons, and immune cells.\n", "\n", "Question: How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\n", "Answer: The intrinsically disordered protein region (IDPR) within CLEC16A is crucial for its stability and interaction with RNF41, as it regulates CLEC16A turnover and is the site where RNF41 acts to destabilize CLEC16A. The IDPR is essential for the enzymatic function and molecular interactions of CLEC16A, including the assembly of the CLEC16A–RNF41–USP8 mitophagy complex. Loss of this IDPR impairs CLEC16A's ability to ubiquitinate RNF41, affecting the overall stability and function of the protein complex.\n", "\n", "Question: What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\n", "Answer: The CLEC16A-RNF41 complex is significant in the regulation of mitophagy as it promotes the assembly and stability of the tripartite mitophagy complex, which includes CLEC16A, RNF41, and USP8. This complex plays a crucial role in mitochondrial quality control, and the loss of CLEC16A impairs mitochondrial health and function in various cell types. Additionally, RNF41 might have a more central role in mitophagy than previously thought, as it can lead to the degradation of key mitophagy regulators.\n", "\n", "Question: How does RNF41 influence the turnover of CLEC16A, and what are the molecular mechanisms involved?\n", "Answer: RNF41 influences the turnover of CLEC16A by ubiquitinating and destabilizing it, an action that requires RNF41's ubiquitin ligase activity. Overexpression of RNF41 decreases CLEC16A protein levels and increases its ubiquitination, but a ligase-dead RNF41 mutant does not affect CLEC16A levels. The internal IDPR of CLEC16A is crucial for this process, as altering this region prevents RNF41 from reducing CLEC16A levels.\n", "\n", "Question: Which diseases are associated with dysregulation of CLEC16A, and what implications does this have for potential treatments?\n", "Answer: Dysregulation of CLEC16A is associated with over 20 human diseases, including diabetes, cardiovascular disease, stroke, multiple sclerosis, arthritis, Crohn's disease, and other inflammatory diseases. The implications for potential treatments involve targeting the intrinsic disordered protein region (IDPR) within CLEC16A to prevent its turnover, thereby increasing protein levels to enhance its function. This approach could potentially treat or prevent diseases associated with reduced CLEC16A levels by improving mitochondrial health through enhanced mitophagy.\n", "\n", "Question: What techniques were used in this study to confirm the presence and function of the IDPR in CLEC16A?\n", "Answer: The study used in silico computational techniques, including AlphaFold for protein structure prediction and IUPred2 for disorder prediction, to identify the putative internal IDPR in CLEC16A. Experimentally, they used NMR spectroscopy to examine the structural conformation of the IDPR and Western blot analysis to assess the impact of the IDPR on protein stability. They also introduced mutations and compared protein levels and stability in HEK293T cells to determine the function of the IDPR in regulating CLEC16A stability.\n", "\n", "Question: How does the disruption of CLEC16A’s IDPR affect its ubiquitination and degradation?\n", "Answer: Disruption of CLEC16A's internal IDPR prevents its turnover and reduces self-ubiquitination in vitro. The IDPR promotes CLEC16A destabilization, and its absence results in higher stability and protein levels compared to the wild-type CLEC16A. Additionally, RNF41 promotes the ubiquitination and destabilization of CLEC16A, and this process is impaired when the IDPR is disrupted.\n", "\n", "Question: Why might the IDPR in CLEC16A be considered a therapeutic target for diseases related to mitochondrial dysfunction?\n", "Answer: The IDPR in CLEC16A is considered a therapeutic target because it regulates CLEC16A turnover, and its destabilization can impact mitochondrial health. By blocking access to this region, it is possible to prevent CLEC16A turnover, potentially increasing protein levels and enhancing its function. This could help treat or prevent diseases associated with mitochondrial dysfunction.\n", "\n", "Question: How do mutations within CLEC16A’s IDPR affect the protein's ability to form complexes with RNF41 and USP8?\n", "Answer: Mutations within CLEC16A's IDPR impair the protein's ability to bind and ubiquitinate RNF41, which is essential for forming the CLEC16A–RNF41–USP8 mitophagy complex. Truncating or shuffling the residues within the IDPR reduces its binding to RNF41 and disrupts the assembly of the tripartite complex. This suggests that the integrity and specific sequence of the IDPR are crucial for the proper interaction and complex formation with RNF41 and USP8.\n", "\n", "Question: What did biophysical analyses reveal about the structural properties of CLEC16A’s IDPR, and how do these properties contribute to its function?\n", "Answer: Biophysical analyses revealed that CLEC16A's internal IDPR is predicted to lack secondary structure and is enriched in charged, polar residues like glutamic acid and lysine, which promote intrinsic disorder. These structural properties contribute to the protein's function by regulating CLEC16A turnover, as lysine residues in the IDPR are essential for this process. The IDPR's role in turnover is significant because it affects CLEC16A stability and function, which is pertinent to its involvement in human diseases.\n", "\n" ] } ], "source": [ "# List of questions based on the PDF content for testing\n", "questions = [\n", " \"What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?\",\n", " \"How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?\",\n", " \"What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?\",\n", " \"How does RNF41 influence the turnover of CLEC16A, and what are the molecular mechanisms involved?\",\n", " \"Which diseases are associated with dysregulation of CLEC16A, and what implications does this have for potential treatments?\",\n", " \"What techniques were used in this study to confirm the presence and function of the IDPR in CLEC16A?\",\n", " \"How does the disruption of CLEC16A’s IDPR affect its ubiquitination and degradation?\",\n", " \"Why might the IDPR in CLEC16A be considered a therapeutic target for diseases related to mitochondrial dysfunction?\",\n", " \"How do mutations within CLEC16A’s IDPR affect the protein's ability to form complexes with RNF41 and USP8?\",\n", " \"What did biophysical analyses reveal about the structural properties of CLEC16A’s IDPR, and how do these properties contribute to its function?\"\n", "]\n", "\n", "# Loop through each question, invoke the RAG chain, and print each answer\n", "for question in questions:\n", " result = rag_chain.invoke({\"input\": question})\n", " print(f\"Question: {question}\")\n", " print(f\"Answer: {result[\"answer\"]}\\n\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 2 }