In [1]:
import os
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings # Adjust to your embedding model
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI # Replace with the LLM of your choice
from dotenv import load_dotenv
load_dotenv()



 from tqdm.autonotebook import tqdm


True

In [4]:
# Initialize Pinecone

pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "clec16a-study"


In [7]:
spec = pinecone.ServerlessSpec(cloud='aws',region="us-east-1")

# Create the index if it doesn't exist
if index_name not in pc.list_indexes():
 pc.create_index(index_name, dimension=1536, spec=spec) # Adjust dimension as needed
# Connect to the index
index = pc.Index(index_name)

In [8]:
import openai 
openai.api_key = os.getenv("OPENAI_API_KEY")

In [9]:
MODEL = 'text-embedding-ada-002' 
pdf_path = "../data/main.pdf" # Replace with your actual PDF path
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Initialize embedding model
embedding_model = OpenAIEmbeddings(model=MODEL)

 embedding_model = OpenAIEmbeddings(model=MODEL)


In [12]:
# Define function to create or connect to an existing index
def create_or_connect_index(index_name, dimension):
 spec = pinecone.ServerlessSpec(cloud='aws',region="us-east-1")
 if index_name not in pc.list_indexes().names():
 pc.create_index(
 name=index_name,
 dimension=dimension,
 metric='cosine', # You can use 'dotproduct' or other metrics if needed
 spec=spec
 )
 return pc.Index(index_name)

In [None]:
sample_embedding = embedding_model.embed_query("Test")
index = create_or_connect_index(index_name, dimension=len(sample_embedding))
print(f'sample embedding: {len(sample_embedding)}')

sampleembedding: 1536


In [23]:
for i, doc in enumerate(documents):
 embedding = embedding_model.embed_query(doc.page_content)
 pinecone_id = f"page-{i}"
 metadata = {"text": doc.page_content} # Include a 'text' snippet in metadata
 index.upsert([(pinecone_id, embedding, metadata)]) # Upsert embedding with metadata

In [24]:
from langchain_openai import ChatOpenAI


vector_store = Pinecone.from_existing_index(index_name=index_name, embedding=embedding_model)

# Set up RetrievalQA chain for querying using a chat-based model for better responses
llm = ChatOpenAI(model="gpt-4", openai_api_key=openai.api_key) # Replace with the chat model of choice
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever())

In [25]:
# Define the list of questions
questions = [
 "What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?",
 "How does the intrinsically disordered protein region (IDPR) within CLEC16A impact its stability and interaction with RNF41?",
 "What is the significance of the CLEC16A-RNF41 complex in the regulation of mitophagy?",
 # Add more questions as needed
]

# Query each question and print the answers
for question in questions:
 answer = qa_chain.run(question)
 print(f"Question: {question}")
 print(f"Answer: {answer}\n")


Question: What role does CLEC16A play in mitochondrial quality control, and why is this important for cellular health?
Answer: CLEC16A is an E3 ubiquitin ligase that plays a significant role in mitochondrial quality control through a process called mitophagy. Mitophagy is a type of autophagy where damaged mitochondria are selectively eliminated from the cell. CLEC16A regulates mitophagy by forming a tripartite complex with another E3 ubiquitin ligase, RNF41, and a ubiquitin-specific peptidase, USP8. This complex controls the activity of the mitophagy regulator PRKN/Parkin. 

Maintaining mitochondrial quality control is crucial for cellular health as damaged mitochondria can lead to a decrease in energy production, increase in harmful reactive oxygen species, and potential induction of cell death. Therefore, the role of CLEC16A in mitochondrial quality control is important for maintaining cellular health and function. It is also noteworthy that the gene for CLEC16A is associated with ov

## Now, turn it into a chat

In [30]:
chat_llm = ChatOpenAI(model="gpt-4o", openai_api_key=openai.api_key)
chat_qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=vector_store.as_retriever(search_kwargs={"k" : 3}))


In [32]:
def chat_system():
 print("Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.")
 while True:
 question = input("You: ")
 if question.lower() in ['exit', 'quit']:
 print("Exiting the chat. Goodbye!")
 break
 answer = chat_qa_chain.run(question)
 print(f"AI: {answer}\n")

# Run the chat system
chat_system()


Welcome to the CLEC16A Chat System! Ask any question, or type 'exit' to quit.
AI: The document is a scientific article discussing research on the regulation of CLEC16A stability through an intrinsically disordered protein region (IDPR) and its implications in various diseases. It includes detailed methodologies such as statistical analysis using Prism software, data availability, acknowledgments, and contributions from various authors supported by institutions like the University of Michigan and the NIH. The research explores genetic associations with diseases like type 1 diabetes, multiple sclerosis, and myocardial infarction. The document also discusses experimental procedures including protein purification, nuclear magnetic resonance (NMR), circular dichroism, and cell culture techniques. Additionally, it highlights the significance of intrinsically disordered proteins in cellular functions and diseases. The article contains references to previous studies and provides a comprehensiv

### Now, i want to add a prompt template

In [33]:
# Define the initial system prompt
initial_prompt = (
 "You are an AI assistant specializing in CLEC16A-related research, focusing on mitochondrial quality control, "
 "the role of intrinsically disordered protein regions, and disease implications. "
 "Answer the following questions based on the document's content."
)

# Define the chat function with prompt
def chat_system():
 print("Welcome to the CLEC16A Chat System! Type 'exit' to quit.")
 
 # Send the initial prompt
 response = chat_qa_chain.run(initial_prompt)
 print(f"AI (Prompt): {response}\n")

 # Start the chat loop
 while True:
 question = input("You: ")
 if question.lower() in ['exit', 'quit']:
 print("Exiting the chat. Goodbye!")
 break
 # Prepend initial prompt to each question
 full_prompt = f"{initial_prompt}\n\n{question}"
 answer = chat_qa_chain.run(full_prompt)
 print(f"AI: {answer}\n")

# Run the chat system
chat_system()


Welcome to the CLEC16A Chat System! Type 'exit' to quit.
AI (Prompt): Sure, I can help with that. Please go ahead with your questions.

AI: The article investigates the role of an internal intrinsically disordered protein region (IDPR) within the CLEC16A protein, which is an E3 ubiquitin ligase involved in mitochondrial quality control through mitophagy. CLEC16A forms a complex with other proteins, RNF41 and USP8, to regulate mitochondrial health. The study highlights that the internal IDPR of CLEC16A is crucial for the protein's function and turnover. It is essential for the binding and ubiquitination of RNF41, which promotes the stability and assembly of the CLEC16A–RNF41–USP8 complex. Disruption of this IDPR prevents CLEC16A turnover and destabilizes the mitophagy complex. The presence of the IDPR in CLEC16A was confirmed using NMR and CD spectroscopy. This research suggests that targeting the IDPR could improve mitochondrial health in diseases associated with CLEC16A, such as diabe