In [1]:
## Getting to Main directory
import os
os.chdir("../")

In [2]:
# loading secret key
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from llama_index.core import VectorStoreIndex
from llama_index.core import ServiceContext
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
import google.generativeai as genai
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
gemini_api_key=os.getenv("GEMINI_API_KEY")
pinecone_api_key=os.getenv("PINECONE_API_KEY")

### Data ingestion - Taking pdf documents and Cleaning and Transforming  Data into vector index

In [6]:
documents=SimpleDirectoryReader("Data").load_data()

In [7]:
len(documents)

34

In [8]:
documents

[Document(id_='2c29fa85-a1fa-479c-8cdc-6c366889be7e', embedding=None, metadata={'page_label': '1', 'file_name': 'peft.pdf', 'file_path': 'e:\\projects\\AI research assistant\\Data\\peft.pdf', 'file_type': 'application/pdf', 'file_size': 562785, 'creation_date': '2024-03-30', 'last_modified_date': '2024-03-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Few-Shot Parameter-Efﬁcient Fine-Tuning is Better\nand Cheaper than In-Context Learning\nHaokun Liu∗Derek Tam∗Mohammed Muqeeth∗\nJay Mohta Tenghao Huang Mohit Bansal Colin Raffel\nDepartment of Computer Science\nUniversity of North Carolina at Chapel Hill\n{haokunl,dtredsox,muqeeth,craffel}@cs.unc.edu\nAbstract\nFew-shot in-context learning (ICL) enables pre-trained language models to per-\nform a pr

In [9]:
documents[0].text

'Few-Shot Parameter-Efﬁcient Fine-Tuning is Better\nand Cheaper than In-Context Learning\nHaokun Liu∗Derek Tam∗Mohammed Muqeeth∗\nJay Mohta Tenghao Huang Mohit Bansal Colin Raffel\nDepartment of Computer Science\nUniversity of North Carolina at Chapel Hill\n{haokunl,dtredsox,muqeeth,craffel}@cs.unc.edu\nAbstract\nFew-shot in-context learning (ICL) enables pre-trained language models to per-\nform a previously-unseen task without any gradient-based training by feeding a\nsmall number of training examples as part of the input. ICL incurs substantial\ncomputational, memory, and storage costs because it involves processing all of the\ntraining examples every time a prediction is made. Parameter-efﬁcient ﬁne-tuning\n(PEFT) (e.g. adapter modules, prompt tuning, sparse update methods, etc.) offers\nan alternative paradigm where a small set of parameters are trained to enable a\nmodel to perform the new task. In this paper, we rigorously compare few-shot\nICL and PEFT and demonstrate that the 

In [10]:
# Clean up our Documents' content
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    # Remove specific unwanted patterns and characters
    unwanted_patterns = [
        "\\n", "  —", "——————————", "—————————", "—————",
        r'\\u[\dA-Fa-f]{4}', r'\uf075', r'\uf0b7'
    ]
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Fix improperly spaced hyphenated words and normalize whitespace
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)

    return content

# Call function
cleaned_docs = []
for d in documents: 
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_docs.append(d)


In [11]:
# Inspect output
cleaned_docs[0].get_content()


'Few-Shot Parameter-Efﬁcient Fine-Tuning is Betterand Cheaper than In-Context LearningHaokun Liu∗Derek Tam∗Mohammed Muqeeth∗Jay Mohta Tenghao Huang Mohit Bansal Colin RaffelDepartment of Computer ScienceUniversity of North Carolina at Chapel Hill{haokunl,dtredsox,muqeeth,craffel}@cs.unc.eduAbstractFew-shot in-context learning (ICL) enables pre-trained language models to perform a previously-unseen task without any gradient-based training by feeding asmall number of training examples as part of the input. ICL incurs substantialcomputational, memory, and storage costs because it involves processing all of thetraining examples every time a prediction is made. Parameter-efﬁcient ﬁne-tuning(PEFT) (e.g. adapter modules, prompt tuning, sparse update methods, etc.) offersan alternative paradigm where a small set of parameters are trained to enable amodel to perform the new task. In this paper, we rigorously compare few-shotICL and PEFT and demonstrate that the latter offers better accuracy as 

In [12]:
cleaned_docs[0].metadata


{'page_label': '1',
 'file_name': 'peft.pdf',
 'file_path': 'e:\\projects\\AI research assistant\\Data\\peft.pdf',
 'file_type': 'application/pdf',
 'file_size': 562785,
 'creation_date': '2024-03-30',
 'last_modified_date': '2024-03-30'}

In [13]:
len(documents)

34

### Configuring Gemini model and GeminiEmbedding

In [14]:
genai.configure(api_key=gemini_api_key)

In [15]:
gemini_embed_model = GeminiEmbedding(model_name="models/embedding-001")

In [17]:
# Setting Tempreture to 0.3 for getting low risk results

model = Gemini(models="gemini-pro",api_key=gemini_api_key,temperature=0.3)

In [18]:
import os

from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.ingestion import IngestionPipeline

# This will be the model we use both for Node parsing and for vectorization
embed_model =gemini_embed_model

# Define the initial pipeline
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
    )


#### Setting up the `Settings` module to have the informantion about our llm and embedding models and also chunk size distribution of document files

#### As LLMPredictor is depriciated, we are using Settings.llm to define our base LLM Model

In [19]:
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceSplitter


Settings.llm = model
Settings.embed_model = gemini_embed_model
Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 512
Settings.context_window = 3900

### Using Pinecone as our vector database taking the index to save them 

In [20]:
from llama_index.vector_stores.pinecone import PineconeVectorStore

In [21]:
from pinecone import Pinecone

pc = Pinecone(api_key=pinecone_api_key)
pinecone_index = pc.Index("ai-research-assistant") # `ai-research-assistant` is the index name

### Nowing indexing and upserting indexes to pinecone

In [22]:
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [23]:
%%capture 
# Our pipeline with the addition of our PineconeVectorStore
pipeline = IngestionPipeline(
    transformations=[
        SemanticSplitterNodeParser(
            buffer_size=1,
            breakpoint_percentile_threshold=95, 
            embed_model=embed_model,
            ),
        embed_model,
        ],
        vector_store=vector_store  # Our new addition
    )

# Now we run our pipeline!
pipeline.run(documents=cleaned_docs)


In [24]:
pinecone_index.describe_index_stats()

# >>> {'dimension': 1536,
# >>> 'index_fullness': 0.0,
# >>> 'namespaces': {'': {'vector_count': 46}},
# >>> 'total_vector_count': 46}


{'dimension': 768,
 'index_fullness': 0.00176,
 'namespaces': {'': {'vector_count': 176}},
 'total_vector_count': 176}

### Simply querying from the index

In [26]:
# from llama_index import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Grab 5 search results
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)

# Query vector DB
answer = retriever.retrieve('generate a summary based on the information you have')

# Inspect results
print([i.get_content() for i in answer])

# >>> ['some relevant search result 1', 'some relevant search result 1'...]


['If your candidate doesn’t know the answer to the above questions and you’re hiring for a ML intern position, then they’re obviously not a great fit.', 'correct answer. Rank classiﬁcation evaluation is compatible with both classiﬁcation and multiplechoice tasks. Since model performance can vary signiﬁcantly depending on the prompt template used,we report the median accuracy across all prompt templates from P3 and across few-shot data subsetsfor each dataset. For all datasets, we report the accuracy on the test set or validation set when the testlabels are not public (e.g. SuperGLUE datasets). ', 'correct answer. Rank classiﬁcation evaluation is compatible with both classiﬁcation and multiplechoice tasks. Since model performance can vary signiﬁcantly depending on the prompt template used,we report the median accuracy across all prompt templates from P3 and across few-shot data subsetsfor each dataset. For all datasets, we report the accuracy on the test set or validation set when the t

### Adding proper prompt templates for the query engine

In [27]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import PromptTemplate
from llama_index.core.postprocessor import SimilarityPostprocessor


# Pass in your retriever from above, which is configured to return the top 5 results
query_engine = RetrieverQueryEngine(retriever=retriever)

postprocessor=SimilarityPostprocessor(similarity_cutoff=0.70)

query_engine=RetrieverQueryEngine(retriever=retriever,
                                  node_postprocessors=[postprocessor])

# Now you query:
llm_query = query_engine.query('generate a summary based on the information you have')
# llm_query = query_engine.query('tell me about ML questions')

llm_query.response

'Empty Response'

In [28]:
def get_full_prompt_template(cur_instr: str, prompt_tmpl):
    tmpl_str = prompt_tmpl.get_template()
    new_tmpl_str = cur_instr + "\n" + tmpl_str
    new_tmpl = PromptTemplate(new_tmpl_str)
    return new_tmpl

In [29]:
QA_PROMPT_KEY = "response_synthesizer:text_qa_template"

# get the base qa prompt (without any instruction prefix)
base_qa_prompt = query_engine.get_prompts()[QA_PROMPT_KEY]


initial_instr = """\
You are a QA assistant specifically designed to help in RESEARCH WORK as a RESEARCH ASSISTANT.
Context information is below. Given the context information and not prior knowledge, \
answer the query. \
"""

# this is the "initial" prompt template
# implicitly used in the first stage of the loop during prompt optimization
# here we explicitly capture it so we can use it for evaluation
old_qa_prompt = get_full_prompt_template(initial_instr, base_qa_prompt)


In [30]:
old_qa_prompt

PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template='You are a QA assistant specifically designed to help in RESEARCH WORK as a RESEARCH ASSISTANT.\nContext information is below. Given the context information and not prior knowledge, answer the query. \nContext information is below.\n---------------------\n{context_str}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {query_str}\nAnswer: ')

In [31]:
# Use the custom prompt when querying
query_engine = vector_index.as_query_engine(text_qa_template=old_qa_prompt)
response = query_engine.query("generate a summary based on the information you have")
# response = query_engine.query('tell me about Few-shot in-context learning')
print(response)


I apologize, but the provided context does not contain sufficient information to generate a meaningful summary.


In [32]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response,show_source=True)
print(response)

Final Response: I apologize, but the provided context does not contain
sufficient information to generate a meaningful summary.
______________________________________________________________________
Source Node 1/2
Node ID: 444a52bb-805b-4698-9e41-6817dcfe1fa1
Similarity: 0.481024384
Text: If your candidate doesn’t know the answer to the above questions
and you’re hiring for a ML intern position, then they’re obviously not
a great fit.
______________________________________________________________________
Source Node 2/2
Node ID: 6c5664e4-5e90-491d-8b80-857852760395
Similarity: 0.492449284
Text: correct answer. Rank classiﬁcation evaluation is compatible with
both classiﬁcation and multiplechoice tasks. Since model performance
can vary signiﬁcantly depending on the prompt template used,we report
the median accuracy across all prompt templates from P3 and across
few-shot data subsetsfor each dataset. For all datasets, we report the
accur...
I apologize, but the provided context does not

### Correct Prompts and the Right question is important to get the desired respose

##### "generate a summary based on the information you have about peft" is a better query in this case than 'tell me about the T-Few Recipe' as in the later case the souce node is fetching irrelevant data like the table content which is apperently more similar through vector search

In [33]:
# Use the custom prompt when querying
query_engine = vector_index.as_query_engine(text_qa_template=old_qa_prompt)
# response = query_engine.query("generate a summary based on the information you have about peft")
response = query_engine.query('tell me about t few recipe')
print(response)


The provided context does not mention anything about recipes, so I cannot answer this question from the provided context.


In [34]:
from llama_index.core.response.pprint_utils import pprint_response
pprint_response(response,show_source=True)
print(response)

Final Response: The provided context does not mention anything about
recipes, so I cannot answer this question from the provided context.
______________________________________________________________________
Source Node 1/2
Node ID: 401a275e-9caa-44dd-bbab-3b46c526adc7
Similarity: 0.553759813
Text: Interview Questions to Ask a ML intern| Xobin [Downloaded]8
Prepared and Curated by Xobin Team
______________________________________________________________________
Source Node 2/2
Node ID: dda36b9b-cba5-4017-9baf-06326c029b8e
Similarity: 0.575024486
Text: Interview Questions to Ask a ML intern| Xobin
[Downloaded]1Interview Questions to Ask a ML intern| Xobin
[Downloaded]We at Xobin reached out to over 70+ Hiring teams to curate
the best interview questions. W e didn't stop there. We went ahead to
understand what type of answers dif ferentiated the top candidate from
the rest.
The provided context does not mention anything about recipes, so I cannot answer this question from the provided c