In [19]:
!pip install -U langchain-community tiktoken langchain-openai langchainhub langchain langgraph duckduckgo-search langchain-groq langchain-huggingface sentence_transformers tavily-python crawl4ai docling easyocr FlagEmbedding "chonkie[semantic]" pinecone streamlit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
   25l   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/9.1 MB ? eta -:--:--━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.1/9.1 MB 189.4 MB/s eta 0:00:00
[?25hInstalling collected packages: streamlit
  Attempting uninstall: streamlit
    Found existing installation: streamlit 1.27.2
    Uninstalling streamlit-1.27.2:
      Successfully uninstalled streamlit-1.27.2
Successfully installed streamlit-1.41.1


In [2]:
!crawl4ai-setup
!export PYTHONPATH=.

[36m[INIT].... → Running post-installation setup...[0m
[36m[INIT].... → Installing Playwright browsers...[0m
You are using a frozen webkit browser which does not receive updates anymore on ubuntu20.04-x64. Please update to the latest version of your operating system to test up-to-date browsers.
╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Please install them with the following command:      ║
║                                                      ║
║     sudo playwright install-deps                     ║
║                                                      ║
║ Alternatively, use apt:                              ║
║     sudo apt-get install libxslt1.1\                 ║
║         libwoff1\                                    ║
║         libwebpdemux2\                               ║
║         libenchant-2-2\                              ║
║         libhyphen0\                                  ║
║         libgle

In [3]:
from pathlib import Path
from typing import List, Union
import logging
from dataclasses import dataclass

from langchain_core.documents import Document as LCDocument
from langchain_core.document_loaders import BaseLoader
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    EasyOcrOptions
)

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

@dataclass
class ProcessingResult:
    """Store results of document processing"""
    success_count: int = 0
    failure_count: int = 0
    partial_success_count: int = 0
    failed_files: List[str] = None

    def __post_init__(self):
        if self.failed_files is None:
            self.failed_files = []

class MultiFormatDocumentLoader(BaseLoader):
    """Loader for multiple document formats that converts to LangChain documents"""
    
    def __init__(
        self,
        file_paths: Union[str, List[str]],
        enable_ocr: bool = True,
        enable_tables: bool = True
    ):
        self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths
        self._enable_ocr = enable_ocr
        self._enable_tables = enable_tables
        self._converter = self._setup_converter()
        
    def _setup_converter(self):
        """Set up the document converter with appropriate options"""
        # Configure pipeline options
        pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(
                force_full_page_ocr=True
            ))
        if self._enable_ocr:
            pipeline_options.do_ocr = True
        if self._enable_tables:
            pipeline_options.do_table_structure = True
            pipeline_options.table_structure_options.do_cell_matching = True

        # Create converter with supported formats
        return DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],
            format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )}
        )

    def lazy_load(self):
        """Convert documents and yield LangChain documents"""
        results = ProcessingResult()
        
        for file_path in self._file_paths:
            try:
                path = Path(file_path)
                if not path.exists():
                    _log.warning(f"File not found: {file_path}")
                    results.failure_count += 1
                    results.failed_files.append(file_path)
                    continue

                conversion_result = self._converter.convert(path)
                
                if conversion_result.status == ConversionStatus.SUCCESS:
                    results.success_count += 1
                    text = conversion_result.document.export_to_markdown()
                    metadata = {
                        'source': str(path),
                        'file_type': path.suffix,
                    }
                    yield LCDocument(
                        page_content=text,
                        metadata=metadata
                    )
                elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:
                    results.partial_success_count += 1
                    _log.warning(f"Partial conversion for {file_path}")
                    text = conversion_result.document.export_to_markdown()
                    metadata = {
                        'source': str(path),
                        'file_type': path.suffix,
                        'conversion_status': 'partial'
                    }
                    yield LCDocument(
                        page_content=text,
                        metadata=metadata
                    )
                else:
                    results.failure_count += 1
                    results.failed_files.append(file_path)
                    _log.error(f"Failed to convert {file_path}")
                    
            except Exception as e:
                _log.error(f"Error processing {file_path}: {str(e)}")
                results.failure_count += 1
                results.failed_files.append(file_path)

        # Log final results
        total = results.success_count + results.partial_success_count + results.failure_count
        _log.info(
            f"Processed {total} documents:\n"
            f"- Successfully converted: {results.success_count}\n"
            f"- Partially converted: {results.partial_success_count}\n"
            f"- Failed: {results.failure_count}"
        )
        if results.failed_files:
            _log.info("Failed files:")
            for file in results.failed_files:
                _log.info(f"- {file}")
                
                
# if __name__ == '__main__':
#     # Load documents from a list of file paths
#     loader = MultiFormatDocumentLoader(
#         file_paths=[
#             # './data/2404.19756v1.pdf',
#             # './data/OD429347375590223100.pdf',
#             './data/Project Report Format.docx',
#             # './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
#         ],
#         enable_ocr=False,
#         enable_tables=True
#     )
#     for doc in loader.lazy_load():
#         print(doc.page_content)
#         print(doc.metadata)
#         # save document in .md file 
#         with open('output.md', 'w') as f:
#             f.write(doc.page_content)

In [4]:
from typing import List
import numpy as np
from chonkie.embeddings import BaseEmbeddings
from FlagEmbedding import BGEM3FlagModel
from chonkie import SDPMChunker as SDPMChunker

class BGEM3Embeddings(BaseEmbeddings):
    def __init__(self, model_name):
        self.model = BGEM3FlagModel(model_name, use_fp16=True)
        self.task = "separation"
    
    @property
    def dimension(self):
        return 1024

    def embed(self, text: str):
        e = self.model.encode([text], return_dense=True, return_sparse=False, return_colbert_vecs=False)['dense_vecs']
        # print(e)
        return e

    def embed_batch(self, texts: List[str]):
        embeddings = self.model.encode(texts, return_dense=True, return_sparse=False, return_colbert_vecs=False
        )
        # print(embeddings['dense_vecs'])
        return embeddings['dense_vecs']

    def count_tokens(self, text: str):
        l = len(self.model.tokenizer.encode(text))
        # print(l)
        return l

    def count_tokens_batch(self, texts: List[str]):
        encodings = self.model.tokenizer(texts)
        # print([len(enc) for enc in encodings["input_ids"]])
        return [len(enc) for enc in encodings["input_ids"]]

    def get_tokenizer_or_token_counter(self):
        return self.model.tokenizer
    
    def similarity(self, u: "np.ndarray", v: "np.ndarray"):
        """Compute cosine similarity between two embeddings."""
        s = (u@v.T)#.item()
        # print(s)
        return s
    
    @classmethod
    def is_available(cls):
        return True

    def __repr__(self):
        return "bgem3"


# def main():
#     # Initialize the BGE M3 embeddings model
#     embedding_model = BGEM3Embeddings(
#         model_name="BAAI/bge-m3"
#     )

#     # Initialize the SDPM chunker
#     chunker = SDPMChunker(
#         embedding_model=embedding_model,
#         chunk_size=256,
#         threshold=0.7,
#         skip_window=2
#     )

#     with open('./output.md', 'r') as file:
#         text = file.read()

#     # Generate chunks
#     chunks = chunker.chunk(text)

#     # Print the chunks
#     for i, chunk in enumerate(chunks, 1):
#         print(f"\nChunk {i}:")
#         print(f"Text: {chunk.text}")
#         print(f"Token count: {chunk.token_count}")
#         print(f"Start index: {chunk.start_index}")
#         print(f"End index: {chunk.end_index}")
#         print(f"no of sentences: {len(chunk.sentences)}")
#         print("-" * 80)

# if __name__ == "__main__":
#     main()

INFO:datasets:PyTorch version 2.5.1 available.


In [7]:
# from data_processing.loader import MultiFormatDocumentLoader
# from data_processing.chunker import SDPMChunker, BGEM3Embeddings

import pandas as pd
from typing import List, Dict, Any
from pinecone import Pinecone, ServerlessSpec
import time
from tqdm import tqdm
from dotenv import load_dotenv
import os


load_dotenv()

# API Keys
PINECONE_API_KEY = input("Enter your Pinecone API key: ")

embedding_model = BGEM3Embeddings(model_name="BAAI/bge-m3")


def load_documents(file_paths: List[str], output_path='./output.md'):
    """
    Load documents from multiple sources and combine them into a single markdown file
    """
    loader = MultiFormatDocumentLoader(
        file_paths=file_paths,
        enable_ocr=False,
        enable_tables=True
    )
    
    # Append all documents to the markdown file
    with open(output_path, 'w') as f:
        for doc in loader.lazy_load():
            # Add metadata as YAML frontmatter
            f.write('---\n')
            for key, value in doc.metadata.items():
                f.write(f'{key}: {value}\n')
            f.write('---\n\n')
            f.write(doc.page_content)
            f.write('\n\n')
    
    return output_path

def process_chunks(markdown_path: str, chunk_size: int = 256, 
                  threshold: float = 0.7, skip_window: int = 2):
    """
    Process the markdown file into chunks and prepare for vector storage
    """
    chunker = SDPMChunker(
        embedding_model=embedding_model,
        chunk_size=chunk_size,
        threshold=threshold,
        skip_window=skip_window
    )
    
    # Read the markdown file
    with open(markdown_path, 'r') as file:
        text = file.read()
    
    # Generate chunks
    chunks = chunker.chunk(text)
    
    # Prepare data for Parquet
    processed_chunks = []
    for chunk in chunks:
        
        processed_chunks.append({
            'text': chunk.text,
            'token_count': chunk.token_count,
            'start_index': chunk.start_index,
            'end_index': chunk.end_index,
            'num_sentences': len(chunk.sentences),
        })
    
    return processed_chunks

def save_to_parquet(chunks: List[Dict[str, Any]], output_path='./data/chunks.parquet'):
    """
    Save processed chunks to a Parquet file
    """
    df = pd.DataFrame(chunks)
    print(f"Saving to Parquet: {output_path}")
    df.to_parquet(output_path)
    print(f"Saved to Parquet: {output_path}")
    return output_path


class PineconeRetriever:
    def __init__(
        self,
        pinecone_client: Pinecone,
        index_name: str,
        namespace: str,
        embedding_generator: BGEM3Embeddings
    ):
        """Initialize the retriever with Pinecone client and embedding generator.
        
        Args:
            pinecone_client: Initialized Pinecone client
            index_name: Name of the Pinecone index
            namespace: Namespace in the index
            embedding_generator: BGEM3Embeddings instance
        """
        self.pinecone = pinecone_client
        self.index = self.pinecone.Index(index_name)
        self.namespace = namespace
        self.embedding_generator = embedding_generator
    
    def invoke(self, question: str, top_k: int = 5):
        """Retrieve similar documents for a question.
        
        Args:
            question: Query string
            top_k: Number of results to return
            
        Returns:
            List of dictionaries containing retrieved documents
        """
        # Generate embedding for the question
        question_embedding = self.embedding_generator.embed(question)
        question_embedding = question_embedding.tolist()
        # Query Pinecone
        results = self.index.query(
            namespace=self.namespace,
            vector=question_embedding,
            top_k=top_k,
            include_values=False,
            include_metadata=True
        )
        
        # Format results
        retrieved_docs = [
            {"page_content": match.metadata["text"], "score": match.score} 
            for match in results.matches
        ]
        
        return retrieved_docs

def ingest_data(
    pc,
    parquet_path: str,
    text_column: str,
    pinecone_client: Pinecone,
    index_name= "vector-index",
    namespace= "rag",
    batch_size: int = 100
):
    """Ingest data from a Parquet file into Pinecone.
    
    Args:
        parquet_path: Path to the Parquet file
        text_column: Name of the column containing text data
        pinecone_client: Initialized Pinecone client
        index_name: Name of the Pinecone index
        namespace: Namespace in the index
        batch_size: Batch size for processing
    """
    # Read Parquet file
    print(f"Reading Parquet file: {parquet_path}")
    df = pd.read_parquet(parquet_path)
    print(f"Total records: {len(df)}")
    # Create or get index
    if not pinecone_client.has_index(index_name):
        pinecone_client.create_index(
            name=index_name,
            dimension=1024,  # BGE-M3 dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        
        # Wait for index to be ready
        while not pinecone_client.describe_index(index_name).status['ready']:
            time.sleep(1)
    
    index = pinecone_client.Index(index_name)
    
    # Process in batches
    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]
        
        # Generate embeddings for batch
        texts = batch_df[text_column].tolist()
        embeddings = embedding_model.embed_batch(texts)
        print(f"embeddings for batch: {i}")
        # Prepare records for upsert
        records = []
        for idx, (_, row) in enumerate(batch_df.iterrows()):
            records.append({
                "id": str(row.name),  # Using DataFrame index as ID
                "values": embeddings[idx],
                "metadata": {"text": row[text_column]}
            })
        
        # Upsert to Pinecone
        index.upsert(vectors=records, namespace=namespace)
        
        # Small delay to handle rate limits
        time.sleep(0.5)

def get_retriever(
    pinecone_client: Pinecone,
    index_name= "vector-index",
    namespace= "rag"
):
    """Create and return a PineconeRetriever instance.
    
    Args:
        pinecone_client: Initialized Pinecone client
        index_name: Name of the Pinecone index
        namespace: Namespace in the index
        
    Returns:
        Configured PineconeRetriever instance
    """
    return PineconeRetriever(
        pinecone_client=pinecone_client,
        index_name=index_name,
        namespace=namespace,
        embedding_generator=embedding_model
    )
    
def main():
    # Initialize Pinecone client
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Define input files
    file_paths=[
        # './data/2404.19756v1.pdf',
        # './data/OD429347375590223100.pdf',
        # './data/Project Report Format.docx',
        '/teamspace/studios/this_studio/adaptive_rag/data/UNIT 2 GENDER BASED VIOLENCE.pptx'
    ]
    md_file_path = '/teamspace/studios/this_studio/adaptive_rag/data/output.md'
    parquet_file_path = '/teamspace/studios/this_studio/adaptive_rag/data/chunks.parquet'
    # Process pipeline
    try:
        # Step 1: Load and combine documents
        print("Loading documents...")
        markdown_path = load_documents(file_paths, output_path=md_file_path)
        
        # Step 2: Process into chunks with embeddings
        print("Processing chunks...")
        chunks = process_chunks(markdown_path, chunk_size=256, threshold=0.7, skip_window=2)
        
        # Step 3: Save to Parquet
        print("Saving to Parquet...")
        parquet_path = save_to_parquet(chunks, output_path=parquet_file_path)
        
        # Step 4: Ingest into Pinecone
        print("Ingesting into Pinecone...")
        ingest_data(
            pc,
            parquet_path=parquet_path,
            text_column="text",
            pinecone_client=pc,
        )
            
    except Exception as e:
        print(f"Error in pipeline: {str(e)}")

if __name__ == "__main__":
    main()

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

INFO:FlagEmbedding.finetune.embedder.encoder_only.m3.runner:loading existing colbert_linear and sparse_linear---------
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document UNIT 2 GENDER BASED VIOLENCE.pptx
INFO:docling.document_converter:Finished converting document UNIT 2 GENDER BASED VIOLENCE.pptx in 0.05 sec.
INFO:__main__:Processed 1 documents:
- Successfully converted: 1
- Partially converted: 0
- Failed: 0
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followe

Loading documents...
Processing chunks...
Saving to Parquet...
Saving to Parquet: /teamspace/studios/this_studio/adaptive_rag/data/chunks.parquet
Saved to Parquet: /teamspace/studios/this_studio/adaptive_rag/data/chunks.parquet
Ingesting into Pinecone...
Reading Parquet file: /teamspace/studios/this_studio/adaptive_rag/data/chunks.parquet
Total records: 13


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
  0%|          | 0/1 [00:00<?, ?it/s]

embeddings for batch: 0


100%|██████████| 1/1 [00:18<00:00, 18.44s/it]


In [9]:
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List

class DocumentRelevance(BaseModel):
    """Binary score for relevance check on retrieved documents."""
    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

class HallucinationCheck(BaseModel):
    """Binary score for hallucination present in generation answer."""
    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

class AnswerQuality(BaseModel):
    """Binary score to assess answer addresses question."""
    binary_score: str = Field(
        description="Answer addresses the question, 'yes' or 'no'"
    )

def create_llm_grader(grader_type: str, llm):
    """
    Create an LLM grader based on the specified type.
    
    Args:
        grader_type (str): Type of grader to create
    
    Returns:
        Callable: LLM grader function
    """
    # Initialize LLM
    
    # Select grader type and create structured output
    if grader_type == "document_relevance":
        structured_llm_grader = llm.with_structured_output(DocumentRelevance)
        system = """You are a grader assessing relevance of a retrieved document to a user question. 
        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. 
        It does not need to be a stringent test. The goal is to filter out erroneous retrievals. 
        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system),
            ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
        ])
        
    elif grader_type == "hallucination":
        structured_llm_grader = llm.with_structured_output(HallucinationCheck)
        system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. 
        Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system),
            ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
        ])
        
    elif grader_type == "answer_quality":
        structured_llm_grader = llm.with_structured_output(AnswerQuality)
        system = """You are a grader assessing whether an answer addresses / resolves a question. 
        Give a binary score 'yes' or 'no'. 'Yes' means that the answer resolves the question."""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system),
            ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
        ])
    
    else:
        raise ValueError(f"Unknown grader type: {grader_type}")
    
    return prompt | structured_llm_grader

def grade_document_relevance(question: str, document: str, llm):
    """
    Grade the relevance of a document to a given question.
    
    Args:
        question (str): User's question
        document (str): Retrieved document content
    
    Returns:
        str: Binary score ('yes' or 'no')
    """
    grader = create_llm_grader("document_relevance", llm)
    result = grader.invoke({"question": question, "document": document})
    return result.binary_score

def check_hallucination(documents: List[str], generation: str, llm):
    """
    Check if the generation is grounded in the provided documents.
    
    Args:
        documents (List[str]): List of source documents
        generation (str): LLM generated answer
    
    Returns:
        str: Binary score ('yes' or 'no')
    """
    grader = create_llm_grader("hallucination", llm)
    result = grader.invoke({"documents": documents, "generation": generation})
    return result.binary_score

def grade_answer_quality(question: str, generation: str, llm):
    """
    Grade the quality of the answer in addressing the question.
    
    Args:
        question (str): User's original question
        generation (str): LLM generated answer
    
    Returns:
        str: Binary score ('yes' or 'no')
    """
    grader = create_llm_grader("answer_quality", llm)
    result = grader.invoke({"question": question, "generation": generation})
    return result.binary_score

# if __name__ == "__main__":
#     # Example usage
#     test_question = "What are the types of agent memory?"
#     test_document = "Agent memory can be classified into different types such as episodic, semantic, and working memory."
#     test_generation = "Agent memory includes episodic memory for storing experiences, semantic memory for general knowledge, and working memory for immediate processing."
#     llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
    
#     print("Document Relevance:", grade_document_relevance(test_question, test_document, llm))
#     print("Hallucination Check:", check_hallucination([test_document], test_generation, llm))
#     print("Answer Quality:", grade_answer_quality(test_question, test_generation, llm))

In [13]:
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

def create_query_rewriter(llm):
    """
    Create a query rewriter to optimize retrieval.
    
    Returns:
        Callable: Query rewriter function
    """
    
    # Prompt for query rewriting
    system = """You are a question re-writer that converts an input question to a better version that is optimized 
    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
    
    re_write_prompt = ChatPromptTemplate.from_messages([
        ("system", system),
        ("human", "Here is the initial question: \n\n {question} \n Formulate an improved question."),
    ])
    
    # Create query rewriter chain
    return re_write_prompt | llm | StrOutputParser()

def rewrite_query(question: str, llm):
    """
    Rewrite a given query to optimize retrieval.
    
    Args:
        question (str): Original user question
    
    Returns:
        str: Rewritten query
    """
    query_rewriter = create_query_rewriter(llm)
    try:
        rewritten_query = query_rewriter.invoke({"question": question})
        return rewritten_query
    except Exception as e:
        print(f"Query rewriting error: {e}")
        return question

# if __name__ == "__main__":
#     # Example usage
#     test_queries = [
#         "Tell me about AI agents",
#         "What do we know about memory in AI systems?",
#         "Bears draft strategy"
#     ]
#     llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
    
#     for query in test_queries:
#         rewritten = rewrite_query(query, llm)
#         print(f"Original: {query}")
#         print(f"Rewritten: {rewritten}\n")

In [14]:
import os
import sys
import asyncio
from typing import List, Dict, Optional

from langchain_community.tools import DuckDuckGoSearchResults
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from dotenv import load_dotenv

load_dotenv()

class AdvancedWebCrawler:
    def __init__(self, 
                 max_search_results: int = 5, 
                 word_count_threshold: int = 50,
                 content_filter_type: str = 'pruning',
                 filter_threshold: float = 0.48):
        """
        Initialize the Advanced Web Crawler
        
        Args:
            max_search_results (int): Maximum number of search results to process
            word_count_threshold (int): Minimum word count for crawled content
            content_filter_type (str): Type of content filter ('pruning' or 'bm25')
            filter_threshold (float): Threshold for content filtering
        """
        self.max_search_results = max_search_results
        self.word_count_threshold = word_count_threshold
        self.content_filter_type = content_filter_type
        self.filter_threshold = filter_threshold

    def _create_web_search_tool(self):
        """
        Create a web search tool using DuckDuckGo
        
        Returns:
            DuckDuckGoSearchResults: Web search tool
        """
        return DuckDuckGoSearchResults(max_results=self.max_search_results, output_format="list")

    def _create_content_filter(self, user_query: Optional[str] = None):
        """
        Create content filter based on specified type
        
        Args:
            user_query (Optional[str]): Query to use for BM25 filtering
        
        Returns:
            Content filter strategy
        """
        if self.content_filter_type == 'bm25' and user_query:
            return BM25ContentFilter(
                user_query=user_query, 
                bm25_threshold=self.filter_threshold
            )
        else:
            return PruningContentFilter(
                threshold=self.filter_threshold, 
                threshold_type="fixed", 
                min_word_threshold=self.word_count_threshold
            )

    async def crawl_urls(self, urls: List[str], user_query: Optional[str] = None):
        """
        Crawl multiple URLs with content filtering
        
        Args:
            urls (List[str]): List of URLs to crawl
            user_query (Optional[str]): Query used for BM25 content filtering
        
        Returns:
            List of crawl results
        """
        async with AsyncWebCrawler(
            browser_type="chromium", 
            headless=True, 
            verbose=True
        ) as crawler:
            # Create appropriate content filter
            content_filter = self._create_content_filter(user_query)
            
            # Run crawling for multiple URLs
            results = await crawler.arun_many(
                urls=urls,
                word_count_threshold=self.word_count_threshold,
                bypass_cache=True,
                markdown_generator=DefaultMarkdownGenerator(
                    content_filter=content_filter
                ),
                cache_mode=CacheMode.DISABLED,
                exclude_external_links=True,
                remove_overlay_elements=True,
                simulate_user=True,
                magic=True
            )
            
            # Process and return crawl results
            processed_results = []
            for result in results:
                crawl_result = {
                    "url": result.url,
                    "success": result.success,
                    "title": result.metadata.get('title', 'N/A'),
                    "content": result.markdown_v2.raw_markdown if result.success else result.error_message,
                    "word_count": len(result.markdown_v2.raw_markdown.split()) if result.success else 0,
                    "links": {
                        "internal": len(result.links.get('internal', [])),
                        "external": len(result.links.get('external', []))
                    },
                    "images": len(result.media.get('images', []))
                }
                processed_results.append(crawl_result)
            
            return processed_results

    async def search_and_crawl(self, query: str) -> List[Dict]:
        """
        Perform web search and crawl the results
        
        Args:
            query (str): Search query
        
        Returns:
            List of crawled content results
        """
        # Perform web search
        search_tool = self._create_web_search_tool()
        try:
            search_results = search_tool.invoke({"query": query})
            
            # Extract URLs from search results
            urls = [result['link'] for result in search_results]
            print(f"Found {len(urls)} URLs for query: {query}")
            
            # Crawl URLs
            crawl_results = await self.crawl_urls(urls, user_query=query)
            
            return crawl_results
        
        except Exception as e:
            print(f"Web search and crawl error: {e}")
            return []

# def main():
#     # Example usage
#     crawler = AdvancedWebCrawler(
#         max_search_results=5,
#         word_count_threshold=50,
#         content_filter_type='f',
#         filter_threshold=0.48
#     )
    
#     test_queries = [
#         "Latest developments in AI agents",
#         "Today's weather forecast in Kolkata",
#     ]
    
#     for query in test_queries:
#         # Run search and crawl asynchronously
#         results = asyncio.run(crawler.search_and_crawl(query))
        
#         print(f"\nResults for query: {query}")
#         for result in results:
#             print(f"URL: {result['url']}")
#             print(f"Success: {result['success']}")
#             print(f"Title: {result['title']}")
#             print(f"Word Count: {result['word_count']}")
#             print(f"Content Preview: {result['content'][:500]}...\n")

# if __name__ == "__main__":
#     main()

In [15]:
from typing import List, TypedDict
from langchain_core.documents.base import Document

class GraphState(TypedDict):
    """
    Represents the state of our adaptive RAG graph.

    Attributes:
        question (str): Original user question
        generation (str, optional): LLM generated answer
        documents (List[Document], optional): Retrieved or searched documents
    """
    question: str
    generation: str | None
    documents: List[Document]


In [16]:
from langgraph.graph import END, StateGraph, START
from langchain_core.prompts import PromptTemplate
import asyncio
from langchain_core.output_parsers import StrOutputParser

def perform_web_search(question: str):
    """
    Perform web search using the AdvancedWebCrawler.
    
    Args:
        question (str): User's input question
    
    Returns:
        List: Web search results
    """
    # Initialize web crawler
    crawler = AdvancedWebCrawler(
        max_search_results=5,
        word_count_threshold=50,
        content_filter_type='f',
        filter_threshold=0.48
    )
    results = asyncio.run(crawler.search_and_crawl(question))
    
    return results


def create_adaptive_rag_workflow(retriever, llm, top_k=5, enable_websearch=False):
    """
    Create the adaptive RAG workflow graph.
    
    Args:
        retriever: Vector store retriever
    
    Returns:
        Compiled LangGraph workflow
    """
    def retrieve(state: GraphState):
        """Retrieve documents from vectorstore."""
        print("---RETRIEVE---")
        question = state['question']
        documents = retriever.invoke(question, top_k)
        print(f"Retrieved {len(documents)} documents.")
        print(documents)
        return {"documents": documents, "question": question}

    def route_to_datasource(state: GraphState):
        """Route question to web search or vectorstore."""
        print("---ROUTE QUESTION---")
        # question = state['question']
        # source = route_query(question)
       
        if enable_websearch:
            print("---ROUTE TO WEB SEARCH---")
            return "web_search"
        else:
            print("---ROUTE TO RAG---")
            return "vectorstore"

    def generate_answer(state: GraphState):
        """Generate answer using retrieved documents."""
        print("---GENERATE---")
        question = state['question']
        documents = state['documents']
        
        # Prepare context
        context = "\n\n".join([doc["page_content"] for doc in documents])
        prompt_template = PromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:""")
        # Generate answer
        rag_chain = prompt_template | llm | StrOutputParser()

        generation = rag_chain.invoke({"context": context, "question": question})
        
        return {"generation": generation, "documents": documents, "question": question}

    def grade_documents(state: GraphState):
        """Filter relevant documents."""
        print("---GRADE DOCUMENTS---")
        question = state['question']
        documents = state['documents']
        
        # Filter documents
        filtered_docs = []
        for doc in documents:
            score = grade_document_relevance(question, doc["page_content"], llm)
            if score == "yes":
                filtered_docs.append(doc)
        
        return {"documents": filtered_docs, "question": question}

    def web_search(state: GraphState):
        """Perform web search."""
        print("---WEB SEARCH---")
        question = state['question']
        
        # Perform web search
        results = perform_web_search(question)
        web_documents = [
            {
                "page_content": result['content'], 
                "metadata": {"source": result['url']}
            } for result in results
        ]
        
        return {"documents": web_documents, "question": question}

    def check_generation_quality(state: GraphState):
        """Check the quality of generated answer."""
        print("---ASSESS GENERATION---")
        question = state['question']
        documents = state['documents']
        generation = state['generation']
 
        
        print("---Generation is not hallucinated.---")
        # Check answer quality
        quality_score = grade_answer_quality(question, generation, llm)
        if quality_score == "yes":
            print("---Answer quality is good.---")
        else:
            print("---Answer quality is poor.---")
        return "end" if quality_score == "yes" else "rewrite"

    # Create workflow
    workflow = StateGraph(GraphState)

    # Add nodes
    workflow.add_node("vectorstore", retrieve)
    workflow.add_node("web_search", web_search)
    workflow.add_node("grade_documents", grade_documents)
    workflow.add_node("generate", generate_answer)
    workflow.add_node("rewrite_query", lambda state: {
        "question": rewrite_query(state['question'], llm),
        "documents": [],
        "generation": None
    })

    # Define edges
    workflow.add_conditional_edges(
        START, 
        route_to_datasource,
        {
            "web_search": "web_search",
            "vectorstore": "vectorstore"
        }
    )
    
    workflow.add_edge("web_search", "generate")
    workflow.add_edge("vectorstore", "grade_documents")
    
    workflow.add_conditional_edges(
        "grade_documents",
        lambda state: "generate" if state['documents'] else "rewrite_query"
    )
    
    workflow.add_edge("rewrite_query", "vectorstore")
    
    workflow.add_conditional_edges(
        "generate",
        check_generation_quality,
        {
            "end": END,
            "regenerate": "generate",
            "rewrite": "rewrite_query"
        }
    )

    # Compile the workflow
    app = workflow.compile()
    return app

def run_adaptive_rag(retriever, question: str, llm, top_k=5, enable_websearch=False):
    """
    Run the adaptive RAG workflow for a given question.
    
    Args:
        retriever: Vector store retriever
        question (str): User's input question
    
    Returns:
        str: Generated answer
    """
    # Create workflow
    workflow = create_adaptive_rag_workflow(retriever, llm, top_k, enable_websearch=enable_websearch)
    
    # Run workflow
    final_state = None
    for output in workflow.stream({"question": question}, config={"recursion_limit": 5}):
        for key, value in output.items():
            print(f"Node '{key}':")
            # Optionally print state details
            # print(value)
        final_state = value
    
    return final_state.get('generation', 'No answer could be generated.')

# if __name__ == "__main__":
#     # Example usage
#     from vectorstore.pinecone_db import PINECONE_API_KEY, ingest_data,  get_retriever, load_documents, process_chunks, save_to_parquet
#     from pinecone import Pinecone
    
#     # Load and prepare documents
#     pc = Pinecone(api_key=PINECONE_API_KEY)
    
#     # Define input files
#     file_paths=[
#         # './data/2404.19756v1.pdf',
#         # './data/OD429347375590223100.pdf',
#         # './data/Project Report Format.docx',
#         './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
#     ]

#     # Process pipeline
#     try:
#         # Step 1: Load and combine documents
#         print("Loading documents...")
#         markdown_path = load_documents(file_paths)
        
#         # Step 2: Process into chunks with embeddings
#         print("Processing chunks...")
#         chunks = process_chunks(markdown_path)
        
#         # Step 3: Save to Parquet
#         print("Saving to Parquet...")
#         parquet_path = save_to_parquet(chunks)
        
#         # Step 4: Ingest into Pinecone
#         print("Ingesting into Pinecone...")
#         ingest_data(pc,
#             parquet_path=parquet_path,
#             text_column="text",
#             pinecone_client=pc,
#         )
        
#         # Step 5: Test retrieval
#         print("\nTesting retrieval...")
#         retriever = get_retriever(
#             pinecone_client=pc,
#             index_name="vector-index",
#             namespace="rag"
#         )
        
#     except Exception as e:
#         print(f"Error in pipeline: {str(e)}")    

#     llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
    
#     # Test questions
#     test_questions = [
#         # "What are the key components of AI agent memory?",
#         # "Explain prompt engineering techniques",
#         # "What are recent advancements in adversarial attacks on LLMs?"
#         "what are the trending papers that are published in NeurIPS 2024?"
#     ]
    
#     # Run workflow for each test question
#     for question in test_questions:
#         print(f"\n--- Processing Question: {question} ---")
#         answer = run_adaptive_rag(retriever, question, llm)
#         print("\nFinal Answer:", answer)

In [18]:
from pinecone import Pinecone
from langchain_openai import ChatOpenAI
from langgraph.pregel import GraphRecursionError
import tempfile
import os
from pathlib import Path

def initialize_pinecone(api_key):
    """Initialize Pinecone client with API key."""
    try:
        return Pinecone(api_key=api_key)
    except Exception as e:
        print(f"Error initializing Pinecone: {str(e)}")
        return None

def initialize_llm(api_key):
    """Initialize OpenAI LLM."""
    try:
        return ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo")
    except Exception as e:
        print(f"Error initializing OpenAI: {str(e)}")
        return None

def process_documents(file_paths, pc):
    """Process documents and store in Pinecone."""
    if not file_paths:
        print("No documents provided.")
        return None

    print("Processing documents...")
    temp_dir = tempfile.mkdtemp()
    markdown_path = Path(temp_dir) / "combined.md"
    parquet_path = Path(temp_dir) / "documents.parquet"

    try:
        markdown_path = load_documents(file_paths, output_path=markdown_path)
        chunks = process_chunks(markdown_path, chunk_size=256, threshold=0.6)
        parquet_path = save_to_parquet(chunks, parquet_path)
        
        ingest_data(
            pc=pc,
            parquet_path=parquet_path,
            text_column="text",
            pinecone_client=pc
        )
        
        retriever = get_retriever(pc)
        print("Documents processed successfully!")
        return retriever
        
    except Exception as e:
        print(f"Error processing documents: {str(e)}")
        return None
    finally:
        try:
            os.remove(markdown_path)
            os.remove(parquet_path)
            os.rmdir(temp_dir)
        except:
            pass

def main():
    # Get API keys
    pinecone_api_key = input("Enter your Pinecone API key: ")
    openai_api_key = input("Enter your OpenAI API key: ")
    
    # Initialize clients
    pc = initialize_pinecone(pinecone_api_key)
    if not pc:
        return
    
    llm = initialize_llm(openai_api_key)
    if not llm:
        return

    # Get document paths
    print("\nEnter the paths to your documents (one per line).")
    print("Press Enter twice when done:")
    
    file_paths = []
    while True:
        path = input()
        if not path:
            break
        if os.path.exists(path):
            file_paths.append(path)
        else:
            print(f"Warning: File {path} does not exist")

    # Process documents
    retriever = process_documents(file_paths, pc)
    if not retriever:
        return

    # Chat loop
    print("\nChat with your documents! Type 'exit' to quit.")
    while True:
        question = input("\nYou: ")
        
        if question.lower() == 'exit':
            print("Goodbye!")
            break
        
        try:
            response = run_adaptive_rag(
                retriever=retriever,
                question=question,
                llm=llm,
                top_k=5,
                enable_websearch=False
            )
            print("\nAssistant:", response)
            
        except GraphRecursionError:
            print("\nAssistant: I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents.")
            
        except Exception as e:
            print(f"\nError: {str(e)}")

if __name__ == "__main__":
    main()

INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone



Enter the paths to your documents (one per line).
Press Enter twice when done:
Processing documents...


INFO:docling.document_converter:Going to convert document batch...
INFO:docling.pipeline.base_pipeline:Processing document Project Report Format.docx
INFO:docling.document_converter:Finished converting document Project Report Format.docx in 0.44 sec.
INFO:__main__:Processed 1 documents:
- Successfully converted: 1
- Partially converted: 0
- Failed: 0
pre tokenize: 100%|██████████| 2/2 [00:00<00:00, 133.11it/s]
Inference Embeddings: 100%|██████████| 2/2 [01:36<00:00, 48.32s/it]


Saving to Parquet: /tmp/tmpwx9hgq_7/documents.parquet
Saved to Parquet: /tmp/tmpwx9hgq_7/documents.parquet
Reading Parquet file: /tmp/tmpwx9hgq_7/documents.parquet
Total records: 26


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
  0%|          | 0/1 [00:00<?, ?it/s]

embeddings for batch: 0


100%|██████████| 1/1 [00:34<00:00, 34.48s/it]
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference


Documents processed successfully!

Chat with your documents! Type 'exit' to quit.
---ROUTE QUESTION---
---ROUTE TO RAG---
---RETRIEVE---
Retrieved 5 documents.
[{'page_content': ' In the process of detecting  wild  animals,  further  improvement  in  accuracy  is  needed. Further, there is an opportunity to develop approaches that are proficient\n\nin  working  well  in  a  generalized  approach  under  both  day  and  night conditions with background variations for detecting human-animal conflict.  The  YOLOv5  model,  with  certain  modifications  and  additions,  is found  to  be  suitable  for  developing  a  generalized  framework  for  the detection of human-animal conflict under both day and night conditions with background variations. Especially, the addition of attention layers as part of the primary detection network helps not only to focus on key areas of the scene under study but also provides optimization in the training and enhanced accuracy. In view of the above, in this

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'grade_documents':


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'rewrite_query':
---RETRIEVE---
Retrieved 5 documents.
[{'page_content': ' In the process of detecting  wild  animals,  further  improvement  in  accuracy  is  needed. Further, there is an opportunity to develop approaches that are proficient\n\nin  working  well  in  a  generalized  approach  under  both  day  and  night conditions with background variations for detecting human-animal conflict.  The  YOLOv5  model,  with  certain  modifications  and  additions,  is found  to  be  suitable  for  developing  a  generalized  framework  for  the detection of human-animal conflict under both day and night conditions with background variations. Especially, the addition of attention layers as part of the primary detection network helps not only to focus on key areas of the scene under study but also provides optimization in the training and enhanced accuracy. In view of the above, in this work, a SENet attention layer (Hu et al., 2019) is added to YOLOv5 for detecting human-animal confl

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'grade_documents':

Assistant: I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents.
---ROUTE QUESTION---
---ROUTE TO RAG---
---RETRIEVE---
Retrieved 5 documents.
[{'page_content': '|      |      |     |     |\n\na A is admonishment coefficient of total population (Times New Roman 10)\n\nb B is Bombardment coefficient of the mean population (Times New Roman 10)\n\n- Motivation of the study\n\nAlarming rate of climate change, sea level rise and other natural disasters are to be managed efficiently. Assessment and management of green house gases thus become very much essential..\n\n1 Adapted from Monika and Ram, 2008 (Times New Roman 10)\n\nSample sheet 11\n\n##### The satellite image as given in Figure 1.1 shows the area from where samples are collected.\n\n<!-- image -->\n\nFigure 1.1 Title of the figure (Times New Roman 11)\n\n### REFERENCES\n\n- Attanas, D.B. and

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'grade_documents':


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'rewrite_query':
---RETRIEVE---
Retrieved 5 documents.
[{'page_content': ' There are three main methods of feature extraction, namely local, holistic, and hybrid. For example, in the local approach entire face is divided into some small regions and then features are extracted from each small region and then during detection, those extracted features are applied. That is why after changing the images slightly from the original one, either  by rotating  the image or by changing its contrast, the trained network can work for detecting images.\n\n## 5.4. Impact analysis\n\nThe key novelty of the system is the use of an AI-based automated approach,  which  provides  higher  accuracy  in  detecting  human-wild animal conflicts and alarms forest officials and the public continuously throughout the day and night. Forest officials are not required to stand along the boundary of the KNP and monitor the movements of wild animals constantly. Instead, they can attend as notified by the system.

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'grade_documents':

Assistant: I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents.
---ROUTE QUESTION---
---ROUTE TO RAG---
---RETRIEVE---
Retrieved 5 documents.
[{'page_content': '---\nsource: /teamspace/studios/this_studio/adaptive_rag/data/Project Report Format.docx\nfile_type: .docx\n---\n\nGUIDELINES FOR B.TECH PROJECT REPORT PREPARATION\n\nIntroduction\n\nThis document is intended to provide a set of specific and uniform guidelines to the B. Tech students in the preparation of the project report. The content of the report, which is submitted to the University in partial fulfillment for the award of the degree of Bachelor of Technology, is very much important. It is also imperative that the report, to be acceptable by the University, should essentially meet a uniform format emphasizing readability, concordance with ethical standards and University-wide homoge

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Node 'grade_documents':
---GENERATE---


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


---ASSESS GENERATION---
---Generation is not hallucinated.---


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


---Answer quality is good.---
Node 'generate':

Assistant: The project introduction should provide specific guidelines for B.Tech students in preparing their project report. It should emphasize the importance of the report for fulfilling the requirements of the Bachelor of Technology degree and highlight the need for uniform format, readability, and ethical standards. It should set the tone for the rest of the report and provide an overview of the project.
Goodbye!
