Spaces:

anindya-hf-2002
/

Adaptive-RAG

Sleeping

App Files Files Community

anindya-hf-2002 commited on Dec 22, 2024

Commit

b6d19d9

verified ·

1 Parent(s): 53a8618

Upload 21 files

Browse files

Files changed (21) hide show

.gitignore +171 -0
app.py +226 -0
main.py +123 -0
notebooks/adaptive_rag.ipynb +0 -0
notebooks/app.py +226 -0
requirements.txt +21 -0
src/__init__.py +0 -0
src/agents/__init__.py +0 -0
src/agents/router.py +59 -0
src/agents/state.py +15 -0
src/agents/workflow.py +274 -0
src/data_processing/__init__.py +0 -0
src/data_processing/chunker.py +85 -0
src/data_processing/loader.py +153 -0
src/llm/__init__.py +0 -0
src/llm/graders.py +128 -0
src/llm/query_rewriter.py +55 -0
src/tools/__init__.py +0 -0
src/tools/web_search.py +173 -0
src/vectorstore/__init__.py +0 -0
src/vectorstore/pinecone_db.py +282 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+data/
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

app.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import streamlit as st
+from src.vectorstore.pinecone_db import ingest_data, get_retriever, load_documents, process_chunks, save_to_parquet
+from pinecone import Pinecone
+from langchain_openai import ChatOpenAI
+from langchain_ollama import ChatOllama
+from src.agents.workflow import run_adaptive_rag
+from langgraph.pregel import GraphRecursionError
+import tempfile
+import os
+import time
+from pathlib import Path
+# Page config
+st.set_page_config(page_title="RAG Chat Assistant", layout="wide")
+# Initialize session states
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "documents_processed" not in st.session_state:
+    st.session_state.documents_processed = False
+if "retriever" not in st.session_state:
+    st.session_state.retriever = None
+if "pinecone_client" not in st.session_state:
+    st.session_state.pinecone_client = None
+def initialize_pinecone(api_key):
+    """Initialize Pinecone client with API key."""
+    try:
+        return Pinecone(api_key=api_key)
+    except Exception as e:
+        st.error(f"Error initializing Pinecone: {str(e)}")
+        return None
+def initialize_llm(llm_option, openai_api_key=None):
+    """Initialize LLM based on user selection."""
+    if llm_option == "OpenAI":
+        if not openai_api_key:
+            st.sidebar.warning("Please enter OpenAI API key.")
+            return None
+        return ChatOpenAI(api_key=openai_api_key, model="gpt-3.5-turbo")
+    else:
+        return ChatOllama(model="llama3.2", temperature=0.3, num_predict=512, top_p=0.6)
+def clear_pinecone_index(pc, index_name="vector-index"):
+    """Clear the Pinecone index."""
+    try:
+        pc.delete_index(index_name)
+        st.session_state.documents_processed = False
+        st.session_state.retriever = None
+        st.success("Database cleared successfully!")
+    except Exception as e:
+        st.error(f"Error clearing database: {str(e)}")
+def process_documents(uploaded_files, pc):
+    """Process uploaded documents and store in Pinecone."""
+    if not uploaded_files:
+        st.warning("Please upload at least one document.")
+        return False
+    with st.spinner("Processing documents..."):
+        temp_dir = tempfile.mkdtemp()
+        file_paths = []
+        markdown_path = Path(temp_dir) / "combined.md"
+        parquet_path = Path(temp_dir) / "documents.parquet"
+        for uploaded_file in uploaded_files:
+            file_path = Path(temp_dir) / uploaded_file.name
+            with open(file_path, "wb") as f:
+                f.write(uploaded_file.getvalue())
+            file_paths.append(str(file_path))
+        try:
+            markdown_path = load_documents(file_paths, output_path=markdown_path)
+            chunks = process_chunks(markdown_path, chunk_size=256, threshold=0.6)
+            print(f"Processed chunks: {chunks}")
+            parquet_path = save_to_parquet(chunks, parquet_path)
+            ingest_data(
+                pc=pc,
+                parquet_path=parquet_path,
+                text_column="text",
+                pinecone_client=pc
+            )
+            st.session_state.retriever = get_retriever(pc)
+            st.session_state.documents_processed = True
+            return True
+        except Exception as e:
+            st.error(f"Error processing documents: {str(e)}")
+            return False
+        finally:
+            for file_path in file_paths:
+                try:
+                    os.remove(file_path)
+                except:
+                    pass
+            try:
+                os.rmdir(temp_dir)
+            except:
+                pass
+def run_rag_with_streaming(retriever, question, llm, enable_web_search=False):
+    """Run RAG workflow and yield streaming results."""
+    try:
+        response = run_adaptive_rag(
+            retriever=retriever,
+            question=question,
+            llm=llm,
+            top_k=5,
+            enable_websearch=enable_web_search
+        )
+        for word in response.split():
+            yield word + " "
+            time.sleep(0.03)
+    except GraphRecursionError:
+        response = "I apologize, but I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents."
+        for word in response.split():
+            yield word + " "
+            time.sleep(0.03)
+    except Exception as e:
+        yield f"I encountered an error while processing your question: {str(e)}"
+def main():
+    st.title("🤖 RAG Chat Assistant")
+    # Sidebar configuration
+    st.sidebar.title("Configuration")
+    # API Keys in sidebar
+    pinecone_api_key = st.sidebar.text_input("Enter Pinecone API Key:", type="password")
+    # LLM Selection
+    llm_option = st.sidebar.selectbox("Select Language Model:", ["OpenAI", "Ollama"])
+    openai_api_key = None
+    if llm_option == "OpenAI":
+        openai_api_key = st.sidebar.text_input("Enter OpenAI API Key:", type="password")
+    # Web search tool in sidebar
+    st.sidebar.markdown("---")
+    st.sidebar.markdown("### Tools")
+    use_web_search = st.sidebar.checkbox("Web search")
+    # Initialize Pinecone
+    if pinecone_api_key:
+        if st.session_state.pinecone_client is None:
+            st.session_state.pinecone_client = initialize_pinecone(pinecone_api_key)
+    else:
+        st.sidebar.warning("Please enter Pinecone API key to continue.")
+        st.stop()
+    # Initialize LLM
+    llm = initialize_llm(llm_option, openai_api_key)
+    if llm is None:
+        st.stop()
+    # Clear DB Button
+    st.sidebar.markdown("---")
+    if st.sidebar.button("Clear Database"):
+        if st.session_state.pinecone_client:
+            clear_pinecone_index(st.session_state.pinecone_client)
+            st.session_state.messages = []  # Clear chat history
+    # Document upload section
+    if not st.session_state.documents_processed:
+        st.header("📄 Document Upload")
+        uploaded_files = st.file_uploader(
+            "Upload your documents",
+            accept_multiple_files=True,
+            type=["pdf", "docx", "txt", "pptx", "md"]
+        )
+        if st.button("Process Documents"):
+            if process_documents(uploaded_files, st.session_state.pinecone_client):
+                st.success("Documents processed successfully!")
+    # Chat interface
+    if st.session_state.documents_processed:
+        st.header("💬 Chat")
+        # Display chat history
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+        # Chat input
+        if prompt := st.chat_input("Ask a question about your documents..."):
+            # Display user message
+            with st.chat_message("user"):
+                if use_web_search:
+                    st.markdown(prompt.strip() + ''' :red-background[Web Search]''')
+                else:
+                    st.markdown(prompt)
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            # Generate and stream response
+            with st.chat_message("assistant"):
+                response_container = st.empty()
+                full_response = ""
+                # Show spinner while processing
+                with st.spinner("Thinking..."):
+                    # Stream the response
+                    for chunk in run_rag_with_streaming(
+                        retriever=st.session_state.retriever,
+                        question=prompt,
+                        llm=llm,
+                        enable_web_search=use_web_search
+                    ):
+                        full_response += chunk
+                        response_container.markdown(full_response + "▌")
+                # Final update without cursor
+                response_container.markdown(full_response)
+                # Save to chat history
+                st.session_state.messages.append(
+                    {"role": "assistant", "content": full_response}
+                )
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from pinecone import Pinecone
+from langchain_openai import ChatOpenAI
+from src.vectorstore.pinecone_db import ingest_data, get_retriever, load_documents, process_chunks, save_to_parquet
+from src.agents.workflow import run_adaptive_rag
+from langgraph.pregel import GraphRecursionError
+import tempfile
+import os
+from pathlib import Path
+def initialize_pinecone(api_key):
+    """Initialize Pinecone client with API key."""
+    try:
+        return Pinecone(api_key=api_key)
+    except Exception as e:
+        print(f"Error initializing Pinecone: {str(e)}")
+        return None
+def initialize_llm(api_key):
+    """Initialize OpenAI LLM."""
+    try:
+        return ChatOpenAI(api_key=api_key, model="gpt-3.5-turbo")
+    except Exception as e:
+        print(f"Error initializing OpenAI: {str(e)}")
+        return None
+def process_documents(file_paths, pc):
+    """Process documents and store in Pinecone."""
+    if not file_paths:
+        print("No documents provided.")
+        return None
+    print("Processing documents...")
+    temp_dir = tempfile.mkdtemp()
+    markdown_path = Path(temp_dir) / "combined.md"
+    parquet_path = Path(temp_dir) / "documents.parquet"
+    try:
+        markdown_path = load_documents(file_paths, output_path=markdown_path)
+        chunks = process_chunks(markdown_path, chunk_size=256, threshold=0.6)
+        parquet_path = save_to_parquet(chunks, parquet_path)
+        ingest_data(
+            pc=pc,
+            parquet_path=parquet_path,
+            text_column="text",
+            pinecone_client=pc
+        )
+        retriever = get_retriever(pc)
+        print("Documents processed successfully!")
+        return retriever
+    except Exception as e:
+        print(f"Error processing documents: {str(e)}")
+        return None
+    finally:
+        try:
+            os.remove(markdown_path)
+            os.remove(parquet_path)
+            os.rmdir(temp_dir)
+        except:
+            pass
+def main():
+    # Get API keys
+    pinecone_api_key = input("Enter your Pinecone API key: ")
+    openai_api_key = input("Enter your OpenAI API key: ")
+    # Initialize clients
+    pc = initialize_pinecone(pinecone_api_key)
+    if not pc:
+        return
+    llm = initialize_llm(openai_api_key)
+    if not llm:
+        return
+    # Get document paths
+    print("\nEnter the paths to your documents (one per line).")
+    print("Press Enter twice when done:")
+    file_paths = []
+    while True:
+        path = input()
+        if not path:
+            break
+        if os.path.exists(path):
+            file_paths.append(path)
+        else:
+            print(f"Warning: File {path} does not exist")
+    # Process documents
+    retriever = process_documents(file_paths, pc)
+    if not retriever:
+        return
+    # Chat loop
+    print("\nChat with your documents! Type 'exit' to quit.")
+    while True:
+        question = input("\nYou: ")
+        if question.lower() == 'exit':
+            print("Goodbye!")
+            break
+        try:
+            response = run_adaptive_rag(
+                retriever=retriever,
+                question=question,
+                llm=llm,
+                top_k=5,
+                enable_websearch=False
+            )
+            print("\nAssistant:", response)
+        except GraphRecursionError:
+            print("\nAssistant: I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents.")
+        except Exception as e:
+            print(f"\nError: {str(e)}")
+if __name__ == "__main__":
+    main()

notebooks/adaptive_rag.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/app.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import streamlit as st
+from src.vectorstore.pinecone_db import ingest_data, get_retriever, load_documents, process_chunks, save_to_parquet
+from pinecone import Pinecone
+from langchain_openai import ChatOpenAI
+from langchain_ollama import ChatOllama
+from src.agents.workflow import run_adaptive_rag
+from langgraph.pregel import GraphRecursionError
+import tempfile
+import os
+import time
+from pathlib import Path
+# Page config
+st.set_page_config(page_title="RAG Chat Assistant", layout="wide")
+# Initialize session states
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+if "documents_processed" not in st.session_state:
+    st.session_state.documents_processed = False
+if "retriever" not in st.session_state:
+    st.session_state.retriever = None
+if "pinecone_client" not in st.session_state:
+    st.session_state.pinecone_client = None
+def initialize_pinecone(api_key):
+    """Initialize Pinecone client with API key."""
+    try:
+        return Pinecone(api_key=api_key)
+    except Exception as e:
+        st.error(f"Error initializing Pinecone: {str(e)}")
+        return None
+def initialize_llm(llm_option, openai_api_key=None):
+    """Initialize LLM based on user selection."""
+    if llm_option == "OpenAI":
+        if not openai_api_key:
+            st.sidebar.warning("Please enter OpenAI API key.")
+            return None
+        return ChatOpenAI(api_key=openai_api_key, model="gpt-3.5-turbo")
+    else:
+        return ChatOllama(model="llama3.2", temperature=0.3, num_predict=512, top_p=0.6)
+def clear_pinecone_index(pc, index_name="vector-index"):
+    """Clear the Pinecone index."""
+    try:
+        pc.delete_index(index_name)
+        st.session_state.documents_processed = False
+        st.session_state.retriever = None
+        st.success("Database cleared successfully!")
+    except Exception as e:
+        st.error(f"Error clearing database: {str(e)}")
+def process_documents(uploaded_files, pc):
+    """Process uploaded documents and store in Pinecone."""
+    if not uploaded_files:
+        st.warning("Please upload at least one document.")
+        return False
+    with st.spinner("Processing documents..."):
+        temp_dir = tempfile.mkdtemp()
+        file_paths = []
+        markdown_path = Path(temp_dir) / "combined.md"
+        parquet_path = Path(temp_dir) / "documents.parquet"
+        for uploaded_file in uploaded_files:
+            file_path = Path(temp_dir) / uploaded_file.name
+            with open(file_path, "wb") as f:
+                f.write(uploaded_file.getvalue())
+            file_paths.append(str(file_path))
+        try:
+            markdown_path = load_documents(file_paths, output_path=markdown_path)
+            chunks = process_chunks(markdown_path, chunk_size=256, threshold=0.6)
+            print(f"Processed chunks: {chunks}")
+            parquet_path = save_to_parquet(chunks, parquet_path)
+            ingest_data(
+                pc=pc,
+                parquet_path=parquet_path,
+                text_column="text",
+                pinecone_client=pc
+            )
+            st.session_state.retriever = get_retriever(pc)
+            st.session_state.documents_processed = True
+            return True
+        except Exception as e:
+            st.error(f"Error processing documents: {str(e)}")
+            return False
+        finally:
+            for file_path in file_paths:
+                try:
+                    os.remove(file_path)
+                except:
+                    pass
+            try:
+                os.rmdir(temp_dir)
+            except:
+                pass
+def run_rag_with_streaming(retriever, question, llm, enable_web_search=False):
+    """Run RAG workflow and yield streaming results."""
+    try:
+        response = run_adaptive_rag(
+            retriever=retriever,
+            question=question,
+            llm=llm,
+            top_k=5,
+            enable_websearch=enable_web_search
+        )
+        for word in response.split():
+            yield word + " "
+            time.sleep(0.03)
+    except GraphRecursionError:
+        response = "I apologize, but I cannot find a sufficient answer to your question in the provided documents. Please try rephrasing your question or ask something else about the content of the documents."
+        for word in response.split():
+            yield word + " "
+            time.sleep(0.03)
+    except Exception as e:
+        yield f"I encountered an error while processing your question: {str(e)}"
+def main():
+    st.title("🤖 RAG Chat Assistant")
+    # Sidebar configuration
+    st.sidebar.title("Configuration")
+    # API Keys in sidebar
+    pinecone_api_key = st.sidebar.text_input("Enter Pinecone API Key:", type="password")
+    # LLM Selection
+    llm_option = st.sidebar.selectbox("Select Language Model:", ["OpenAI", "Ollama"])
+    openai_api_key = None
+    if llm_option == "OpenAI":
+        openai_api_key = st.sidebar.text_input("Enter OpenAI API Key:", type="password")
+    # Web search tool in sidebar
+    st.sidebar.markdown("---")
+    st.sidebar.markdown("### Tools")
+    use_web_search = st.sidebar.checkbox("Web search")
+    # Initialize Pinecone
+    if pinecone_api_key:
+        if st.session_state.pinecone_client is None:
+            st.session_state.pinecone_client = initialize_pinecone(pinecone_api_key)
+    else:
+        st.sidebar.warning("Please enter Pinecone API key to continue.")
+        st.stop()
+    # Initialize LLM
+    llm = initialize_llm(llm_option, openai_api_key)
+    if llm is None:
+        st.stop()
+    # Clear DB Button
+    st.sidebar.markdown("---")
+    if st.sidebar.button("Clear Database"):
+        if st.session_state.pinecone_client:
+            clear_pinecone_index(st.session_state.pinecone_client)
+            st.session_state.messages = []  # Clear chat history
+    # Document upload section
+    if not st.session_state.documents_processed:
+        st.header("📄 Document Upload")
+        uploaded_files = st.file_uploader(
+            "Upload your documents",
+            accept_multiple_files=True,
+            type=["pdf", "docx", "txt", "pptx", "md"]
+        )
+        if st.button("Process Documents"):
+            if process_documents(uploaded_files, st.session_state.pinecone_client):
+                st.success("Documents processed successfully!")
+    # Chat interface
+    if st.session_state.documents_processed:
+        st.header("💬 Chat")
+        # Display chat history
+        for message in st.session_state.messages:
+            with st.chat_message(message["role"]):
+                st.markdown(message["content"])
+        # Chat input
+        if prompt := st.chat_input("Ask a question about your documents..."):
+            # Display user message
+            with st.chat_message("user"):
+                if use_web_search:
+                    st.markdown(prompt.strip() + ''' :red-background[Web Search]''')
+                else:
+                    st.markdown(prompt)
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            # Generate and stream response
+            with st.chat_message("assistant"):
+                response_container = st.empty()
+                full_response = ""
+                # Show spinner while processing
+                with st.spinner("Thinking..."):
+                    # Stream the response
+                    for chunk in run_rag_with_streaming(
+                        retriever=st.session_state.retriever,
+                        question=prompt,
+                        llm=llm,
+                        enable_web_search=use_web_search
+                    ):
+                        full_response += chunk
+                        response_container.markdown(full_response + "▌")
+                # Final update without cursor
+                response_container.markdown(full_response)
+                # Save to chat history
+                st.session_state.messages.append(
+                    {"role": "assistant", "content": full_response}
+                )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+langchain-community
+tiktoken
+langchain-openai
+langchainhub
+chromadb
+langchain
+langgraph
+duckduckgo-search
+langchain-groq
+langchain-huggingface
+sentence_transformers
+tavily-python
+langchain-ollama
+ollama
+crawl4ai
+docling
+easyocr
+FlagEmbedding
+chonkie[semantic]
+pinecone
+streamlit

src/__init__.py ADDED Viewed

File without changes

src/agents/__init__.py ADDED Viewed

File without changes

src/agents/router.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_ollama import ChatOllama
+from pydantic import BaseModel, Field
+from typing import Literal
+class RouteQuery(BaseModel):
+    """Route a user query to the most relevant datasource."""
+    datasource: Literal["vectorstore", "web_search"] = Field(
+        description="Route question to web search or vectorstore retrieval"
+    )
+def create_query_router():
+    """
+    Create a query router to determine data source for a given question.
+    Returns:
+        Callable: Query router function
+    """
+    # LLM with function call
+    llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
+    structured_llm_router = llm.with_structured_output(RouteQuery)
+    # Prompt
+    system = """You are an expert at routing a user question to a vectorstore or web search.
+    The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
+    Use the vectorstore for questions on these topics. Otherwise, use web-search."""
+    route_prompt = ChatPromptTemplate.from_messages([
+        ("system", system),
+        ("human", "{question}"),
+    ])
+    return route_prompt | structured_llm_router
+def route_query(question: str):
+    """
+    Route a specific query to its appropriate data source.
+    Args:
+        question (str): User's input question
+    Returns:
+        str: Recommended data source
+    """
+    router = create_query_router()
+    result = router.invoke({"question": question})
+    return result.datasource
+if __name__ == "__main__":
+    # Example usage
+    test_questions = [
+        "Who will the Bears draft first in the NFL draft?",
+        "What are the types of agent memory?"
+    ]
+    for q in test_questions:
+        source = route_query(q)
+        print(f"Question: {q}")
+        print(f"Routed to: {source}\n")

src/agents/state.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import List, TypedDict
+from langchain_core.documents.base import Document
+class GraphState(TypedDict):
+    """
+    Represents the state of our adaptive RAG graph.
+    Attributes:
+        question (str): Original user question
+        generation (str, optional): LLM generated answer
+        documents (List[Document], optional): Retrieved or searched documents
+    """
+    question: str
+    generation: str | None
+    documents: List[Document]

src/agents/workflow.py ADDED Viewed

	@@ -0,0 +1,274 @@

+from langgraph.graph import END, StateGraph, START
+from langchain_core.prompts import PromptTemplate
+from agents.state import GraphState
+# from agents.router import route_query
+import asyncio
+from vectorstore.pinecone_db import get_retriever
+from tools.web_search import AdvancedWebCrawler
+from llm.graders import (
+    grade_document_relevance,
+    check_hallucination,
+    grade_answer_quality
+)
+from langchain_core.output_parsers import StrOutputParser
+from llm.query_rewriter import rewrite_query
+from langchain_ollama import ChatOllama
+def perform_web_search(question: str):
+    """
+    Perform web search using the AdvancedWebCrawler.
+    Args:
+        question (str): User's input question
+    Returns:
+        List: Web search results
+    """
+    # Initialize web crawler
+    crawler = AdvancedWebCrawler(
+        max_search_results=5,
+        word_count_threshold=50,
+        content_filter_type='f',
+        filter_threshold=0.48
+    )
+    results = asyncio.run(crawler.search_and_crawl(question))
+    return results
+def create_adaptive_rag_workflow(retriever, llm, top_k=5, enable_websearch=False):
+    """
+    Create the adaptive RAG workflow graph.
+    Args:
+        retriever: Vector store retriever
+    Returns:
+        Compiled LangGraph workflow
+    """
+    def retrieve(state: GraphState):
+        """Retrieve documents from vectorstore."""
+        print("---RETRIEVE---")
+        question = state['question']
+        documents = retriever.invoke(question, top_k)
+        print(f"Retrieved {len(documents)} documents.")
+        print(documents)
+        return {"documents": documents, "question": question}
+    def route_to_datasource(state: GraphState):
+        """Route question to web search or vectorstore."""
+        print("---ROUTE QUESTION---")
+        # question = state['question']
+        # source = route_query(question)
+        if enable_websearch:
+            print("---ROUTE TO WEB SEARCH---")
+            return "web_search"
+        else:
+            print("---ROUTE TO RAG---")
+            return "vectorstore"
+    def generate_answer(state: GraphState):
+        """Generate answer using retrieved documents."""
+        print("---GENERATE---")
+        question = state['question']
+        documents = state['documents']
+        # Prepare context
+        context = "\n\n".join([doc["page_content"] for doc in documents])
+        prompt_template = PromptTemplate.from_template("""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
+        Question: {question}
+        Context: {context}
+        Answer:""")
+        # Generate answer
+        rag_chain = prompt_template | llm | StrOutputParser()
+        generation = rag_chain.invoke({"context": context, "question": question})
+        return {"generation": generation, "documents": documents, "question": question}
+    def grade_documents(state: GraphState):
+        """Filter relevant documents."""
+        print("---GRADE DOCUMENTS---")
+        question = state['question']
+        documents = state['documents']
+        # Filter documents
+        filtered_docs = []
+        for doc in documents:
+            score = grade_document_relevance(question, doc["page_content"], llm)
+            if score == "yes":
+                filtered_docs.append(doc)
+        return {"documents": filtered_docs, "question": question}
+    def web_search(state: GraphState):
+        """Perform web search."""
+        print("---WEB SEARCH---")
+        question = state['question']
+        # Perform web search
+        results = perform_web_search(question)
+        web_documents = [
+            {
+                "page_content": result['content'],
+                "metadata": {"source": result['url']}
+            } for result in results
+        ]
+        return {"documents": web_documents, "question": question}
+    def check_generation_quality(state: GraphState):
+        """Check the quality of generated answer."""
+        print("---ASSESS GENERATION---")
+        question = state['question']
+        documents = state['documents']
+        generation = state['generation']
+        print("---Generation is not hallucinated.---")
+        # Check answer quality
+        quality_score = grade_answer_quality(question, generation, llm)
+        if quality_score == "yes":
+            print("---Answer quality is good.---")
+        else:
+            print("---Answer quality is poor.---")
+        return "end" if quality_score == "yes" else "rewrite"
+    # Create workflow
+    workflow = StateGraph(GraphState)
+    # Add nodes
+    workflow.add_node("vectorstore", retrieve)
+    workflow.add_node("web_search", web_search)
+    workflow.add_node("grade_documents", grade_documents)
+    workflow.add_node("generate", generate_answer)
+    workflow.add_node("rewrite_query", lambda state: {
+        "question": rewrite_query(state['question'], llm),
+        "documents": [],
+        "generation": None
+    })
+    # Define edges
+    workflow.add_conditional_edges(
+        START,
+        route_to_datasource,
+        {
+            "web_search": "web_search",
+            "vectorstore": "vectorstore"
+        }
+    )
+    workflow.add_edge("web_search", "generate")
+    workflow.add_edge("vectorstore", "grade_documents")
+    workflow.add_conditional_edges(
+        "grade_documents",
+        lambda state: "generate" if state['documents'] else "rewrite_query"
+    )
+    workflow.add_edge("rewrite_query", "vectorstore")
+    workflow.add_conditional_edges(
+        "generate",
+        check_generation_quality,
+        {
+            "end": END,
+            "regenerate": "generate",
+            "rewrite": "rewrite_query"
+        }
+    )
+    # Compile the workflow
+    app = workflow.compile()
+    return app
+def run_adaptive_rag(retriever, question: str, llm, top_k=5, enable_websearch=False):
+    """
+    Run the adaptive RAG workflow for a given question.
+    Args:
+        retriever: Vector store retriever
+        question (str): User's input question
+    Returns:
+        str: Generated answer
+    """
+    # Create workflow
+    workflow = create_adaptive_rag_workflow(retriever, llm, top_k, enable_websearch=enable_websearch)
+    # Run workflow
+    final_state = None
+    for output in workflow.stream({"question": question}, config={"recursion_limit": 5}):
+        for key, value in output.items():
+            print(f"Node '{key}':")
+            # Optionally print state details
+            # print(value)
+        final_state = value
+    return final_state.get('generation', 'No answer could be generated.')
+if __name__ == "__main__":
+    # Example usage
+    from vectorstore.pinecone_db import PINECONE_API_KEY, ingest_data,  get_retriever, load_documents, process_chunks, save_to_parquet
+    from pinecone import Pinecone
+    # Load and prepare documents
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    # Define input files
+    file_paths=[
+        # './data/2404.19756v1.pdf',
+        # './data/OD429347375590223100.pdf',
+        # './data/Project Report Format.docx',
+        './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
+    ]
+    # Process pipeline
+    try:
+        # Step 1: Load and combine documents
+        print("Loading documents...")
+        markdown_path = load_documents(file_paths)
+        # Step 2: Process into chunks with embeddings
+        print("Processing chunks...")
+        chunks = process_chunks(markdown_path)
+        # Step 3: Save to Parquet
+        print("Saving to Parquet...")
+        parquet_path = save_to_parquet(chunks)
+        # Step 4: Ingest into Pinecone
+        print("Ingesting into Pinecone...")
+        ingest_data(pc,
+            parquet_path=parquet_path,
+            text_column="text",
+            pinecone_client=pc,
+        )
+        # Step 5: Test retrieval
+        print("\nTesting retrieval...")
+        retriever = get_retriever(
+            pinecone_client=pc,
+            index_name="vector-index",
+            namespace="rag"
+        )
+    except Exception as e:
+        print(f"Error in pipeline: {str(e)}")
+    llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
+    # Test questions
+    test_questions = [
+        # "What are the key components of AI agent memory?",
+        # "Explain prompt engineering techniques",
+        # "What are recent advancements in adversarial attacks on LLMs?"
+        "what are the trending papers that are published in NeurIPS 2024?"
+    ]
+    # Run workflow for each test question
+    for question in test_questions:
+        print(f"\n--- Processing Question: {question} ---")
+        answer = run_adaptive_rag(retriever, question, llm)
+        print("\nFinal Answer:", answer)

src/data_processing/__init__.py ADDED Viewed

File without changes

src/data_processing/chunker.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from typing import List
+import numpy as np
+from chonkie.embeddings import BaseEmbeddings
+from FlagEmbedding import BGEM3FlagModel
+from chonkie import SDPMChunker as SDPMChunker
+class BGEM3Embeddings(BaseEmbeddings):
+    def __init__(self, model_name):
+        self.model = BGEM3FlagModel(model_name, use_fp16=True)
+        self.task = "separation"
+    @property
+    def dimension(self):
+        return 1024
+    def embed(self, text: str):
+        e = self.model.encode([text], return_dense=True, return_sparse=False, return_colbert_vecs=False)['dense_vecs']
+        # print(e)
+        return e
+    def embed_batch(self, texts: List[str]):
+        embeddings = self.model.encode(texts, return_dense=True, return_sparse=False, return_colbert_vecs=False
+        )
+        # print(embeddings['dense_vecs'])
+        return embeddings['dense_vecs']
+    def count_tokens(self, text: str):
+        l = len(self.model.tokenizer.encode(text))
+        # print(l)
+        return l
+    def count_tokens_batch(self, texts: List[str]):
+        encodings = self.model.tokenizer(texts)
+        # print([len(enc) for enc in encodings["input_ids"]])
+        return [len(enc) for enc in encodings["input_ids"]]
+    def get_tokenizer_or_token_counter(self):
+        return self.model.tokenizer
+    def similarity(self, u: "np.ndarray", v: "np.ndarray"):
+        """Compute cosine similarity between two embeddings."""
+        s = ([email protected])#.item()
+        # print(s)
+        return s
+    @classmethod
+    def is_available(cls):
+        return True
+    def __repr__(self):
+        return "bgem3"
+def main():
+    # Initialize the BGE M3 embeddings model
+    embedding_model = BGEM3Embeddings(
+        model_name="BAAI/bge-m3"
+    )
+    # Initialize the SDPM chunker
+    chunker = SDPMChunker(
+        embedding_model=embedding_model,
+        chunk_size=256,
+        threshold=0.7,
+        skip_window=2
+    )
+    with open('./output.md', 'r') as file:
+        text = file.read()
+    # Generate chunks
+    chunks = chunker.chunk(text)
+    # Print the chunks
+    for i, chunk in enumerate(chunks, 1):
+        print(f"\nChunk {i}:")
+        print(f"Text: {chunk.text}")
+        print(f"Token count: {chunk.token_count}")
+        print(f"Start index: {chunk.start_index}")
+        print(f"End index: {chunk.end_index}")
+        print(f"no of sentences: {len(chunk.sentences)}")
+        print("-" * 80)
+if __name__ == "__main__":
+    main()

src/data_processing/loader.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from pathlib import Path
+from typing import List, Union
+import logging
+from dataclasses import dataclass
+from langchain_core.documents import Document as LCDocument
+from langchain_core.document_loaders import BaseLoader
+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat, ConversionStatus
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    EasyOcrOptions
+)
+logging.basicConfig(level=logging.INFO)
+_log = logging.getLogger(__name__)
+@dataclass
+class ProcessingResult:
+    """Store results of document processing"""
+    success_count: int = 0
+    failure_count: int = 0
+    partial_success_count: int = 0
+    failed_files: List[str] = None
+    def __post_init__(self):
+        if self.failed_files is None:
+            self.failed_files = []
+class MultiFormatDocumentLoader(BaseLoader):
+    """Loader for multiple document formats that converts to LangChain documents"""
+    def __init__(
+        self,
+        file_paths: Union[str, List[str]],
+        enable_ocr: bool = True,
+        enable_tables: bool = True
+    ):
+        self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths
+        self._enable_ocr = enable_ocr
+        self._enable_tables = enable_tables
+        self._converter = self._setup_converter()
+    def _setup_converter(self):
+        """Set up the document converter with appropriate options"""
+        # Configure pipeline options
+        pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(
+                force_full_page_ocr=True
+            ))
+        if self._enable_ocr:
+            pipeline_options.do_ocr = True
+        if self._enable_tables:
+            pipeline_options.do_table_structure = True
+            pipeline_options.table_structure_options.do_cell_matching = True
+        # Create converter with supported formats
+        return DocumentConverter(
+            allowed_formats=[
+                InputFormat.PDF,
+                InputFormat.IMAGE,
+                InputFormat.DOCX,
+                InputFormat.HTML,
+                InputFormat.PPTX,
+                InputFormat.ASCIIDOC,
+                InputFormat.MD,
+            ],
+            format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )}
+        )
+    def lazy_load(self):
+        """Convert documents and yield LangChain documents"""
+        results = ProcessingResult()
+        for file_path in self._file_paths:
+            try:
+                path = Path(file_path)
+                if not path.exists():
+                    _log.warning(f"File not found: {file_path}")
+                    results.failure_count += 1
+                    results.failed_files.append(file_path)
+                    continue
+                conversion_result = self._converter.convert(path)
+                if conversion_result.status == ConversionStatus.SUCCESS:
+                    results.success_count += 1
+                    text = conversion_result.document.export_to_markdown()
+                    metadata = {
+                        'source': str(path),
+                        'file_type': path.suffix,
+                    }
+                    yield LCDocument(
+                        page_content=text,
+                        metadata=metadata
+                    )
+                elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:
+                    results.partial_success_count += 1
+                    _log.warning(f"Partial conversion for {file_path}")
+                    text = conversion_result.document.export_to_markdown()
+                    metadata = {
+                        'source': str(path),
+                        'file_type': path.suffix,
+                        'conversion_status': 'partial'
+                    }
+                    yield LCDocument(
+                        page_content=text,
+                        metadata=metadata
+                    )
+                else:
+                    results.failure_count += 1
+                    results.failed_files.append(file_path)
+                    _log.error(f"Failed to convert {file_path}")
+            except Exception as e:
+                _log.error(f"Error processing {file_path}: {str(e)}")
+                results.failure_count += 1
+                results.failed_files.append(file_path)
+        # Log final results
+        total = results.success_count + results.partial_success_count + results.failure_count
+        _log.info(
+            f"Processed {total} documents:\n"
+            f"- Successfully converted: {results.success_count}\n"
+            f"- Partially converted: {results.partial_success_count}\n"
+            f"- Failed: {results.failure_count}"
+        )
+        if results.failed_files:
+            _log.info("Failed files:")
+            for file in results.failed_files:
+                _log.info(f"- {file}")
+if __name__ == '__main__':
+    # Load documents from a list of file paths
+    loader = MultiFormatDocumentLoader(
+        file_paths=[
+            # './data/2404.19756v1.pdf',
+            # './data/OD429347375590223100.pdf',
+            './data/Project Report Format.docx',
+            # './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
+        ],
+        enable_ocr=False,
+        enable_tables=True
+    )
+    for doc in loader.lazy_load():
+        print(doc.page_content)
+        print(doc.metadata)
+        # save document in .md file
+        with open('output.md', 'w') as f:
+            f.write(doc.page_content)

src/llm/__init__.py ADDED Viewed

File without changes

src/llm/graders.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from langchain_core.prompts import ChatPromptTemplate
+from langchain_ollama import ChatOllama
+from pydantic import BaseModel, Field
+from typing import List
+class DocumentRelevance(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+class HallucinationCheck(BaseModel):
+    """Binary score for hallucination present in generation answer."""
+    binary_score: str = Field(
+        description="Answer is grounded in the facts, 'yes' or 'no'"
+    )
+class AnswerQuality(BaseModel):
+    """Binary score to assess answer addresses question."""
+    binary_score: str = Field(
+        description="Answer addresses the question, 'yes' or 'no'"
+    )
+def create_llm_grader(grader_type: str, llm):
+    """
+    Create an LLM grader based on the specified type.
+    Args:
+        grader_type (str): Type of grader to create
+    Returns:
+        Callable: LLM grader function
+    """
+    # Initialize LLM
+    # Select grader type and create structured output
+    if grader_type == "document_relevance":
+        structured_llm_grader = llm.with_structured_output(DocumentRelevance)
+        system = """You are a grader assessing relevance of a retrieved document to a user question.
+        If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
+        It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
+        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", system),
+            ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
+        ])
+    elif grader_type == "hallucination":
+        structured_llm_grader = llm.with_structured_output(HallucinationCheck)
+        system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts.
+        Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", system),
+            ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
+        ])
+    elif grader_type == "answer_quality":
+        structured_llm_grader = llm.with_structured_output(AnswerQuality)
+        system = """You are a grader assessing whether an answer addresses / resolves a question.
+        Give a binary score 'yes' or 'no'. 'Yes' means that the answer resolves the question."""
+        prompt = ChatPromptTemplate.from_messages([
+            ("system", system),
+            ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
+        ])
+    else:
+        raise ValueError(f"Unknown grader type: {grader_type}")
+    return prompt | structured_llm_grader
+def grade_document_relevance(question: str, document: str, llm):
+    """
+    Grade the relevance of a document to a given question.
+    Args:
+        question (str): User's question
+        document (str): Retrieved document content
+    Returns:
+        str: Binary score ('yes' or 'no')
+    """
+    grader = create_llm_grader("document_relevance", llm)
+    result = grader.invoke({"question": question, "document": document})
+    return result.binary_score
+def check_hallucination(documents: List[str], generation: str, llm):
+    """
+    Check if the generation is grounded in the provided documents.
+    Args:
+        documents (List[str]): List of source documents
+        generation (str): LLM generated answer
+    Returns:
+        str: Binary score ('yes' or 'no')
+    """
+    grader = create_llm_grader("hallucination", llm)
+    result = grader.invoke({"documents": documents, "generation": generation})
+    return result.binary_score
+def grade_answer_quality(question: str, generation: str, llm):
+    """
+    Grade the quality of the answer in addressing the question.
+    Args:
+        question (str): User's original question
+        generation (str): LLM generated answer
+    Returns:
+        str: Binary score ('yes' or 'no')
+    """
+    grader = create_llm_grader("answer_quality", llm)
+    result = grader.invoke({"question": question, "generation": generation})
+    return result.binary_score
+if __name__ == "__main__":
+    # Example usage
+    test_question = "What are the types of agent memory?"
+    test_document = "Agent memory can be classified into different types such as episodic, semantic, and working memory."
+    test_generation = "Agent memory includes episodic memory for storing experiences, semantic memory for general knowledge, and working memory for immediate processing."
+    llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
+    print("Document Relevance:", grade_document_relevance(test_question, test_document, llm))
+    print("Hallucination Check:", check_hallucination([test_document], test_generation, llm))
+    print("Answer Quality:", grade_answer_quality(test_question, test_generation, llm))

src/llm/query_rewriter.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from langchain_core.prompts.chat import ChatPromptTemplate
+from langchain_ollama import ChatOllama
+from langchain_core.output_parsers import StrOutputParser
+def create_query_rewriter(llm):
+    """
+    Create a query rewriter to optimize retrieval.
+    Returns:
+        Callable: Query rewriter function
+    """
+    # Prompt for query rewriting
+    system = """You are a question re-writer that converts an input question to a better version that is optimized
+    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+    re_write_prompt = ChatPromptTemplate.from_messages([
+        ("system", system),
+        ("human", "Here is the initial question: \n\n {question} \n Formulate an improved question."),
+    ])
+    # Create query rewriter chain
+    return re_write_prompt | llm | StrOutputParser()
+def rewrite_query(question: str, llm):
+    """
+    Rewrite a given query to optimize retrieval.
+    Args:
+        question (str): Original user question
+    Returns:
+        str: Rewritten query
+    """
+    query_rewriter = create_query_rewriter(llm)
+    try:
+        rewritten_query = query_rewriter.invoke({"question": question})
+        return rewritten_query
+    except Exception as e:
+        print(f"Query rewriting error: {e}")
+        return question
+if __name__ == "__main__":
+    # Example usage
+    test_queries = [
+        "Tell me about AI agents",
+        "What do we know about memory in AI systems?",
+        "Bears draft strategy"
+    ]
+    llm = ChatOllama(model = "llama3.2", temperature = 0.1, num_predict = 256, top_p=0.5)
+    for query in test_queries:
+        rewritten = rewrite_query(query, llm)
+        print(f"Original: {query}")
+        print(f"Rewritten: {rewritten}\n")

src/tools/__init__.py ADDED Viewed

File without changes

src/tools/web_search.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import sys
+import asyncio
+from typing import List, Dict, Optional
+from langchain_community.tools import DuckDuckGoSearchResults
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from dotenv import load_dotenv
+load_dotenv()
+class AdvancedWebCrawler:
+    def __init__(self,
+                 max_search_results: int = 5,
+                 word_count_threshold: int = 50,
+                 content_filter_type: str = 'pruning',
+                 filter_threshold: float = 0.48):
+        """
+        Initialize the Advanced Web Crawler
+        Args:
+            max_search_results (int): Maximum number of search results to process
+            word_count_threshold (int): Minimum word count for crawled content
+            content_filter_type (str): Type of content filter ('pruning' or 'bm25')
+            filter_threshold (float): Threshold for content filtering
+        """
+        self.max_search_results = max_search_results
+        self.word_count_threshold = word_count_threshold
+        self.content_filter_type = content_filter_type
+        self.filter_threshold = filter_threshold
+    def _create_web_search_tool(self):
+        """
+        Create a web search tool using DuckDuckGo
+        Returns:
+            DuckDuckGoSearchResults: Web search tool
+        """
+        return DuckDuckGoSearchResults(max_results=self.max_search_results, output_format="list")
+    def _create_content_filter(self, user_query: Optional[str] = None):
+        """
+        Create content filter based on specified type
+        Args:
+            user_query (Optional[str]): Query to use for BM25 filtering
+        Returns:
+            Content filter strategy
+        """
+        if self.content_filter_type == 'bm25' and user_query:
+            return BM25ContentFilter(
+                user_query=user_query,
+                bm25_threshold=self.filter_threshold
+            )
+        else:
+            return PruningContentFilter(
+                threshold=self.filter_threshold,
+                threshold_type="fixed",
+                min_word_threshold=self.word_count_threshold
+            )
+    async def crawl_urls(self, urls: List[str], user_query: Optional[str] = None):
+        """
+        Crawl multiple URLs with content filtering
+        Args:
+            urls (List[str]): List of URLs to crawl
+            user_query (Optional[str]): Query used for BM25 content filtering
+        Returns:
+            List of crawl results
+        """
+        async with AsyncWebCrawler(
+            browser_type="chromium",
+            headless=True,
+            verbose=True
+        ) as crawler:
+            # Create appropriate content filter
+            content_filter = self._create_content_filter(user_query)
+            # Run crawling for multiple URLs
+            results = await crawler.arun_many(
+                urls=urls,
+                word_count_threshold=self.word_count_threshold,
+                bypass_cache=True,
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=content_filter
+                ),
+                cache_mode=CacheMode.DISABLED,
+                exclude_external_links=True,
+                remove_overlay_elements=True,
+                simulate_user=True,
+                magic=True
+            )
+            # Process and return crawl results
+            processed_results = []
+            for result in results:
+                crawl_result = {
+                    "url": result.url,
+                    "success": result.success,
+                    "title": result.metadata.get('title', 'N/A'),
+                    "content": result.markdown_v2.raw_markdown if result.success else result.error_message,
+                    "word_count": len(result.markdown_v2.raw_markdown.split()) if result.success else 0,
+                    "links": {
+                        "internal": len(result.links.get('internal', [])),
+                        "external": len(result.links.get('external', []))
+                    },
+                    "images": len(result.media.get('images', []))
+                }
+                processed_results.append(crawl_result)
+            return processed_results
+    async def search_and_crawl(self, query: str) -> List[Dict]:
+        """
+        Perform web search and crawl the results
+        Args:
+            query (str): Search query
+        Returns:
+            List of crawled content results
+        """
+        # Perform web search
+        search_tool = self._create_web_search_tool()
+        try:
+            search_results = search_tool.invoke({"query": query})
+            # Extract URLs from search results
+            urls = [result['link'] for result in search_results]
+            print(f"Found {len(urls)} URLs for query: {query}")
+            # Crawl URLs
+            crawl_results = await self.crawl_urls(urls, user_query=query)
+            return crawl_results
+        except Exception as e:
+            print(f"Web search and crawl error: {e}")
+            return []
+def main():
+    # Example usage
+    crawler = AdvancedWebCrawler(
+        max_search_results=5,
+        word_count_threshold=50,
+        content_filter_type='f',
+        filter_threshold=0.48
+    )
+    test_queries = [
+        "Latest developments in AI agents",
+        "Today's weather forecast in Kolkata",
+    ]
+    for query in test_queries:
+        # Run search and crawl asynchronously
+        results = asyncio.run(crawler.search_and_crawl(query))
+        print(f"\nResults for query: {query}")
+        for result in results:
+            print(f"URL: {result['url']}")
+            print(f"Success: {result['success']}")
+            print(f"Title: {result['title']}")
+            print(f"Word Count: {result['word_count']}")
+            print(f"Content Preview: {result['content'][:500]}...\n")
+if __name__ == "__main__":
+    main()

src/vectorstore/__init__.py ADDED Viewed

File without changes

src/vectorstore/pinecone_db.py ADDED Viewed

	@@ -0,0 +1,282 @@

+from data_processing.loader import MultiFormatDocumentLoader
+from data_processing.chunker import SDPMChunker, BGEM3Embeddings
+import pandas as pd
+from typing import List, Dict, Any
+from pinecone import Pinecone, ServerlessSpec
+import time
+from tqdm import tqdm
+from dotenv import load_dotenv
+import os
+load_dotenv()
+# API Keys
+PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
+embedding_model = BGEM3Embeddings(model_name="BAAI/bge-m3")
+def load_documents(file_paths: List[str], output_path='./data/output.md'):
+    """
+    Load documents from multiple sources and combine them into a single markdown file
+    """
+    loader = MultiFormatDocumentLoader(
+        file_paths=file_paths,
+        enable_ocr=False,
+        enable_tables=True
+    )
+    # Append all documents to the markdown file
+    with open(output_path, 'w') as f:
+        for doc in loader.lazy_load():
+            # Add metadata as YAML frontmatter
+            f.write('---\n')
+            for key, value in doc.metadata.items():
+                f.write(f'{key}: {value}\n')
+            f.write('---\n\n')
+            f.write(doc.page_content)
+            f.write('\n\n')
+    return output_path
+def process_chunks(markdown_path: str, chunk_size: int = 256,
+                  threshold: float = 0.7, skip_window: int = 2):
+    """
+    Process the markdown file into chunks and prepare for vector storage
+    """
+    chunker = SDPMChunker(
+        embedding_model=embedding_model,
+        chunk_size=chunk_size,
+        threshold=threshold,
+        skip_window=skip_window
+    )
+    # Read the markdown file
+    with open(markdown_path, 'r') as file:
+        text = file.read()
+    # Generate chunks
+    chunks = chunker.chunk(text)
+    # Prepare data for Parquet
+    processed_chunks = []
+    for chunk in chunks:
+        processed_chunks.append({
+            'text': chunk.text,
+            'token_count': chunk.token_count,
+            'start_index': chunk.start_index,
+            'end_index': chunk.end_index,
+            'num_sentences': len(chunk.sentences),
+        })
+    return processed_chunks
+def save_to_parquet(chunks: List[Dict[str, Any]], output_path='./data/chunks.parquet'):
+    """
+    Save processed chunks to a Parquet file
+    """
+    df = pd.DataFrame(chunks)
+    print(f"Saving to Parquet: {output_path}")
+    df.to_parquet(output_path)
+    print(f"Saved to Parquet: {output_path}")
+    return output_path
+class PineconeRetriever:
+    def __init__(
+        self,
+        pinecone_client: Pinecone,
+        index_name: str,
+        namespace: str,
+        embedding_generator: BGEM3Embeddings
+    ):
+        """Initialize the retriever with Pinecone client and embedding generator.
+        Args:
+            pinecone_client: Initialized Pinecone client
+            index_name: Name of the Pinecone index
+            namespace: Namespace in the index
+            embedding_generator: BGEM3Embeddings instance
+        """
+        self.pinecone = pinecone_client
+        self.index = self.pinecone.Index(index_name)
+        self.namespace = namespace
+        self.embedding_generator = embedding_generator
+    def invoke(self, question: str, top_k: int = 5):
+        """Retrieve similar documents for a question.
+        Args:
+            question: Query string
+            top_k: Number of results to return
+        Returns:
+            List of dictionaries containing retrieved documents
+        """
+        # Generate embedding for the question
+        question_embedding = self.embedding_generator.embed(question)
+        question_embedding = question_embedding.tolist()
+        # Query Pinecone
+        results = self.index.query(
+            namespace=self.namespace,
+            vector=question_embedding,
+            top_k=top_k,
+            include_values=False,
+            include_metadata=True
+        )
+        # Format results
+        retrieved_docs = [
+            {"page_content": match.metadata["text"], "score": match.score}
+            for match in results.matches
+        ]
+        return retrieved_docs
+def ingest_data(
+    pc,
+    parquet_path: str,
+    text_column: str,
+    pinecone_client: Pinecone,
+    index_name= "vector-index",
+    namespace= "rag",
+    batch_size: int = 100
+):
+    """Ingest data from a Parquet file into Pinecone.
+    Args:
+        parquet_path: Path to the Parquet file
+        text_column: Name of the column containing text data
+        pinecone_client: Initialized Pinecone client
+        index_name: Name of the Pinecone index
+        namespace: Namespace in the index
+        batch_size: Batch size for processing
+    """
+    # Read Parquet file
+    print(f"Reading Parquet file: {parquet_path}")
+    df = pd.read_parquet(parquet_path)
+    print(f"Total records: {len(df)}")
+    # Create or get index
+    if not pinecone_client.has_index(index_name):
+        pinecone_client.create_index(
+            name=index_name,
+            dimension=1024,  # BGE-M3 dimension
+            metric="cosine",
+            spec=ServerlessSpec(
+                cloud='aws',
+                region='us-east-1'
+            )
+        )
+        # Wait for index to be ready
+        while not pinecone_client.describe_index(index_name).status['ready']:
+            time.sleep(1)
+    index = pinecone_client.Index(index_name)
+    # Process in batches
+    for i in tqdm(range(0, len(df), batch_size)):
+        batch_df = df.iloc[i:i+batch_size]
+        # Generate embeddings for batch
+        texts = batch_df[text_column].tolist()
+        embeddings = embedding_model.embed_batch(texts)
+        print(f"embeddings for batch: {i}")
+        # Prepare records for upsert
+        records = []
+        for idx, (_, row) in enumerate(batch_df.iterrows()):
+            records.append({
+                "id": str(row.name),  # Using DataFrame index as ID
+                "values": embeddings[idx],
+                "metadata": {"text": row[text_column]}
+            })
+        # Upsert to Pinecone
+        index.upsert(vectors=records, namespace=namespace)
+        # Small delay to handle rate limits
+        time.sleep(0.5)
+def get_retriever(
+    pinecone_client: Pinecone,
+    index_name= "vector-index",
+    namespace= "rag"
+):
+    """Create and return a PineconeRetriever instance.
+    Args:
+        pinecone_client: Initialized Pinecone client
+        index_name: Name of the Pinecone index
+        namespace: Namespace in the index
+    Returns:
+        Configured PineconeRetriever instance
+    """
+    return PineconeRetriever(
+        pinecone_client=pinecone_client,
+        index_name=index_name,
+        namespace=namespace,
+        embedding_generator=embedding_model
+    )
+def main():
+    # Initialize Pinecone client
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    # Define input files
+    file_paths=[
+        # './data/2404.19756v1.pdf',
+        # './data/OD429347375590223100.pdf',
+        # './data/Project Report Format.docx',
+        './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
+    ]
+    # Process pipeline
+    try:
+        # Step 1: Load and combine documents
+        # print("Loading documents...")
+        # markdown_path = load_documents(file_paths)
+        # # Step 2: Process into chunks with embeddings
+        # print("Processing chunks...")
+        # chunks = process_chunks(markdown_path)
+        # # Step 3: Save to Parquet
+        # print("Saving to Parquet...")
+        # parquet_path = save_to_parquet(chunks)
+        # # Step 4: Ingest into Pinecone
+        # print("Ingesting into Pinecone...")
+        # ingest_data(
+            # pc,
+        #     parquet_path=parquet_path,
+        #     text_column="text",
+        #     pinecone_client=pc,
+        # )
+        # Step 5: Test retrieval
+        print("\nTesting retrieval...")
+        retriever = get_retriever(
+            pinecone_client=pc,
+            index_name="vector-index",
+            namespace="rag"
+        )
+        results = retriever.invoke(
+            question="describe the gender based violence",
+            top_k=5
+        )
+        for i, doc in enumerate(results, 1):
+            print(f"\nResult {i}:")
+            print(f"Content: {doc['page_content']}...")
+            print(f"Score: {doc['score']}")
+    except Exception as e:
+        print(f"Error in pipeline: {str(e)}")
+if __name__ == "__main__":
+    main()