Spaces:

spark-ds549
/

LibRAG

Sleeping

Dephoh commited on Dec 9, 2024

Commit

b296661

1 Parent(s): 9b667a3

loading scripts and app stuff

Co-authored By: Daniel [email protected]
Co-authored By: Brandon [email protected]
Co-authored By: Enrico [email protected]
Co-authored By: Jinanshi [email protected]

Files changed (5) hide show

Dockerfile +3 -76
RAG.py +156 -0
load_pinecone.py +97 -0
load_script.py +140 -0
streamlit_app.py +148 -0

Dockerfile CHANGED Viewed

@@ -1,111 +1,38 @@
-FROM python:3.9
 # Create a non-root user
 RUN useradd -m -u 1000 user
 USER user
 # Set PATH to include user's local bin
 ENV PATH="/home/user/.local/bin:$PATH"
 # Set working directory
 WORKDIR /app
 # Copy requirements file with appropriate ownership
 COPY --chown=user ./requirements.txt requirements.txt
 # Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application files with appropriate ownership
 COPY --chown=user . /app
 # Set environment variables for Streamlit
 ENV HOST=0.0.0.0
 ENV PORT=7860
 ENV STREAMLIT_SERVER_PORT=7860
 ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
 # Change the CMD to use chainlit
-CMD ["streamlit", "run", "streamlit-rag-app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

+FROM python:3.12.4
 # Create a non-root user
 RUN useradd -m -u 1000 user
 USER user
 # Set PATH to include user's local bin
 ENV PATH="/home/user/.local/bin:$PATH"
 # Set working directory
 WORKDIR /app
 # Copy requirements file with appropriate ownership
 COPY --chown=user ./requirements.txt requirements.txt
 # Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install rank_bm25
 # Copy application files with appropriate ownership
 COPY --chown=user . /app
 # Set environment variables for Streamlit
 ENV HOST=0.0.0.0
 ENV PORT=7860
 ENV STREAMLIT_SERVER_PORT=7860
 ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
 # Change the CMD to use chainlit
+CMD ["streamlit", "run", "streamlit_app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]

RAG.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import getpass
+import os
+import time
+from pinecone import Pinecone, ServerlessSpec
+from langchain_pinecone import PineconeVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from dotenv import load_dotenv
+from langchain_core.prompts import PromptTemplate
+from langchain_openai import ChatOpenAI
+import re
+from langchain_core.documents import Document
+from langchain_community.retrievers import BM25Retriever
+import requests
+from typing import Dict, Any, Optional, List, Tuple
+import json
+import logging
+def retrieve(index_name: str, query: str, embeddings, k: int = 1000) -> Tuple[List[Document], List[float]]:
+    load_dotenv()
+    pinecone_api_key = os.getenv("PINECONE_API_KEY")
+    pc = Pinecone(api_key=pinecone_api_key)
+    index = pc.Index(index_name)
+    vector_store = PineconeVectorStore(index=index, embedding=embeddings)
+    results = vector_store.similarity_search_with_score(
+        query,
+        k=k,
+    )
+    documents = []
+    scores = []
+    for res, score in results:
+        documents.append(res)
+        scores.append(score)
+    return documents, scores
+def safe_get_json(url: str) -> Optional[Dict]:
+    """Safely fetch and parse JSON from a URL."""
+    print("Fetching JSON")
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        logging.error(f"Error fetching from {url}: {str(e)}")
+        return None
+def extract_text_from_json(json_data: Dict) -> str:
+    """Extract text content from JSON response."""
+    if not json_data:
+        return ""
+    text_parts = []
+    # Handle direct text fields
+    text_fields = ["title_info_primary_tsi","abstract_tsi","subject_geographic_sim","genre_specific_ssim"]
+    for field in text_fields:
+        if field in json_data['data']['attributes'] and json_data['data']['attributes'][field]:
+            # print(json_data[field])
+            text_parts.append(str(json_data['data']['attributes'][field]))
+    return " ".join(text_parts) if text_parts else "No content available"
+def rerank(documents: List[Document], query: str) -> List[Document]:
+    """Rerank documents using BM25, with proper error handling."""
+    if not documents:
+        return []
+    full_docs = []
+    for doc in documents:
+        if not doc.metadata.get('source'):
+            continue
+        url = f"https://www.digitalcommonwealth.org/search/{doc.metadata['source']}"
+        json_data = safe_get_json(f"{url}.json")
+        if json_data:
+            text_content = extract_text_from_json(json_data)
+            if text_content:  # Only add documents with actual content
+                full_docs.append(Document(page_content=text_content, metadata={"source":doc.metadata['source'],"field":doc.metadata['field'],"URL":url}))
+    # If no valid documents were processed, return empty list
+    if not full_docs:
+        return []
+    # Create BM25 retriever with the processed documents
+    reranker = BM25Retriever.from_documents(full_docs, k=min(10, len(full_docs)))
+    reranked_docs = reranker.invoke(query)
+    return reranked_docs
+def parse_xml_and_check(xml_string: str) -> str:
+    """Parse XML-style tags and handle validation."""
+    if not xml_string:
+        return "No response generated."
+    pattern = r"<(\w+)>(.*?)</\1>"
+    matches = re.findall(pattern, xml_string, re.DOTALL)
+    parsed_response = dict(matches)
+    if parsed_response.get('VALID') == 'NO':
+        return "Sorry, I was unable to find any documents relevant to your query."
+    return parsed_response.get('RESPONSE', "No response found in the output")
+def RAG(llm: Any, query: str, index_name: str, embeddings: Any, top: int = 10, k: int = 100) -> Tuple[str, List[Document]]:
+    """Main RAG function with improved error handling and validation."""
+    try:
+        # Retrieve initial documents
+        retrieved, _ = retrieve(index_name=index_name, query=query, embeddings=embeddings, k=k)
+        if not retrieved:
+            return "No documents found for your query.", []
+        # Rerank documents
+        reranked = rerank(documents=retrieved, query=query)
+        if not reranked:
+            return "Unable to process the retrieved documents.", []
+        # Prepare context from reranked documents
+        context = "\n\n".join(doc.page_content for doc in reranked[:top] if doc.page_content)
+        if not context.strip():
+            return "No relevant content found in the documents.", []
+        # Prepare prompt
+        prompt_template = PromptTemplate.from_template(
+            """Pretend you are a professional librarian. Please Summarize The Following Context as though you had retrieved it for a patron:
+            Context:{context}
+            Make sure to answer in the following format
+            First, reason about the answer between <REASONING></REASONING> headers,
+            based on the context determine if there is sufficient material for answering the exact question,
+            return either <VALID>YES</VALID> or <VALID>NO</VALID>
+            then return a response between <RESPONSE></RESPONSE> headers:
+            Here is an example
+            <EXAMPLE>
+            <QUERY>Are pineapples a good fuel for cars?</QUERY>
+            <CONTEXT>Cars use gasoline for fuel. Some cars use electricity for fuel.Tesla stock has increased by 10 percent over the last quarter.</CONTEXT>
+            <REASONING>Based on the context pineapples have not been explored as a fuel for cars. The context discusses gasoline, electricity, and tesla stock, therefore it is not relevant to the query about pineapples for fuel</REASONING>
+            <VALID>NO</VALID>
+            <RESPONSE>Pineapples are not a good fuel for cars, however with further researach they migth be</RESPONSE>
+            </EXAMPLE>
+            Now it's your turn
+            <QUERY>
+            {query}
+            </QUERY>"""
+        )
+        # Generate response
+        prompt = prompt_template.invoke({"context": context, "query": query})
+        print(prompt)
+        response = llm.invoke(prompt)
+        # Parse and return response
+        parsed = parse_xml_and_check(response.content)
+        return parsed, reranked
+    except Exception as e:
+        logging.error(f"Error in RAG function: {str(e)}")
+        return f"An error occurred while processing your query: {str(e)}", []

load_pinecone.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_pinecone import PineconeVectorStore
+from langchain_core.documents import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from pinecone import Pinecone, ServerlessSpec
+from uuid import uuid4
+import json
+import os
+from dotenv import load_dotenv
+import sys
+import time
+load_dotenv()
+BEGIN = int(sys.argv[1])
+END = int(sys.argv[2])
+PATH = sys.argv[3]
+# Pinecone setup
+PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
+pc = Pinecone(api_key=PINECONE_API_KEY)
+INDEX_NAME = sys.argv[4]
+index = pc.Index(INDEX_NAME)
+print("Loading JSON...")
+meta = json.load(open(PATH))
+model_name = "sentence-transformers/all-MiniLM-L6-v2"
+model_kwargs = {'device': 'cuda'}
+encode_kwargs = {'normalize_embeddings': False}
+print("Initializing Pinecone index...")
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+vector_store = PineconeVectorStore(index=index, embedding=embeddings)
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,
+    chunk_overlap=100,
+    length_function=len,
+    separators=["\n\n", "\n", " ", ""]
+)
+fields = ['abstract_tsi','title_info_primary_tsi','title_info_primary_subtitle_tsi', 'title_info_alternative_tsim']
+print("Beginning Embeddings...")
+start = time.time()
+full_data = []
+for page in meta:
+    content = page['data']
+    full_data += content
+if BEGIN > END:
+    slice = content[BEGIN:]
+else:
+    slice = content[BEGIN:END]
+num = 0
+for item in slice:
+    id = item["id"]
+    item_data = item["attributes"]
+    print(id, time.time())
+    documents = []
+    for field in item_data:
+        if (field in fields) or ("note" in field):
+            entry = str(item_data[field])
+            if len(entry) > 1000:
+                chunks = text_splitter.split_text(entry)
+                for chunk in chunks:
+                    documents.append(Document(page_content=chunk, metadata={"source": id, "field": field}))
+            else:
+                documents.append(Document(page_content=entry, metadata={"source": id, "field": field}))
+    if num % 1000 == 0:
+        print(num, f"Added vectors to vectorstore at {time.time()} on id {id}")
+    print(documents)
+    uuids = [str(uuid4()) for _ in range(len(documents))]
+    vector_store.add_documents(documents=documents, ids=uuids)
+    num += 1
+end = time.time()
+print(f"Embedded all documents in {end-start} seconds...")

load_script.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import time
+import os
+import sys
+import requests
+def fetch_digital_commonwealth():
+    start = time.time()
+    BASE_URL = "https://www.digitalcommonwealth.org/search.json?search_field=all_fields&per_page=100&q="
+    PAGE = sys.argv[1]
+    END_PAGE = sys.argv[2]
+    file_name = f"out{PAGE}_{END_PAGE}.json"
+    FINAL_PAGE = 13038
+    output = []
+    file_path = f"./{file_name}"
+    # file_path = './output.json'
+    if os.path.exists(file_path):
+        with open(file_path,'r') as file:
+            output = json.load(file)
+            if int(PAGE) < (len(output) + 1):
+                PAGE = len(output) + 1
+    if int(PAGE) >= int(END_PAGE):
+        return None
+    print(f'Reading page {PAGE} up to page {END_PAGE}')
+    retries = 0
+    while True:
+        try:
+            response = requests.get(f"{BASE_URL}&page={PAGE}")
+            response.raise_for_status()
+            data = response.json()
+            # Append current page data to the output list
+            output.append(data)
+            # Save the entire output to a JSON file after each iteration
+            with open(file_path, 'w') as f:
+                json.dump(output, f)
+            # check if theres a next page
+            # print(len(response))
+            if data['meta']['pages']['next_page']:
+                if data['meta']['pages']['next_page'] == int(END_PAGE):
+                    print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+                    break
+                elif data['meta']['pages']['next_page'] == FINAL_PAGE:
+                    print(f"finished page {PAGE}")
+                    PAGE = FINAL_PAGE
+                else:
+                    print(f"finished page {PAGE}")
+                    PAGE = data['meta']['pages']['next_page']
+            else:
+                print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+                break
+            retries = 0
+            # Optional: Add a small delay to avoid overwhelming the API
+            # time.sleep(0.5)
+        except requests.exceptions.RequestException as e:
+            print(f"An error occurred: {e}")
+            print(f"Processed and saved page {PAGE}. Total pages saved: {len(output)}")
+            retries += 1
+            if retries >= 5:
+                break
+    end = time.time()
+    print(f"Timer: {end - start}")
+    print(f"Finished processing all pages. Total pages saved: {len(output)}")
+if __name__ == "__main__":
+    fetch_digital_commonwealth()

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import streamlit as st
+import os
+from typing import List, Tuple, Optional
+from pinecone import Pinecone
+from langchain_pinecone import PineconeVectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from dotenv import load_dotenv
+from RAG import RAG
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="Boston Public Library Chatbot",
+    page_icon="🤖",
+    layout="wide"
+)
+def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]:
+    """Initialize the language model and embeddings."""
+    try:
+        load_dotenv()
+        # Initialize OpenAI model
+        llm = ChatOpenAI(
+            model="gpt-4",  # Changed from gpt-4o-mini which appears to be a typo
+            temperature=0,
+            timeout=60,  # Added reasonable timeout
+            max_retries=2
+        )
+        # Initialize embeddings
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2"
+        )
+        return llm, embeddings
+    except Exception as e:
+        logger.error(f"Error initializing models: {str(e)}")
+        st.error(f"Failed to initialize models: {str(e)}")
+        return None, None
+def process_message(
+    query: str,
+    llm: ChatOpenAI,
+    index_name: str,
+    embeddings: HuggingFaceEmbeddings
+) -> Tuple[str, List]:
+    """Process the user message using the RAG system."""
+    try:
+        response, sources = RAG(
+            query=query,
+            llm=llm,
+            index_name=index_name,
+            embeddings=embeddings
+        )
+        return response, sources
+    except Exception as e:
+        logger.error(f"Error in process_message: {str(e)}")
+        return f"Error processing message: {str(e)}", []
+def display_sources(sources: List) -> None:
+    """Display sources in expandable sections with proper formatting."""
+    if not sources:
+        st.info("No sources available for this response.")
+        return
+    st.subheader("Sources")
+    for i, doc in enumerate(sources, 1):
+        try:
+            with st.expander(f"Source {i}"):
+                if hasattr(doc, 'page_content'):
+                    st.markdown(f"**Content:** {doc.page_content}")
+                    if hasattr(doc, 'metadata'):
+                        for key, value in doc.metadata.items():
+                            st.markdown(f"**{key.title()}:** {value}")
+                else:
+                    st.markdown(f"**Content:** {str(doc)}")
+        except Exception as e:
+            logger.error(f"Error displaying source {i}: {str(e)}")
+            st.error(f"Error displaying source {i}")
+def main():
+    st.title("RAG Chatbot")
+    # Initialize session state
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Initialize models
+    llm, embeddings = initialize_models()
+    if not llm or not embeddings:
+        st.error("Failed to initialize the application. Please check the logs.")
+        return
+    # Constants
+    INDEX_NAME = 'bpl-rag'
+    # Display chat history
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    user_input = st.chat_input("Type your message here...")
+    if user_input:
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(user_input)
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        # Process and display assistant response
+        with st.chat_message("assistant"):
+            with st.spinner("Let Me Think..."):
+                response, sources = process_message(
+                    query=user_input,
+                    llm=llm,
+                    index_name=INDEX_NAME,
+                    embeddings=embeddings
+                )
+                if isinstance(response, str):
+                    st.markdown(response)
+                    st.session_state.messages.append({
+                        "role": "assistant",
+                        "content": response
+                    })
+                    # Display sources
+                    display_sources(sources)
+                else:
+                    st.error("Received an invalid response format")
+    # Footer
+    st.markdown("---")
+    st.markdown(
+        "Built with ❤️ using Streamlit + LangChain + OpenAI",
+        help="An AI-powered chatbot with RAG capabilities"
+    )
+if __name__ == "__main__":
+    main()