Spaces:
Running
Running
import streamlit as st | |
import os | |
from typing import List, Tuple, Optional | |
from pinecone import Pinecone | |
from langchain_pinecone import PineconeVectorStore | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_openai import ChatOpenAI | |
from langchain_core.prompts import PromptTemplate | |
from dotenv import load_dotenv | |
from RAG import RAG | |
import logging | |
from image_scraper import DigitalCommonwealthScraper | |
import shutil | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Page configuration | |
st.set_page_config( | |
page_title="Boston Public Library Chatbot", | |
page_icon="🤖", | |
layout="wide" | |
) | |
def initialize_models() -> Tuple[Optional[ChatOpenAI], HuggingFaceEmbeddings]: | |
"""Initialize the language model and embeddings.""" | |
try: | |
load_dotenv() | |
if "llm" not in st.session_state: | |
# Initialize OpenAI model | |
st.session_state.llm = ChatOpenAI( | |
model="gpt-4", # Changed from gpt-4o-mini which appears to be a typo | |
temperature=0, | |
timeout=60, # Added reasonable timeout | |
max_retries=2 | |
) | |
if "embeddings" not in st.session_state: | |
# Initialize embeddings | |
st.session_state.embeddings = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2" | |
) | |
if "pinecone" not in st.session_state: | |
pinecone_api_key = os.getenv("PINECONE_API_KEY") | |
INDEX_NAME = 'bpl-rag' | |
#initialize vectorstore | |
pc = Pinecone(api_key=pinecone_api_key) | |
index = pc.Index(INDEX_NAME) | |
st.session_state.pinecone = PineconeVectorStore(index=index, embedding=st.session_state.embeddings) | |
except Exception as e: | |
logger.error(f"Error initializing models: {str(e)}") | |
st.error(f"Failed to initialize models: {str(e)}") | |
return None, None | |
def process_message( | |
query: str, | |
llm: ChatOpenAI, | |
vectorstore: PineconeVectorStore, | |
) -> Tuple[str, List]: | |
"""Process the user message using the RAG system.""" | |
try: | |
response, sources = RAG( | |
query=query, | |
llm=llm, | |
vectorstore=vectorstore, | |
) | |
return response, sources | |
except Exception as e: | |
logger.error(f"Error in process_message: {str(e)}") | |
return f"Error processing message: {str(e)}", [] | |
def display_sources(sources: List) -> None: | |
"""Display sources in expandable sections with proper formatting.""" | |
if not sources: | |
st.info("No sources available for this response.") | |
return | |
st.subheader("Sources") | |
for i, doc in enumerate(sources, 1): | |
try: | |
with st.expander(f"Source {i}"): | |
if hasattr(doc, 'page_content'): | |
st.markdown(f"**Content:** {doc.page_content[0:100] + ' ...'}") | |
if hasattr(doc, 'metadata'): | |
for key, value in doc.metadata.items(): | |
st.markdown(f"**{key.title()}:** {value}") | |
# Web Scraper to display images of sources | |
# Especially helpful if the sources are images themselves | |
# or are OCR'd text files | |
scraper = DigitalCommonwealthScraper() | |
images = scraper.extract_images(doc.metadata["URL"]) | |
images = images[:1] | |
# If there are no images then don't display them | |
if not images: | |
st.warning("No images found on the page.") | |
return | |
# Download the images | |
# Delete the directory if it already exists | |
# to clear the existing cache of images for each listed source | |
output_dir = 'downloaded_images' | |
if os.path.exists(output_dir): | |
shutil.rmtree(output_dir) | |
# Download the main image to a local directory | |
downloaded_files = scraper.download_images(images) | |
# Display the image using st.image | |
# Display the title of the image using img.get | |
st.image(downloaded_files, width=400, caption=[ | |
img.get('alt', f'Image {i+1}') for i, img in enumerate(images) | |
]) | |
else: | |
st.markdown(f"**Content:** {str(doc)}") | |
except Exception as e: | |
logger.error(f"Error displaying source {i}: {str(e)}") | |
st.error(f"Error displaying source {i}") | |
def main(): | |
st.title("Digital Commonwealth RAG") | |
INDEX_NAME = 'bpl-rag' | |
# Initialize session state | |
if "messages" not in st.session_state: | |
st.session_state.messages = [] | |
# Initialize models | |
initialize_models() | |
# Display chat history | |
for message in st.session_state.messages: | |
with st.chat_message(message["role"]): | |
st.markdown(message["content"]) | |
# Chat input | |
user_input = st.chat_input("Type your query here...") | |
if user_input: | |
# Display user message | |
with st.chat_message("user"): | |
st.markdown(user_input) | |
st.session_state.messages.append({"role": "user", "content": user_input}) | |
# Process and display assistant response | |
with st.chat_message("assistant"): | |
with st.spinner("Thinking... Please be patient, I'm a little slow right now..."): | |
response, sources = process_message( | |
query=user_input, | |
llm=st.session_state.llm, | |
vectorstore=st.session_state.pinecone | |
) | |
if isinstance(response, str): | |
st.markdown(response) | |
st.session_state.messages.append({ | |
"role": "assistant", | |
"content": response | |
}) | |
# Display sources | |
display_sources(sources) | |
else: | |
st.error("Received an invalid response format") | |
# Footer | |
st.markdown("---") | |
st.markdown( | |
"Built with Langchain + Streamlit + Pinecone", | |
help="Natural Language Querying for Digital Commonwealth" | |
) | |
st.markdown("The Digital Commonwealth site provides access to photographs, manuscripts, books, audio recordings, and other materials of historical interest that have been\ndigitized and made available by members of Digital Commonwealth, a statewide consortium of libraries, museums, archives, and historical societies from across Massachusetts.") | |
if __name__ == "__main__": | |
main() |