Spaces:

btoews
/

FinVA_Demo

Sleeping

File size: 7,788 Bytes

import streamlit as st
from openai import OpenAI
import os

# Authentication function
def authenticate():
    st.title("Financial Virtual Assistant")
    st.subheader("Login")

    username = st.text_input("Username")
    password = st.text_input("Password", type="password")

    if st.button("Login"):
        if username == os.getenv('username') and password == os.getenv('password'):
            st.session_state.authenticated = True
            return True
        else:
            st.error("Invalid username or password")
    return False


# Check authentication state
if "authenticated" not in st.session_state:
    st.session_state.authenticated = False

if not st.session_state.authenticated:
    if authenticate():
        st.rerun()
else:
    # Streamlit page configuration
    st.set_page_config(page_title="Financial Virtual Assistant", layout="wide")

    # Initialize session state for chat history and API client
    if "messages" not in st.session_state:
        st.session_state.messages = []
    if "client" not in st.session_state:
        base_url = f"https://brandontoews--vllm-openai-compatible-serve.modal.run/v1"

        # Initialize OpenAI client
        st.session_state.client = OpenAI(
            api_key=os.getenv('openai_api_key'),  # Replace with your API key or use modal.Secret
            base_url=base_url
        )
    if "models" not in st.session_state:
        # Fetch available models from the server
        try:
            models = st.session_state.client.models.list().data
            st.session_state.models = [model.id for model in models]
        except Exception as e:
            st.session_state.models = ["neuralmagic/Mistral-7B-Instruct-v0.3-quantized.w8a16"]  # Fallback if fetch fails
            st.warning(f"Failed to fetch models: {e}. Using default model.")


    # Function to estimate token count (heuristic: ~4 chars per token)
    def estimate_token_count(messages):
        total_chars = sum(len(message["content"]) for message in messages)
        return total_chars // 4  # Approximate: 4 characters per token


    # Function to truncate messages if token count exceeds limit
    def truncate_messages(messages, max_tokens=2048, keep_last_n=5):
        # Always keep the system prompt (if present) and the last N messages
        system_prompt = [msg for msg in messages if msg["role"] == "system"]
        non_system_messages = [msg for msg in messages if msg["role"] != "system"]

        # Estimate current token count
        current_tokens = estimate_token_count(messages)

        # If under the limit, no truncation needed
        if current_tokens <= max_tokens:
            return messages

        # Truncate older non-system messages, keeping the last N
        truncated_non_system_messages = non_system_messages[-keep_last_n:] if len(
            non_system_messages) > keep_last_n else non_system_messages

        # Reconstruct messages: system prompt (if any) + truncated non-system messages
        return system_prompt + truncated_non_system_messages


    # Function to get completion from vLLM server
    def get_completion(client, model_id, messages, stream=True, temperature=0.2, top_p=0.85, max_tokens=512):
        completion_args = {
            "model": model_id,
            "messages": messages,
            "temperature": temperature,
            "top_p": top_p,
            "max_tokens": max_tokens,
            "stream": stream,
        }
        try:
            response = client.chat.completions.create(**completion_args)
            return response
        except Exception as e:
            st.error(f"Error during API call: {e}")
            return None


    # Sidebar for configuration
    with st.sidebar:
        st.header("Chat Settings")

        # Model selection dropdown
        model_id = st.selectbox(
            "Select Model",
            options=st.session_state.models,
            index=0,
            help="Choose a model available on the vLLM server"
        )

        # System prompt input
        system_prompt = st.text_area(
            "System Prompt",
            value="You are a finance expert, providing clear, accurate, and concise answers to financial questions.",
            height=100,
            help="Enter a system prompt to guide the model's behavior (optional)"
        )
        if st.button("Apply System Prompt"):
            if st.session_state.messages and st.session_state.messages[0]["role"] == "system":
                st.session_state.messages[0] = {"role": "system", "content": system_prompt}
            else:
                st.session_state.messages.insert(0, {"role": "system", "content": system_prompt})
            st.success("System prompt updated!")

        # Other settings
        temperature = st.slider("Temperature", 0.0, 1.0, 0.2, help="Controls randomness of responses")
        top_p = st.slider("Top P", 0.0, 1.0, 0.85, help="Controls diversity via nucleus sampling")
        max_tokens = st.number_input("Max Tokens", min_value=1, value=512, help="Maximum length of response (optional)")
        if st.button("Clear Chat"):
            st.session_state.messages = (
                [{"role": "system", "content": system_prompt}] if system_prompt else []
            )

    # Main chat interface
    st.title("Financial Virtual Assistant")
    st.write("Chat with a finance-tuned LLM powered by vLLM on Modal. Select a model and customize your system prompt!")

    # Display chat history
    for message in st.session_state.messages:
        if message["role"] == "system":
            with st.expander("System Prompt", expanded=False):
                st.markdown(f"**System**: {message['content']}")
        else:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

    # User input
    if prompt := st.chat_input("Type your message here..."):
        # Add user message to history
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.markdown(prompt)

        # Truncate messages if necessary to stay under token limit
        st.session_state.messages = truncate_messages(
            st.session_state.messages,
            max_tokens=2048 - max_tokens,  # Reserve space for the output
            keep_last_n=5
        )
        # Debug: Log token count and messages
        current_tokens = estimate_token_count(st.session_state.messages)
        st.write(f"Debug: Current token count: {current_tokens}")
        st.write(f"Debug: Messages sent to model: {st.session_state.messages}")

        # Get and display assistant response
        with st.chat_message("assistant"):
            response = get_completion(
                st.session_state.client,
                model_id,
                st.session_state.messages,
                stream=True,
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens
            )
            if response:
                # Stream the response
                placeholder = st.empty()
                assistant_message = ""
                for chunk in response:
                    if chunk.choices[0].delta.content:
                        assistant_message += chunk.choices[0].delta.content
                        placeholder.markdown(assistant_message + "▌")  # Cursor effect
                placeholder.markdown(assistant_message)  # Final message without cursor
                st.session_state.messages.append({"role": "assistant", "content": assistant_message})
            else:
                st.error("Failed to get a response from the server.")

    # Instructions
    st.caption("Built with Streamlit and vLLM on Modal. Adjust settings in the sidebar and chat away!")