File size: 4,972 Bytes
6d05084
 
908889a
cad7c8d
 
 
 
 
 
 
 
c6d3d79
cad7c8d
f4b6670
 
6d05084
03db62a
23837f4
 
6d05084
 
03db62a
 
c6d3d79
 
f4b6670
 
 
 
 
 
 
 
 
c6d3d79
f4b6670
 
 
 
 
 
 
 
 
c6d3d79
6d05084
 
 
 
 
 
 
 
 
 
 
 
c6d3d79
6d05084
 
 
 
 
43f3aa5
6d05084
 
43f3aa5
edc04cb
6d05084
908889a
 
cad7c8d
c6d3d79
cad7c8d
 
 
 
 
 
cb90804
c0f7738
 
 
 
d6ce3c4
c0f7738
908889a
10c4a8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46576f4
10c4a8d
 
 
 
 
46576f4
 
 
 
908889a
10c4a8d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import random
import gradio as gr
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from langchain_google_genai import ChatGoogleGenerativeAI

# os.environ["GROQ_API_KEY"] = os.environ.get("GROQ_API_KEY")   
# os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY")   

# Initialize memory to manages the chat history, 
# ensuring the AI remembers the specified number of history messages, in this case 8.
memory = ConversationBufferWindowMemory(k=8, memory_key="chat_history", return_messages=True)


def generate_response(user_input, history, model, temperature, max_tokens, top_p, seed):
    print( "Model =", model)

    if model.startswith("gemini"):
        chat = ChatGoogleGenerativeAI(
            google_api_key = os.environ.get("GOOGLE_API_KEY"),
            model = model,
            temperature=temperature,
            max_tokens=max_tokens,
            timeout=None,
            max_retries=2,
            top_p = top_p
        )        
    else:
        chat = ChatGroq(
            groq_api_key = os.environ.get("GROQ_API_KEY"),
            model_name = model,
            temperature=temperature,
            max_tokens=max_tokens, 
            request_timeout=None,
            max_retries=2,
            top_p = top_p            
        )

    prompt = ChatPromptTemplate.from_messages(
        [
            # This is the persistent system prompt, sets the initial context for the AI.
            SystemMessage(content='You are a helpful AI assistant.'),
            # This placeholder will take care of chat history.
            MessagesPlaceholder(variable_name="chat_history"),
            # This template is where the user's current input will be injected into the prompt.
            HumanMessagePromptTemplate.from_template("{human_input}"),
        ]
    )    

    # Create a conversation sequence using RunnableSequence
    conversation = prompt | chat

    # Load chat_history
    chat_history = memory.load_memory_variables({})["chat_history"]
    
    # The chatbot's answer is generated by sending the full prompt to the LLM
    response = conversation.invoke({"human_input": user_input, "chat_history": chat_history})

    # Update the memory with the new interaction
    memory.save_context({"input": user_input}, {"output": response.content})
    
    return response.content

# Define additional inputs and examples if needed
additional_inputs = [
    gr.Dropdown(choices=["llama-3.1-70b-versatile", "llama-3.1-8b-instant", "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma2-9b-it", "gemma-7b-it","gemini-1.5-pro", "gemini-1.5-flash", "gemini-1.5-flash-8b", "gemini-2.0-flash-exp"], value="llama-3.1-70b-versatile", label="Model"),
    gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.5, label="Temperature", info="Controls diversity of the generated text. Lower is more deterministic, higher is more creative."),
    gr.Slider(minimum=1, maximum=8000, step=1, value=8000, label="Max Tokens", info="The maximum number of tokens that the model can process in a single response.<br>Maximums: 8k for gemma 7b it, gemma2 9b it, llama 7b & 70b, 32k for mixtral 8x7b, 132k for llama 3.1."),
    gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=0.5, label="Top P", info="A method of text generation where a model will only consider the most probable next tokens that make up the probability p."),
    gr.Number(precision=0, value=0, label="Seed", info="A starting point to initiate generation, use 0 for random")
]

example1 = [
            ["What's the distance from Tokyo to New York?"],    
            ["What to San Francisco?"],        
            ["Then what to Beijing?"],
            ["And what to Kyoto?"],
            ["What from Beijing to New York?"]
        ]

# # Create the Gradio interface
# interface = gr.ChatInterface(
#     fn=generate_response, 
#     chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
#     additional_inputs=additional_inputs,
#     examples=example1,
#     cache_examples=False,
# )

# # Launch the app
# interface.launch()

def clear_chat():
    # print("Clear chat history")
    memory.clear()
    return None

with gr.Blocks(fill_width=True, fill_height=True) as demo:
    ci= gr.ChatInterface(
        fn=generate_response, 
        chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
        additional_inputs=additional_inputs,
        examples=example1,
        cache_examples=False,
    )
    ci.clear_btn.click(clear_chat)
    # clear_button = gr.Button("Clear chat history and Start a new chat")
    # clear_button.click(clear_chat, inputs=None, outputs=None)

demo.launch()