import gradio as gr from huggingface_hub import InferenceClient import os import faiss from transformers import pipeline from sentence_transformers import SentenceTransformer # Documents converted into FAISS vectors documents = [ "The class starts at 2PM Wednesday.", "Python is our main programming language.", "Our university is located in Szeged.", "We are making things with RAG, Rasa and LLMs.", "Gabor Toth is the author of this chatbot." ] embedding_model = SentenceTransformer('all-MiniLM-L6-v2') document_embeddings = embedding_model.encode(documents) index = faiss.IndexFlatL2(document_embeddings.shape[1]) index.add(document_embeddings) client = InferenceClient("meta-llama/Llama-3.2-3B-Instruct") def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): # Get relevant document query_embedding = embedding_model.encode([message]) distances, indices = index.search(query_embedding, k=2) relevant_document = documents[indices[0][0]], documents[indices[0][1]] # Set prompt messages = [{"role": "system", "content": system_message},{"role": "system", "content": f"context: {relevant_document}"}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()