import gradio as gr from huggingface_hub import InferenceClient import pandas as pd import transformers import torch from sentence_transformers import SentenceTransformer, util """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # Load the SBERT model sbert_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Function to initialize the Llama 3 8B model pipeline def initiate_pipeline(): model = "meta-llama/Meta-Llama-3-8B-Instruct" device = "cuda:0" if torch.cuda.is_available() else "cpu" return transformers.pipeline( "text-generation", model=model, model_kwargs={"torch_dtype": torch.bfloat16}, device=device, ) # Initialize the model llama_model = initiate_pipeline() # Load the Q&A pairs from the CSV qa_data = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/rag_juri_cv.csv") # Function to retrieve the top 5 relevant Q&A pairs using Sentence-BERT def retrieve_top_k(query, k=5): # Combine the questions from the CSV into a list questions = qa_data['QUESTION'].tolist() # Encode the questions and the query using Sentence-BERT question_embeddings = sbert_model.encode(questions, convert_to_tensor=True) query_embedding = sbert_model.encode(query, convert_to_tensor=True) # Compute cosine similarities between the query and all questions cosine_scores = util.pytorch_cos_sim(query_embedding, question_embeddings).flatten() # Get the indices of the top k most similar questions top_k_indices = torch.topk(cosine_scores, k=k).indices.cpu() # Retrieve the corresponding Q&A pairs top_k_qa = qa_data.iloc[top_k_indices] return top_k_qa def chatbot(query): # Retrieve the top 5 relevant Q&A pairs top_k_qa = retrieve_top_k(query) # Generate the prefix, body, and suffix for the prompt prefix = """ <|begin_of_text|><|start_header_id|>user<|end_header_id|>You are a chatbot specialized in answering questions about Juri Grosjean's CV. Please only use the information provided in the context to answer the question. Here is the question to answer: """ + query + "\n\n" context = "This is the context information to answer the question:\n" for index, row in top_k_qa.iterrows(): context += f"Information {index}: {row['ANSWER']}\n\n" suffix = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>" prompt = prefix + context + suffix # Generate a response outputs = llama_model( prompt, max_new_tokens=500, do_sample=True, temperature=0.6, top_p=0.9, ) # Extract and return the chatbot's answer output = outputs[0]["generated_text"] return output.split("assistant")[-1].strip() # Set up the Gradio interface demo = gr.Interface( fn=chatbot, inputs=gr.Textbox(lines=5, placeholder="Ask a question about Juri Grosjean's CV"), outputs="text" ) if __name__ == "__main__": demo.launch()