from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext from llama_index.llms.huggingface import HuggingFaceLLM import torch import gradio as gr from llama_index.core import Settings from langchain_community.embeddings import HuggingFaceEmbeddings from llama_index.core import ServiceContext # from langchain.embeddings.huggingface import HuggingFaceEmbeddings documents = SimpleDirectoryReader('files').load_data() system_prompt=""" You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided. """ embed_model= HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") llm = HuggingFaceLLM( context_window=4096, max_new_tokens=256, generate_kwargs={"temperature": 0.1, "do_sample": True}, system_prompt=system_prompt, tokenizer_name="anasmkh/new_customized_llama3.1_8b", model_name="anasmkh/new_customized_llama3.1_8b", device_map="auto", model_kwargs={"torch_dtype": torch.float16 } ) Settings.llm = llm Settings.embed_model =embed_model # Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20) Settings.num_output = 250 Settings.context_window = 3900 index = VectorStoreIndex.from_documents( documents, embed_model=embed_model ) query_engine = index.as_query_engine(llm=llm) def chat(message, history): history = history or [] history.append({"role": "user", "content": message}) response=query_engine.query(message) # response = generator(history)[-1]["generated_text"] history.append({"role": "assistant", "content": response}) return history with gr.Blocks() as demo: chatbot = gr.Chatbot() message = gr.Textbox() clear = gr.ClearButton([message, chatbot]) message.submit(chat, [message, chatbot], chatbot) clear.click(lambda: None, None, chatbot, queue=False) demo.launch()