anasmkh's picture
Update app.py
0de253d verified
raw
history blame
1.91 kB
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
import gradio as gr
from llama_index.core import Settings
from langchain_community.embeddings import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
documents = SimpleDirectoryReader('files').load_data()
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={"temperature": 0.1, "do_sample": True},
system_prompt=system_prompt,
tokenizer_name="anasmkh/customized_llama3.1_8b",
model_name="anasmkh/customized_llama3.1_8b",
device_map="auto",
model_kwargs={"torch_dtype": torch.float16 }
)
embed_model= HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
Settings.llm = llm
Settings.embed_model =embed_model
# Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
Settings.num_output = 250
Settings.context_window = 3900
index = VectorStoreIndex.from_documents(
documents, embed_model=embed_model
)
query_engine = index.as_query_engine(llm=llm)
def chat(message, history):
history = history or []
history.append({"role": "user", "content": message})
response=query_engine.query(message)
# response = generator(history)[-1]["generated_text"]
history.append({"role": "assistant", "content": response})
return history
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
message = gr.Textbox()
clear = gr.ClearButton([message, chatbot])
message.submit(chat, [message, chatbot], chatbot)
clear.click(lambda: None, None, chatbot, queue=False)
demo.launch()