Spaces:

SansarK
/

SansarChat

Runtime error

App Files Files Community

SansarChat / rag_with_mircosoftphi2_and_hf_embeddings.py

SansarK

Upload rag_with_mircosoftphi2_and_hf_embeddings.py

908c319 verified about 1 year ago

raw

history blame

2.65 kB

	# -- coding: utf-8 --
	"""RAG_with_MircosoftPhi2_and_HF_Embeddings.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/github/sumant1122/RAG-Phi2-LlamaIndex/blob/main/RAG_with_MircosoftPhi2_and_HF_Embeddings.ipynb
	"""

	!pip install -q pypdf
	!pip install -q python-dotenv
	!pip install -q llama-index
	!pip install -q llama-index-llms-huggingface
	!pip install -q llama-index-embeddings-huggingface
	!pip install -q gradio
	!pip install einops
	!pip install accelerate
	!pip install -q llama-cpp-python

	!pip install llama-index-llms-llama-cpp llama-index-embeddings-huggingface

	from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
	import torch

	documents = SimpleDirectoryReader("/content/rag").load_data()

	"""New sectiond"""

	from llama_index.core.prompts.prompts import SimpleInputPrompt
	from llama_index.llms.llama_cpp import LlamaCPP

	system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."

	# This will wrap the default prompts that are internal to llama-index
	query_wrapper_prompt = SimpleInputPrompt("<\|USER\|>{query_str}<\|ASSISTANT\|>")

	# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin"
	model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

	llm = LlamaCPP(
	# You can pass in the URL to a GGML model to download it automatically
	model_url=model_url,
	# optionally, you can set the path to a pre-downloaded model instead of model_url
	model_path=None,
	temperature=0.1,
	max_new_tokens=256,
	# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
	context_window=3900,
	# kwargs to pass to __call__()
	generate_kwargs={},
	# kwargs to pass to __init__()
	# set to at least 1 to use GPU
	model_kwargs={"n_gpu_layers": 1},
	verbose=True,
	)

	"""HuggingFace Embeddings"""

	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	# loads BAAI/bge-small-en-v1.5
	embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

	service_context = ServiceContext.from_defaults(
	chunk_size=256,
	llm=llm,
	embed_model=embed_model
	)

	"""predict"""

	index = VectorStoreIndex.from_documents(documents, service_context=service_context)

	query_engine = index.as_query_engine()

	def predict(input, history):
	response = query_engine.query(input)
	return str(response)

	"""Gradio"""

	import gradio as gr

	gr.ChatInterface(predict).launch(share=True)