Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""RAG_with_MircosoftPhi2_and_HF_Embeddings.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/github/sumant1122/RAG-Phi2-LlamaIndex/blob/main/RAG_with_MircosoftPhi2_and_HF_Embeddings.ipynb | |
""" | |
!pip install -q pypdf | |
!pip install -q python-dotenv | |
!pip install -q llama-index | |
!pip install -q llama-index-llms-huggingface | |
!pip install -q llama-index-embeddings-huggingface | |
!pip install -q gradio | |
!pip install einops | |
!pip install accelerate | |
!pip install -q llama-cpp-python | |
!pip install llama-index-llms-llama-cpp llama-index-embeddings-huggingface | |
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext | |
import torch | |
documents = SimpleDirectoryReader("/content/rag").load_data() | |
"""New sectiond""" | |
from llama_index.core.prompts.prompts import SimpleInputPrompt | |
from llama_index.llms.llama_cpp import LlamaCPP | |
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided." | |
# This will wrap the default prompts that are internal to llama-index | |
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>") | |
# model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin" | |
model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf" | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
model_url=model_url, | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=None, | |
temperature=0.1, | |
max_new_tokens=256, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=3900, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# kwargs to pass to __init__() | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": 1}, | |
verbose=True, | |
) | |
"""HuggingFace Embeddings""" | |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
# loads BAAI/bge-small-en-v1.5 | |
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
service_context = ServiceContext.from_defaults( | |
chunk_size=256, | |
llm=llm, | |
embed_model=embed_model | |
) | |
"""predict""" | |
index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
query_engine = index.as_query_engine() | |
def predict(input, history): | |
response = query_engine.query(input) | |
return str(response) | |
"""Gradio""" | |
import gradio as gr | |
gr.ChatInterface(predict).launch(share=True) |