File size: 4,571 Bytes
df83264 0de368b df83264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
os.system('pip install python-dotenv llama-index llama-index-llms-ollama llama-index-packs-ragatouille-retriever llama-index-packs-code-hierarchy llama-index-vector-stores-qdrant llama-index-embeddings-fastembed langchain llama-index-embeddings-langchain -U langchain-community sentence-transformers unstructured gradio ipython')
os.environ["HF_HOME"] = "weights"
os.environ["TORCH_HOME"] = "weights"
import gc
import re
import uuid
import textwrap
import subprocess
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from rag_101.retriever import (
load_embedding_model,
load_reranker_model
)
# allows nested access to the event loop
nest_asyncio.apply()
# setting up the llm
llm=Ollama(model="mistral", request_timeout=60.0)
# setting up the embedding model
lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)
# utility functions
def parse_github_url(url):
pattern = r"https://github\.com/([^/]+)/([^/]+)"
match = re.match(pattern, url)
return match.groups() if match else (None, None)
def clone_github_repo(repo_url):
try:
print('Cloning the repo ...')
result = subprocess.run(["git", "clone", repo_url], check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
print(f"Failed to clone repository: {e}")
return None
def validate_owner_repo(owner, repo):
return bool(owner) and bool(repo)
# Setup a query engine
def setup_query_engine(github_url):
owner, repo = parse_github_url(github_url)
if validate_owner_repo(owner, repo):
# Clone the GitHub repo & save it in a directory
input_dir_path = f"{repo}"
if os.path.exists(input_dir_path):
pass
else:
clone_github_repo(github_url)
loader = SimpleDirectoryReader(
input_dir = input_dir_path,
required_exts=[".py", ".ipynb", ".js", ".ts", ".md"],
recursive=True
)
try:
docs = loader.load_data()
# ====== Create vector store and upload data ======
Settings.embed_model = embed_model
index = VectorStoreIndex.from_documents(docs, show_progress=True)
# TODO try async index creation for faster emebdding generation & persist it to memory!
# index = VectorStoreIndex(docs, use_async=True)
# ====== Setup a query engine ======
Settings.llm = llm
query_engine = index.as_query_engine(similarity_top_k=4)
# ====== Customise prompt template ======
qa_prompt_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts(
{"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)
if docs:
print("Data loaded successfully!!")
print("Ready to chat!!")
else:
print("No data found, check if the repository is not empty!")
return query_engine
except Exception as e:
print(f"An error occurred: {e}")
else:
print('Invalid github repo, try again!')
return None
# Provide url to the repository you want to chat with
github_url = "https://github.com/Aniket23160/Pose-Graph-SLAM"
query_engine = setup_query_engine(github_url=github_url)
print("----------------------------------------------------------------")
query='What is this repo about?'
print(f"Question: {query}")
response = query_engine.query(query)
print(f"Answer: {response}") |