blackbox / app.py
anikettty's picture
Update app.py
0de368b verified
import os
os.system('pip install python-dotenv llama-index llama-index-llms-ollama llama-index-packs-ragatouille-retriever llama-index-packs-code-hierarchy llama-index-vector-stores-qdrant llama-index-embeddings-fastembed langchain llama-index-embeddings-langchain -U langchain-community sentence-transformers unstructured gradio ipython')
os.environ["HF_HOME"] = "weights"
os.environ["TORCH_HOME"] = "weights"
import gc
import re
import uuid
import textwrap
import subprocess
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.core import PromptTemplate
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
from llama_index.core.storage.storage_context import StorageContext
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.embeddings.langchain import LangchainEmbedding
from rag_101.retriever import (
load_embedding_model,
load_reranker_model
)
# allows nested access to the event loop
nest_asyncio.apply()
# setting up the llm
llm=Ollama(model="mistral", request_timeout=60.0)
# setting up the embedding model
lc_embedding_model = load_embedding_model()
embed_model = LangchainEmbedding(lc_embedding_model)
# utility functions
def parse_github_url(url):
pattern = r"https://github\.com/([^/]+)/([^/]+)"
match = re.match(pattern, url)
return match.groups() if match else (None, None)
def clone_github_repo(repo_url):
try:
print('Cloning the repo ...')
result = subprocess.run(["git", "clone", repo_url], check=True, text=True, capture_output=True)
except subprocess.CalledProcessError as e:
print(f"Failed to clone repository: {e}")
return None
def validate_owner_repo(owner, repo):
return bool(owner) and bool(repo)
# Setup a query engine
def setup_query_engine(github_url):
owner, repo = parse_github_url(github_url)
if validate_owner_repo(owner, repo):
# Clone the GitHub repo & save it in a directory
input_dir_path = f"{repo}"
if os.path.exists(input_dir_path):
pass
else:
clone_github_repo(github_url)
loader = SimpleDirectoryReader(
input_dir = input_dir_path,
required_exts=[".py", ".ipynb", ".js", ".ts", ".md"],
recursive=True
)
try:
docs = loader.load_data()
# ====== Create vector store and upload data ======
Settings.embed_model = embed_model
index = VectorStoreIndex.from_documents(docs, show_progress=True)
# TODO try async index creation for faster emebdding generation & persist it to memory!
# index = VectorStoreIndex(docs, use_async=True)
# ====== Setup a query engine ======
Settings.llm = llm
query_engine = index.as_query_engine(similarity_top_k=4)
# ====== Customise prompt template ======
qa_prompt_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information above I want you to think step by step to answer the query in a crisp manner, incase case you don't know the answer say 'I don't know!'.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts(
{"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)
if docs:
print("Data loaded successfully!!")
print("Ready to chat!!")
else:
print("No data found, check if the repository is not empty!")
return query_engine
except Exception as e:
print(f"An error occurred: {e}")
else:
print('Invalid github repo, try again!')
return None
# Provide url to the repository you want to chat with
github_url = "https://github.com/Aniket23160/Pose-Graph-SLAM"
query_engine = setup_query_engine(github_url=github_url)
print("----------------------------------------------------------------")
query='What is this repo about?'
print(f"Question: {query}")
response = query_engine.query(query)
print(f"Answer: {response}")