gitbot / main.py
S0ham075's picture
first
b7064d3
import os
import shutil
import git
from urllib.parse import urlparse
local_dir = os.getcwd()
branch = None
# Function to extract repository name from URL
def get_repo_name(url):
parsed_url = urlparse(url)
# Extract the base name from the path (which is usually the repository name)
repo_name = os.path.basename(parsed_url.path)
# Remove the ".git" extension if it exists
repo_name = repo_name[:-4]
return repo_name
# Function to clone a Git repository
def clone_repo(url):
try:
path = os.path.join(local_dir,"staging",get_repo_name(url))
# Check if the repository already exists in the specified path
if os.path.exists(path):
print(f"{get_repo_name(url)} already added in db")
return False
repo = git.Repo.clone_from(url,path)
global branch
branch = repo.head.reference
print(f"{get_repo_name(url)} cloned succesfully")
return True
except Exception as e :
print(f"Error cloning the git repository: {e}")
return False
def delete_cloned_repo(url):
local_path = os.path.join(local_dir,"staging",get_repo_name(url))
try:
# Check if the local path exists
if os.path.exists(local_path):
# Use shutil.rmtree to remove the entire directory
shutil.rmtree(local_path,ignore_errors=True)
print(f"Repository at {local_path} successfully deleted.")
else:
print(f"Repository at {local_path} does not exist.")
except Exception as e:
print(f"Error deleting repository: {e}")
from langchain_community.document_loaders import GitLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
import qdrant_client
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 1000,
chunk_overlap = 20,
)
# from langchain_together.embeddings import TogetherEmbeddings
# embeddings2 = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval",together_api_key="d8ec7106bd0c268bf4672dba83272b86054fbe849eba82f3f75ceb17e6d57eb0")
client = qdrant_client.QdrantClient(
os.getenv("QDRANT_HOST"),
api_key=os.getenv("QDRANT_API_KEY")
)
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5")
vectorstore = None
def load_repo(url):
collection_config = qdrant_client.http.models.VectorParams(
size=384, # 768 for instructor-xl, 1536 for OpenAI
distance=qdrant_client.http.models.Distance.COSINE
)
client.recreate_collection(
collection_name=get_repo_name(url),
vectors_config=collection_config
)
vectorstore = Qdrant(
client=client,
collection_name=get_repo_name(url),
embeddings=embeddings
)
print("collection created")
try:
loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),)
data = loader.load()
chunks = text_splitter.split_documents(data)
print("chunks created")
vectorstore.add_documents(chunks)
return True
except Exception as e:
print(f"Error loading and indexing repository: {e}")
return False
def repository_loader(url):
result = False
if(clone_repo(url)):
result = load_repo(url)
if result :
delete_cloned_repo(url)
print('HELLO FROM CONTAINER')
#answer_query("How is the routing done in this project and what are the routes used",'https://github.com/s0ham075/Google-Docs-Frontend.git')
# delete_cloned_repo()