|
import os |
|
import shutil |
|
import git |
|
from urllib.parse import urlparse |
|
|
|
local_dir = os.getcwd() |
|
branch = None |
|
|
|
|
|
def get_repo_name(url): |
|
parsed_url = urlparse(url) |
|
|
|
repo_name = os.path.basename(parsed_url.path) |
|
|
|
repo_name = repo_name[:-4] |
|
return repo_name |
|
|
|
|
|
def clone_repo(url): |
|
try: |
|
path = os.path.join(local_dir,"staging",get_repo_name(url)) |
|
|
|
if os.path.exists(path): |
|
print(f"{get_repo_name(url)} already added in db") |
|
return False |
|
|
|
repo = git.Repo.clone_from(url,path) |
|
global branch |
|
branch = repo.head.reference |
|
print(f"{get_repo_name(url)} cloned succesfully") |
|
return True |
|
except Exception as e : |
|
print(f"Error cloning the git repository: {e}") |
|
return False |
|
|
|
def delete_cloned_repo(url): |
|
local_path = os.path.join(local_dir,"staging",get_repo_name(url)) |
|
try: |
|
|
|
if os.path.exists(local_path): |
|
|
|
shutil.rmtree(local_path,ignore_errors=True) |
|
print(f"Repository at {local_path} successfully deleted.") |
|
else: |
|
print(f"Repository at {local_path} does not exist.") |
|
except Exception as e: |
|
print(f"Error deleting repository: {e}") |
|
|
|
from langchain_community.document_loaders import GitLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_community.vectorstores import Qdrant |
|
import qdrant_client |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size = 1000, |
|
chunk_overlap = 20, |
|
) |
|
|
|
|
|
|
|
|
|
client = qdrant_client.QdrantClient( |
|
os.getenv("QDRANT_HOST"), |
|
api_key=os.getenv("QDRANT_API_KEY") |
|
) |
|
|
|
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings |
|
embeddings = FastEmbedEmbeddings(model_name="BAAI/bge-small-en-v1.5") |
|
vectorstore = None |
|
|
|
def load_repo(url): |
|
collection_config = qdrant_client.http.models.VectorParams( |
|
size=384, |
|
distance=qdrant_client.http.models.Distance.COSINE |
|
) |
|
|
|
client.recreate_collection( |
|
collection_name=get_repo_name(url), |
|
vectors_config=collection_config |
|
) |
|
vectorstore = Qdrant( |
|
client=client, |
|
collection_name=get_repo_name(url), |
|
embeddings=embeddings |
|
) |
|
print("collection created") |
|
try: |
|
loader = GitLoader(repo_path=os.path.join(local_dir,"staging",get_repo_name(url)), branch=branch, file_filter=lambda file_path: not file_path.endswith("package-lock.json"),) |
|
data = loader.load() |
|
chunks = text_splitter.split_documents(data) |
|
print("chunks created") |
|
vectorstore.add_documents(chunks) |
|
return True |
|
except Exception as e: |
|
print(f"Error loading and indexing repository: {e}") |
|
return False |
|
|
|
def repository_loader(url): |
|
result = False |
|
if(clone_repo(url)): |
|
result = load_repo(url) |
|
if result : |
|
delete_cloned_repo(url) |
|
|
|
|
|
|
|
print('HELLO FROM CONTAINER') |
|
|
|
|
|
|