Spaces:
Runtime error
Runtime error
import streamlit as st | |
import os | |
from dotenv import load_dotenv | |
from langchain.document_loaders import GithubFileLoader | |
# from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain_text_splitters import CharacterTextSplitter | |
load_dotenv() | |
#get the GITHUB_ACCESS_TOKEN from the .env file | |
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN") | |
GITHUB_BASE_URL = "https://github.com/" | |
def get_hugging_face_model(): | |
model_name = "mchochlov/codebert-base-cd-ft" | |
hf = HuggingFaceEmbeddings(model_name=model_name) | |
return hf | |
def get_similar_files(query, db, embeddings): | |
docs_and_scores = db.similarity_search_with_score(query) | |
return docs_and_scores | |
# STREAMLIT INTERFACE | |
st.title("Find Similar Code") | |
USER = st.text_input("Enter the Github User", value = "heaversm") | |
REPO = st.text_input("Enter the Github Repository", value = "gdrive-docker") | |
FILE_TYPES_TO_LOAD = st.multiselect("Select File Types", [".py", ".ts",".js",".css",".html"], default = [".py"]) | |
text_input = st.text_area("Enter a Code Example", value = | |
""" | |
def create_app(): | |
app = connexion.FlaskApp(__name__, specification_dir="../.openapi") | |
app.add_api( | |
API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app") | |
) | |
""", height = 330 | |
) | |
button = st.button("Find Similar Code") | |
if button: | |
loader = GithubFileLoader( | |
repo=f"{USER}/{REPO}", | |
access_token=GITHUB_ACCESS_TOKEN, | |
github_api_url="https://api.github.com", | |
file_filter=lambda file_path: file_path.endswith( | |
tuple(FILE_TYPES_TO_LOAD) | |
) | |
) | |
documents = loader.load() | |
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
docs = text_splitter.split_documents(documents) | |
embedding_vector = get_hugging_face_model() | |
db = FAISS.from_documents(docs, embedding_vector) | |
query = text_input | |
results_with_scores = get_similar_files(query, db, embedding_vector) | |
for doc, score in results_with_scores: | |
print(f"Path: {doc.metadata['path']}, Score: {score}") | |
top_file_path = results_with_scores[0][0].metadata['path'] | |
top_file_content = results_with_scores[0][0].page_content | |
top_file_score = results_with_scores[0][1] | |
top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}" | |
# write a clickable link in streamlit | |
st.markdown(f"[Top file link]({top_file_link})") | |
else: | |
st.info("Please Submit a Code Sample to Find Similar Code") |