Codebert-Repo-Analyzer / github_st.py
heaversm's picture
User specified username, repo, and file type selector. Need to deal with access token
65ef3b6
raw
history blame
2.73 kB
import streamlit as st
import os
from dotenv import load_dotenv
from langchain.document_loaders import GithubFileLoader
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
load_dotenv()
#get the GITHUB_ACCESS_TOKEN from the .env file
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
GITHUB_BASE_URL = "https://github.com/"
@st.cache_resource
def get_hugging_face_model():
model_name = "mchochlov/codebert-base-cd-ft"
hf = HuggingFaceEmbeddings(model_name=model_name)
return hf
def get_similar_files(query, db, embeddings):
# embedding_vector = embeddings.embed_query(query)
# docs_and_scores = db.similarity_search_by_vector(embedding_vector, k = 10)
docs_and_scores = db.similarity_search_with_score(query)
return docs_and_scores
st.title("Find Similar Code")
#streamlit text input for USER
USER = st.text_input("Enter the Github User", value = "heaversm")
#streamlit text input for REPO
REPO = st.text_input("Enter the Github Repository", value = "gdrive-docker")
#streamlit file type selector
FILE_TYPES_TO_LOAD = st.multiselect("Select File Types", [".py", ".ts",".js",".css",".html"], default = [".py"])
text_input = st.text_area("Enter a Code Example", value =
"""
def create_app():
app = connexion.FlaskApp(__name__, specification_dir="../.openapi")
app.add_api(
API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app")
)
""", height = 330
)
button = st.button("Find Similar Code")
if button:
loader = GithubFileLoader(
#repo is USER/REPO
repo=f"{USER}/{REPO}",
access_token=GITHUB_ACCESS_TOKEN,
github_api_url="https://api.github.com",
file_filter=lambda file_path: file_path.endswith(
tuple(FILE_TYPES_TO_LOAD)
)
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embedding_vector = get_hugging_face_model()
db = FAISS.from_documents(docs, embedding_vector)
query = text_input
results_with_scores = get_similar_files(query, db, embedding_vector)
for doc, score in results_with_scores:
print(f"Metadata: {doc.metadata}, Score: {score}")
top_file_path = results_with_scores[0][0].metadata['path']
top_file_content = results_with_scores[0][0].page_content
top_file_score = results_with_scores[0][1]
top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}"
# write a clickable link in streamlit
st.markdown(f"[Top file link]({top_file_link})")
else:
st.info("Please Submit a Code Sample")