heaversm's picture
update main app file with new github streamlit code
65772d2
raw
history blame
2.52 kB
import streamlit as st
import os
from dotenv import load_dotenv
from langchain.document_loaders import GithubFileLoader
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
load_dotenv()
#get the GITHUB_ACCESS_TOKEN from the .env file
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
GITHUB_BASE_URL = "https://github.com/"
@st.cache_resource
def get_hugging_face_model():
model_name = "mchochlov/codebert-base-cd-ft"
hf = HuggingFaceEmbeddings(model_name=model_name)
return hf
def get_similar_files(query, db, embeddings):
docs_and_scores = db.similarity_search_with_score(query)
return docs_and_scores
# STREAMLIT INTERFACE
st.title("Find Similar Code")
USER = st.text_input("Enter the Github User", value = "heaversm")
REPO = st.text_input("Enter the Github Repository", value = "gdrive-docker")
FILE_TYPES_TO_LOAD = st.multiselect("Select File Types", [".py", ".ts",".js",".css",".html"], default = [".py"])
text_input = st.text_area("Enter a Code Example", value =
"""
def create_app():
app = connexion.FlaskApp(__name__, specification_dir="../.openapi")
app.add_api(
API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app")
)
""", height = 330
)
button = st.button("Find Similar Code")
if button:
loader = GithubFileLoader(
repo=f"{USER}/{REPO}",
access_token=GITHUB_ACCESS_TOKEN,
github_api_url="https://api.github.com",
file_filter=lambda file_path: file_path.endswith(
tuple(FILE_TYPES_TO_LOAD)
)
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embedding_vector = get_hugging_face_model()
db = FAISS.from_documents(docs, embedding_vector)
query = text_input
results_with_scores = get_similar_files(query, db, embedding_vector)
for doc, score in results_with_scores:
print(f"Path: {doc.metadata['path']}, Score: {score}")
top_file_path = results_with_scores[0][0].metadata['path']
top_file_content = results_with_scores[0][0].page_content
top_file_score = results_with_scores[0][1]
top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}"
# write a clickable link in streamlit
st.markdown(f"[Top file link]({top_file_link})")
else:
st.info("Please Submit a Code Sample to Find Similar Code")