File size: 2,879 Bytes
138d490
65772d2
 
 
 
 
 
 
138d490
65772d2
 
 
 
 
138d490
 
 
 
 
 
 
 
65772d2
 
 
138d490
65772d2
 
138d490
91b8fbc
 
 
65772d2
 
 
162428e
90b6409
64dce86
65772d2
 
 
 
 
69b0a93
64dce86
65772d2
 
 
 
69b0a93
65772d2
 
 
 
 
 
 
 
 
 
 
 
 
138d490
65772d2
 
 
 
 
 
 
 
 
 
138d490
 
65772d2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import streamlit as st
import os
from dotenv import load_dotenv
from langchain.document_loaders import GithubFileLoader
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter

load_dotenv()

#get the GITHUB_ACCESS_TOKEN from the .env file
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
GITHUB_BASE_URL = "https://github.com/"


@st.cache_resource
def get_hugging_face_model():
  model_name = "mchochlov/codebert-base-cd-ft"
  hf = HuggingFaceEmbeddings(model_name=model_name)
  return hf

def get_similar_files(query, db, embeddings):
  docs_and_scores = db.similarity_search_with_score(query)
  return docs_and_scores

# STREAMLIT INTERFACE
st.title("Find Similar Code")

st.markdown("This app takes a code sample you provide, and finds similar code in a Github repository.")
st.markdown("This functionality could ideally be implemented across multiple repos to allow you to find helpful examples of how to implement the code you are working on writing, or identify other code contributors who could help you resolve your issues")

USER = st.text_input("Enter the Github User", value = "heaversm")
REPO = st.text_input("Enter the Github Repository", value = "gdrive-docker")
FILE_TYPES_TO_LOAD = st.multiselect("Select File Types", [".py", ".ts",".js",".css",".html"], default = [".py"])

text_input = st.text_area("Enter a Code Example", value =
"""
def create_app():
    app = connexion.FlaskApp(__name__, specification_dir="../.openapi")
    app.add_api(
        API_VERSION, resolver=connexion.resolver.RelativeResolver("provider.app")
    )
""", height = 330
)

button = st.button("Find Similar Code")


if button:
  loader = GithubFileLoader(
    repo=f"{USER}/{REPO}",
    access_token=GITHUB_ACCESS_TOKEN,
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(
      tuple(FILE_TYPES_TO_LOAD)
    )
  )
  documents = loader.load()
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  docs = text_splitter.split_documents(documents)
  embedding_vector = get_hugging_face_model()
  db = FAISS.from_documents(docs, embedding_vector)
  query = text_input
  results_with_scores = get_similar_files(query, db, embedding_vector)
  for doc, score in results_with_scores:
    print(f"Path: {doc.metadata['path']}, Score: {score}")

  top_file_path = results_with_scores[0][0].metadata['path']
  top_file_content = results_with_scores[0][0].page_content
  top_file_score = results_with_scores[0][1]
  top_file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{top_file_path}"
  # write a clickable link in streamlit
  st.markdown(f"[Top file link]({top_file_link})")


else:
  st.info("Please Submit a Code Sample to Find Similar Code")