File size: 4,552 Bytes
138d490
65772d2
 
04e99d1
 
65772d2
 
 
 
04e99d1
 
138d490
65772d2
 
 
 
 
138d490
04e99d1
 
 
 
138d490
 
 
 
 
 
 
65772d2
 
 
138d490
04e99d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65772d2
 
138d490
91b8fbc
 
 
04e99d1
 
 
 
 
 
 
 
 
 
 
 
162428e
90b6409
64dce86
04e99d1
69b0a93
64dce86
65772d2
04e99d1
65772d2
04e99d1
 
65772d2
 
 
 
 
 
 
 
 
 
 
 
 
138d490
65772d2
04e99d1
65772d2
04e99d1
 
 
138d490
04e99d1
 
 
 
 
 
 
 
 
138d490
65772d2
04e99d1
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import streamlit as st
import os
from dotenv import load_dotenv
# from langchain.document_loaders import GithubFileLoader
from langchain_community.document_loaders import GithubFileLoader
# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from github import Github
from github import Auth

load_dotenv()

#get the GITHUB_ACCESS_TOKEN from the .env file
GITHUB_ACCESS_TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")
GITHUB_BASE_URL = "https://github.com/"

# initialize Github
auth = Auth.Token(GITHUB_ACCESS_TOKEN)
g = Github(auth=auth)


@st.cache_resource
def get_hugging_face_model():
  model_name = "mchochlov/codebert-base-cd-ft"
  hf = HuggingFaceEmbeddings(model_name=model_name)
  return hf

def get_similar_files(query, db, embeddings):
  docs_and_scores = db.similarity_search_with_score(query)
  return docs_and_scores

def fetch_repos(username):
  print(f"Fetching repositories for user: {username}")
  try:
    user = g.get_user(username)
    print(f"User: {user}")
    return [repo.name for repo in user.get_repos()]
  except Exception as e:
    st.error(f"Error fetching repositories: {e}")
    return []

def get_file_contributors(repo_name, file_path):
    try:
        repo = g.get_repo(f"{USER}/{repo_name}")
        commits = repo.get_commits(path=file_path)
        contributors = {}
        for commit in commits:
            author = commit.author.login if commit.author else "Unknown"
            if author in contributors:
                contributors[author] += 1
            else:
                contributors[author] = 1
        return contributors
    except Exception as e:
        st.error(f"Error fetching contributors: {e}")
        return {}

# Initialize session state for repositories
if "repos" not in st.session_state:
    st.session_state.repos = []

# STREAMLIT INTERFACE
st.title("Find Similar Code")

st.markdown("This app takes a code sample you provide, and finds similar code in a Github repository.")
st.markdown("This functionality could ideally be implemented across multiple repos to allow you to find helpful examples of how to implement the code you are working on writing, or identify other code contributors who could help you resolve your issues")

USER = st.text_input("Enter the Github User", value = "Satttoshi")

fetch_repos_button = st.button("Fetch Repositories")

if fetch_repos_button:
    st.session_state.repos = fetch_repos(USER)


REPO = st.selectbox("Select a Github Repository", options=st.session_state.repos)


FILE_TYPES_TO_LOAD = st.multiselect("Select File Types", [".py", ".ts",".js",".css",".html"], default = [".ts"])

text_input = st.text_area("Enter a Code Example", value =
"""

""", height = 330
)

find_similar_code_button = st.button("Find Similar Code")

if find_similar_code_button:
  print(f"Searching for similar code in {USER}/{REPO}")
  loader = GithubFileLoader(
    repo=f"{USER}/{REPO}",
    access_token=GITHUB_ACCESS_TOKEN,
    github_api_url="https://api.github.com",
    file_filter=lambda file_path: file_path.endswith(
      tuple(FILE_TYPES_TO_LOAD)
    )
  )
  documents = loader.load()
  text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  docs = text_splitter.split_documents(documents)
  embedding_vector = get_hugging_face_model()
  db = FAISS.from_documents(docs, embedding_vector)
  query = text_input
  results_with_scores = get_similar_files(query, db, embedding_vector)
  results_with_scores = results_with_scores[:5] #limit to 5 results
  for doc, score in results_with_scores:
    #print all metadata info in the doc.metadata dictionary
    # for key, value in doc.metadata.items():
    #     print(f"{key}: {value}")

    path = doc.metadata['path']
    content = doc.page_content
    score = round(float(score), 2)
    contributors = get_file_contributors(REPO, path)
    print(f"Path: {doc.metadata['path']}, Score: {score}, Contributors: {contributors}")
    file_link = f"{GITHUB_BASE_URL}{USER}/{REPO}/blob/main/{path}"
    st.markdown(f"[{path}]({file_link})")
    for contributor, count in contributors.items():
        st.write(f"* Contributor: [{contributor}](https://github.com/{contributor}), Commits: {count}")

else:
  st.info("Please Submit a Code Sample to Find Similar Code")

#https://github.com/heaversm/gdrive-docker/blob/main/gdrive/provider/__init__.py
#https://github.com/heaversm/gdrive-docker/blob/main/gdrive/provider/__init__.py