Spaces:

Lazyhope
/

RepoSnipy

Runtime error

App Files Files Community

Lazyhope commited on Jun 2, 2023

Commit

18247d7

1 Parent(s): 71a5415

Use docarray.index.InMemoryExactNNIndex instead of DocList as index

Browse files

Files changed (1) hide show

app.py +62 -50

app.py CHANGED Viewed

@@ -4,25 +4,34 @@ from typing import List, Optional
 import pandas as pd
 import streamlit as st
-from docarray import BaseDoc, DocList
 from docarray.typing import TorchTensor
-from docarray.utils.find import find
 from transformers import pipeline
-DATASET_PATH = Path(__file__).parent.joinpath("data/index.bin")
 @st.cache_resource(show_spinner="Loading dataset...")
 def load_index():
     class RepoDoc(BaseDoc):
         name: str
-        topics: list  # TODO: List[str]
         stars: int
         license: str
         code_embedding: Optional[TorchTensor[768]]
         doc_embedding: Optional[TorchTensor[768]]
-    return DocList[RepoDoc].load_binary(DATASET_PATH)
 @st.cache_resource(show_spinner="Loading RepoSim pipeline...")
@@ -31,7 +40,6 @@ def load_model():
         model="Lazyhope/RepoSim",
         trust_remote_code=True,
         device_map="auto",
-        use_auth_token=st.secrets.hf_token,  # TODO: delete this line when the pipeline is public
     )
@@ -52,8 +60,8 @@ def run_model(_model, repo_name, github_token):
 def run_search(index, query, search_field, limit):
-    top_matches, scores = find(
-        index=index, query=query, search_field=search_field, limit=limit
     )
     search_results = top_matches.to_dataframe()
@@ -62,7 +70,7 @@ def run_search(index, query, search_field, limit):
     return search_results
-index = load_index()
 model = load_model()
 with st.sidebar:
@@ -81,12 +89,14 @@ with st.sidebar:
         value=10,
         step=1,
         key="search_results_limit",
     )
     st.multiselect(
         label="Display columns",
         options=["scores", "name", "topics", "stars", "license"],
-        default=["scores", "name", "topics"],
         key="display_columns",
     )
@@ -107,7 +117,7 @@ st.checkbox(
     label="Add/Update this repo to the index",
     value=False,
     key="update_index",
-    help="Update index by generating embeddings for the latest version of this repo",
 )
@@ -117,55 +127,57 @@ if search:
     if match_res is not None:
         repo_name = f"{match_res.group('owner')}/{match_res.group('repo')}"
-        doc_index = -1
-        update_index = st.session_state.update_index
-        try:
-            doc_index = index.name.index(repo_name)
-            assert update_index is False
-            repo_doc = index[doc_index]
-        except (ValueError, AssertionError):
             repo_info = run_model(model, repo_name, st.session_state.github_token)
             if repo_info is None:
                 st.error("Repo not found or invalid GitHub token!")
                 st.stop()
-            repo_doc = index.doc_type(
-                name=repo_info["name"],
-                topics=repo_info["topics"],
-                stars=repo_info["stars"],
-                license=repo_info["license"],
-                code_embedding=repo_info["mean_code_embedding"],
-                doc_embedding=repo_info["mean_doc_embedding"],
-            )
-        if update_index:
-            if not repo_doc.license:
-                st.warning("License is missing in this repo!")
-            if doc_index == -1:
-                index.append(repo_doc)
-                st.success("Repo added to the index!")
             else:
-                index[doc_index] = repo_doc
                 st.success("Repo updated in the index!")
-        st.session_state["query"] = repo_doc
     else:
         st.error("Invalid input!")
-if "query" in st.session_state:
-    query = st.session_state.query
     limit = st.session_state.search_results_limit
     st.dataframe(
         pd.DataFrame(
             [
                 {
-                    "name": query.name,
-                    "topics": query.topics,
-                    "stars": query.stars,
-                    "license": query.license,
                 }
             ],
         )
@@ -174,14 +186,14 @@ if "query" in st.session_state:
     display_columns = st.session_state.display_columns
     code_sim_tab, doc_sim_tab = st.tabs(["Code Similarity", "Docstring Similarity"])
-    if query.code_embedding is not None:
-        code_sim_res = run_search(index, query, "code_embedding", limit)
         code_sim_tab.dataframe(code_sim_res[display_columns])
     else:
-        code_sim_tab.error("No code was extracted for this repo!")
-    if query.doc_embedding is not None:
-        doc_sim_res = run_search(index, query, "doc_embedding", limit)
         doc_sim_tab.dataframe(doc_sim_res[display_columns])
     else:
-        doc_sim_tab.error("No docstring was extracted for this repo!")

 import pandas as pd
 import streamlit as st
+from docarray import BaseDoc
 from docarray.typing import TorchTensor
+from docarray.index import InMemoryExactNNIndex
 from transformers import pipeline
+INDEX_PATH = Path(__file__).parent.joinpath("data/index.bin")
 @st.cache_resource(show_spinner="Loading dataset...")
 def load_index():
     class RepoDoc(BaseDoc):
         name: str
+        topics: list  # List[str]
         stars: int
         license: str
         code_embedding: Optional[TorchTensor[768]]
         doc_embedding: Optional[TorchTensor[768]]
+    default_doc = RepoDoc(
+        name="",
+        topics=[],
+        stars=0,
+        license="",
+        code_embedding=None,
+        doc_embedding=None,
+    )
+    return InMemoryExactNNIndex[RepoDoc](index_file_path=INDEX_PATH), default_doc
 @st.cache_resource(show_spinner="Loading RepoSim pipeline...")
         model="Lazyhope/RepoSim",
         trust_remote_code=True,
         device_map="auto",
     )
 def run_search(index, query, search_field, limit):
+    top_matches, scores = index.find(
+        query=query, search_field=search_field, limit=limit
     )
     search_results = top_matches.to_dataframe()
     return search_results
+index, default_doc = load_index()
 model = load_model()
 with st.sidebar:
         value=10,
         step=1,
         key="search_results_limit",
+        help="Limit the number of search results",
     )
     st.multiselect(
         label="Display columns",
         options=["scores", "name", "topics", "stars", "license"],
+        default=["scores", "name", "topics", "stars", "license"],
+        help="Select columns to display in the search results",
         key="display_columns",
     )
     label="Add/Update this repo to the index",
     value=False,
     key="update_index",
+    help="Encode the latest version of this repo and add/update it to the index",
 )
     if match_res is not None:
         repo_name = f"{match_res.group('owner')}/{match_res.group('repo')}"
+        records = index.filter({"name": {"$eq": repo_name}})
+        query_doc = default_doc.copy() if not records else records[0]
+        if st.session_state.update_index or not records:
             repo_info = run_model(model, repo_name, st.session_state.github_token)
             if repo_info is None:
                 st.error("Repo not found or invalid GitHub token!")
                 st.stop()
+            # Update document inplace
+            query_doc.name = repo_info["name"]
+            query_doc.topics = repo_info["topics"]
+            query_doc.stars = repo_info["stars"]
+            query_doc.license = repo_info["license"]
+            query_doc.code_embedding = repo_info["mean_code_embedding"]
+            query_doc.doc_embedding = repo_info["mean_doc_embedding"]
+        if st.session_state.update_index:
+            if not records:
+                if not query_doc.license:
+                    st.warning(
+                        "License is missing in this repo and will not be persisted!"
+                    )
+                elif (
+                    query_doc.code_embedding is None and query_doc.doc_embedding is None
+                ):
+                    st.warning(
+                        "This repo has no function code or docstring extracted and will not be persisted!"
+                    )
+                else:
+                    index.index(query_doc)
+                    st.success("Repo added to the index!")
             else:
                 st.success("Repo updated in the index!")
+            index.persist(file=INDEX_PATH)
+        st.session_state["query_doc"] = query_doc
     else:
         st.error("Invalid input!")
+if "query_doc" in st.session_state:
+    query_doc = st.session_state.query_doc
     limit = st.session_state.search_results_limit
     st.dataframe(
         pd.DataFrame(
             [
                 {
+                    "name": query_doc.name,
+                    "topics": query_doc.topics,
+                    "stars": query_doc.stars,
+                    "license": query_doc.license,
                 }
             ],
         )
     display_columns = st.session_state.display_columns
     code_sim_tab, doc_sim_tab = st.tabs(["Code Similarity", "Docstring Similarity"])
+    if query_doc.code_embedding is not None:
+        code_sim_res = run_search(index, query_doc, "code_embedding", limit)
         code_sim_tab.dataframe(code_sim_res[display_columns])
     else:
+        code_sim_tab.error("No function code was extracted for this repo!")
+    if query_doc.doc_embedding is not None:
+        doc_sim_res = run_search(index, query_doc, "doc_embedding", limit)
         doc_sim_tab.dataframe(doc_sim_res[display_columns])
     else:
+        doc_sim_tab.error("No function docstring was extracted for this repo!")