heaversm's picture
retitle, remove balloons
d2115ee
raw
history blame
2.16 kB
import streamlit as st
from bs4 import BeautifulSoup
from langchain.embeddings import HuggingFaceEmbeddings
import pickle
import torch
import io
from langchain.vectorstores import FAISS
class CPU_Unpickler(pickle.Unpickler):
def find_class(self, module, name):
if module == 'torch.storage' and name == '_load_from_bytes':
return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
else: return super().find_class(module, name)
@st.cache_resource
def get_hugging_face_model():
model_name = "mchochlov/codebert-base-cd-ft"
hf = HuggingFaceEmbeddings(model_name=model_name)
return hf
@st.cache_resource
def get_db():
with open("codesearchdb.pickle", "rb") as f:
db = CPU_Unpickler(f).load()
return db
def get_similar_links(query, db, embeddings):
embedding_vector = embeddings.embed_query(query)
docs_and_scores = db.similarity_search_by_vector(embedding_vector, k = 10)
hrefs = []
for docs in docs_and_scores:
html_doc = docs.page_content
soup = BeautifulSoup(html_doc, 'html.parser')
href = [a['href'] for a in soup.find_all('a', href=True)]
hrefs.append(href)
links = []
for href_list in hrefs:
for link in href_list:
links.append(link)
return links
embedding_vector = get_hugging_face_model()
db = FAISS.load_local("code_sim_index", embedding_vector, allow_dangerous_deserialization=True)
st.title("Find Similar Code")
text_input = st.text_area("Enter a Code Example", value =
"""
class Solution:
def subsets(self, nums: List[int]) -> List[List[int]]:
outputs = []
def backtrack(k, index, subSet):
if index == k:
outputs.append(subSet[:])
return
for i in range(index, len(nums)):
backtrack(k, i + 1, subSet + [nums[i]])
for j in range(len(nums) + 1):
backtrack(j, 0, [])
return outputs
""", height = 330
)
button = st.button("Find Similar Questions")
if button:
query = text_input
answer = get_similar_links(query, db, embedding_vector)
for link in set(answer):
st.write(link)
else:
st.info("Please Input Valid Text")