# import altair as alt # import numpy as np # import pandas as pd # import streamlit as st # """ # # Welcome to Streamlit! # Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:. # If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community # forums](https://discuss.streamlit.io). # In the meantime, below is an example of what you can do with just a few lines of code: # """ # num_points = st.slider("Number of points in spiral", 1, 10000, 1100) # num_turns = st.slider("Number of turns in spiral", 1, 300, 31) # indices = np.linspace(0, 1, num_points) # theta = 2 * np.pi * num_turns * indices # radius = indices # x = radius * np.cos(theta) # y = radius * np.sin(theta) # df = pd.DataFrame({ # "x": x, # "y": y, # "idx": indices, # "rand": np.random.randn(num_points), # }) # st.altair_chart(alt.Chart(df, height=700, width=700) # .mark_point(filled=True) # .encode( # x=alt.X("x", axis=None), # y=alt.Y("y", axis=None), # color=alt.Color("idx", legend=None, scale=alt.Scale()), # size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])), # )) import streamlit as st import numpy as np import pickle from typing import Dict, List, Any import random from sentence_transformers import SentenceTransformer from qdrant_client import models, QdrantClient import emoji as em import warnings warnings.filterwarnings('ignore') # A function to load the emoji dictionary @st.cache_data(show_spinner=False) def load_dictionary(file_path: str) -> Dict[str, Dict[str, Any]]: """Load the emoji dictionary from a pickle file.""" with open(file_path, 'rb') as file: emoji_dict = pickle.load(file) return emoji_dict # A function to load the sentence encoder model @st.cache_resource(show_spinner=False) def load_encoder(model_name: str) -> SentenceTransformer: """Load a sentence encoder model from Hugging Face Hub.""" sentence_encoder = SentenceTransformer(model_name) #st.session_state.sentence_encoder = sentence_encoder return sentence_encoder # A function to load the Qdrant vector DB client @st.cache_resource(show_spinner=False) def load_qdrant_client(emoji_dict: Dict[str, Dict[str, Any]]) -> QdrantClient: """ Load a Qdrant client and populate the database with embeddings. """ # Setup the Qdrant client and populate the database vector_DB_client = QdrantClient(":memory:") embedding_dict = { emoji: np.array(metadata['embedding']) for emoji, metadata in emoji_dict.items() } # Remove the embeddings from the dictionary so it can be used # as payload in Qdrant for emoji in list(emoji_dict): del emoji_dict[emoji]['embedding'] embedding_dim = next(iter(embedding_dict.values())).shape[0] # Create collection in Qdrant vector_DB_client.create_collection( collection_name="EMOJIS", vectors_config=models.VectorParams( size=embedding_dim, distance=models.Distance.COSINE ), ) # Upload points to the collection vector_DB_client.upload_points( collection_name="EMOJIS", points=[ models.PointStruct( id=idx, vector=embedding_dict[emoji].tolist(), payload=emoji_dict[emoji] ) for idx, emoji in enumerate(emoji_dict) ], ) #st.session_state.vector_DB_client = vector_DB_client return vector_DB_client # for the offline version this code was faster, but resulted in a resource # limits error from online streamlit app # it seems that each user has its own session, thus caching does not help # much here, and the resources are loaded for each user # def load_resources(): # if ('vector_DB_client' not in st.session_state # or 'sentence_encoder' not in st.session_state): # # Load emoji dictionary # with open('emoji_embeddings_dict.pkl', 'rb') as file: # emoji_dict = pickle.load(file) # # Load sentence encoder # embedding_model = 'paraphrase-multilingual-MiniLM-L12-v2' # sentence_encoder = SentenceTransformer(embedding_model) # st.session_state.sentence_encoder = sentence_encoder # # Setup the Qdrant client and populate the database # vector_DB_client = QdrantClient(":memory:") # embedding_dict = { # emoji: np.array(data['embedding']) # for emoji, data in emoji_dict.items() # } # for emoji in list(emoji_dict): # del emoji_dict[emoji]['embedding'] # embedding_dim = next(iter(embedding_dict.values())).shape[0] # # Create collection in Qdrant # vector_DB_client.create_collection( # collection_name="EMOJIS", # vectors_config=models.VectorParams( # size=embedding_dim, # distance=models.Distance.COSINE # ), # ) # # Upload points to the collection # vector_DB_client.upload_points( # collection_name="EMOJIS", # points=[ # models.PointStruct( # id=idx, # vector=embedding_dict[emoji].tolist(), # payload=emoji_dict[emoji] # ) # for idx, emoji in enumerate(emoji_dict) # ], # ) # st.session_state.vector_DB_client = vector_DB_client def retrieve_relevant_emojis( embedding_model: SentenceTransformer, vector_DB_client: QdrantClient, query: str) -> List[str]: """ Return similar emojis to the query using the sentence encoder and Qdrant. """ # Embed the query query_vector = embedding_model.encode(query).tolist() hits = vector_DB_client.search( collection_name="EMOJIS", query_vector=query_vector, limit=50, ) search_emojis = [] # only add to list if it is not already an item in the list for hit in hits: if hit.payload['Emoji'] not in search_emojis: search_emojis.append(hit.payload['Emoji']) return search_emojis def render_results( embedding_model: SentenceTransformer, vector_DB_client: QdrantClient, query: str, emojis_to_render: List[str] = None,) -> None: """ Render the search results in the Streamlit app. """ # Retrieve relevant emojis if emojis_to_render is None: emojis_to_render = retrieve_relevant_emojis( embedding_model, vector_DB_client, query ) #with st.empty(): # Display results as HTML #placeholder = st.empty() if emojis_to_render: st.markdown( '