import json from pathlib import Path import gradio as gr import numpy as np from app.models.text_encoder import TextEncoder SUMMARY_DIR = Path("data/summaries") EMBEDDING_DIR = Path("data/embeddings") NAME_MAP_FILE = Path("data/name_map.json") def search_memes(prompt, top_k=10): """ Search for memes based on the input prompt. Args: prompt: The text prompt to search for top_k: Number of results to return Returns: List of dictionaries containing search results """ # Initialize results list results = [] # Get the embedding file paths embedding_paths = list(EMBEDDING_DIR.glob("*.npy")) # Load the embeddings embeddings = [np.load(path) for path in embedding_paths] # Load the text encoder text_encoder = TextEncoder() # Generate embeddings for the prompt prompt_embedding = text_encoder.encode(prompt) # Calculate similarities similarities = np.dot(embeddings, prompt_embedding) / ( np.linalg.norm(embeddings, axis=1) * np.linalg.norm(prompt_embedding) ) # Get the top k indices top_k_indices = np.argsort(similarities)[-top_k:] # Load the summaries summaries = [] for path in SUMMARY_DIR.glob("*.txt"): with open(path, "r", encoding="utf-8") as f: summaries.append(f.read()) # Load the name map with open(NAME_MAP_FILE, "r") as f: name_map = json.load(f) # Process the top k results for i, index in enumerate(top_k_indices[::-1]): try: result = { "rank": i + 1, "similarity": round(float(similarities[index]), 3), "filename": embedding_paths[index].stem, "original_filename": name_map.get( embedding_paths[index].stem, "Unknown" ), "summary": summaries[index] if index < len(summaries) else "No summary available", } results.append(result) except (IndexError, KeyError) as e: results.append( {"rank": i + 1, "error": f"Error processing result {i + 1}: {str(e)}"} ) return results def format_results(results): """Format the results for display in the Gradio interface""" html_output = "" for result in results: if "error" in result: html_output += "
Rank {result['rank']}: {result['error']}
" html_output += "Rank {result['rank']} (Similarity: {result['similarity']})
" html_output += f"File: {result['original_filename']}
" html_output += f"Summary: {result['summary']}
" html_output += "