import gradio as gr import pandas as pd import re from datasets import load_dataset from sentence_transformers import SentenceTransformer, util from sklearn.metrics.pairwise import cosine_similarity # Load your embedding model embedder = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) def search(query): # Load the dataset dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets") # Convert the dataset to a DataFrame df = pd.DataFrame(dataset['train']) # Combine relevant columns into a single column for text search df["combined"] = ( "hotel_name: " + df.hotel_name.astype(str).str.strip() + "; hotel_description: " + df.hotel_description.astype(str).str.strip() + "; country: " + df.country.astype(str).str.strip() + "; locality: " + df.locality.astype(str).str.strip() + "; price_range: " + df.price_range.astype(str).str.strip() + "; street_address: " + df.street_address.astype(str).str.strip() + "; review_title: " + df.review_title.astype(str).str.strip() + "; review_text: " + df.review_text.astype(str).str.strip() + "; rating_value: " + df.rating_value.astype(str).str.strip() ) # Clean the combined text data df['combined'] = df['combined'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', str(x))) # Define the number of results to return n = 5 # Embed the query using the embedding model query_embedding = embedder.encode(query) # Generate embeddings for the combined text data df['embedding'] = df['combined'].apply(lambda x: embedder.encode(x)) # Calculate similarity between the query and each row's combined text df["similarity"] = df['embedding'].apply(lambda x: cosine_similarity([x], [query_embedding]).flatten()[0]) # Sort the results by similarity and select the top 'n' results results = df.sort_values("similarity", ascending=False).head(n) # Prepare the results to be returned resultlist = [] hlist = [] for r in results.index: if results.hotel_name[r] not in hlist: # Ensure unique hotel names in the results smalldf = results.loc[results.hotel_name == results.hotel_name[r]] if smalldf.shape[0] > 3: smalldf = smalldf[:3] resultlist.append( { "name": results.hotel_name[r], "score": smalldf.similarity.iloc[0], "rating": smalldf.rating_value.max(), "relevant_reviews": smalldf.review_text.tolist() } ) hlist.append(results.hotel_name[r]) return resultlist # Create the Gradio interface demo = gr.Interface(fn=search, inputs="text", outputs="json") demo.launch()