Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import re | |
from datasets import load_dataset | |
from sentence_transformers import SentenceTransformer, util | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Load your embedding model | |
embedder = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) | |
def search(query): | |
# Load the dataset | |
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets") | |
# Convert the dataset to a DataFrame | |
df = pd.DataFrame(dataset['train']) | |
# Combine relevant columns into a single column for text search | |
df["combined"] = ( | |
"hotel_name: " + df.hotel_name.astype(str).str.strip() + | |
"; hotel_description: " + df.hotel_description.astype(str).str.strip() + | |
"; country: " + df.country.astype(str).str.strip() + | |
"; locality: " + df.locality.astype(str).str.strip() + | |
"; price_range: " + df.price_range.astype(str).str.strip() + | |
"; street_address: " + df.street_address.astype(str).str.strip() + | |
"; review_title: " + df.review_title.astype(str).str.strip() + | |
"; review_text: " + df.review_text.astype(str).str.strip() + | |
"; rating_value: " + df.rating_value.astype(str).str.strip() | |
) | |
# Clean the combined text data | |
df['combined'] = df['combined'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', str(x))) | |
# Define the number of results to return | |
n = 5 | |
# Embed the query using the embedding model | |
query_embedding = embedder.encode(query) | |
# Generate embeddings for the combined text data | |
df['embedding'] = df['combined'].apply(lambda x: embedder.encode(x)) | |
# Calculate similarity between the query and each row's combined text | |
df["similarity"] = df['embedding'].apply(lambda x: cosine_similarity([x], [query_embedding]).flatten()[0]) | |
# Sort the results by similarity and select the top 'n' results | |
results = df.sort_values("similarity", ascending=False).head(n) | |
# Prepare the results to be returned | |
resultlist = [] | |
hlist = [] | |
for r in results.index: | |
if results.hotel_name[r] not in hlist: # Ensure unique hotel names in the results | |
smalldf = results.loc[results.hotel_name == results.hotel_name[r]] | |
if smalldf.shape[0] > 3: | |
smalldf = smalldf[:3] | |
resultlist.append( | |
{ | |
"name": results.hotel_name[r], | |
"score": smalldf.similarity.iloc[0], | |
"rating": smalldf.rating_value.max(), | |
"relevant_reviews": smalldf.review_text.tolist() | |
} | |
) | |
hlist.append(results.hotel_name[r]) | |
return resultlist | |
# Create the Gradio interface | |
demo = gr.Interface(fn=search, inputs="text", outputs="json") | |
demo.launch() | |