Sara-ALharbi / app.py
sara04ia's picture
Update app.py
82534a5 verified
import gradio as gr
import pandas as pd
import re
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
# Load your embedding model
embedder = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
def search(query):
# Load the dataset
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")
# Convert the dataset to a DataFrame
df = pd.DataFrame(dataset['train'])
# Combine relevant columns into a single column for text search
df["combined"] = (
"hotel_name: " + df.hotel_name.astype(str).str.strip() +
"; hotel_description: " + df.hotel_description.astype(str).str.strip() +
"; country: " + df.country.astype(str).str.strip() +
"; locality: " + df.locality.astype(str).str.strip() +
"; price_range: " + df.price_range.astype(str).str.strip() +
"; street_address: " + df.street_address.astype(str).str.strip() +
"; review_title: " + df.review_title.astype(str).str.strip() +
"; review_text: " + df.review_text.astype(str).str.strip() +
"; rating_value: " + df.rating_value.astype(str).str.strip()
)
# Clean the combined text data
df['combined'] = df['combined'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', str(x)))
# Define the number of results to return
n = 5
# Embed the query using the embedding model
query_embedding = embedder.encode(query)
# Generate embeddings for the combined text data
df['embedding'] = df['combined'].apply(lambda x: embedder.encode(x))
# Calculate similarity between the query and each row's combined text
df["similarity"] = df['embedding'].apply(lambda x: cosine_similarity([x], [query_embedding]).flatten()[0])
# Sort the results by similarity and select the top 'n' results
results = df.sort_values("similarity", ascending=False).head(n)
# Prepare the results to be returned
resultlist = []
hlist = []
for r in results.index:
if results.hotel_name[r] not in hlist: # Ensure unique hotel names in the results
smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
if smalldf.shape[0] > 3:
smalldf = smalldf[:3]
resultlist.append(
{
"name": results.hotel_name[r],
"score": smalldf.similarity.iloc[0],
"rating": smalldf.rating_value.max(),
"relevant_reviews": smalldf.review_text.tolist()
}
)
hlist.append(results.hotel_name[r])
return resultlist
# Create the Gradio interface
demo = gr.Interface(fn=search, inputs="text", outputs="json")
demo.launch()