File size: 2,811 Bytes
721a830
82534a5
721a830
cf7f465
b836067
91baa9b
 
b836067
91baa9b
b836067
91baa9b
721a830
b836067
 
91baa9b
b836067
91baa9b
 
b836067
91baa9b
 
b836067
91baa9b
 
 
 
 
 
 
 
 
 
b836067
91baa9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf7f465
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76



import gradio as gr
import pandas as pd
import re
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

# Load your embedding model
embedder = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

def search(query):
    # Load the dataset
    dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")
    
    # Convert the dataset to a DataFrame
    df = pd.DataFrame(dataset['train'])

    # Combine relevant columns into a single column for text search
    df["combined"] = (
        "hotel_name: " + df.hotel_name.astype(str).str.strip() + 
        "; hotel_description: " + df.hotel_description.astype(str).str.strip() +
        "; country: " + df.country.astype(str).str.strip() +
        "; locality: " + df.locality.astype(str).str.strip() +
        "; price_range: " + df.price_range.astype(str).str.strip() +
        "; street_address: " + df.street_address.astype(str).str.strip() +
        "; review_title: " + df.review_title.astype(str).str.strip() +
        "; review_text: " + df.review_text.astype(str).str.strip() +
        "; rating_value: " + df.rating_value.astype(str).str.strip()
    )

    # Clean the combined text data
    df['combined'] = df['combined'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', str(x)))

    # Define the number of results to return
    n = 5

    # Embed the query using the embedding model
    query_embedding = embedder.encode(query)

    # Generate embeddings for the combined text data
    df['embedding'] = df['combined'].apply(lambda x: embedder.encode(x))

    # Calculate similarity between the query and each row's combined text
    df["similarity"] = df['embedding'].apply(lambda x: cosine_similarity([x], [query_embedding]).flatten()[0])

    # Sort the results by similarity and select the top 'n' results
    results = df.sort_values("similarity", ascending=False).head(n)

    # Prepare the results to be returned
    resultlist = []
    hlist = []
    for r in results.index:
        if results.hotel_name[r] not in hlist:  # Ensure unique hotel names in the results
            smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
            if smalldf.shape[0] > 3:
                smalldf = smalldf[:3]

            resultlist.append(
                {
                    "name": results.hotel_name[r],
                    "score": smalldf.similarity.iloc[0],
                    "rating": smalldf.rating_value.max(),
                    "relevant_reviews": smalldf.review_text.tolist()
                }
            )
            hlist.append(results.hotel_name[r])

    return resultlist

# Create the Gradio interface
demo = gr.Interface(fn=search, inputs="text", outputs="json")
demo.launch()