File size: 3,121 Bytes
6c9917e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bad0e21
 
 
 
 
 
 
 
6c9917e
bad0e21
 
6c9917e
bad0e21
 
 
 
6c9917e
bad0e21
 
 
 
 
 
 
 
 
 
 
6c9917e
 
 
 
 
 
 
bad0e21
 
 
 
 
 
 
6c9917e
bad0e21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c9917e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import torch
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained embedding model (SentenceTransformer)
model = SentenceTransformer('nomic-ai/nomic-embed-text-v1')

# Load BART summarization model and tokenizer
model_bart = torch.hub.load('pytorch/fairseq', 'bart.large.cnn')
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'facebook/bart-large-cnn')

# Load normalized embeddings from the pkl file
with open('normalized_embeddings.pkl', 'rb') as f:
    normalized_embeddings = pickle.load(f)

# Load the hotel dataset (processed)
df_copy_first_1000 = pd.read_csv('hotel_dataset_processed.csv')

# Function to summarize text using BART
def summarize_text(text):
    inputs = tokenizer.encode("" + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model_bart.generate(inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to search hotels
def search_hotels(query_text, k=1):
    try:
        # Encode the query text using SentenceTransformer
        query_embedding = model.encode(query_text, convert_to_tensor=True)
        query_embedding = query_embedding.cpu().numpy().reshape(1, -1)
        query_embedding = normalize(query_embedding, norm='l2')

        # Compute cosine similarity between query embedding and stored embeddings
        similarities = cosine_similarity(query_embedding, normalized_embeddings)

        # Get indices of the top k similar hotels
        top_indices = similarities[0].argsort()[-k:][::-1]

        # Retrieve the top k similar hotels
        top_hotels = df_copy_first_1000.iloc[top_indices]

        # Prepare results
        results = []
        for _, row in top_hotels.iterrows():
            # Create a summary for the hotel details
            summary_text = (
                f"Description: {row['hotel_description']}\n"
                f"Review Title: {row['review_title']}\n"
                f"Review Text: {row['review_text']}\n"
                f"Review Count: {row['review_count']}"
            )
            summary = summarize_text(summary_text)

            result = (
                f"Hotel Name: {row['hotel_name']}\n"
                f"Locality: {row['locality']}\n"
                f"Price Range: {row['price_range']}\n"
                f"Rate: {row['rate']}\n"
                f"\nSummary:\n{summary}\n"
            )
            results.append(result)

        return "\n\n".join(results)

    except Exception as e:
        return f"An error occurred during the search: {e}"

# Gradio Interface
iface = gr.Interface(
    fn=search_hotels,
    inputs=gr.Textbox(label="Enter your search query"),
    outputs="text",
    title="Hotel Search Engine",
    description="Enter a query to search for hotels and get details about the top results."
)

# Launch Gradio Interface
iface.launch()