File size: 4,615 Bytes
a0c9ed0
 
1d1ff21
 
 
 
 
 
 
7b908e9
3a69eda
4a05925
3a69eda
03fa2bd
 
 
 
 
 
3a69eda
1d1ff21
8b75361
a0c9ed0
4e275b4
228303a
1d1ff21
 
228303a
 
 
 
 
 
 
1d1ff21
 
228303a
 
 
 
 
1d1ff21
228303a
 
 
 
 
 
 
1d1ff21
228303a
 
 
 
 
 
 
 
 
a0c9ed0
1d1ff21
efd1d8f
1d1ff21
 
 
 
 
 
228303a
 
1d1ff21
 
fb31e25
1d1ff21
fb31e25
 
 
229c21c
228303a
1d1ff21
 
 
3a69eda
 
 
 
 
 
 
 
 
228303a
3a69eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d1ff21
0f7292c
3a69eda
 
 
0f7292c
 
3a69eda
 
 
0f7292c
a0c9ed0
1d1ff21
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
import pandas as pd
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
from openai.embeddings_utils import cosine_similarity
import spacy
from spacy.cli import download
import os
import openai

try:
    nlp = spacy.load('en_core_web_sm')
except IOError:
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

openai.api_key = os.getenv("OPENAI_API_KEY")
nltk.download('punkt')
nltk.download('punkt_tab')

df = pd.read_pickle("hotels_data.pkl")
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

def bm25_rank(query, df, n=15):
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in df['combined']]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    df['bm25_scores'] = scores
    top_results = df.nlargest(n, 'bm25_scores')
    return top_results

def search(query, df):
    n = 5
    query_embedding = model.encode(query)
    df = bm25_rank(query, df)
    df["similarity"] = df.embeddings.apply(lambda x: cosine_similarity(x, query_embedding.reshape(768, -1)))
    results = df.sort_values("similarity", ascending=False).head(n)

    resultlist = []
    hlist = []
    for r in results.index:
        if results.hotel_name[r] not in hlist:
            smalldf = results.loc[results.hotel_name == results.hotel_name[r]]
            if smalldf.shape[1] > 3:
                smalldf = smalldf[:3]

            resultlist.append({
                "hotel_name": results.hotel_name[r],
                "image_url": smalldf.hotel_image[r],
                "score": smalldf.rate[r],
                "description": smalldf.hotel_description[r],
                "relevant_reviews": [smalldf.review_text[s] for s in smalldf.index]
            })
            hlist.append(results.hotel_name[r])
    return resultlist

def get_hotel_info(query):
    doc = nlp(query)
    if doc.ents:
        city = str(doc.ents[0]).lower().capitalize()
        df_filtred = df[df['locality'] == city]
        if df_filtred.shape[0] == 0:
            df_filtred = df
    else:
        city = None
        df_filtred = df
    results = search(query, df_filtred)
    response = []
    for result in results:
        response.append({
            'image_url': result['image_url'],
            'hotel_name': result['hotel_name'],
            'score': result['score'],
            'description': result['description'],
            'relevant_reviews': result['relevant_reviews']
        })
    return response

def generate_answer(query, context):
    prompt = f"""
    Based on the following query from a user, please generate a detailed answer based on the context
    focusing on which is the top hotel based on the query. You should respond as if you are a travel agent and are conversing with the
    user in a nice cordial way. Remove any special characters and (\\n), make the output clean and concise.
    ###########
    query:
    "{query}"
    ########
    context:
    "{context}"
    #####
    Return in Markdown format with each hotel highlighted.
    """

    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        max_tokens=1500,
        n=1,
        stop=None,
        temperature=0.2,  # Higher temperature means more creative or more hallucination
        messages=messages
    )

    # Extract the generated response from the API response
    generated_text = response.choices[0].message['content'].strip()

    return generated_text

def chatbot_response(message, history):
    hotel_infos = get_hotel_info(message)
    if hotel_infos:
        context = "\n".join([
            f"Hotel Name: {info['hotel_name']}, Score: {info['score']}, Description: {info['description']}, Reviews: {info['relevant_reviews']}"
            for info in hotel_infos
        ])
        response = generate_answer(message, context)
    else:
        response = "No results found."

    history.append((message, response))
    return history, history

with gr.Blocks() as interface:
    chatbot = gr.Chatbot(label="Hotel Search Chatbot")
    query_input = gr.Textbox(label="Ask me about hotels!")
    submit_button = gr.Button("Send")

    submit_button.click(
        fn=chatbot_response,
        inputs=[query_input, chatbot],
        outputs=[chatbot, chatbot]
    )

interface.launch()