Spaces:
Build error
Build error
File size: 5,801 Bytes
65246ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# -*- coding: utf-8 -*-
"""[Nicole_Lovold_Egar]Assignment 2.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1j5EomK04JF-H3FM_TdzSDk_US55nJd36
"""
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
from google.colab import files
files.upload()
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets list
!kaggle datasets download -d hamzafarooq50/hotel-listings-and-reviews/HotelListInLondon__en2019100120191005.csv
!ls
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
import os
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy import displacy
text = """this is a base text"""
doc = nlp(text)
sentence_spans = list(doc.sents)
displacy.render(doc, jupyter = True, style="ent")
stopwords = list(STOP_WORDS)
from string import punctuation
punctuation = punctuation+ '\n'
import pandas as pd
import scipy.spatial
import pickle as pkl
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
#embedder = SentenceTransformer('bert-base-nli-mean-tokens')
df = pd.read_csv("/content/HotelListInLondon__en2019100120191005.csv",sep=",", encoding='cp1252')
!kaggle datasets download -d hamzafarooq50/hotel-listings-and-reviews
df.head()
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
df['hotel_name'].value_counts()
df['hotel_name'].drop_duplicates()
df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).hotel_features.apply(''.join).reset_index(name='hotel_features')
df.iloc[6].hotel_features
df_combined.head().T
import re
df_combined['hotel_features'] = df_combined['hotel_features'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
def lower_case(input_str):
input_str = input_str.lower()
return input_str
df_combined['hotel_features']= df_combined['hotel_features'].apply(lambda x: lower_case(x))
df = df_combined
df_sentences = df_combined.set_index("hotel_features")
df_sentences = df_sentences["hotel_name"].to_dict()
df_sentences_list = list(df_sentences.keys())
len(df_sentences_list)
list(df_sentences.keys())[:5]
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
# Converts each combined review and creates an embedding for each hotel
corpus = df_sentences_list
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
corpus_embeddings[0].shape
queries = ['Hotel near St Pancras Station that have a spa']
query_embeddings = embedder.encode(queries,show_progress_bar=True)
import torch
# Query sentences:
queries = ['Hotel in london close to buckingham palace',
'Hotel in london less than 200 per night']
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
query_embedding = embedder.encode(query, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print("(Score: {:.4f})".format(score))
print(corpus[idx], "(Score: {:.4f})".format(score))
row_dict = df.loc[df['hotel_features']== corpus[idx]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
# for idx, distance in results[0:closest_n]:
# print("Score: ", "(Score: %.4f)" % (1-distance) , "\n" )
# print("Paragraph: ", corpus[idx].strip(), "\n" )
# row_dict = df.loc[df['all_review']== corpus[idx]]
# print("paper_id: " , row_dict['Hotel'] , "\n")
model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
embeddings = model.encode(corpus)
#print(embeddings)
# Query sentences:
queries = ['Hotel near Kings Cross and good restaurants'
]
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
query_embedding = model.encode(query, convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
top_results = torch.topk(cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print("(Score: {:.4f})".format(score))
print(corpus[idx], "(Score: {:.4f})".format(score))
row_dict = df.loc[df['hotel_features']== corpus[idx]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
hits = util.semantic_search(query_embedding, embeddings, top_k=5)
hits = hits[0] #Get the hits for the first query
for hit in hits:
print (hit)
print("(Score: {:.4f})".format(hit['score']))
print(corpus[hit['corpus_id']])
row_dict = df.loc[df['hotel_features']== corpus[hit['corpus_id']]]
print("paper_id: " , row_dict['hotel_name'] , "\n")
query_embedding.shape
df
"""End"""
|