RMakushkin commited on
Commit
ef6dece
·
1 Parent(s): c060c61

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +34 -0
  3. dataset.csv +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  embs.txt filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  embs.txt filter=lfs diff=lfs merge=lfs -text
37
+ dataset.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertModel
2
+ import torch
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ import pandas as pd
5
+ import numpy as np
6
+ import time
7
+
8
+ loaded_model = BertModel.from_pretrained('model')
9
+ loaded_tokenizer = BertTokenizer.from_pretrained('tokenizer')
10
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
11
+
12
+
13
+ def filter_by_ganre(df: pd.DataFrame, ganre_list: list):
14
+ filtered_df = df[df['ganres'].apply(lambda x: any(g in ganre_list for g in(x)))]
15
+ return filtered_df
16
+
17
+
18
+ end_time = time.time()
19
+
20
+
21
+ def recommendation(df: pd.DataFrame, embeddings:np.array, user_text: str, n=10):
22
+ start_time = time.time()
23
+ tokens = loaded_tokenizer(user_text, return_tensors="pt", padding=True, truncation=True)
24
+ loaded_model.to(device)
25
+ loaded_model.eval()
26
+ with torch.no_grad():
27
+ tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}
28
+ outputs = loaded_model(**tokens)
29
+ user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()
30
+ cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))
31
+ df_res = pd.DataFrame(cosine_similarities.ravel(), columns=['cos_sim']).sort_values('cos_sim', ascending=False)
32
+ dict_topn = df_res.iloc[:n, :].cos_sim.to_dict()
33
+ end_time = time.time()
34
+ return dict_topn
dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c10dbf7a899fbf0553bf6cab5fd11abf35cf224e4e6e4f7843fdd19144c550
3
+ size 19266108