import os import pprint import tempfile from typing import Dict, Text import numpy as np import tensorflow as tf import tensorflow_recommenders as tfrs #scann 1.2.7 + recomm 0.7.0 + TF 2.8.0 import os import unidecode from nltk import word_tokenize import re import pandas as pd from nltk.util import ngrams import base64 import hashlib import gradio as gr import scann df=pd.read_csv("/home/user/app/Dubai_translated_best_2500.csv",sep=",",header=0) for i in range(0,len(df['requisito'])): print(len(df['requisito'].iloc[i])) df=df.drop_duplicates() df=df.dropna() df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title()) df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower()) my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:]) my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:]) ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: { "code": x["code"], "nome_vaga": x["nome_vaga"], "requisito": tf.strings.split(x["requisito"],maxsplit=106) }) l=[] for x in ratings.as_numpy_iterator(): pprint.pprint(len(x['requisito'])) l.append(len(x['requisito'])) min(l) movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: { "code": x["code"], "nome_vaga": x["nome_vaga"] }) for x in movies.take(1).as_numpy_iterator(): pprint.pprint(x) movies = movies.map(lambda x: x["code"]) for x in ratings.take(5).as_numpy_iterator(): pprint.pprint(x) for x in movies.take(5).as_numpy_iterator(): pprint.pprint(x) ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: { "code": x["code"], "requisito": tf.strings.split(x["requisito"],maxsplit=106) }) tf.random.set_seed(42) shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False) shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False) train = shuffled.take(int(df.shape[0]*0.9)) test = shuffled.take(int(df.shape[0]*0.1)) cego=shuffled2 for x in train.take(1).as_numpy_iterator(): pprint.pprint(x) for x in test.take(5).as_numpy_iterator(): pprint.pprint(x) movie_titles = movies#.map(lambda x: x["code"]) user_ids = ratings.map(lambda x: x["requisito"]) xx=[] for x in user_ids.as_numpy_iterator(): try: #print(x) xx.append(x) except: pass unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator())) unique_user_ids = np.unique(np.concatenate(xx)) user_ids=user_ids.batch(int(df.shape[0]*0.9)) layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids) for x in ratings.take(1).as_numpy_iterator(): pprint.pprint(x['requisito']) for x in ratings.take(5).as_numpy_iterator(): pprint.pprint(np.array(layer(x['requisito']))) unique_movie_titles[:10] embedding_dimension = 768 user_model = tf.keras.Sequential([ tf.keras.layers.StringLookup( vocabulary=unique_user_ids, mask_token=None), # We add an additional embedding to account for unknown tokens. tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension), ]) for x in train.take(5).as_numpy_iterator(): pprint.pprint(np.array(user_model(x['requisito'])).shape) movie_model = tf.keras.Sequential([ tf.keras.layers.StringLookup( vocabulary=unique_movie_titles, mask_token=None), tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension) ]) for x in train.take(5).as_numpy_iterator(): pprint.pprint(np.array(movie_model(x['code'])).shape) metrics = tfrs.metrics.FactorizedTopK( candidates=movies.batch(df.shape[0] ).map(movie_model) ) task = tfrs.tasks.Retrieval( metrics=metrics ) class MovielensModel(tfrs.Model): def __init__(self, user_model, movie_model): super().__init__() self.movie_model: tf.keras.Model = movie_model self.user_model: tf.keras.Model = user_model self.task: tf.keras.layers.Layer = task def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: # We pick out the user features and pass them into the user model. user_embeddings = self.user_model(features["requisito"]) # And pick out the movie features and pass them into the movie model, # getting embeddings back. positive_movie_embeddings = self.movie_model(features["code"]) # The task computes the loss and the metrics. return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings) class NoBaseClassMovielensModel(tf.keras.Model): def __init__(self, user_model, movie_model): super().__init__() self.movie_model: tf.keras.Model = movie_model self.user_model: tf.keras.Model = user_model self.task: tf.keras.layers.Layer = task def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: # Set up a gradient tape to record gradients. with tf.GradientTape() as tape: # Loss computation. user_embeddings = self.user_model(features["requisito"]) positive_movie_embeddings = self.movie_model(features["code"]) loss = self.task(user_embeddings, positive_movie_embeddings) # Handle regularization losses as well. regularization_loss = sum(self.losses) total_loss = loss + regularization_loss gradients = tape.gradient(total_loss, self.trainable_variables) self.optimizer.apply_gradients(zip(gradients, self.trainable_variables)) metrics = {metric.name: metric.result() for metric in self.metrics} metrics["loss"] = loss metrics["regularization_loss"] = regularization_loss metrics["total_loss"] = total_loss return metrics def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: # Loss computation. user_embeddings = self.user_model(features["requisito"]) positive_movie_embeddings = self.movie_model(features["code"]) loss = self.task(user_embeddings, positive_movie_embeddings) # Handle regularization losses as well. regularization_loss = sum(self.losses) total_loss = loss + regularization_loss metrics = {metric.name: metric.result() for metric in self.metrics} metrics["loss"] = loss metrics["regularization_loss"] = regularization_loss metrics["total_loss"] = total_loss return metrics model = MovielensModel(user_model, movie_model) model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08)) cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache() cached_test = test.batch(int(df.shape[0]*0.1)).cache() path = os.path.join("/", "model/") cp_callback = tf.keras.callbacks.ModelCheckpoint( filepath=path, verbose=1, save_weights_only=True, save_freq=2) model.fit(cached_train, callbacks=[cp_callback],epochs=200) index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))]) from sklearn.metrics.pairwise import cosine_similarity indice=[] for i in range(0,1633): indice.append(np.array(index)[i][0]) searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree( num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force( 2, quantize=True).build() def predict(text): campos=str(text).lower() query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0) neighbors, distances = searcher.search_batched([query]) xx = df.iloc[neighbors[0],:].nome_vaga return xx demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \ outputs=gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\ css='div {margin-left: auto; margin-right: auto; width: 100%;\ background-image: url("https://drive.google.com/uc?export=view&id=1ZAvzQXQ7_xnMWfmy-UiR5zlCrnfLstoX"); repeat 0 0;}').launch(auth=("dubai777", "Pa$$w0rd123"),share=False)