Spaces:
Sleeping
Sleeping
File size: 6,094 Bytes
afadb57 1426de0 9045f37 112b798 9045f37 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 afadb57 e00db24 3113f7b 9045f37 e00db24 afadb57 e00db24 9045f37 e0fb16c 9045f37 e00db24 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 e0fb16c 9045f37 fdf715a 9045f37 e0fb16c 9045f37 e0fb16c d2159e0 3113f7b e0fb16c e00db24 d2159e0 e00db24 9045f37 3113f7b e00db24 3113f7b 9045f37 3708333 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import os
import streamlit as st
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.preprocessing import normalize
import re
from datasets import load_dataset, Dataset
# Load the dataset from Hugging Face Datasets
def load_files_from_huggingface():
dataset = load_dataset("GMARTINEZMILLA/deepsinisghtz_dataset", split="train")
# Load CSV file
cestas_file = dataset['cestas_final.csv']
cestas = pd.read_csv(cestas_file)
# Load joblib files
count_matrix_file = dataset['count_matrix_0001.joblib']
count_vectorizer_file = dataset['count_vectorizer_0001.joblib']
tf_matrix = load(count_matrix_file)
count_vectorizer = load(count_vectorizer_file)
return cestas, tf_matrix, count_vectorizer
# Save updated files back to Hugging Face Datasets
def save_files_to_huggingface(cestas, tf_matrix, count_vectorizer):
# Save updated CSV file
cestas.to_csv('cestas_final.csv', index=False)
# Create new dataset and push to Hugging Face
dataset = Dataset.from_pandas(cestas)
dataset.push_to_hub("GMARTINEZMILLA/deepsinisghtz_dataset")
# Save updated joblib files
dump(tf_matrix, 'count_matrix_0002.joblib') # Increment version
dump(count_vectorizer, 'count_vectorizer_0002.joblib') # Increment version
# Optionally, push joblib files back to Hugging Face Datasets (if supported)
# You can manually add these files to the dataset in the Hugging Face interface if needed
def get_next_version(file_prefix):
"""Return the next version number for joblib files."""
# You can hardcode or generate a new version name (e.g., 0002, 0003, etc.)
return f"{file_prefix}_0002.joblib"
def recomienda_tf(new_basket, cestas, productos):
# Load the latest versions of the matrix and vectorizer
tf_matrix_file = 'count_matrix_0001.joblib'
count_vectorizer_file = 'count_vectorizer_0001.joblib'
tf_matrix = load(tf_matrix_file)
count_vectorizer = load(count_vectorizer_file)
# Convert the new basket into TF (Term Frequency) format
new_basket_str = ' '.join(new_basket)
new_basket_vector = count_vectorizer.transform([new_basket_str])
new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
# Compare the new basket with previous ones
similarities = cosine_similarity(new_basket_tf, tf_matrix)
# Get the indices of the most similar baskets
similar_indices = similarities.argsort()[0][-4:] # Top 4 most similar baskets
# Create a dictionary to count recommendations
recommendations_count = {}
total_similarity = 0
# Recommend products from similar baskets
for idx in similar_indices:
sim_score = similarities[0][idx]
total_similarity += sim_score # Sum of similarities
products = cestas.iloc[idx]['Cestas'].split()
unique_products = set(products) # Use a set to get unique products
for product in unique_products:
if product.strip() not in new_basket: # Avoid recommending items already in the basket
recommendations_count[product.strip()] = recommendations_count.get(product.strip(), 0) + sim_score
# Calculate the relative probability of each recommended product
recommendations_with_prob = []
if total_similarity > 0:
recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
else:
print("No se encontraron similitudes suficientes para calcular probabilidades.")
# Sort recommendations by relevance score
recommendations_with_prob.sort(key=lambda x: x[1], reverse=True)
# Create a new DataFrame to store recommendations
recommendations_data = []
for product, score in recommendations_with_prob:
# Search for the product description in the products DataFrame
description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
if not description.empty:
recommendations_data.append({
'ARTICULO': product,
'DESCRIPCION': description.values[0],
'RELEVANCIA': score
})
recommendations_df = pd.DataFrame(recommendations_data)
return recommendations_df.head(5)
def retroalimentacion(cestas, cesta_nueva):
# Convert basket from list to string
cesta_unida = ' '.join(cesta_nueva)
# Debugging message
st.write(f"DEBUG: La nueva cesta es {cesta_unida}")
# Add the new basket to the historical baskets if it doesn't already exist
if not cestas['Cestas'].isin([cesta_unida]).any():
cestas.loc[len(cestas)] = cesta_unida
st.success("✓ Cesta añadida al DataFrame.")
# Save the updated DataFrame and joblib files back to Hugging Face Datasets
save_files_to_huggingface(cestas, tf_matrix, count_vectorizer)
st.write("DEBUG: Los archivos se han guardado en Hugging Face Datasets.")
else:
st.warning("⚠️ La cesta ya existe en el DataFrame.")
# Re-vectorize the basket DataFrame
count_vectorizer = CountVectorizer()
count_vectorizer.fit(cestas['Cestas'])
count_matrix = count_vectorizer.transform(cestas['Cestas'])
tf_matrix = normalize(count_matrix, norm='l1')
# Save new versions of the vectorizer and matrix
count_vectorizer_file = get_next_version('count_vectorizer')
tf_matrix_file = get_next_version('count_matrix')
dump(count_vectorizer, count_vectorizer_file)
dump(tf_matrix, tf_matrix_file)
# Debugging messages
st.write(f"DEBUG: Se ha generado la nueva versión del count_vectorizer: {count_vectorizer_file}")
st.write(f"DEBUG: Se ha generado la nueva versión del tf_matrix: {tf_matrix_file}")
return None
|