File size: 6,094 Bytes
afadb57
1426de0
9045f37
 
 
 
112b798
9045f37
 
 
afadb57
e00db24
afadb57
e00db24
 
 
afadb57
e00db24
 
 
afadb57
e00db24
 
 
 
 
afadb57
e00db24
afadb57
e00db24
 
 
 
 
 
 
 
 
 
 
 
afadb57
e00db24
 
afadb57
e00db24
 
 
 
3113f7b
9045f37
e00db24
 
 
afadb57
 
e00db24
9045f37
e0fb16c
9045f37
e00db24
e0fb16c
 
 
9045f37
e0fb16c
 
 
 
 
9045f37
 
e0fb16c
 
9045f37
 
e0fb16c
9045f37
e0fb16c
 
 
9045f37
e0fb16c
9045f37
e0fb16c
 
9045f37
e0fb16c
9045f37
 
 
 
e0fb16c
 
 
 
9045f37
 
 
e0fb16c
9045f37
 
 
 
e0fb16c
9045f37
 
 
 
fdf715a
9045f37
 
e0fb16c
9045f37
e0fb16c
d2159e0
 
 
3113f7b
 
 
 
e0fb16c
e00db24
 
d2159e0
e00db24
9045f37
3113f7b
 
 
 
 
 
 
 
 
 
e00db24
3113f7b
 
 
 
 
 
 
9045f37
3708333
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import streamlit as st
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.preprocessing import normalize
import re
from datasets import load_dataset, Dataset

# Load the dataset from Hugging Face Datasets
def load_files_from_huggingface():
    dataset = load_dataset("GMARTINEZMILLA/deepsinisghtz_dataset", split="train")
    
    # Load CSV file
    cestas_file = dataset['cestas_final.csv']
    cestas = pd.read_csv(cestas_file)
    
    # Load joblib files
    count_matrix_file = dataset['count_matrix_0001.joblib']
    count_vectorizer_file = dataset['count_vectorizer_0001.joblib']
    tf_matrix = load(count_matrix_file)
    count_vectorizer = load(count_vectorizer_file)
    
    return cestas, tf_matrix, count_vectorizer

# Save updated files back to Hugging Face Datasets
def save_files_to_huggingface(cestas, tf_matrix, count_vectorizer):
    # Save updated CSV file
    cestas.to_csv('cestas_final.csv', index=False)
    
    # Create new dataset and push to Hugging Face
    dataset = Dataset.from_pandas(cestas)
    dataset.push_to_hub("GMARTINEZMILLA/deepsinisghtz_dataset")

    # Save updated joblib files
    dump(tf_matrix, 'count_matrix_0002.joblib')  # Increment version
    dump(count_vectorizer, 'count_vectorizer_0002.joblib')  # Increment version
    
    # Optionally, push joblib files back to Hugging Face Datasets (if supported)
    # You can manually add these files to the dataset in the Hugging Face interface if needed

def get_next_version(file_prefix):
    """Return the next version number for joblib files."""
    # You can hardcode or generate a new version name (e.g., 0002, 0003, etc.)
    return f"{file_prefix}_0002.joblib"

def recomienda_tf(new_basket, cestas, productos): 
    # Load the latest versions of the matrix and vectorizer
    tf_matrix_file = 'count_matrix_0001.joblib'
    count_vectorizer_file = 'count_vectorizer_0001.joblib'
    
    tf_matrix = load(tf_matrix_file)
    count_vectorizer = load(count_vectorizer_file)
                    
    # Convert the new basket into TF (Term Frequency) format
    new_basket_str = ' '.join(new_basket)
    new_basket_vector = count_vectorizer.transform([new_basket_str])
    new_basket_tf = normalize(new_basket_vector, norm='l1')  # Normalize the count matrix for the current basket

    # Compare the new basket with previous ones
    similarities = cosine_similarity(new_basket_tf, tf_matrix)
    
    # Get the indices of the most similar baskets
    similar_indices = similarities.argsort()[0][-4:]  # Top 4 most similar baskets
    
    # Create a dictionary to count recommendations
    recommendations_count = {}
    total_similarity = 0
    
    # Recommend products from similar baskets
    for idx in similar_indices:
        sim_score = similarities[0][idx]
        total_similarity += sim_score  # Sum of similarities
        products = cestas.iloc[idx]['Cestas'].split()
        
        unique_products = set(products)  # Use a set to get unique products
        
        for product in unique_products:
            if product.strip() not in new_basket:  # Avoid recommending items already in the basket
                recommendations_count[product.strip()] = recommendations_count.get(product.strip(), 0) + sim_score
    
    # Calculate the relative probability of each recommended product
    recommendations_with_prob = []
    if total_similarity > 0:
        recommendations_with_prob = [(product, score / total_similarity) for product, score in recommendations_count.items()]
    else:
        print("No se encontraron similitudes suficientes para calcular probabilidades.")
     
    # Sort recommendations by relevance score
    recommendations_with_prob.sort(key=lambda x: x[1], reverse=True)
    
    # Create a new DataFrame to store recommendations
    recommendations_data = []
    
    for product, score in recommendations_with_prob:
        # Search for the product description in the products DataFrame
        description = productos.loc[productos['ARTICULO'] == product, 'DESCRIPCION']
        if not description.empty:
            recommendations_data.append({
                'ARTICULO': product,
                'DESCRIPCION': description.values[0],
                'RELEVANCIA': score
            })
    recommendations_df = pd.DataFrame(recommendations_data)
    
    return recommendations_df.head(5)

def retroalimentacion(cestas, cesta_nueva):
    # Convert basket from list to string
    cesta_unida = ' '.join(cesta_nueva)
    
    # Debugging message
    st.write(f"DEBUG: La nueva cesta es {cesta_unida}")
    
    # Add the new basket to the historical baskets if it doesn't already exist
    if not cestas['Cestas'].isin([cesta_unida]).any():
        cestas.loc[len(cestas)] = cesta_unida
        st.success("✓ Cesta añadida al DataFrame.")
        
        # Save the updated DataFrame and joblib files back to Hugging Face Datasets
        save_files_to_huggingface(cestas, tf_matrix, count_vectorizer)
        
        st.write("DEBUG: Los archivos se han guardado en Hugging Face Datasets.")
    else:
        st.warning("⚠️ La cesta ya existe en el DataFrame.")
    
    # Re-vectorize the basket DataFrame
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit(cestas['Cestas'])
    count_matrix = count_vectorizer.transform(cestas['Cestas'])
    tf_matrix = normalize(count_matrix, norm='l1')

    # Save new versions of the vectorizer and matrix
    count_vectorizer_file = get_next_version('count_vectorizer')
    tf_matrix_file = get_next_version('count_matrix')
    
    dump(count_vectorizer, count_vectorizer_file)
    dump(tf_matrix, tf_matrix_file)
    
    # Debugging messages
    st.write(f"DEBUG: Se ha generado la nueva versión del count_vectorizer: {count_vectorizer_file}")
    st.write(f"DEBUG: Se ha generado la nueva versión del tf_matrix: {tf_matrix_file}")

    return None