GMARTINEZMILLA commited on
Commit
cc75240
·
verified ·
1 Parent(s): 426fa6d

Update utils.py

Browse files

Volvemos a versión que funciona visualmente, aunque no se pueda editar dentro del propio repositorio.

Files changed (1) hide show
  1. utils.py +43 -51
utils.py CHANGED
@@ -1,5 +1,5 @@
 
1
  import os
2
- import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  import warnings
@@ -9,56 +9,57 @@ from sklearn.metrics.pairwise import cosine_similarity
9
  from joblib import dump, load
10
  from sklearn.preprocessing import normalize
11
  import re
12
- from datasets import load_dataset, Dataset
13
 
14
- # Load the dataset from Hugging Face Datasets
15
- def load_files_from_huggingface():
16
- dataset = load_dataset("GMARTINEZMILLA/deepsinisghtz_dataset", split="train")
 
 
 
 
 
17
 
18
- # Load CSV file
19
- cestas_file = dataset['cestas_final.csv']
20
- cestas = pd.read_csv(cestas_file)
21
 
22
- # Load joblib files
23
- count_matrix_file = dataset['count_matrix_0001.joblib']
24
- count_vectorizer_file = dataset['count_vectorizer_0001.joblib']
25
- tf_matrix = load(count_matrix_file)
26
- count_vectorizer = load(count_vectorizer_file)
27
 
28
- return cestas, tf_matrix, count_vectorizer
29
-
30
- # Save updated files back to Hugging Face Datasets
31
- def save_files_to_huggingface(cestas, tf_matrix, count_vectorizer):
32
- # Save updated CSV file
33
- cestas.to_csv('cestas_final.csv', index=False)
34
-
35
- # Create new dataset and push to Hugging Face
36
- dataset = Dataset.from_pandas(cestas)
37
- dataset.push_to_hub("GMARTINEZMILLA/deepsinisghtz_dataset")
38
 
39
- # Save updated joblib files
40
- dump(tf_matrix, 'count_matrix_0002.joblib') # Increment version
41
- dump(count_vectorizer, 'count_vectorizer_0002.joblib') # Increment version
 
 
 
 
 
42
 
43
- # Optionally, push joblib files back to Hugging Face Datasets (if supported)
44
- # You can manually add these files to the dataset in the Hugging Face interface if needed
 
 
 
 
 
 
45
 
46
- def get_next_version(file_prefix):
47
- """Return the next version number for joblib files."""
48
- # You can hardcode or generate a new version name (e.g., 0002, 0003, etc.)
49
- return f"{file_prefix}_0002.joblib"
50
 
51
  def recomienda_tf(new_basket, cestas, productos):
52
- # Load the latest versions of the matrix and vectorizer
53
- tf_matrix_file = 'count_matrix_0001.joblib'
54
- count_vectorizer_file = 'count_vectorizer_0001.joblib'
 
55
 
 
56
  tf_matrix = load(tf_matrix_file)
57
- count_vectorizer = load(count_vectorizer_file)
58
 
59
  # Convert the new basket into TF (Term Frequency) format
60
  new_basket_str = ' '.join(new_basket)
61
- new_basket_vector = count_vectorizer.transform([new_basket_str])
62
  new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
63
 
64
  # Compare the new basket with previous ones
@@ -113,20 +114,15 @@ def retroalimentacion(cestas, cesta_nueva):
113
  # Convert basket from list to string
114
  cesta_unida = ' '.join(cesta_nueva)
115
 
116
- # Debugging message
117
- st.write(f"DEBUG: La nueva cesta es {cesta_unida}")
118
-
119
  # Add the new basket to the historical baskets if it doesn't already exist
120
  if not cestas['Cestas'].isin([cesta_unida]).any():
121
  cestas.loc[len(cestas)] = cesta_unida
122
- st.success("Cesta añadida al DataFrame.")
123
 
124
- # Save the updated DataFrame and joblib files back to Hugging Face Datasets
125
- save_files_to_huggingface(cestas, tf_matrix, count_vectorizer)
126
-
127
- st.write("DEBUG: Los archivos se han guardado en Hugging Face Datasets.")
128
  else:
129
- st.warning("⚠️ La cesta ya existe en el DataFrame.")
130
 
131
  # Re-vectorize the basket DataFrame
132
  count_vectorizer = CountVectorizer()
@@ -136,13 +132,9 @@ def retroalimentacion(cestas, cesta_nueva):
136
 
137
  # Save new versions of the vectorizer and matrix
138
  count_vectorizer_file = get_next_version('count_vectorizer')
139
- tf_matrix_file = get_next_version('count_matrix')
140
 
141
  dump(count_vectorizer, count_vectorizer_file)
142
  dump(tf_matrix, tf_matrix_file)
143
-
144
- # Debugging messages
145
- st.write(f"DEBUG: Se ha generado la nueva versión del count_vectorizer: {count_vectorizer_file}")
146
- st.write(f"DEBUG: Se ha generado la nueva versión del tf_matrix: {tf_matrix_file}")
147
 
148
  return None
 
1
+ parte de utils.py
2
  import os
 
3
  import pandas as pd
4
  import numpy as np
5
  import warnings
 
9
  from joblib import dump, load
10
  from sklearn.preprocessing import normalize
11
  import re
 
12
 
13
+ def get_next_version(file_prefix, folder='RecommendationFiles/'):
14
+ """Find the latest version of a file and return the next version's filename."""
15
+ if not os.path.exists(folder):
16
+ os.makedirs(folder) # Ensure the folder exists
17
+
18
+ # Regular expression to match files like 'file_0001.joblib'
19
+ pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
20
+ files = [f for f in os.listdir(folder) if pattern.match(f)]
21
 
22
+ # Extract version numbers from matching files
23
+ versions = [int(pattern.match(f).group(1)) for f in files]
 
24
 
25
+ # Determine the next version number
26
+ next_version = max(versions) + 1 if versions else 1
 
 
 
27
 
28
+ # Return the next version filename with the folder path
29
+ return os.path.join(folder, f"{file_prefix}_{next_version:04d}.joblib")
 
 
 
 
 
 
 
 
30
 
31
+ def get_latest_version(file_prefix, folder='RecommendationFiles/'):
32
+ """Find the latest version of a file to load."""
33
+ if not os.path.exists(folder):
34
+ raise FileNotFoundError(f"Folder '{folder}' does not exist")
35
+
36
+ # Regular expression to match files like 'file_0001.joblib'
37
+ pattern = re.compile(rf"{file_prefix}_(\d+)\.joblib")
38
+ files = [f for f in os.listdir(folder) if pattern.match(f)]
39
 
40
+ # Extract version numbers from matching files
41
+ versions = [int(pattern.match(f).group(1)) for f in files]
42
+
43
+ if versions:
44
+ latest_version = max(versions)
45
+ return os.path.join(folder, f"{file_prefix}_{latest_version:04d}.joblib")
46
+ else:
47
+ raise FileNotFoundError(f"No versions found for {file_prefix} in folder '{folder}'")
48
 
 
 
 
 
49
 
50
  def recomienda_tf(new_basket, cestas, productos):
51
+
52
+ # Get the latest versions of the matrix and vectorizer from the folder
53
+ tf_matrix_file = get_latest_version('count_matrix')
54
+ count_vectorizer_file = get_latest_version('count_vectorizer')
55
 
56
+ # Load the matrix TF and the vectorizer
57
  tf_matrix = load(tf_matrix_file)
58
+ count = load(count_vectorizer_file)
59
 
60
  # Convert the new basket into TF (Term Frequency) format
61
  new_basket_str = ' '.join(new_basket)
62
+ new_basket_vector = count.transform([new_basket_str])
63
  new_basket_tf = normalize(new_basket_vector, norm='l1') # Normalize the count matrix for the current basket
64
 
65
  # Compare the new basket with previous ones
 
114
  # Convert basket from list to string
115
  cesta_unida = ' '.join(cesta_nueva)
116
 
 
 
 
117
  # Add the new basket to the historical baskets if it doesn't already exist
118
  if not cestas['Cestas'].isin([cesta_unida]).any():
119
  cestas.loc[len(cestas)] = cesta_unida
120
+ print("Cesta añadida.")
121
 
122
+ # Re-save the updated baskets DataFrame
123
+ cestas.to_csv('RecommendationFiles/cestas_final.csv', index=False)
 
 
124
  else:
125
+ print("La cesta ya existe en el DataFrame.")
126
 
127
  # Re-vectorize the basket DataFrame
128
  count_vectorizer = CountVectorizer()
 
132
 
133
  # Save new versions of the vectorizer and matrix
134
  count_vectorizer_file = get_next_version('count_vectorizer')
135
+ tf_matrix_file = get_next_version('tf_matrix')
136
 
137
  dump(count_vectorizer, count_vectorizer_file)
138
  dump(tf_matrix, tf_matrix_file)
 
 
 
 
139
 
140
  return None