File size: 1,931 Bytes
dd81387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from typing import List, Dict
from utils.config import Config
import os


# Load the dataset (replace with the actual path to your dataset)
dataset_path = Config.read('app', 'dataset')

# Ensure the dataset exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"The dataset file at {dataset_path} was not found.")

# Load the dataset
data = pd.read_pickle(dataset_path)

# Ensure the dataset has the necessary columns: 'asin', 'title', 'brand', 'medium_image_url'
required_columns = ['asin', 'title', 'brand', 'medium_image_url']
for col in required_columns:
    if col not in data.columns:
        raise ValueError(f"Missing required column: {col} in the dataset")

# Set up the vectorizer and fit the model
tfidf_title_vectorizer = TfidfVectorizer(min_df = 0.0)
tfidf_title_features = tfidf_title_vectorizer.fit_transform(data['title'])

# Function to calculate the tf-idf model and return closest matches
def tfidf_model(input_text: str, num_results: int) -> List[Dict]:

    # Transform the input text to the same TF-IDF feature space
    query_vec = tfidf_title_vectorizer.transform([input_text])

    pairwise_dist = pairwise_distances(tfidf_title_features, query_vec)

    # np.argsort will return indices of 9 smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data.index[indices])

    results = []
    for i in range(0,len(indices)):
        result = {
            'asin': data['asin'].loc[df_indices[i]],
            'brand': data['brand'].loc[df_indices[i]],
            'title': data['title'].loc[df_indices[i]],
            'url': data['medium_image_url'].loc[df_indices[i]]
        }
        results.append(result)

    return results