Spaces:

ishantvivek
/

recommendation_system

Sleeping

App Files Files Community

recommendation_system / utils /BagOfWords.py

ishantvivek

recommendation-system: Add backend server of fastapi and controllers

dd81387 6 months ago

raw

history blame contribute delete

1.84 kB

	import numpy as np
	import pandas as pd
	from sklearn.metrics.pairwise import pairwise_distances
	from sklearn.feature_extraction.text import CountVectorizer
	from typing import List, Dict
	import os

	from utils.config import Config

	# Load the dataset (replace with the actual path to your dataset)
	dataset_path = Config.read('app', 'dataset')

	# Ensure the dataset exists
	if not os.path.exists(dataset_path):
	raise FileNotFoundError(f"The dataset file at {dataset_path} was not found.")

	# Load the dataset
	data = pd.read_pickle(dataset_path)

	# Ensure the dataset has the necessary columns: 'asin', 'title', 'brand', 'medium_image_url'
	required_columns = ['asin', 'title', 'brand', 'medium_image_url']
	for col in required_columns:
	if col not in data.columns:
	raise ValueError(f"Missing required column: {col} in the dataset")

	# Set up the vectorizer and fit the model
	title_vectorizer = CountVectorizer()
	title_features = title_vectorizer.fit_transform(data['title'])

	# Function to calculate the bag-of-words model and return closest matches
	def bag_of_words_model(query: str, num_results: int) -> List[Dict]:
	# Transform the input query to the same feature space
	query_vec = title_vectorizer.transform([query])

	# Calculate pairwise distances between the query and all items in the corpus
	pairwise_dist = pairwise_distances(title_features, query_vec, metric='cosine')

	# Get the indices of the closest matches
	indices = np.argsort(pairwise_dist.flatten())[0:num_results]

	results = []
	for idx in indices:
	result = {
	'asin': data['asin'].iloc[idx],
	'brand': data['brand'].iloc[idx],
	'title': data['title'].iloc[idx],
	'url': data['medium_image_url'].iloc[idx],
	}
	results.append(result)

	return results