Fetch_App / offer_pipeline.py
Calvin
final touches
f94a42e
import streamlit as st
from transformers import pipeline
import pickle
import os
import pandas as pd
import ast
import string
import re
from sentence_transformers import SentenceTransformer, util
st.set_page_config(
page_title="Offer Recommender",
layout="wide"
)
# Download and cache models
pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Directory of csv files
dire = "DS_NLP_search_data"
# Use Streamlit caching to load data once
@st.cache_data
def get_processed_offers():
'''
Load processed offers from exploration notebook and cache
Returns:
processed_offers (pd.DataFrame) : zero-shot categorized offers
'''
processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)
return processed_offers
@st.cache_data
def get_categories_data():
'''
Load raw category data and cache
Returns:
cats (pd.DataFrame) : raw category data
'''
cats = pd.read_csv(os.path.join(dire, "categories.csv"))
return cats
@st.cache_data
def get_offers_data():
'''
Load raw offfers data and cache
Returns:
cats (pd.DataFrame) : raw offers data
'''
offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))
return offers
@st.cache_data
def get_categories(cats_):
'''
Extract, load categories and cache
Parameters:
cats_ (pd.DataFrame) : raw categories data
Returns:
categories (List) : child categories
'''
categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
for x in ["Mature"]:
if x in categories:
categories.remove(x)
return categories
def check_in_offer(search_str, offer_rets):
'''
Determine if the input text is directly in the offer with basic string matching
Parameters:
search_str (string) : user text input
offer_rets (pd.DataFrame) : raw offer data
Returns:
df (pd.DataFrame) : offers with text input
'''
offers = []
for i in range(len(offer_rets)):
offer_str = offer_rets.iloc[i]["OFFER"]
parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)
if search_str.lower() in parsed_str.split(" "):
offers.append(offer_str)
df = pd.DataFrame({"OFFER":offers})
return df
def is_retailer(search_str, threshold=0.5):
'''
Determine if the text input is highly likely to be a retailer
Parameters:
search_str (string) : user text input
threshold (int) : probability threshold
Returns:
is_ret (boolean) : true if retailer, false otherwise
'''
processed_search_str = search_str.lower().capitalize()
labels = pipe(processed_search_str,
candidate_labels=["brand", "retailer", "item"],
)
is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold
return is_ret
def perform_cat_inference(search_str, categories, cats, processed_offers):
'''
Perform zero shot learning twice and return the offers relevant to the child categories
Parameters:
search_str (string) : user text input
categories (pd.DataFrame) : list of categories
cats (pd.DataFrame) : raw category data
processed_offers (pd.DataFrame) : processed_offer_data
Returns:
offers (pd.DataFrame) : relevant offers
labels (dict) : parent categories and their probability scores
labels_2 (dict) : child categories and their probability scores
'''
labels = pipe(search_str,
candidate_labels=categories,
)
# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
labels_2 = pipe(search_str,
candidate_labels=filtered_cats,
)
top_labels = labels_2["labels"][:3]
offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()
return offers, labels, labels_2
def sort_by_similarity(search_str, related_offers):
'''
Use sentence embeddings to evaluate the similarity of relevant offers to the text input
Parameters:
search_str (string) : user text input
related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning
Returns:
df (pd.DataFrame) : relevant offers and their similiarity scores
'''
temp_dict = {}
embedding_1 = model.encode(search_str, convert_to_tensor=True)
for offer in list(related_offers["OFFER"]):
embedding_2 = model.encode(offer, convert_to_tensor=True)
temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))
sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})
return df
def main():
# Load and cache data
col_1, col_2, col_3 = st.columns(3)
search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize()
processed_offers = get_processed_offers()
cats = get_categories_data()
offer_rets = get_offers_data()
categories = get_categories(cats)
if col_1.button("Search", type="primary"):
# Check offers where the text is directly in it
retail = is_retailer(search_str)
direct_offers = check_in_offer(search_str, offer_rets)
col_2.write("Directly related offers")
if len(direct_offers) == 0:
col_2.write("None found")
else:
col_2.table(direct_offers)
if retail:
# If retail, we directly compare every offer using sentence embeddings
related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
else:
# Otherwise, we use zero shot learning with processed offers to narrow down our search
related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers)
related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]
col_2.write("Parent categories probabilities")
col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
col_2.write("Child categories probabilities")
col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))
col_2.write("Other related offers")
sorted_offers = sort_by_similarity(search_str, related_offers)
if len(sorted_offers) == 0:
col_2.write("None found")
else:
col_2.table(sorted_offers)
if __name__ == "__main__":
main()