Spaces:
Sleeping
Sleeping
import os | |
import os.path as osp | |
import gzip | |
import pickle | |
import json | |
import torch | |
import pandas as pd | |
import numpy as np | |
from tqdm import tqdm | |
from huggingface_hub import hf_hub_download | |
import zipfile | |
from ogb.utils.url import download_url | |
from src.benchmarks.semistruct.knowledge_base import SemiStructureKB | |
from src.tools.process_text import clean_data, compact_text | |
from src.tools.node import df_row_to_dict, Node, register_node | |
from src.tools.io import save_files, load_files | |
PROCESSED_DATASET = { | |
"repo": "snap-stanford/STaRK-Dataset", | |
"file": "amazon_processed.zip", | |
} | |
class AmazonSemiStruct(SemiStructureKB): | |
REVIEW_CATEGORIES = set(['Amazon_Fashion','All_Beauty','Appliances', | |
'Arts_Crafts_and_Sewing','Automotive','Books', | |
'CDs_and_Vinyl','Cell_Phones_and_Accessories', | |
'Clothing_Shoes_and_Jewelry','Digital_Music', | |
'Electronics','Gift_Cards','Grocery_and_Gourmet_Food', | |
'Home_and_Kitchen','Industrial_and_Scientific', 'Kindle_Store', | |
'Luxury_Beauty','Magazine_Subscriptions', 'Movies_and_TV', | |
'Musical_Instruments', 'Office_Products','Patio_Lawn_and_Garden', | |
'Pet_Supplies','Prime_Pantry','Software','Sports_and_Outdoors', | |
'Tools_and_Home_Improvement','Toys_and_Games','Video_Games']) | |
# single answers | |
QA_CATEGORIES = set(['Appliances','Arts_Crafts_and_Sewing', 'Automotive', | |
'Baby','Beauty','Cell_Phones_and_Accessories', | |
'Clothing_Shoes_and_Jewelry','Electronics', | |
'Grocery_and_Gourmet_Food','Health_and_Personal_Care', | |
'Home_and_Kitchen','Musical_Instruments','Office_Products', | |
'Patio_Lawn_and_Garden','Pet_Supplies','Sports_and_Outdoors', | |
'Tools_and_Home_Improvement','Toys_and_Games','Video_Games']) | |
COMMON = set(['Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', | |
'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Electronics', | |
'Grocery_and_Gourmet_Food', 'Home_and_Kitchen', 'Musical_Instruments', | |
'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Sports_and_Outdoors', | |
'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games']) | |
link_columns = ['also_buy', 'also_view'] | |
review_columns = ['reviewerID', 'summary', 'reviewText', 'vote', 'overall', 'verified', 'reviewTime'] | |
qa_columns = ['questionType', 'answerType', 'question', 'answer', 'answerTime'] | |
meta_columns = ['asin', 'title', 'global_category', 'category', 'price', 'brand', 'feature', | |
'rank', 'details', 'description'] | |
candidate_types = ['product'] | |
node_attr_dict = {'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'], | |
'brand': ['brand_name']} | |
def __init__(self, | |
root, | |
categories: list, | |
meta_link_types=['brand'], | |
max_entries=25, | |
download_processed=True, | |
**kwargs): | |
''' | |
Args: | |
root (str): root directory to store the data | |
categories (list): product categories | |
meta_link_types (list): a list which may contain entries in node info | |
that used to consruct meta links, e.g. ['category', 'brand'] | |
will construct entity nodes of catrgory and brand which link | |
to corresponding nodes | |
max_entries (int): maximum number of review & qa entries to show in the description | |
indirected (bool): make the graph indirected | |
''' | |
self.root = root | |
self.max_entries = max_entries | |
self.raw_data_dir = osp.join(root, 'raw') | |
self.processed_data_dir = osp.join(root, 'processed') | |
os.makedirs(self.raw_data_dir, exist_ok=True) | |
os.makedirs(self.processed_data_dir, exist_ok=True) | |
# construct the graph based on link info in the raw data | |
cache_path = None if meta_link_types is None else \ | |
osp.join(self.processed_data_dir, 'cache', '-'.join(meta_link_types)) | |
if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')) and download_processed: | |
print('Downloading processed data...') | |
processed_path = hf_hub_download( | |
PROCESSED_DATASET["repo"], | |
PROCESSED_DATASET["file"], | |
repo_type="model", | |
) | |
with zipfile.ZipFile(processed_path, 'r') as zip_ref: | |
zip_ref.extractall(self.root) | |
os.remove(processed_path) | |
print('Downloaded processed data!') | |
if not (cache_path is None) and osp.exists(cache_path): | |
print(f'Load cached graph with meta link types {meta_link_types}') | |
processed_data = load_files(cache_path) | |
else: | |
processed_data = self._process_raw(categories) | |
if meta_link_types: | |
# customize the graph by adding meta links | |
processed_data = self.post_process(processed_data, meta_link_types=meta_link_types, cache_path=cache_path) | |
super(AmazonSemiStruct, self).__init__(**processed_data, **kwargs) | |
def __getitem__(self, idx): | |
idx = int(idx) | |
node_info = self.node_info[idx] | |
try: | |
dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ') | |
node_info['dimensions'], node_info['weight'] = dimensions, weight | |
except: pass | |
node = Node() | |
register_node(node, node_info) | |
return node | |
def get_chunk_info(self, idx, attribute): | |
if not hasattr(self[idx], attribute): return '' | |
node_attr = getattr(self[idx], attribute) | |
if 'feature' in attribute: | |
features = [] | |
if len(node_attr): | |
for feature_idx, feature in enumerate(node_attr): | |
if feature == '': continue | |
if 'asin' in feature.lower(): continue | |
features.append(feature) | |
chunk = ' '.join(features) | |
elif 'review' in attribute: | |
chunk = '' | |
if len(node_attr): | |
scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node_attr] | |
ranks = np.argsort(-np.array(scores)) | |
for idx, review_idx in enumerate(ranks): | |
review = node_attr[review_idx] | |
chunk += 'The review \"' + str(review['summary']) + '\"' | |
chunk += 'states that \"' + str(review['reviewText']) + '\". ' | |
if idx > self.max_entries: break | |
elif 'qa' in attribute: | |
chunk = '' | |
if len(node_attr): | |
for idx, question in enumerate(node_attr): | |
chunk += 'The question is \"' + str(question['question']) + '\", ' | |
chunk += 'and the answer is \"' + str(question['answer']) + '\". ' | |
if idx > self.max_entries: | |
break | |
elif 'description' in attribute and len(node_attr): | |
chunk = " ".join(node_attr) | |
else: | |
chunk = node_attr | |
return chunk | |
def get_doc_info(self, idx, | |
add_rel=True, | |
compact=False): | |
if self.node_type_dict[int(self.node_types[idx])] == 'brand': | |
return f'brand name: {self[idx].brand_name}' | |
node = self[idx] | |
doc = f'- product: {node.title}\n' | |
if hasattr(node, 'brand'): | |
doc += f'- brand: {node.brand}\n' | |
try: | |
dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ') | |
doc += (f'- dimensions: {dimensions}\n' | |
f'- weight: {weight}\n') | |
except: pass | |
if len(node.description): | |
description = " ".join(node.description).strip(" ") | |
if len(description) > 0: | |
doc += f'- description: {description}\n' | |
feature_text = f'- features: \n' | |
if len(node.feature): | |
for feature_idx, feature in enumerate(node.feature): | |
if feature == '': continue | |
if 'asin' in feature.lower(): continue | |
feature_text += (f'#{feature_idx + 1}: {feature}\n') | |
else: feature_text = '' | |
if len(node.review): | |
review_text = f'- reviews: \n' | |
scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node.review] | |
ranks = np.argsort(-np.array(scores)) | |
for i, review_idx in enumerate(ranks): | |
review = node.review[review_idx] | |
review_text += (f'#{review_idx + 1}:\n' | |
f'summary: {review["summary"]}\n' | |
f'text: "{review["reviewText"]}"\n') | |
if i > self.max_entries: break | |
else: review_text = '' | |
if len(node.qa): | |
qa_text = f'- Q&A: \n' | |
for qa_idx, qa in enumerate(node.qa): | |
qa_text += (f'#{qa_idx + 1}:\n' | |
f'question: "{qa["question"]}"\n' | |
f'answer: "{qa["answer"]}"\n') | |
if qa_idx > self.max_entries: break | |
else: qa_text = '' | |
doc += feature_text + review_text + qa_text | |
if add_rel: | |
doc += self.get_rel_info(idx) | |
if compact: | |
doc = compact_text(doc) | |
return doc | |
def get_rel_info(self, idx, rel_types=None, n_rel=-1): | |
doc = '' | |
rel_types = self.rel_type_lst() if rel_types is None else rel_types | |
n_also_buy = self.get_neighbor_nodes(idx, 'also_buy') | |
n_also_view = self.get_neighbor_nodes(idx, 'also_view') | |
n_has_brand = self.get_neighbor_nodes(idx, 'has_brand') | |
str_also_buy = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_buy)] | |
str_also_view = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_view)] | |
if len(str_also_buy) == 0: str_also_buy = '' | |
if len(str_also_view) == 0: str_also_view = '' | |
str_has_brand = '' | |
if len(n_has_brand): | |
str_has_brand = f' brand: {self[n_has_brand[0]].brand_name}\n' | |
str_also_buy = ''.join(str_also_buy) | |
str_also_view = ''.join(str_also_view) | |
if len(str_also_buy): | |
doc += f' products also purchased: \n{str_also_buy}' | |
if len(str_also_view): | |
doc += f' products also viewed: \n{str_also_view}' | |
if len(n_has_brand): | |
doc += str_has_brand | |
if len(doc): | |
doc = '- relations:\n' + doc | |
return doc | |
def _process_raw(self, categories): | |
if 'all' in categories: | |
review_categories = self.REVIEW_CATEGORIES | |
qa_categories = self.QA_CATEGORIES | |
else: | |
qa_categories = review_categories = categories | |
assert len(set(categories) - self.COMMON) == 0, f'invalid categories exist' | |
if osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')): | |
print(f'Load processed data from {self.processed_data_dir}') | |
loaded_files = load_files(self.processed_data_dir) | |
loaded_files.update( | |
{'node_types': torch.zeros(len(loaded_files['node_info'])), | |
'node_type_dict': {0: 'product'}}) | |
return loaded_files | |
print(f'Check data downloading...') | |
for category in review_categories: | |
review_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2' | |
download_url(f'{review_header}/categoryFiles/{category}.json.gz', self.raw_data_dir) | |
download_url(f'{review_header}/metaFiles2/meta_{category}.json.gz', self.raw_data_dir) | |
for category in qa_categories: | |
qa_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa' | |
download_url(f'{qa_header}/qa_{category}.json.gz', self.raw_data_dir) | |
if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')): | |
print('Loading data... It might take a while') | |
# read amazon QA data | |
df_qa = pd.concat([read_qa(osp.join(self.raw_data_dir, f'qa_{category}.json.gz')) | |
for category in qa_categories])[['asin'] + self.qa_columns] | |
# read amazon review data | |
df_review = pd.concat([read_review(osp.join(self.raw_data_dir, f'{category}.json.gz')) | |
for category in review_categories])[['asin'] + self.review_columns] | |
# read amazon meta data from amazon review & amazon kdd | |
meta_df_lst = [] | |
for category in review_categories: | |
cat_review = read_review(osp.join(self.raw_data_dir, f'meta_{category}.json.gz')) | |
cat_review.insert(0, 'global_category', category.replace('_', ' ')) | |
meta_df_lst.append(cat_review) | |
df_ucsd_meta = pd.concat(meta_df_lst) | |
print('Preprocessing data...') | |
df_ucsd_meta = df_ucsd_meta.drop_duplicates(subset='asin', keep='first') | |
df_meta = df_ucsd_meta[self.meta_columns + self.link_columns] | |
# Merge dataframes | |
df_review_meta = df_review.merge(df_meta, left_on='asin', right_on='asin') | |
unique_asin = np.unique(np.array(df_review_meta['asin'])) | |
# Filer items with both meta and review data | |
df_qa_reduced = df_qa[df_qa['asin'].isin(unique_asin)] | |
df_review_reduced = df_review[df_review['asin'].isin(unique_asin)] | |
df_meta_reduced = df_meta[df_meta['asin'].isin(unique_asin)].reset_index() | |
def get_map(df): | |
asin2id, id2asin = {}, {} | |
for idx in range(len(df)): | |
asin2id[df['asin'][idx]] = idx | |
id2asin[idx] = df['asin'][idx] | |
return asin2id, id2asin | |
print('Construct node info and graph...') | |
# get mapping from asin to node id and its reversed mapping | |
self.asin2id, self.id2asin = get_map(df_meta_reduced) | |
node_info = self.construct_raw_node_info(df_meta_reduced, df_review_reduced, df_qa_reduced) | |
edge_index, edge_types = self.create_raw_product_graph(df_meta_reduced, | |
columns=self.link_columns) | |
edge_type_dict = {0: 'also_buy', 1: 'also_view'} | |
processed_data = { | |
'node_info': node_info, | |
'edge_index': edge_index, | |
'edge_types': edge_types, | |
'edge_type_dict': edge_type_dict} | |
print(f'Saving to {self.processed_data_dir}...') | |
save_files(save_path=self.processed_data_dir, **processed_data) | |
processed_data.update({'node_types': torch.zeros(len(processed_data['node_info'])), | |
'node_type_dict': {0: 'product'}}) | |
return processed_data | |
def post_process(self, raw_info, meta_link_types, cache_path=None): | |
print(f'Adding meta link types {meta_link_types}') | |
node_info = raw_info['node_info'] | |
edge_type_dict = raw_info['edge_type_dict'] | |
node_type_dict = raw_info['node_type_dict'] | |
node_types = raw_info['node_types'].tolist() | |
edge_index = raw_info['edge_index'].tolist() | |
edge_types = raw_info['edge_types'].tolist() | |
n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict) | |
for i, link_type in enumerate(meta_link_types): | |
values = np.array([self._process_brand(node_info_i[link_type]) for node_info_i in node_info.values() if link_type in node_info_i.keys()]) | |
indices = np.array([idx for idx, node_info_i in enumerate(node_info.values()) if link_type in node_info_i.keys()]) | |
cur_n_nodes = len(node_info) | |
node_type_dict[n_n_types + i] = link_type | |
edge_type_dict[n_e_types + i] = "has_" + link_type | |
unique = np.unique(values) | |
for j, unique_j in enumerate(unique): | |
node_info[cur_n_nodes + j] = {link_type + '_name': unique_j} | |
ids = indices[np.array(values == unique_j)] | |
edge_index[0].extend(list(ids)) | |
edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))]) | |
edge_types.extend([i + n_e_types for _ in range(len(ids))]) | |
node_types.extend([n_n_types + i for _ in range(len(unique))]) | |
edge_index = torch.LongTensor(edge_index) | |
edge_types = torch.LongTensor(edge_types) | |
node_types = torch.LongTensor(node_types) | |
files = {'node_info': node_info, | |
'edge_index': edge_index, | |
'edge_types': edge_types, | |
'edge_type_dict': edge_type_dict, | |
'node_type_dict': node_type_dict, | |
'node_types': node_types | |
} | |
if cache_path is not None: | |
save_files(cache_path, **files) | |
return files | |
def _process_brand(self, brand): | |
brand = brand.strip(" \".*+,-_!@#$%^&*();\/|<>\'\t\n\r\\") | |
if len(brand) > 3 and brand[:3] == 'by ': | |
brand = brand[3:] | |
if len(brand) > 4 and brand[-4:] == '.com': | |
brand = brand[:-4] | |
if len(brand) > 4 and brand[:4] == 'www.': | |
brand = brand[4:] | |
if len(brand) > 100: | |
brand = brand.split(' ')[0] | |
return brand | |
def construct_raw_node_info(self, df_meta, df_review, df_qa): | |
node_info = {} | |
for idx, asin in self.id2asin.items(): | |
node_info[idx] = {} | |
node_info[idx]['review'] = [] | |
node_info[idx]['qa'] = [] | |
for i in tqdm(range(len(df_meta))): | |
df_meta_i = df_meta.iloc[i] | |
asin = df_meta_i['asin'] | |
idx = self.asin2id[asin] | |
for column in self.meta_columns: | |
if column == 'brand': | |
brand = self._process_brand(clean_data(df_meta_i[column])) | |
if len(brand) > 1: | |
node_info[idx]['brand'] = brand | |
else: | |
node_info[idx][column] = clean_data(df_meta_i[column]) | |
for name, df in zip(['review', 'qa'], [df_review, df_qa]): | |
for i in tqdm(range(len(df))): | |
df_i = df.iloc[i] | |
asin = df_i['asin'] | |
idx = self.asin2id[asin] | |
node_info[idx][name].append( | |
df_row_to_dict(df_i, colunm_names=self.review_columns \ | |
if name == 'review' else self.qa_columns)) | |
return node_info | |
def create_raw_product_graph(self, df, columns): | |
edge_types = [] | |
edge_index = [[], []] | |
for idx in range(len(df)): | |
out_node = self.asin2id[df['asin'].iloc[idx]] | |
for edge_type_id, edge_type in enumerate(columns): | |
in_nodes = [] | |
if not isinstance(df[edge_type].iloc[idx], list): | |
continue | |
for i in df[edge_type].iloc[idx]: | |
try: | |
in_nodes.append(self.asin2id[i]) | |
except KeyError: | |
continue | |
edge_types.extend([edge_type_id for _ in range(len(in_nodes))]) | |
edge_index[0].extend([out_node for _ in range(len(in_nodes))]) | |
edge_index[1].extend(in_nodes) | |
return torch.LongTensor(edge_index), torch.LongTensor(edge_types) | |
def has_brand(self, idx, brand): | |
try: | |
b = self[idx].brand | |
if len(b) > 4 and b[-4:] == '.com': b = b[:-4] | |
if len(brand) > 4 and brand[-4:] == '.com': brand = brand[:-4] | |
return b.lower().strip("\"") == brand.lower().strip("\"") | |
except: | |
return False | |
def has_also_buy(self, idx, also_buy_item): | |
try: | |
also_buy_lst = self.get_neighbor_nodes(idx, 'also_buy') | |
return also_buy_item in also_buy_lst | |
except: | |
return False | |
def has_also_view(self, idx, also_view_item): | |
try: | |
also_buy_lst = self.get_neighbor_nodes(idx, 'also_view') | |
return also_view_item in also_buy_lst | |
except: | |
return False | |
# read review files | |
def read_review(path): | |
def parse(path): | |
g = gzip.open(path, 'rb') | |
for l in g: | |
yield json.loads(l) | |
def getDF(path): | |
i = 0 | |
df = {} | |
for d in parse(path): | |
df[i] = d | |
i += 1 | |
return pd.DataFrame.from_dict(df, orient='index') | |
return getDF(path) | |
# read qa files | |
def read_qa(path): | |
def parse(path): | |
g = gzip.open(path, 'rb') | |
for l in g: | |
yield eval(l) | |
def getDF(path): | |
i = 0 | |
df = {} | |
for d in parse(path): | |
df[i] = d | |
i += 1 | |
return pd.DataFrame.from_dict(df, orient='index') | |
return getDF(path) |