Spaces:

snap-stanford
/

SKB-Explorer

Running on CPU Upgrade

File size: 21,778 Bytes

import os
import os.path as osp
import gzip
import pickle
import json
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from huggingface_hub import hf_hub_download
import zipfile
from ogb.utils.url import download_url
from src.benchmarks.semistruct.knowledge_base import SemiStructureKB
from src.tools.process_text import clean_data, compact_text
from src.tools.node import df_row_to_dict, Node, register_node
from src.tools.io import save_files, load_files

PROCESSED_DATASET = {
    "repo": "snap-stanford/STaRK-Dataset",
    "file": "amazon_processed.zip",
}


class AmazonSemiStruct(SemiStructureKB):
    
    REVIEW_CATEGORIES = set(['Amazon_Fashion','All_Beauty','Appliances',
                             'Arts_Crafts_and_Sewing','Automotive','Books',
                             'CDs_and_Vinyl','Cell_Phones_and_Accessories',
                             'Clothing_Shoes_and_Jewelry','Digital_Music',
                             'Electronics','Gift_Cards','Grocery_and_Gourmet_Food',
                             'Home_and_Kitchen','Industrial_and_Scientific', 'Kindle_Store',
                             'Luxury_Beauty','Magazine_Subscriptions', 'Movies_and_TV',
                             'Musical_Instruments', 'Office_Products','Patio_Lawn_and_Garden',
                             'Pet_Supplies','Prime_Pantry','Software','Sports_and_Outdoors',
                             'Tools_and_Home_Improvement','Toys_and_Games','Video_Games'])
    
    # single answers
    QA_CATEGORIES = set(['Appliances','Arts_Crafts_and_Sewing', 'Automotive',
                         'Baby','Beauty','Cell_Phones_and_Accessories',
                         'Clothing_Shoes_and_Jewelry','Electronics',
                        'Grocery_and_Gourmet_Food','Health_and_Personal_Care',
                        'Home_and_Kitchen','Musical_Instruments','Office_Products',
                        'Patio_Lawn_and_Garden','Pet_Supplies','Sports_and_Outdoors',
                        'Tools_and_Home_Improvement','Toys_and_Games','Video_Games'])
    
    COMMON = set(['Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 
                  'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Electronics', 
                  'Grocery_and_Gourmet_Food', 'Home_and_Kitchen', 'Musical_Instruments', 
                  'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Sports_and_Outdoors', 
                  'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games'])
    
    link_columns = ['also_buy', 'also_view']
    review_columns = ['reviewerID', 'summary', 'reviewText', 'vote', 'overall', 'verified', 'reviewTime']
    qa_columns = ['questionType', 'answerType', 'question', 'answer', 'answerTime']
    meta_columns = ['asin', 'title', 'global_category', 'category', 'price', 'brand', 'feature',
                    'rank', 'details', 'description']
    candidate_types = ['product']
    node_attr_dict = {'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'],
                       'brand': ['brand_name']}

    def __init__(self, 
                 root,
                 categories: list, 
                 meta_link_types=['brand'],
                 max_entries=25,
                 download_processed=True,
                 **kwargs):
        '''
            Args: 
                root (str): root directory to store the data
                categories (list): product categories
                meta_link_types (list): a list which may contain entries in node info 
                                        that used to consruct meta links, e.g. ['category', 'brand'] 
                                        will construct entity nodes of catrgory and brand which link 
                                        to corresponding nodes
                max_entries (int): maximum number of review & qa entries to show in the description
                indirected (bool): make the graph indirected
        '''

        self.root = root
        self.max_entries = max_entries 
        self.raw_data_dir = osp.join(root, 'raw')
        self.processed_data_dir = osp.join(root, 'processed')
        os.makedirs(self.raw_data_dir, exist_ok=True)
        os.makedirs(self.processed_data_dir, exist_ok=True)

        # construct the graph based on link info in the raw data
        cache_path = None if meta_link_types is None else \
                     osp.join(self.processed_data_dir, 'cache', '-'.join(meta_link_types))
        
        if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')) and download_processed:
            print('Downloading processed data...')
            processed_path = hf_hub_download(
                PROCESSED_DATASET["repo"],
                PROCESSED_DATASET["file"],
                repo_type="model",
            )
            with zipfile.ZipFile(processed_path, 'r') as zip_ref:
                zip_ref.extractall(self.root)
            os.remove(processed_path)
            print('Downloaded processed data!')

        if not (cache_path is None) and osp.exists(cache_path):
            print(f'Load cached graph with meta link types {meta_link_types}')
            processed_data = load_files(cache_path)
        else:
            processed_data = self._process_raw(categories)
            if meta_link_types: 
                # customize the graph by adding meta links
                processed_data = self.post_process(processed_data, meta_link_types=meta_link_types, cache_path=cache_path)
        super(AmazonSemiStruct, self).__init__(**processed_data, **kwargs)
    
    def __getitem__(self, idx):
        idx = int(idx)
        node_info = self.node_info[idx]
        try:
            dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ')
            node_info['dimensions'], node_info['weight'] = dimensions, weight
        except: pass
        node = Node()
        register_node(node, node_info)
        return node
        
    def get_chunk_info(self, idx, attribute):
        if not hasattr(self[idx], attribute): return ''
        node_attr = getattr(self[idx], attribute)
        
        if 'feature' in attribute:
            features = []
            if len(node_attr):
                for feature_idx, feature in enumerate(node_attr):
                    if feature == '': continue
                    if 'asin' in feature.lower(): continue
                    features.append(feature)
            chunk = ' '.join(features)
        
        elif 'review' in attribute:
            chunk = ''
            if len(node_attr):
                scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node_attr]
                ranks = np.argsort(-np.array(scores))
                for idx, review_idx in enumerate(ranks):
                    review = node_attr[review_idx]
                    chunk += 'The review \"' + str(review['summary']) + '\"'
                    chunk += 'states that \"' + str(review['reviewText']) + '\". '
                    if idx > self.max_entries: break
        
        elif 'qa' in attribute:
            chunk = ''
            if len(node_attr):
                for idx, question in enumerate(node_attr):
                    chunk += 'The question is \"' + str(question['question']) + '\", '
                    chunk += 'and the answer is \"' + str(question['answer']) + '\". '
                    if idx > self.max_entries: 
                        break
        
        elif 'description' in attribute and len(node_attr):
            chunk = " ".join(node_attr)
    
        else:
            chunk = node_attr
        return chunk 
    
    def get_doc_info(self, idx, 
                     add_rel=True, 
                     compact=False):
        
        if self.node_type_dict[int(self.node_types[idx])] == 'brand':
            return f'brand name: {self[idx].brand_name}'
        
        node = self[idx]
        doc = f'- product: {node.title}\n'
        if hasattr(node, 'brand'):
            doc += f'- brand: {node.brand}\n'
        try:
            dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ')
            doc += (f'- dimensions: {dimensions}\n'
                    f'- weight: {weight}\n')
        except: pass
        if len(node.description):
            description = " ".join(node.description).strip(" ")
            if len(description) > 0:
                doc += f'- description: {description}\n'
        
        feature_text = f'- features: \n'
        if len(node.feature):
            for feature_idx, feature in enumerate(node.feature):
                if feature == '': continue
                if 'asin' in feature.lower(): continue
                feature_text += (f'#{feature_idx + 1}: {feature}\n')
        else: feature_text = ''
        
        if len(node.review):
            review_text = f'- reviews: \n'
            scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node.review]
            ranks = np.argsort(-np.array(scores))
            for i, review_idx in enumerate(ranks):
                review = node.review[review_idx]
                review_text += (f'#{review_idx + 1}:\n'
                                f'summary: {review["summary"]}\n'
                                f'text: "{review["reviewText"]}"\n')
                if i > self.max_entries: break
        else: review_text = ''
        
        if len(node.qa):
            qa_text = f'- Q&A: \n'
            for qa_idx, qa in enumerate(node.qa):
                qa_text += (f'#{qa_idx + 1}:\n'
                            f'question: "{qa["question"]}"\n'
                            f'answer: "{qa["answer"]}"\n')
                if qa_idx > self.max_entries: break
        else: qa_text = ''
        
        doc += feature_text + review_text + qa_text
        
        if add_rel:
            doc += self.get_rel_info(idx)
        if compact: 
            doc = compact_text(doc)
        return doc
    
    def get_rel_info(self, idx, rel_types=None, n_rel=-1):
        doc = ''
        rel_types = self.rel_type_lst() if rel_types is None else rel_types
        n_also_buy = self.get_neighbor_nodes(idx, 'also_buy')
        n_also_view = self.get_neighbor_nodes(idx, 'also_view')
        n_has_brand = self.get_neighbor_nodes(idx, 'has_brand')

        str_also_buy = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_buy)]
        str_also_view = [f"#{idx + 1}: " + self[i].title  + '\n' for idx, i in enumerate(n_also_view)]
        
        if len(str_also_buy) == 0: str_also_buy = ''
        if len(str_also_view) == 0: str_also_view = ''
        str_has_brand = ''
        if len(n_has_brand): 
            str_has_brand = f'  brand: {self[n_has_brand[0]].brand_name}\n'
            
        str_also_buy = ''.join(str_also_buy)
        str_also_view = ''.join(str_also_view)

        if len(str_also_buy):
            doc += f'  products also purchased: \n{str_also_buy}'
        if len(str_also_view):
            doc += f'  products also viewed: \n{str_also_view}'
        if len(n_has_brand):
            doc += str_has_brand
            
        if len(doc): 
            doc = '- relations:\n' + doc
        return doc
    
    def _process_raw(self, categories):
        if 'all' in categories:
            review_categories = self.REVIEW_CATEGORIES
            qa_categories = self.QA_CATEGORIES
        else:
            qa_categories = review_categories = categories
            assert len(set(categories) - self.COMMON) == 0, f'invalid categories exist'
        
        if osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')):
            print(f'Load processed data from {self.processed_data_dir}')
            loaded_files = load_files(self.processed_data_dir)
            loaded_files.update(
                {'node_types': torch.zeros(len(loaded_files['node_info'])),
                 'node_type_dict': {0: 'product'}})
            return loaded_files
        
        print(f'Check data downloading...')
        for category in review_categories:
            review_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2'
            download_url(f'{review_header}/categoryFiles/{category}.json.gz', self.raw_data_dir)
            download_url(f'{review_header}/metaFiles2/meta_{category}.json.gz', self.raw_data_dir)
        for category in qa_categories:
            qa_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa'
            download_url(f'{qa_header}/qa_{category}.json.gz', self.raw_data_dir)
            
        if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')):
            print('Loading data... It might take a while')
            # read amazon QA data
            df_qa = pd.concat([read_qa(osp.join(self.raw_data_dir, f'qa_{category}.json.gz'))
                               for category in qa_categories])[['asin'] + self.qa_columns]
            
            # read amazon review data
            df_review = pd.concat([read_review(osp.join(self.raw_data_dir, f'{category}.json.gz')) 
                                   for category in review_categories])[['asin'] + self.review_columns]
            # read amazon meta data from amazon review & amazon kdd
            meta_df_lst = []
            for category in review_categories:
                cat_review = read_review(osp.join(self.raw_data_dir, f'meta_{category}.json.gz'))
                cat_review.insert(0, 'global_category', category.replace('_', ' '))
                meta_df_lst.append(cat_review)
            df_ucsd_meta = pd.concat(meta_df_lst)
            
            print('Preprocessing data...')
            df_ucsd_meta = df_ucsd_meta.drop_duplicates(subset='asin', keep='first')
            df_meta = df_ucsd_meta[self.meta_columns + self.link_columns]
            
            # Merge dataframes
            df_review_meta = df_review.merge(df_meta, left_on='asin', right_on='asin')
            unique_asin = np.unique(np.array(df_review_meta['asin']))
            
            # Filer items with both meta and review data
            df_qa_reduced = df_qa[df_qa['asin'].isin(unique_asin)]
            df_review_reduced = df_review[df_review['asin'].isin(unique_asin)]
            df_meta_reduced = df_meta[df_meta['asin'].isin(unique_asin)].reset_index()
            
            def get_map(df):
                asin2id, id2asin = {}, {}
                for idx in range(len(df)):
                    asin2id[df['asin'][idx]] = idx
                    id2asin[idx] = df['asin'][idx]
                return asin2id, id2asin

            print('Construct node info and graph...')
            # get mapping from asin to node id and its reversed mapping
            self.asin2id, self.id2asin = get_map(df_meta_reduced)
            node_info = self.construct_raw_node_info(df_meta_reduced, df_review_reduced, df_qa_reduced)
            edge_index, edge_types = self.create_raw_product_graph(df_meta_reduced, 
                                                                   columns=self.link_columns)
            edge_type_dict = {0: 'also_buy', 1: 'also_view'}
            processed_data = {
                'node_info': node_info, 
                'edge_index': edge_index, 
                'edge_types': edge_types,
                'edge_type_dict': edge_type_dict}
            
            print(f'Saving to {self.processed_data_dir}...')
            save_files(save_path=self.processed_data_dir, **processed_data)

        processed_data.update({'node_types': torch.zeros(len(processed_data['node_info'])),
                               'node_type_dict': {0: 'product'}})
        return processed_data
    
    def post_process(self, raw_info, meta_link_types, cache_path=None):
        print(f'Adding meta link types {meta_link_types}')
        node_info = raw_info['node_info']
        edge_type_dict = raw_info['edge_type_dict']
        node_type_dict = raw_info['node_type_dict']
        node_types = raw_info['node_types'].tolist()
        edge_index = raw_info['edge_index'].tolist()
        edge_types = raw_info['edge_types'].tolist()
        
        n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict)
        for i, link_type in enumerate(meta_link_types):
            values = np.array([self._process_brand(node_info_i[link_type]) for node_info_i in node_info.values() if link_type in node_info_i.keys()])
            indices = np.array([idx for idx, node_info_i in enumerate(node_info.values()) if link_type in node_info_i.keys()])
            
            cur_n_nodes = len(node_info)
            node_type_dict[n_n_types + i] = link_type
            edge_type_dict[n_e_types + i] = "has_" + link_type
            unique = np.unique(values)
            for j, unique_j in enumerate(unique):
                node_info[cur_n_nodes + j] = {link_type + '_name': unique_j}
                ids = indices[np.array(values == unique_j)]
                edge_index[0].extend(list(ids))
                edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))])
                edge_types.extend([i + n_e_types for _ in range(len(ids))])
            node_types.extend([n_n_types + i for _ in range(len(unique))])
        edge_index = torch.LongTensor(edge_index)
        edge_types = torch.LongTensor(edge_types)
        node_types = torch.LongTensor(node_types)
        files = {'node_info': node_info, 
                 'edge_index': edge_index, 
                 'edge_types': edge_types, 
                 'edge_type_dict': edge_type_dict,
                 'node_type_dict': node_type_dict,
                 'node_types': node_types
                 }
        if cache_path is not None:
            save_files(cache_path, **files)
        return files
    
    def _process_brand(self, brand):
        brand = brand.strip(" \".*+,-_!@#$%^&*();\/|<>\'\t\n\r\\")
        if len(brand) > 3 and brand[:3] == 'by ':
            brand = brand[3:]
        if len(brand) > 4 and brand[-4:] == '.com':
            brand = brand[:-4]
        if len(brand) > 4 and brand[:4] == 'www.':
            brand = brand[4:]
        if len(brand) > 100: 
            brand = brand.split(' ')[0]
        return brand
    
    def construct_raw_node_info(self, df_meta, df_review, df_qa):
        node_info = {}
        for idx, asin in self.id2asin.items():
            node_info[idx] = {}
            node_info[idx]['review'] = []
            node_info[idx]['qa'] = []
        
        for i in tqdm(range(len(df_meta))):
            df_meta_i = df_meta.iloc[i]
            asin = df_meta_i['asin']
            idx = self.asin2id[asin]
            for column in self.meta_columns:
                if column == 'brand':
                    brand = self._process_brand(clean_data(df_meta_i[column]))
                    if len(brand) > 1:
                        node_info[idx]['brand'] = brand
                else:
                    node_info[idx][column] = clean_data(df_meta_i[column])
                        
        for name, df in zip(['review', 'qa'], [df_review, df_qa]):
            for i in tqdm(range(len(df))):
                df_i = df.iloc[i]
                asin = df_i['asin']
                idx = self.asin2id[asin]
                node_info[idx][name].append(
                    df_row_to_dict(df_i, colunm_names=self.review_columns \
                                   if name == 'review' else self.qa_columns))
        return node_info

    def create_raw_product_graph(self, df, columns):
        edge_types = []
        edge_index = [[], []]
        for idx in range(len(df)):
            out_node = self.asin2id[df['asin'].iloc[idx]]
            for edge_type_id, edge_type in enumerate(columns):
                in_nodes = []
                if not isinstance(df[edge_type].iloc[idx], list):
                    continue
                for i in df[edge_type].iloc[idx]:
                    try:
                        in_nodes.append(self.asin2id[i])
                    except KeyError:
                        continue
                edge_types.extend([edge_type_id for _ in range(len(in_nodes))])
                edge_index[0].extend([out_node for _ in range(len(in_nodes))])
                edge_index[1].extend(in_nodes)
        return torch.LongTensor(edge_index), torch.LongTensor(edge_types)

    def has_brand(self, idx, brand):
        try: 
            b = self[idx].brand
            if len(b) > 4 and b[-4:] == '.com': b = b[:-4]
            if len(brand) > 4 and brand[-4:] == '.com': brand = brand[:-4]
            return b.lower().strip("\"") == brand.lower().strip("\"")
        except:
            return False

    def has_also_buy(self, idx, also_buy_item):
        try: 
            also_buy_lst = self.get_neighbor_nodes(idx, 'also_buy') 
            return also_buy_item in also_buy_lst
        except:
            return False
        
    def has_also_view(self, idx, also_view_item):
        try: 
            also_buy_lst = self.get_neighbor_nodes(idx, 'also_view') 
            return also_view_item in also_buy_lst
        except:
            return False
    
# read review files
def read_review(path):
  def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
      yield json.loads(l)
  def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
      df[i] = d
      i += 1
    return pd.DataFrame.from_dict(df, orient='index')
  return getDF(path)


# read qa files
def read_qa(path):
  def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
      yield eval(l)
  def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
      df[i] = d
      i += 1
    return pd.DataFrame.from_dict(df, orient='index')
  return getDF(path)