|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import pyarrow as pa |
|
try: |
|
import pyarrow.parquet as pq |
|
except ImportError: |
|
pq = None |
|
from pyarrow.tests.util import rands |
|
|
|
|
|
class ParquetWriteBinary(object): |
|
|
|
def setup(self): |
|
nuniques = 100000 |
|
value_size = 50 |
|
length = 1000000 |
|
num_cols = 10 |
|
|
|
unique_values = np.array([rands(value_size) for |
|
i in range(nuniques)], dtype='O') |
|
values = unique_values[np.random.randint(0, nuniques, size=length)] |
|
self.table = pa.table([pa.array(values) for i in range(num_cols)], |
|
names=['f{}'.format(i) for i in range(num_cols)]) |
|
self.table_df = self.table.to_pandas() |
|
|
|
def time_write_binary_table(self): |
|
out = pa.BufferOutputStream() |
|
pq.write_table(self.table, out) |
|
|
|
def time_write_binary_table_uncompressed(self): |
|
out = pa.BufferOutputStream() |
|
pq.write_table(self.table, out, compression='none') |
|
|
|
def time_write_binary_table_no_dictionary(self): |
|
out = pa.BufferOutputStream() |
|
pq.write_table(self.table, out, use_dictionary=False) |
|
|
|
def time_convert_pandas_and_write_binary_table(self): |
|
out = pa.BufferOutputStream() |
|
pq.write_table(pa.table(self.table_df), out) |
|
|
|
|
|
def generate_dict_strings(string_size, nunique, length, random_order=True): |
|
uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') |
|
if random_order: |
|
indices = np.random.randint(0, nunique, size=length).astype('i4') |
|
else: |
|
indices = np.arange(nunique).astype('i4').repeat(length // nunique) |
|
return pa.DictionaryArray.from_arrays(indices, uniques) |
|
|
|
|
|
def generate_dict_table(num_cols, string_size, nunique, length, |
|
random_order=True): |
|
data = generate_dict_strings(string_size, nunique, length, |
|
random_order=random_order) |
|
return pa.table([ |
|
data for i in range(num_cols) |
|
], names=['f{}'.format(i) for i in range(num_cols)]) |
|
|
|
|
|
class ParquetWriteDictionaries(object): |
|
|
|
param_names = ('nunique',) |
|
params = [(1000), (100000)] |
|
|
|
def setup(self, nunique): |
|
self.num_cols = 10 |
|
self.value_size = 32 |
|
self.nunique = nunique |
|
self.length = 10000000 |
|
|
|
self.table = generate_dict_table(self.num_cols, self.value_size, |
|
self.nunique, self.length) |
|
self.table_sequential = generate_dict_table(self.num_cols, |
|
self.value_size, |
|
self.nunique, self.length, |
|
random_order=False) |
|
|
|
def time_write_random_order(self, nunique): |
|
pq.write_table(self.table, pa.BufferOutputStream()) |
|
|
|
def time_write_sequential(self, nunique): |
|
pq.write_table(self.table_sequential, pa.BufferOutputStream()) |
|
|
|
|
|
class ParquetManyColumns(object): |
|
|
|
total_cells = 10000000 |
|
param_names = ('num_cols',) |
|
params = [100, 1000, 10000] |
|
|
|
def setup(self, num_cols): |
|
num_rows = self.total_cells // num_cols |
|
self.table = pa.table({'c' + str(i): np.random.randn(num_rows) |
|
for i in range(num_cols)}) |
|
|
|
out = pa.BufferOutputStream() |
|
pq.write_table(self.table, out) |
|
self.buf = out.getvalue() |
|
|
|
def time_write(self, num_cols): |
|
out = pa.BufferOutputStream() |
|
pq.write_table(self.table, out) |
|
|
|
def time_read(self, num_cols): |
|
pq.read_table(self.buf) |
|
|