bullm's picture
new test
e2f925b
import datetime as dt
import pandas as pd
import itertools
import pymongo # requires dnspython
import configparser
from .mongo_credentials import credentials
CREDENTIALS = credentials
class _MongoAbstract(object):
_instance = None
_credentials = None
_name = None
_key_ls = None
_del_many_limit = 50
_override_boolean_key = None
def __init__(self, instance='remote', **query):
self._set_instance(instance)
self._set_credentials()
if not isinstance(self._name, str) or not isinstance(self._instance, str) or not isinstance(self._key_ls, list):
raise Exception('Instance (str), collection _name (str) and _key_ls (list) need to be properly declared!')
db_obj = self._get_db_object()
collection_names_ls = db_obj.list_collection_names()
idx_tup_ls = [(i, pymongo.ASCENDING) for i in self._key_ls]
if self._name not in collection_names_ls:
print(f"Creating new {self._instance} collection: {self._name}\n")
collection = db_obj[self._name] # this will create the collection
collection.create_index(idx_tup_ls, unique=True, name=self._name) # this will index it
else:
collection = db_obj[self._name] # collection already exists
self._db_obj = db_obj
self._collection = collection
query = self._override_boolean_keys(**query)
self._is_adv_query = sum([isinstance(v, list) for v in query.values()])
self._set_query(query)
self._set_advanced_query()
self._out = None
self._cursor = self._get_cursor()
self._full_key_ls = self._key_ls + ['values']
def _set_credentials(self):
if self._credentials is None:
self._credentials = CREDENTIALS
def _get_credentials(self):
if self._credentials is None:
self._set_credentials()
return self._credentials
def _set_instance(self, instance):
self._instance = instance
def _override_boolean_keys(self, **query):
if self._get_boolean_keys() and query:
for key in self._get_boolean_keys():
if key in query.keys():
if isinstance(query[key], (tuple, list)):
query[key] = [str(int(i)) if isinstance(i, bool) else i for i in query[key]]
else:
if isinstance(query[key], bool):
query[key] = str(int(query[key]))
return query
def _get_boolean_keys(self):
if self._override_boolean_key is not None:
if isinstance(self._override_boolean_key, str):
return [self._override_boolean_key]
elif isinstance(self._override_boolean_key, list):
return self._override_boolean_key
else:
raise Exception(f'_override_boolean_key not recognized {self._override_boolean_key}')
else:
return None
def __repr__(self):
print_str = f'database: {self._get_db_name()}'
print_str += f'\ninstance: {self._get_instance()}'
print_str += f'\ncollection: {self._get_name()}'
key_str = ', '.join([i for i in self.get_keys() if i != 'values'])
print_str += f'\nkeys: [{key_str}]'
storage_size = self._db_obj.command('collstats', self._get_name())['size'] / 1e6
print_str += f'\nstorage size: {storage_size:.1f}MB'
num_docs = self._get_cursor().count()
print_str += f'\ndocuments: {num_docs}'
return print_str
def _get_collection(self):
return self._collection
def _get_instance(self):
return self._instance
def _get_name(self):
return self._name
def _get_host_and_port(self):
user = self._get_credentials()['user']
password = self._get_credentials()['password']
host = self._get_credentials()['host']
port = self._get_credentials()['port']
full_hostname = f"mongodb+srv://{user}:{password}@{host}/{port}"
return full_hostname, port
def _get_db_object(self):
client = pymongo.MongoClient(host=self._get_host_and_port()[0], port=self._get_host_and_port()[1])
return client[self._get_db_name()]
def _get_db_name(self):
return self._get_credentials()['dbname']
def _set_advanced_query(self):
if self._is_adv_query:
and_ls = []
ls_dd = self._product_dict(**self._query)
for dd in ls_dd:
and_ls.extend([{'$and': [dd]}])
or_dd = {'$or': and_ls}
self._query = or_dd
def _get_query(self):
return self._query
def _get_cursor(self):
if self._out is None:
self._cursor = self._collection.find(self._query)
return self._cursor
@staticmethod
def _product_dict(**kwargs):
for key, val in kwargs.items(): # make sure adv query values are all lists
if not isinstance(val, list):
kwargs.update({key: [val]})
keys = kwargs.keys()
vals = kwargs.values()
for instance in itertools.product(*vals):
yield dict(zip(keys, instance))
def _set_query(self, query):
query_dd = dict()
for k, v in query.items():
if v is not None: # drop None from queries, but allow it to pass
if isinstance(v, str) and '~' in v:
query_dd[k] = {'$ne': v.replace('~', '')}
elif isinstance(v, list):
new_ls = list()
neg_ls = list()
for i in v:
if isinstance(i, str) and '~' in i:
neg_ls.append(i.replace('~', ''))
else:
new_ls.append(i)
if len(neg_ls) > 0:
new_ls.append({'$nin': neg_ls})
query_dd[k] = new_ls
else:
query_dd[k] = v
self._query = query_dd
def drop_collection(self, confirm=''):
if confirm != 'confirm':
print('drop_collection not confirmed. Make sure you know what you are doing. Pass "confirm" to drop.')
else:
db_obj = self._get_db_object()
db_obj.drop_collection(self._name)
print('Database: %s' % self._get_db_name())
print('Collection dropped: %s' % self._name)
def _reindex(self, confirm=''):
if confirm != 'confirm':
print('reindex not confirmed. Nothing done. Pass "confirm" to reindex.')
else:
collection = self._get_collection()
collection.drop_indexes()
idx_tup_ls = [(i, pymongo.ASCENDING) for i in self._key_ls]
collection.create_index(idx_tup_ls, unique=True, name=self._name) # this will index it
print('collection %s successfully re-indexed' % self._name)
def count(self):
print(f'Database: {self._get_db_name()}')
print(f'Collection: {self._name}')
print(f'Query: {self._query}')
print(f'Documents: {self._collection.count_documents(self._query)}')
def replace_field_value(self, existing_field=True, **kwargs):
"""
make sure you declare the appropriate key=value arguments to be modified in your query!
"""
if len(kwargs) != 1:
raise Exception('please rename only one field at a time')
field = list(kwargs.keys())[0]
value = kwargs[field]
if field not in self.get_keys() and existing_field: # check init that everything is ok to reindex
raise Exception('new field needs to be declared in collection keys before replace / re-index')
if field == 'values':
raise Exception('values cannot be replace with this method. Use .update_values() instead')
if existing_field:
if not self._query:
raise Exception('Cannot replace key value without filters.')
if self._get_boolean_keys() and field in self._get_boolean_keys():
value = str(int(value))
cursor = self._get_cursor()
request_ls = []
for doc in cursor:
if existing_field:
doc.pop(field)
update_dd = {'$set': {field: value}}
request = pymongo.UpdateOne(doc, update_dd, upsert=False)
request_ls.append(request)
if len(request_ls) > 0:
self._get_collection().bulk_write(request_ls)
print(f'collection field {field} successfully updated to {value}')
if not existing_field:
self._reindex('confirm')
else:
print('no operations to handle')
@classmethod
def get_keys(self):
return [i for i in self._key_ls] # pass a copy
def get_key_values(self):
match_dd = {'$match': self._query}
key_dd = {k: f'${k}' for k in self._key_ls}
group_dd = {'$group': {'_id': key_dd}}
cursor = self._collection.aggregate([match_dd, group_dd])
ls = [i['_id'] for i in list(cursor)]
keys_df = pd.DataFrame(ls)
if not keys_df.empty:
keys_df = keys_df.sort_values(by=self._key_ls)
keys_df = keys_df.reset_index(drop=True)
return keys_df
def _is_local_instance(self):
return self._get_instance() == 'local'
def delete_many(self, confirm=''):
if not self._query:
raise Exception('Cannot delete_many without filters. Use drop_collection() instead for a full drop.')
lmt = self._del_many_limit
cnt = self._collection.count_documents(self._query)
if (cnt > lmt) and confirm != 'confirm':
raise Exception(f'confirm delete_many() if more than {lmt} documents. Potentially deleted: {cnt}')
before = self._collection.count_documents({})
self._get_collection().delete_many(self._query)
after = self._collection.count_documents({})
self._out = None
print(f'{before - after} document(s) deleted')
def distinct(self, field):
if isinstance(field, str):
field = field
cursor = self._get_cursor()
return cursor.distinct(field) # faster
elif isinstance(field, list):
group_dd = {"_id": {i: f'${i}' for i in field}}
if len(self._query) > 0:
cursor = self._collection.aggregate([{'$match': self._query},
{"$group": group_dd}])
else:
cursor = self._collection.aggregate([{"$group": group_dd}])
dd = [doc['_id'] for doc in cursor]
return pd.DataFrame().from_dict(dd)
class _MongoSeries(_MongoAbstract):
_drop_weekends = None
def __init__(self, instance='remote', **query):
_MongoAbstract.__init__(self, instance=instance, **query)
def _get_drop_weekends(self):
return self._drop_weekends
def _update_query_date_range(self, start, end):
if start is not None and end is None:
self._query['values.date'] = start
# self._query['values'] = {'$elemMatch': {'date': {'$gte': start}}}
elif start is None and end is not None:
self._query['values.date'] = {'$lte': end}
elif start is not None and end is not None:
self._query['values.date'] = {'$gte': start, '$lte': end}
else: # both none
pass
def _get_query_cursor(self, start, end):
if start is None and end is None:
return self._get_cursor() # simple case
else:
co = self._get_collection()
key_ls = self.get_keys()
match_dd = {'$match': self._query}
if start is not None and end is None:
if isinstance(start, (dt.datetime, dt.date)):
start = start.strftime('%Y%m%d')
start = start.replace('-', '')
filter_dd = {
'$filter':
{'input': '$values',
'as': 'vals',
'cond': {'$gte': ['$$vals.date', start]}
}
}
elif start is None and end is not None:
if isinstance(end, dt.datetime):
end = end.strftime('%Y%m%d')
end = end.replace('-', '')
filter_dd = {
'$filter':
{'input': '$values',
'as': 'vals',
'cond': {'$lte': ['$$vals.date', end]}
}
}
else: # both defined
if isinstance(start, dt.datetime):
start = start.strftime('%Y%m%d')
start = start.replace('-', '')
if isinstance(end, dt.datetime):
end = end.strftime('%Y%m%d')
end = end.replace('-', '')
filter_dd = {
'$filter':
{'input': '$values',
'as': 'vals',
'cond': {'$and': [{'$gte': ['$$vals.date', start]},
{'$lte': ['$$vals.date', end]}]}
}
}
key_dd = {i: 1 for i in key_ls}
key_dd['values'] = filter_dd
project_dd = {'$project': key_dd}
cursor = co.aggregate([match_dd, project_dd])
return cursor
def insert(self, pd_obj, col_key):
if pd_obj.empty:
print('pd_obj empty, no data to insert')
return
if isinstance(pd_obj, pd.Series):
pd_obj = pd.DataFrame(pd_obj)
# check no dups
if pd_obj.columns.has_duplicates:
raise Exception('inserted object cannot have duplicated keys!')
# initial checks
if isinstance(col_key, str):
col_key = [col_key]
if len(self._key_ls) != len(col_key):
name = self._name
full = len(self._key_ls)
given = len(col_key)
raise Exception(f'insert error: {name}. col_key must have {full} elements. provided was {given}')
df = pd_obj.copy()
df.index = df.index.map(lambda i: i.strftime('%Y%m%d'))
# integrity check
for col in pd_obj.columns:
col_val_ls = col.split('.')
if len(self._full_key_ls) - 1 != len(col_val_ls):
raise Exception(f'all columns must have {len(col_key)} elements. provided was {col_val_ls}')
# do not store full NaN rows or columns
df = df.dropna(how='all', axis=0)
df = df.dropna(how='all', axis=1)
request_ls = []
for col in df.columns:
col_val_ls = col.split('.')
tup = zip(col_key, col_val_ls)
query_dd = {k: v for k, v in tup}
values_dd = df[col].dropna().to_dict()
values_ls = [{'date': k, 'value': v} for k, v in values_dd.items()]
update_dd = {'$set': {'values': values_ls}}
request = pymongo.UpdateOne(query_dd, update_dd, upsert=True)
request_ls.append(request)
self._get_collection().bulk_write(request_ls)
def last_update(self, how='each', order='min'):
match_dd = {'$match': self._query}
key_dd = {k: f'${k}' for k in self._key_ls}
group_dd = {
'$group':
{'_id':
{'_id': key_dd},
'max': {'$max': "$values.date"}
}
}
sort_dd = {'$sort': {'values.date': 1}}
cursor = self._get_collection().aggregate([match_dd, group_dd, sort_dd])
last_df = pd.DataFrame()
for num, doc in enumerate(cursor):
dd = doc['_id']['_id']
dd['last_update'] = doc['max'][-1]
tmp_df = pd.DataFrame.from_dict(dd, orient='index').T
last_df = pd.concat([last_df, tmp_df], axis=0, sort=True)
last_df = last_df[self._key_ls + ['last_update']]
last_df['last_update'] = last_df['last_update'].apply(pd.to_datetime)
days_f = lambda x: (dt.datetime.today() - x).days
last_df['last_update_days'] = last_df['last_update'].apply(days_f)
if how == 'all':
if order == 'max':
return last_df['last_update'].max()
elif order == 'min':
return last_df['last_update'].min()
else:
raise Exception(f'argument order not recognized {order}')
elif how == 'each':
return last_df
else:
raise Exception(f'argument how not recognized: {how}')
def query(self, start=None, end=None, rename=None, expand=False):
"""
override is for self.last_update()
"""
if self._out is None:
cursor = self._get_query_cursor(start, end)
if rename is not None:
if isinstance(rename, str):
rename = [rename]
exclude_ls = ['_id', 'values']
df = pd.DataFrame()
for doc in cursor:
if len(doc['values']) > 0:
if rename is not None:
name = '.'.join([doc[i] for i in rename])
else:
name = '.'.join([doc[i] for i in sorted(doc.keys()) if i not in exclude_ls])
doc_df = pd.DataFrame().from_dict(doc['values']).set_index('date')
doc_df.columns = [name]
df = pd.concat([df, doc_df], axis=1, sort=True)
if df.empty:
return df
df.index = df.index.map(pd.to_datetime)
if self._get_drop_weekends():
week_days_ls = df.index.weekday < 5
df = df.loc[week_days_ls].copy()
if expand:
df.columns = df.columns.str.split('.', expand=True)
# return series if only one column or row
if len(df.columns) == 1:
df = df[df.columns[0]].copy()
if len(df.index) == 1:
df = df.iloc[-1].copy()
self._out = df.copy()
return self._out
def drop_datapoint(self, date_str):
if self._collection.count_documents(self._query) == 0:
raise Exception('no documents found with given filters')
if self._collection.count_documents(self._query) > 1:
raise Exception('you can only drop datapoints one series at a time')
if isinstance(date_str, dt.datetime):
date_str = date_str.strftime('%Y%m%d')
cursor = self._get_cursor()
request_ls = []
for doc in cursor:
doc.pop('_id')
doc.pop('values')
doc['values.date'] = date_str
update_dd = {'$unset': {'values.$.value': ""}}
request = pymongo.UpdateOne(doc, update_dd, upsert=False)
request_ls.append(request)
self._get_collection().bulk_write(request_ls)
def update_values(self, pd_obj, col_key):
if pd_obj.empty:
print('pd_obj empty, no data to update')
return
# initial checks
if isinstance(col_key, str):
col_key = [col_key]
if len(self._key_ls) != len(col_key):
name = self._name
full = len(self._key_ls)
given = len(col_key)
raise Exception(f'insert error: {name}. col_key must have {full} elements. provided was {given}')
if isinstance(pd_obj, pd.Series):
df = pd.DataFrame(pd_obj)
else:
df = pd_obj.copy()
# check no dups
if df.columns.has_duplicates:
raise Exception('inserted object cannot have duplicated keys!')
# integrity check
for col in df.columns:
col_val_ls = col.split('.')
if len(self._full_key_ls) - 1 != len(col_val_ls):
raise Exception(f'all columns must have {len(col_key)} elements. provided was {col_val_ls}')
df.index = df.index.map(lambda i: i.strftime('%Y%m%d'))
df = df.dropna(axis=0, how='all')
df = df.dropna(axis=1, how='all')
request_ls = []
for col in df.columns:
col_val_ls = col.split('.')
tup = zip(col_key, col_val_ls)
query_dd = {k: v for k, v in tup}
values_dd = df[col].dropna().to_dict()
# pull many
date_ls = list(values_dd.keys())
update_dd = {
'$pull': {'values': {'date': {'$in': date_ls}}}
}
request = pymongo.UpdateMany(query_dd, update_dd, upsert=True)
request_ls.append(request)
# push many
new_ls = [{'date': k, 'value': v} for k, v in values_dd.items()]
update_dd = {'$push': {'values': {'$each': new_ls}}}
request = pymongo.UpdateMany(query_dd, update_dd, upsert=True)
request_ls.append(request)
self._get_collection().bulk_write(request_ls, ordered=True)
class _MongoDoc(_MongoAbstract):
def __init__(self, instance='remote', **query):
_MongoAbstract.__init__(self, instance=instance, **query)
def insert(self, dd):
if not isinstance(dd, dict):
print('insert dict empty, no data to insert')
return
# initial checks
if sorted(list(self._get_query().keys())) != sorted(self.get_keys()):
raise Exception(f'specify a full query to insert into a _MongoTable! Keys are: {str(self.get_keys())}')
cast_key_ls = set([str(k) for k in dd.keys()])
if len(cast_key_ls) != len(dd.keys()):
raise Exception('identical string representations of keys are not allowed')
insert_dd = {str(k): v for k, v in dd.items()}
filter_dd = self._get_query()
update_dd = {'$set': {'values': insert_dd}}
self._get_collection().update_one(filter_dd, update=update_dd, upsert=True)
def query(self, expand=False, rename=None):
"""
override is for self.last_update()
"""
if rename:
if isinstance(rename, str):
rename = [rename]
if self._out is None:
cursor = self._get_cursor() # simple case
df = pd.DataFrame()
for num, doc in enumerate(cursor):
if doc['values']:
doc.pop('_id')
val_dd = doc.pop('values')
if rename:
name = '.'.join([str(doc[k]) for k in rename])
else:
name = '.'.join([str(doc[k]) for k in sorted(doc.keys())])
s = pd.Series(val_dd, name=name, dtype=object)
df = pd.concat([df, s], axis=1, sort=True)
if expand:
df.columns = df.columns.str.split('.', expand=True)
# return series if only one column or row
if len(df.columns) == 1:
df = df[df.columns[0]].copy()
if len(df.index) == 1:
df = df.iloc[-1].copy()
self._out = df
return self._out
class _MongoTable(_MongoAbstract):
def __init__(self, instance='remote', **query):
_MongoAbstract.__init__(self, instance=instance, **query)
def insert(self, pd_obj):
if pd_obj.empty:
print('pd_obj empty, no data to insert')
return
if isinstance(pd_obj, pd.Series):
pd_obj = pd.DataFrame(pd_obj)
# check no dups
if pd_obj.columns.has_duplicates:
raise Exception('inserted object cannot have duplicated columns!')
# initial checks
if sorted(list(self._get_query().keys())) != sorted(self.get_keys()):
raise Exception(f'specify a full query to insert into a _MongoTable! Keys are: {str(self.get_keys())}')
# drop nan
df = pd_obj.copy()
df = df.dropna(how='all', axis=0)
df = df.dropna(how='all', axis=1)
if isinstance(df.columns, pd.core.indexes.multi.MultiIndex):
df.columns = ['.'.join([i for i in col]) for col in df.columns]
if isinstance(df.index, pd.core.indexes.multi.MultiIndex):
df.index = ['.'.join([i for i in col]) for col in df.columns]
df.columns = df.columns.map(str)
df.index = df.index.map(str)
values_dd = df.to_dict(orient='records')
filter_dd = self._get_query()
update_dd = {'$set': {'values': values_dd}}
self._get_collection().update_one(filter_dd, update=update_dd, upsert=True)
def query(self, rename=None, expand=False):
"""
override is for self.last_update()
"""
if self._out is None:
cursor = self._get_cursor() # simple case
if rename is not None:
if isinstance(rename, str):
rename = [rename]
exclude_ls = ['_id', 'values']
out_df = pd.DataFrame()
cnt = 0
for num, doc in enumerate(cursor):
if len(doc['values']) > 0:
df = pd.DataFrame().from_records(doc['values'])
if rename is not None:
name = '.'.join([doc[i] for i in rename])
else:
name = '.'.join([doc[i] for i in sorted(doc.keys()) if i not in exclude_ls])
id_ls = [name] * df.shape[0]
df['_id'] = id_ls
out_df = pd.concat([out_df, df], axis=0, sort=True)
cnt += 1
if out_df.empty:
return out_df
if cnt > 1:
out_df = out_df.reset_index(drop=False)
out_df = out_df.set_index('_id', drop=True)
out_df.index.name = None
else:
out_df = out_df.drop('_id', axis=1)
if expand:
out_df.index = out_df.index.str.split('.', expand=True)
# return series if only one column or row
if len(out_df.columns) == 1:
out_df = out_df[out_df.columns[0]].copy()
if len(out_df.index) == 1:
out_df = out_df.iloc[-1].copy()
self._out = out_df.copy()
return self._out
class _MongoLog(_MongoAbstract):
def __init__(self, instance='remote', **query):
_MongoAbstract.__init__(self, instance=instance, **query)
def _log(self, df):
request_ls = []
if not df.empty:
for (section, iso_dt), row_s in df.iterrows():
values_dd = row_s.to_dict()
doc_id = {'section': section,
'iso_date': iso_dt}
update_dd = {'$push': {'values': values_dd}}
request = pymongo.UpdateOne(doc_id, update_dd, upsert=True)
request_ls.append(request)
self._get_collection().bulk_write(request_ls)
def query(self, rename=None, expand=False):
"""
override is for self.last_update()
"""
if self._out is None:
cursor = self._get_cursor() # simple case
if rename is not None:
if isinstance(rename, str):
rename = [rename]
exclude_ls = ['_id', 'values']
df = pd.DataFrame()
for doc in cursor:
if len(doc['values']) > 0:
if rename is not None:
name = '.'.join([doc[i] for i in rename])
else:
name = '.'.join([doc[i] for i in sorted(doc.keys()) if i not in exclude_ls])
doc_df = pd.DataFrame().from_records(doc['values'])
doc_df.index = [name] * len(doc_df.index)
df = pd.concat([df, doc_df], axis=0, sort=True)
if df.empty:
return df
if expand:
df.index = df.index.str.split('.', expand=True)
# return series if only one column or row
if len(df.columns) == 1:
df = df[df.columns[0]].copy()
if len(df.index) == 1:
df = df.iloc[-1].copy()
self._out = df.copy()
return self._out