antonlabate
ver 1.3
d758c99
raw
history blame contribute delete
5.52 kB
import os
import sys
import json
import sqlite3
from os import listdir, makedirs
from os.path import isfile, isdir, join, split, exists, splitext
from nltk import word_tokenize, tokenize
import traceback
EXIST = {"atis", "geo", "advising", "yelp", "restaurants", "imdb", "academic"}
def convert_fk_index(data):
fk_holder = []
for fk in data["foreign_keys"]:
tn, col, ref_tn, ref_col = fk[0][0], fk[0][1], fk[1][0], fk[1][1]
ref_cid, cid = None, None
try:
tid = data['table_names_original'].index(tn)
ref_tid = data['table_names_original'].index(ref_tn)
for i, (tab_id, col_org) in enumerate(data['column_names_original']):
if tab_id == ref_tid and ref_col == col_org:
ref_cid = i
elif tid == tab_id and col == col_org:
cid = i
if ref_cid and cid:
fk_holder.append([cid, ref_cid])
except:
traceback.print_exc()
print("table_names_original: ", data['table_names_original'])
print("finding tab name: ", tn, ref_tn)
sys.exit()
return fk_holder
def dump_db_json_schema(db, f):
'''read table and column info'''
conn = sqlite3.connect(db)
conn.execute('pragma foreign_keys=ON')
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
data = {'db_id': f,
'table_names_original': [],
'table_names': [],
'column_names_original': [(-1, '*')],
'column_names': [(-1, '*')],
'column_types': ['text'],
'primary_keys': [],
'foreign_keys': []}
fk_holder = []
for i, item in enumerate(cursor.fetchall()):
table_name = item[0]
data['table_names_original'].append(table_name)
data['table_names'].append(table_name.lower().replace("_", ' '))
fks = conn.execute("PRAGMA foreign_key_list('{}') ".format(table_name)).fetchall()
#print("db:{} table:{} fks:{}".format(f,table_name,fks))
fk_holder.extend([[(table_name, fk[3]), (fk[2], fk[4])] for fk in fks])
cur = conn.execute("PRAGMA table_info('{}') ".format(table_name))
for j, col in enumerate(cur.fetchall()):
data['column_names_original'].append((i, col[1]))
data['column_names'].append((i, col[1].lower().replace("_", " ")))
#varchar, '' -> text, int, numeric -> integer,
col_type = col[2].lower()
if 'char' in col_type or col_type == '' or 'text' in col_type or 'var' in col_type:
data['column_types'].append('text')
elif 'int' in col_type or 'numeric' in col_type or 'decimal' in col_type or 'number' in col_type\
or 'id' in col_type or 'real' in col_type or 'double' in col_type or 'float' in col_type:
data['column_types'].append('number')
elif 'date' in col_type or 'time' in col_type or 'year' in col_type:
data['column_types'].append('time')
elif 'boolean' in col_type:
data['column_types'].append('boolean')
else:
data['column_types'].append('others')
if col[5] == 1:
data['primary_keys'].append(len(data['column_names'])-1)
data["foreign_keys"] = fk_holder
data['foreign_keys'] = convert_fk_index(data)
return data
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python get_tables.py [dir includes many subdirs containing database.sqlite files] [output file name e.g. output.json] [existing tables.json file to be inherited]")
sys.exit()
input_dir = sys.argv[1]
output_file = sys.argv[2]
ex_tab_file = sys.argv[3]
all_fs = [df for df in listdir(input_dir) if exists(join(input_dir, df, df+'.sqlite'))]
with open(ex_tab_file, encoding='utf8') as f:
ex_tabs = json.load(f)
#for tab in ex_tabs:
# tab["foreign_keys"] = convert_fk_index(tab)
ex_tabs = {tab["db_id"]: tab for tab in ex_tabs if tab["db_id"] in all_fs}
print("precessed file num: ", len(ex_tabs))
not_fs = [df for df in listdir(input_dir) if not exists(join(input_dir, df, df+'.sqlite'))]
for d in not_fs:
print("no sqlite file found in: ", d)
db_files = [(df+'.sqlite', df) for df in listdir(input_dir) if exists(join(input_dir, df, df+'.sqlite'))]
tables = []
for f, df in db_files:
#if df in ex_tabs.keys():
#print 'reading old db: ', df
# tables.append(ex_tabs[df])
db = join(input_dir, df, f)
print('\nreading new db: ', df)
table = dump_db_json_schema(db, df)
prev_tab_num = len(ex_tabs[df]["table_names"])
prev_col_num = len(ex_tabs[df]["column_names"])
cur_tab_num = len(table["table_names"])
cur_col_num = len(table["column_names"])
if df in ex_tabs.keys() and prev_tab_num == cur_tab_num and prev_col_num == cur_col_num and prev_tab_num != 0 and len(ex_tabs[df]["column_names"]) > 1:
table["table_names"] = ex_tabs[df]["table_names"]
table["column_names"] = ex_tabs[df]["column_names"]
else:
print("\n----------------------------------problem db: ", df)
tables.append(table)
print("final db num: ", len(tables))
with open(output_file, 'wt', encoding='utf8') as out:
json.dump(tables, out, sort_keys=True, indent=2, separators=(',', ': '), ensure_ascii=False)