Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
''' | |
@Author : Jiangjie Chen | |
@Time : 2018/11/15 17:08 | |
@Contact: [email protected] | |
''' | |
import re | |
import datetime | |
import os | |
import argparse | |
import logging | |
import traceback | |
try: | |
import ujson as json | |
except: | |
import json | |
HADOOP_BIN = 'PATH=/usr/bin/:$PATH hdfs' | |
FOR_PUBLIC = True | |
def LengthStats(filename): | |
len_list = [] | |
thresholds = [0.8, 0.9, 0.95, 0.99, 0.999] | |
with open(filename) as f: | |
for line in f: | |
len_list.append(len(line.strip().split())) | |
stats = { | |
'Max': max(len_list), | |
'Min': min(len_list), | |
'Avg': round(sum(len_list) / len(len_list), 4), | |
} | |
len_list.sort() | |
for t in thresholds: | |
stats[f"Top-{t}"] = len_list[int(len(len_list) * t)] | |
for k in stats: | |
print(f"- {k}: {stats[k]}") | |
return stats | |
class AttrDict(dict): | |
def __init__(self, *args, **kwargs): | |
super(AttrDict, self).__init__(*args, **kwargs) | |
self.__dict__ = self | |
def TraceBack(error_msg): | |
exc = traceback.format_exc() | |
msg = f'[Error]: {error_msg}.\n[Traceback]: {exc}' | |
return msg | |
def Now(): | |
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
def AbsParentDir(file, parent='..', postfix=None): | |
ppath = os.path.abspath(file) | |
parent_level = parent.count('.') | |
while parent_level > 0: | |
ppath = os.path.dirname(ppath) | |
parent_level -= 1 | |
if postfix is not None: | |
return os.path.join(ppath, postfix) | |
else: | |
return ppath | |
def init_logger(log_file=None, log_file_level=logging.NOTSET, from_scratch=False): | |
from coloredlogs import ColoredFormatter | |
import tensorflow as tf | |
fmt = "[%(asctime)s %(levelname)s] %(message)s" | |
log_format = ColoredFormatter(fmt=fmt) | |
# log_format = logging.Formatter() | |
logger = logging.getLogger() | |
logger.setLevel(log_file_level) | |
console_handler = logging.StreamHandler() | |
console_handler.setFormatter(log_format) | |
logger.handlers = [console_handler] | |
if log_file and log_file != '': | |
if from_scratch and tf.io.gfile.exists(log_file): | |
logger.warning('Removing previous log file: %s' % log_file) | |
tf.io.gfile.remove(log_file) | |
path = os.path.dirname(log_file) | |
os.makedirs(path, exist_ok=True) | |
file_handler = logging.FileHandler(log_file) | |
file_handler.setLevel(log_file_level) | |
file_handler.setFormatter(log_format) | |
logger.addHandler(file_handler) | |
return logger | |
def OverWriteCjjPy(root='.'): | |
# import difflib | |
# diff = difflib.HtmlDiff() | |
cnt = 0 | |
golden_cjjpy = os.path.join(root, 'cjjpy.py') | |
# golden_content = open(golden_cjjpy).readlines() | |
for dir, folder, file in os.walk(root): | |
for f in file: | |
if f == 'cjjpy.py': | |
cjjpy = '%s/%s' % (dir, f) | |
# content = open(cjjpy).readlines() | |
# d = diff.make_file(golden_content, content) | |
cnt += 1 | |
print('[%d]: %s' % (cnt, cjjpy)) | |
os.system('cp %s %s' % (golden_cjjpy, cjjpy)) | |
def ChangeFileFormat(filename, new_fmt): | |
assert type(filename) is str and type(new_fmt) is str | |
spt = filename.split('.') | |
if len(spt) == 0: | |
return filename | |
else: | |
return filename.replace('.' + spt[-1], new_fmt) | |
def CountLines(fname): | |
with open(fname, 'rb') as f: | |
count = 0 | |
last_data = '\n' | |
while True: | |
data = f.read(0x400000) | |
if not data: | |
break | |
count += data.count(b'\n') | |
last_data = data | |
if last_data[-1:] != b'\n': | |
count += 1 # Remove this if a wc-like count is needed | |
return count | |
def GetDate(): | |
return str(datetime.datetime.now())[5:10].replace('-', '') | |
def TimeClock(seconds): | |
sec = int(seconds) | |
hour = int(sec / 3600) | |
min = int((sec - hour * 3600) / 60) | |
ssec = float(seconds) - hour * 3600 - min * 60 | |
# return '%dh %dm %.2fs' % (hour, min, ssec) | |
return '{0:>2d}h{1:>3d}m{2:>6.2f}s'.format(hour, min, ssec) | |
def StripAll(text): | |
return text.strip().replace('\t', '').replace('\n', '').replace(' ', '') | |
def GetBracket(text, bracket, en_br=False): | |
# input should be aa(bb)cc, True for bracket, False for text | |
if bracket: | |
try: | |
return re.findall('\((.*?)\)', text.strip())[-1] | |
except: | |
return '' | |
else: | |
if en_br: | |
text = re.sub('\(.*?\)', '', text.strip()) | |
return re.sub('(.*?)', '', text.strip()) | |
def CharLang(uchar, lang): | |
assert lang.lower() in ['en', 'cn', 'zh'] | |
if lang.lower() in ['cn', 'zh']: | |
if uchar >= '\u4e00' and uchar <= '\u9fa5': | |
return True | |
else: | |
return False | |
elif lang.lower() == 'en': | |
if (uchar <= 'Z' and uchar >= 'A') or (uchar <= 'z' and uchar >= 'a'): | |
return True | |
else: | |
return False | |
else: | |
raise NotImplementedError | |
def WordLang(word, lang): | |
for i in word.strip(): | |
if i.isspace(): continue | |
if not CharLang(i, lang): | |
return False | |
return True | |
def SortDict(_dict, reverse=True): | |
assert type(_dict) is dict | |
return sorted(_dict.items(), key=lambda d: d[1], reverse=reverse) | |
def lark(content='test'): | |
print(content) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--diff', nargs=2, | |
help='show difference between two files, shown in downloads/diff.html') | |
parser.add_argument('--de_unicode', action='store_true', default=False, | |
help='remove unicode characters') | |
parser.add_argument('--link_entity', action='store_true', default=False, | |
help='') | |
parser.add_argument('--max_comm_len', action='store_true', default=False, | |
help='') | |
parser.add_argument('--search', nargs=2, | |
help='search key from file, 2 args: file name & key') | |
parser.add_argument('--email', nargs=2, | |
help='sending emails, 2 args: subject & content') | |
parser.add_argument('--overwrite', action='store_true', default=None, | |
help='overwrite all cjjpy under given *dir* based on *dir*/cjjpy.py') | |
parser.add_argument('--replace', nargs=3, | |
help='replace char, 3 args: file name & replaced char & replacer char') | |
parser.add_argument('--lark', nargs=1) | |
parser.add_argument('--get_hdfs', nargs=2, | |
help='easy copy from hdfs to local fs, 2 args: remote_file/dir & local_dir') | |
parser.add_argument('--put_hdfs', nargs=2, | |
help='easy put from local fs to hdfs, 2 args: local_file/dir & remote_dir') | |
parser.add_argument('--length_stats', nargs=1, | |
help='simple token lengths distribution of a line-by-line file') | |
args = parser.parse_args() | |
if args.overwrite: | |
print('* Overwriting cjjpy...') | |
OverWriteCjjPy() | |
if args.lark: | |
try: | |
content = args.lark[0] | |
except: | |
content = 'running complete' | |
print(f'* Larking "{content}"...') | |
lark(content) | |
if args.length_stats: | |
file = args.length_stats[0] | |
print(f'* Working on {file} lengths statistics...') | |
LengthStats(file) | |