Spaces:

jiangjiechen
/

loren-fact-checking

Build error

App Files Files Community

loren-fact-checking / src /qg_client /cjjpy.py

jiangjiechen

init loren for spaces

7f7285f almost 3 years ago

raw

history blame

7.41 kB

	# -- coding: utf-8 --

	'''
	@Author : Jiangjie Chen
	@Time : 2018/11/15 17:08
	@Contact: [email protected]
	'''

	import re
	import datetime
	import os
	import argparse
	import logging
	import traceback

	try:
	import ujson as json
	except:
	import json

	HADOOP_BIN = 'PATH=/usr/bin/:$PATH hdfs'
	FOR_PUBLIC = True


	def LengthStats(filename):
	len_list = []
	thresholds = [0.8, 0.9, 0.95, 0.99, 0.999]
	with open(filename) as f:
	for line in f:
	len_list.append(len(line.strip().split()))
	stats = {
	'Max': max(len_list),
	'Min': min(len_list),
	'Avg': round(sum(len_list) / len(len_list), 4),
	}
	len_list.sort()
	for t in thresholds:
	stats[f"Top-{t}"] = len_list[int(len(len_list) * t)]

	for k in stats:
	print(f"- {k}: {stats[k]}")
	return stats


	class AttrDict(dict):
	def __init__(self, args, *kwargs):
	super(AttrDict, self).__init__(args, *kwargs)
	self.__dict__ = self


	def TraceBack(error_msg):
	exc = traceback.format_exc()
	msg = f'[Error]: {error_msg}.\n[Traceback]: {exc}'
	return msg


	def Now():
	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


	def AbsParentDir(file, parent='..', postfix=None):
	ppath = os.path.abspath(file)
	parent_level = parent.count('.')
	while parent_level > 0:
	ppath = os.path.dirname(ppath)
	parent_level -= 1
	if postfix is not None:
	return os.path.join(ppath, postfix)
	else:
	return ppath


	def init_logger(log_file=None, log_file_level=logging.NOTSET, from_scratch=False):
	from coloredlogs import ColoredFormatter
	import tensorflow as tf

	fmt = "[%(asctime)s %(levelname)s] %(message)s"
	log_format = ColoredFormatter(fmt=fmt)
	# log_format = logging.Formatter()
	logger = logging.getLogger()
	logger.setLevel(log_file_level)

	console_handler = logging.StreamHandler()
	console_handler.setFormatter(log_format)
	logger.handlers = [console_handler]

	if log_file and log_file != '':
	if from_scratch and tf.io.gfile.exists(log_file):
	logger.warning('Removing previous log file: %s' % log_file)
	tf.io.gfile.remove(log_file)
	path = os.path.dirname(log_file)
	os.makedirs(path, exist_ok=True)
	file_handler = logging.FileHandler(log_file)
	file_handler.setLevel(log_file_level)
	file_handler.setFormatter(log_format)
	logger.addHandler(file_handler)

	return logger


	def OverWriteCjjPy(root='.'):
	# import difflib
	# diff = difflib.HtmlDiff()
	cnt = 0
	golden_cjjpy = os.path.join(root, 'cjjpy.py')
	# golden_content = open(golden_cjjpy).readlines()
	for dir, folder, file in os.walk(root):
	for f in file:
	if f == 'cjjpy.py':
	cjjpy = '%s/%s' % (dir, f)
	# content = open(cjjpy).readlines()
	# d = diff.make_file(golden_content, content)
	cnt += 1
	print('[%d]: %s' % (cnt, cjjpy))
	os.system('cp %s %s' % (golden_cjjpy, cjjpy))


	def ChangeFileFormat(filename, new_fmt):
	assert type(filename) is str and type(new_fmt) is str
	spt = filename.split('.')
	if len(spt) == 0:
	return filename
	else:
	return filename.replace('.' + spt[-1], new_fmt)


	def CountLines(fname):
	with open(fname, 'rb') as f:
	count = 0
	last_data = '\n'
	while True:
	data = f.read(0x400000)
	if not data:
	break
	count += data.count(b'\n')
	last_data = data
	if last_data[-1:] != b'\n':
	count += 1 # Remove this if a wc-like count is needed
	return count


	def GetDate():
	return str(datetime.datetime.now())[5:10].replace('-', '')


	def TimeClock(seconds):
	sec = int(seconds)
	hour = int(sec / 3600)
	min = int((sec - hour * 3600) / 60)
	ssec = float(seconds) - hour * 3600 - min * 60
	# return '%dh %dm %.2fs' % (hour, min, ssec)
	return '{0:>2d}h{1:>3d}m{2:>6.2f}s'.format(hour, min, ssec)


	def StripAll(text):
	return text.strip().replace('\t', '').replace('\n', '').replace(' ', '')


	def GetBracket(text, bracket, en_br=False):
	# input should be aa(bb)cc, True for bracket, False for text
	if bracket:
	try:
	return re.findall('\（(.*?)\）', text.strip())[-1]
	except:
	return ''
	else:
	if en_br:
	text = re.sub('$.*?$', '', text.strip())
	return re.sub('（.*?）', '', text.strip())


	def CharLang(uchar, lang):
	assert lang.lower() in ['en', 'cn', 'zh']
	if lang.lower() in ['cn', 'zh']:
	if uchar >= '\u4e00' and uchar <= '\u9fa5':
	return True
	else:
	return False
	elif lang.lower() == 'en':
	if (uchar <= 'Z' and uchar >= 'A') or (uchar <= 'z' and uchar >= 'a'):
	return True
	else:
	return False
	else:
	raise NotImplementedError


	def WordLang(word, lang):
	for i in word.strip():
	if i.isspace(): continue
	if not CharLang(i, lang):
	return False
	return True


	def SortDict(_dict, reverse=True):
	assert type(_dict) is dict
	return sorted(_dict.items(), key=lambda d: d[1], reverse=reverse)


	def lark(content='test'):
	print(content)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()

	parser.add_argument('--diff', nargs=2,
	help='show difference between two files, shown in downloads/diff.html')
	parser.add_argument('--de_unicode', action='store_true', default=False,
	help='remove unicode characters')
	parser.add_argument('--link_entity', action='store_true', default=False,
	help='')
	parser.add_argument('--max_comm_len', action='store_true', default=False,
	help='')
	parser.add_argument('--search', nargs=2,
	help='search key from file, 2 args: file name & key')
	parser.add_argument('--email', nargs=2,
	help='sending emails, 2 args: subject & content')
	parser.add_argument('--overwrite', action='store_true', default=None,
	help='overwrite all cjjpy under given dir based on dir/cjjpy.py')
	parser.add_argument('--replace', nargs=3,
	help='replace char, 3 args: file name & replaced char & replacer char')
	parser.add_argument('--lark', nargs=1)
	parser.add_argument('--get_hdfs', nargs=2,
	help='easy copy from hdfs to local fs, 2 args: remote_file/dir & local_dir')
	parser.add_argument('--put_hdfs', nargs=2,
	help='easy put from local fs to hdfs, 2 args: local_file/dir & remote_dir')
	parser.add_argument('--length_stats', nargs=1,
	help='simple token lengths distribution of a line-by-line file')

	args = parser.parse_args()

	if args.overwrite:
	print('* Overwriting cjjpy...')
	OverWriteCjjPy()

	if args.lark:
	try:
	content = args.lark[0]
	except:
	content = 'running complete'
	print(f'* Larking "{content}"...')
	lark(content)

	if args.length_stats:
	file = args.length_stats[0]
	print(f'* Working on {file} lengths statistics...')
	LengthStats(file)