Spaces:

jiangjiechen
/

loren-fact-checking

Build error

File size: 7,413 Bytes

20e9c0d

# -*- coding: utf-8 -*-

'''
@Author : Jiangjie Chen
@Time   : 2018/11/15 17:08
@Contact: [email protected]
'''

import re
import datetime
import os
import argparse
import logging
import traceback

try:
    import ujson as json
except:
    import json

HADOOP_BIN = 'PATH=/usr/bin/:$PATH hdfs'
FOR_PUBLIC = True


def LengthStats(filename):
    len_list = []
    thresholds = [0.8, 0.9, 0.95, 0.99, 0.999]
    with open(filename) as f:
        for line in f:
            len_list.append(len(line.strip().split()))
    stats = {
        'Max': max(len_list),
        'Min': min(len_list),
        'Avg': round(sum(len_list) / len(len_list), 4),
    }
    len_list.sort()
    for t in thresholds:
        stats[f"Top-{t}"] = len_list[int(len(len_list) * t)]

    for k in stats:
        print(f"- {k}: {stats[k]}")
    return stats


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


def TraceBack(error_msg):
    exc = traceback.format_exc()
    msg = f'[Error]: {error_msg}.\n[Traceback]: {exc}'
    return msg


def Now():
    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


def AbsParentDir(file, parent='..', postfix=None):
    ppath = os.path.abspath(file)
    parent_level = parent.count('.')
    while parent_level > 0:
        ppath = os.path.dirname(ppath)
        parent_level -= 1
    if postfix is not None:
        return os.path.join(ppath, postfix)
    else:
        return ppath


def init_logger(log_file=None, log_file_level=logging.NOTSET, from_scratch=False):
    from coloredlogs import ColoredFormatter
    import tensorflow as tf

    fmt = "[%(asctime)s %(levelname)s] %(message)s"
    log_format = ColoredFormatter(fmt=fmt)
    # log_format = logging.Formatter()
    logger = logging.getLogger()
    logger.setLevel(log_file_level)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(log_format)
    logger.handlers = [console_handler]

    if log_file and log_file != '':
        if from_scratch and tf.io.gfile.exists(log_file):
            logger.warning('Removing previous log file: %s' % log_file)
            tf.io.gfile.remove(log_file)
        path = os.path.dirname(log_file)
        os.makedirs(path, exist_ok=True)
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(log_file_level)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)

    return logger


def OverWriteCjjPy(root='.'):
    # import difflib
    # diff = difflib.HtmlDiff()
    cnt = 0
    golden_cjjpy = os.path.join(root, 'cjjpy.py')
    # golden_content = open(golden_cjjpy).readlines()
    for dir, folder, file in os.walk(root):
        for f in file:
            if f == 'cjjpy.py':
                cjjpy = '%s/%s' % (dir, f)
                # content = open(cjjpy).readlines()
                # d = diff.make_file(golden_content, content)
                cnt += 1
                print('[%d]: %s' % (cnt, cjjpy))
                os.system('cp %s %s' % (golden_cjjpy, cjjpy))


def ChangeFileFormat(filename, new_fmt):
    assert type(filename) is str and type(new_fmt) is str
    spt = filename.split('.')
    if len(spt) == 0:
        return filename
    else:
        return filename.replace('.' + spt[-1], new_fmt)


def CountLines(fname):
    with open(fname, 'rb') as f:
        count = 0
        last_data = '\n'
        while True:
            data = f.read(0x400000)
            if not data:
                break
            count += data.count(b'\n')
            last_data = data
        if last_data[-1:] != b'\n':
            count += 1  # Remove this if a wc-like count is needed
    return count


def GetDate():
    return str(datetime.datetime.now())[5:10].replace('-', '')


def TimeClock(seconds):
    sec = int(seconds)
    hour = int(sec / 3600)
    min = int((sec - hour * 3600) / 60)
    ssec = float(seconds) - hour * 3600 - min * 60
    # return '%dh %dm %.2fs' % (hour, min, ssec)
    return '{0:>2d}h{1:>3d}m{2:>6.2f}s'.format(hour, min, ssec)


def StripAll(text):
    return text.strip().replace('\t', '').replace('\n', '').replace(' ', '')


def GetBracket(text, bracket, en_br=False):
    # input should be aa(bb)cc, True for bracket, False for text
    if bracket:
        try:
            return re.findall('\（(.*?)\）', text.strip())[-1]
        except:
            return ''
    else:
        if en_br:
            text = re.sub('\(.*?\)', '', text.strip())
        return re.sub('（.*?）', '', text.strip())


def CharLang(uchar, lang):
    assert lang.lower() in ['en', 'cn', 'zh']
    if lang.lower() in ['cn', 'zh']:
        if uchar >= '\u4e00' and uchar <= '\u9fa5':
            return True
        else:
            return False
    elif lang.lower() == 'en':
        if (uchar <= 'Z' and uchar >= 'A') or (uchar <= 'z' and uchar >= 'a'):
            return True
        else:
            return False
    else:
        raise NotImplementedError


def WordLang(word, lang):
    for i in word.strip():
        if i.isspace(): continue
        if not CharLang(i, lang):
            return False
    return True


def SortDict(_dict, reverse=True):
    assert type(_dict) is dict
    return sorted(_dict.items(), key=lambda d: d[1], reverse=reverse)


def lark(content='test'):
    print(content)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--diff', nargs=2,
                        help='show difference between two files, shown in downloads/diff.html')
    parser.add_argument('--de_unicode', action='store_true', default=False,
                        help='remove unicode characters')
    parser.add_argument('--link_entity', action='store_true', default=False,
                        help='')
    parser.add_argument('--max_comm_len', action='store_true', default=False,
                        help='')
    parser.add_argument('--search', nargs=2,
                        help='search key from file, 2 args: file name & key')
    parser.add_argument('--email', nargs=2,
                        help='sending emails, 2 args: subject & content')
    parser.add_argument('--overwrite', action='store_true', default=None,
                        help='overwrite all cjjpy under given *dir* based on *dir*/cjjpy.py')
    parser.add_argument('--replace', nargs=3,
                        help='replace char, 3 args: file name & replaced char & replacer char')
    parser.add_argument('--lark', nargs=1)
    parser.add_argument('--get_hdfs', nargs=2,
                        help='easy copy from hdfs to local fs, 2 args: remote_file/dir & local_dir')
    parser.add_argument('--put_hdfs', nargs=2,
                        help='easy put from local fs to hdfs, 2 args: local_file/dir & remote_dir')
    parser.add_argument('--length_stats', nargs=1,
                        help='simple token lengths distribution of a line-by-line file')

    args = parser.parse_args()

    if args.overwrite:
        print('* Overwriting cjjpy...')
        OverWriteCjjPy()

    if args.lark:
        try:
            content = args.lark[0]
        except:
            content = 'running complete'
        print(f'* Larking "{content}"...')
        lark(content)

    if args.length_stats:
        file = args.length_stats[0]
        print(f'* Working on {file} lengths statistics...')
        LengthStats(file)