File size: 2,922 Bytes
88435ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import json
import re
from typing import Any, Callable


# dict2json --------------------------------
def dict2json(python_dict: dict[str, Any]) -> str:
    """
    Python dictใ‚’JSONๆ–‡ๅญ—ๅˆ—ใซๅค‰ๆ›ใ™ใ‚‹

    Args:
        python_dict (dict): ๅค‰ๆ›ใ™ใ‚‹Python dict

    Returns:
        str: ๅค‰ๆ›ใ•ใ‚ŒใŸJSONๆ–‡ๅญ—ๅˆ—
    """
    # ensure_ascii: ๆ—ฅๆœฌ่ชžใจใ‹ใ‚’ๅ‡บๅŠ›ใ™ใ‚‹ใŸใ‚
    json_string = json.dumps(python_dict, indent=2, ensure_ascii=False)
    return json_string


# optimize token --------------------------------
def optimize_token(text: str, funcs: list[Callable[[str], str]] | None = None) -> str:
    """
    ใƒ†ใ‚ญใ‚นใƒˆใฎใƒˆใƒผใ‚ฏใƒณใ‚’ๆœ€้ฉๅŒ–ใ‚’ใ™ใ‚‹

    Args:
        text (str): ๆœ€้ฉๅŒ–ใ™ใ‚‹ใƒ†ใ‚ญใ‚นใƒˆ

    Returns:
        str: ๆœ€้ฉๅŒ–ใ•ใ‚ŒใŸใƒ†ใ‚ญใ‚นใƒˆ
    """
    funcs = funcs or [minimize_newline, zenkaku_to_hankaku, remove_trailing_spaces]
    for func in funcs:
        text = func(text)
    return text.strip()


def _replace_consecutive(text: str, pattern: str, replacing_text: str) -> str:
    """
    ใƒ†ใ‚ญใ‚นใƒˆๅ†…ใฎ้€ฃ็ถšใ™ใ‚‹ใƒ‘ใ‚ฟใƒผใƒณใซๅฏพใ—ใฆใ€ๆŒ‡ๅฎšใ•ใ‚ŒใŸ็ฝฎๆ›ใƒ†ใ‚ญใ‚นใƒˆใง็ฝฎๆ›ใ™ใ‚‹

    Args:
        text (str): ใƒ†ใ‚ญใ‚นใƒˆ
        pattern (str): ็ฝฎๆ›ใ™ใ‚‹ใƒ‘ใ‚ฟใƒผใƒณ
        replacing_text (str): ็ฝฎๆ›ใƒ†ใ‚ญใ‚นใƒˆ

    Returns:
        str: ็ฝฎๆ›ใ•ใ‚ŒใŸใƒ†ใ‚ญใ‚นใƒˆ
    """
    p = re.compile(pattern)
    matches = [(m.start(), m.end()) for m in p.finditer(text)][::-1]

    text_replaced = list(text)

    for i_start, i_end in matches:
        text_replaced[i_start:i_end] = [replacing_text]
    return "".join(text_replaced)


def minimize_newline(text: str) -> str:
    """
    ใƒ†ใ‚ญใ‚นใƒˆๅ†…ใฎ้€ฃ็ถšใ™ใ‚‹ๆ”น่กŒใ‚’2ไปฅไธ‹ใซใ™ใ‚‹

    Args:
        text (str): ใƒ†ใ‚ญใ‚นใƒˆ

    Returns:
        str: ๆ”น่กŒใ‚’ๆœ€ๅฐ้™ใซใ—ใŸใƒ†ใ‚ญใ‚นใƒˆ
    """
    return _replace_consecutive(text, pattern="\n{2,}", replacing_text="\n\n")


def zenkaku_to_hankaku(text: str) -> str:
    """
    ใƒ†ใ‚ญใ‚นใƒˆๅ†…ใฎๅ…จ่ง’ๆ–‡ๅญ—ใ‚’ๅŠ่ง’ๆ–‡ๅญ—ใซๅค‰ๆ›ใ™ใ‚‹

    Args:
        text (str): ใƒ†ใ‚ญใ‚นใƒˆ

    Returns:
        str: ๅŠ่ง’ๆ–‡ๅญ—ใซๅค‰ๆ›ใ•ใ‚ŒใŸใƒ†ใ‚ญใ‚นใƒˆ
    """
    mapping_dict = {"ใ€€": " ", "๏ผš": ": ", "โ€Ž": " ", "๏ผŽ": "ใ€‚", "๏ผŒ": "ใ€", "๏ฟฅ": "ยฅ"}
    hankaku_text = ""
    for char in text:
        # A-Za-z0-9!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
        if char in mapping_dict:
            hankaku_text += mapping_dict[char]
        elif 65281 <= ord(char) <= 65374:
            hankaku_text += chr(ord(char) - 65248)
        else:
            hankaku_text += char
    return hankaku_text


def remove_trailing_spaces(text: str) -> str:
    """
    ใƒ†ใ‚ญใ‚นใƒˆๅ†…ใฎๅ„่กŒใฎๆœซๅฐพใฎใ‚นใƒšใƒผใ‚นใ‚’ๅ‰Š้™คใ™ใ‚‹

    Args:
        text (str): ใƒ†ใ‚ญใ‚นใƒˆ

    Returns:
        str: ใ‚นใƒšใƒผใ‚นใ‚’ๅ‰Š้™คใ—ใŸใƒ†ใ‚ญใ‚นใƒˆ
    """
    return "\n".join([line.rstrip() for line in text.split("\n")])