File size: 5,826 Bytes
6ef31de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.

Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
import json
import logging
import re
from typing import Dict, Union

import bs4
import markdownify  # == 0.11.6
import tqdm


div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
    "```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")


def reformat_code(val: str) -> str:
    # Input code format is:
    # ```
    # $<language>Copy code$<exact_code_here>
    #
    # ```
    # This function convert it into the correct markdown format
    return re.sub(code_lang_pattern, code_lang_format, val)


def html_to_markdown(val: str) -> str:
    # Remove all <div>. This is required to make intent work in code blocks.
    val = re.sub(div_pattern, "", val)
    # Remove all <span>. This is required to make underscores work in code blocks.
    val = re.sub(span_pattern, "", val)
    # Markdown to html
    val = markdownify.markdownify(val).strip()
    # Reformat code
    val = reformat_code(val)

    # Remove noisy "[number] / [number]" at the beginning
    noise = re.search(regenerate_pattern, val)
    if noise and noise.start() == 0:
        val = val[noise.end() :]
    # Remove noisy "Copy[number] chars / [number] words"
    val = re.sub(copy_chars_pattern, "", val)
    # Remove empty code block ```\nCopy code\n```
    val = re.sub(copy_code_pattern, "", val)

    # Strip
    val = val.replace("\n\n\n", "\n").strip()

    if args.debug:
        print(val)
        exit()

    return val


def should_filter(val: str) -> bool:
    black_list = ["openai", "chatgpt"]
    for w in black_list:
        if w in val.lower():
            return True
    return False


def clean_html_source(content, begin, end, check_tag, check_num):
    """
    Clean the input json content.

    Args:
        content: json file loaded in memory.
        check_tag: a debug purpose arg. If a conversation contains the tag, log
          it before and after cleaning.
        check_num: number of matched conversations logged.
    """
    BARRIER = "\n" + "=" * 20 + "\n"
    cnt_skip = 0
    cnt_too_short = 0
    cnt_id_duplication = 0
    cnt_value_duplication = 0
    cnt_filter = 0
    cnt_tag = 0
    visited = {}

    content = content[begin:end]
    new_content = []

    for sample in tqdm.tqdm(content):
        skipped = False
        cid = sample["id"]

        if len(sample["conversations"]) <= 1:
            print(f"id {cid} is too short")
            cnt_too_short += 1
            skipped = True
        elif cid in visited:
            print(f"id {cid} is an id duplication of {visited[cid]}")
            cnt_id_duplication += 1
            skipped = True
        elif (
            sample["conversations"][1]["value"],
            len(sample["conversations"]),
        ) in visited:
            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
            print(f"id {cid} is a value duplication of {visited[key]}")
            cnt_value_duplication += 1
            skipped = True
        else:
            key = (sample["conversations"][1]["value"], len(sample["conversations"]))
            visited[cid] = visited[key] = cid

            for c in sample["conversations"]:
                if should_filter(c["value"]):
                    print(f"id {cid} is filtered out")
                    cnt_filter += 1
                    skipped = True
                    break

                try:
                    new_val = html_to_markdown(c["value"])
                except (bs4.builder.ParserRejectedMarkup, AssertionError):
                    skipped = True
                    break

                c["value"] = new_val

                # Debug
                if (
                    check_tag is not None
                    and check_tag in c["value"]
                    and cnt_tag < check_num
                ):
                    logging.debug(
                        BARRIER
                        + c["value"]
                        + "\n"
                        + BARRIER
                        + new_val
                        + "\n"
                        + BARRIER
                        + "\n"
                    )
                    cnt_tag += 1
                    if cnt_tag == check_num:
                        break

        if not skipped:
            new_content.append(sample)
        else:
            cnt_skip += 1

    print(
        f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
        f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
        f"cnt_value_duplication: {cnt_value_duplication}, cnt_filter: {cnt_filter}"
    )

    return new_content


def main(args):
    content = json.load(open(args["in_file"], "r"))
    content = clean_html_source(
        content, args["begin"], args["end"], args["check_tag"], args["check_num"]
    )
    json.dump(content, open(args["out_file"], "w"), indent=2)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--in-file", type=str, required=True)
    parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
    parser.add_argument("--begin", type=int)
    parser.add_argument("--end", type=int)
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("--check-tag", type=str)
    parser.add_argument("--check-num", type=int, default=1)
    args = parser.parse_args()
    main(vars(args))