File size: 5,826 Bytes
6ef31de |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""
- Convert html to markdown with basic data cleaning.
- Deduplication.
Usage:
python3 -m fastchat.data.clean_sharegpt --in sharegpt_html.json --out sharegpt_clean.json
"""
import argparse
import json
import logging
import re
from typing import Dict, Union
import bs4
import markdownify # == 0.11.6
import tqdm
div_pattern = re.compile("<div.*?>")
span_pattern = re.compile("<span.*?>")
code_lang_pattern = re.compile(
"```\s*" + "(.*?)" + "(?:Copy code)+" + "(.+?)" + "\s*?```", re.DOTALL
)
code_lang_format = "```\g<1>\n\g<2>\n```"
regenerate_pattern = re.compile("\d+ / \d+")
copy_chars_pattern = re.compile("Copy\d+ chars / \d+ words")
copy_code_pattern = re.compile("```(.*?)Copy code\s*```")
def reformat_code(val: str) -> str:
# Input code format is:
# ```
# $<language>Copy code$<exact_code_here>
#
# ```
# This function convert it into the correct markdown format
return re.sub(code_lang_pattern, code_lang_format, val)
def html_to_markdown(val: str) -> str:
# Remove all <div>. This is required to make intent work in code blocks.
val = re.sub(div_pattern, "", val)
# Remove all <span>. This is required to make underscores work in code blocks.
val = re.sub(span_pattern, "", val)
# Markdown to html
val = markdownify.markdownify(val).strip()
# Reformat code
val = reformat_code(val)
# Remove noisy "[number] / [number]" at the beginning
noise = re.search(regenerate_pattern, val)
if noise and noise.start() == 0:
val = val[noise.end() :]
# Remove noisy "Copy[number] chars / [number] words"
val = re.sub(copy_chars_pattern, "", val)
# Remove empty code block ```\nCopy code\n```
val = re.sub(copy_code_pattern, "", val)
# Strip
val = val.replace("\n\n\n", "\n").strip()
if args.debug:
print(val)
exit()
return val
def should_filter(val: str) -> bool:
black_list = ["openai", "chatgpt"]
for w in black_list:
if w in val.lower():
return True
return False
def clean_html_source(content, begin, end, check_tag, check_num):
"""
Clean the input json content.
Args:
content: json file loaded in memory.
check_tag: a debug purpose arg. If a conversation contains the tag, log
it before and after cleaning.
check_num: number of matched conversations logged.
"""
BARRIER = "\n" + "=" * 20 + "\n"
cnt_skip = 0
cnt_too_short = 0
cnt_id_duplication = 0
cnt_value_duplication = 0
cnt_filter = 0
cnt_tag = 0
visited = {}
content = content[begin:end]
new_content = []
for sample in tqdm.tqdm(content):
skipped = False
cid = sample["id"]
if len(sample["conversations"]) <= 1:
print(f"id {cid} is too short")
cnt_too_short += 1
skipped = True
elif cid in visited:
print(f"id {cid} is an id duplication of {visited[cid]}")
cnt_id_duplication += 1
skipped = True
elif (
sample["conversations"][1]["value"],
len(sample["conversations"]),
) in visited:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
print(f"id {cid} is a value duplication of {visited[key]}")
cnt_value_duplication += 1
skipped = True
else:
key = (sample["conversations"][1]["value"], len(sample["conversations"]))
visited[cid] = visited[key] = cid
for c in sample["conversations"]:
if should_filter(c["value"]):
print(f"id {cid} is filtered out")
cnt_filter += 1
skipped = True
break
try:
new_val = html_to_markdown(c["value"])
except (bs4.builder.ParserRejectedMarkup, AssertionError):
skipped = True
break
c["value"] = new_val
# Debug
if (
check_tag is not None
and check_tag in c["value"]
and cnt_tag < check_num
):
logging.debug(
BARRIER
+ c["value"]
+ "\n"
+ BARRIER
+ new_val
+ "\n"
+ BARRIER
+ "\n"
)
cnt_tag += 1
if cnt_tag == check_num:
break
if not skipped:
new_content.append(sample)
else:
cnt_skip += 1
print(
f"total: {len(content)}, skip: {cnt_skip}, new: {len(new_content)}, "
f"cnt_too_short: {cnt_too_short}, cnt_id_duplication: {cnt_id_duplication}, "
f"cnt_value_duplication: {cnt_value_duplication}, cnt_filter: {cnt_filter}"
)
return new_content
def main(args):
content = json.load(open(args["in_file"], "r"))
content = clean_html_source(
content, args["begin"], args["end"], args["check_tag"], args["check_num"]
)
json.dump(content, open(args["out_file"], "w"), indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--in-file", type=str, required=True)
parser.add_argument("--out-file", type=str, default="sharegpt_clean.json")
parser.add_argument("--begin", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--check-tag", type=str)
parser.add_argument("--check-num", type=int, default=1)
args = parser.parse_args()
main(vars(args))
|