Spaces:
Paused
Paused
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
from email import policy | |
from email.parser import BytesParser | |
from rag.app.naive import chunk as naive_chunk | |
import re | |
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks | |
from deepdoc.parser import HtmlParser, TxtParser | |
from timeit import default_timer as timer | |
from rag.settings import cron_logger | |
import io | |
def chunk( | |
filename, | |
binary=None, | |
from_page=0, | |
to_page=100000, | |
lang="Chinese", | |
callback=None, | |
**kwargs, | |
): | |
""" | |
Only eml is supported | |
""" | |
eng = lang.lower() == "english" # is_english(cks) | |
parser_config = kwargs.get( | |
"parser_config", | |
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}, | |
) | |
doc = { | |
"docnm_kwd": filename, | |
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), | |
} | |
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |
main_res = [] | |
attachment_res = [] | |
if binary: | |
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary)) | |
else: | |
msg = BytesParser(policy=policy.default).parse(open(filename, "rb")) | |
text_txt, html_txt = [], [] | |
# get the email header info | |
for header, value in msg.items(): | |
text_txt.append(f"{header}: {value}") | |
# get the email main info | |
def _add_content(msg, content_type): | |
if content_type == "text/plain": | |
text_txt.append( | |
msg.get_payload(decode=True).decode(msg.get_content_charset()) | |
) | |
elif content_type == "text/html": | |
html_txt.append( | |
msg.get_payload(decode=True).decode(msg.get_content_charset()) | |
) | |
elif "multipart" in content_type: | |
if msg.is_multipart(): | |
for part in msg.iter_parts(): | |
_add_content(part, part.get_content_type()) | |
_add_content(msg, msg.get_content_type()) | |
sections = TxtParser.parser_txt("\n".join(text_txt)) + [ | |
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l | |
] | |
st = timer() | |
chunks = naive_merge( | |
sections, | |
int(parser_config.get("chunk_token_num", 128)), | |
parser_config.get("delimiter", "\n!?。;!?"), | |
) | |
main_res.extend(tokenize_chunks(chunks, doc, eng, None)) | |
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) | |
# get the attachment info | |
for part in msg.iter_attachments(): | |
content_disposition = part.get("Content-Disposition") | |
if content_disposition: | |
dispositions = content_disposition.strip().split(";") | |
if dispositions[0].lower() == "attachment": | |
filename = part.get_filename() | |
payload = part.get_payload(decode=True) | |
try: | |
attachment_res.extend( | |
naive_chunk(filename, payload, callback=callback, **kwargs) | |
) | |
except Exception: | |
pass | |
return main_res + attachment_res | |
if __name__ == "__main__": | |
import sys | |
def dummy(prog=None, msg=""): | |
pass | |
chunk(sys.argv[1], callback=dummy) | |