# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from email import policy from email.parser import BytesParser from rag.app.naive import chunk as naive_chunk import re from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks from deepdoc.parser import HtmlParser, TxtParser from timeit import default_timer as timer from rag.settings import cron_logger import io def chunk( filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs, ): """ Only eml is supported """ eng = lang.lower() == "english" # is_english(cks) parser_config = kwargs.get( "parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}, ) doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), } doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) main_res = [] attachment_res = [] if binary: msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary)) else: msg = BytesParser(policy=policy.default).parse(open(filename, "rb")) text_txt, html_txt = [], [] # get the email header info for header, value in msg.items(): text_txt.append(f"{header}: {value}") # get the email main info def _add_content(msg, content_type): if content_type == "text/plain": text_txt.append( msg.get_payload(decode=True).decode(msg.get_content_charset()) ) elif content_type == "text/html": html_txt.append( msg.get_payload(decode=True).decode(msg.get_content_charset()) ) elif "multipart" in content_type: if msg.is_multipart(): for part in msg.iter_parts(): _add_content(part, part.get_content_type()) _add_content(msg, msg.get_content_type()) sections = TxtParser.parser_txt("\n".join(text_txt)) + [ (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l ] st = timer() chunks = naive_merge( sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), ) main_res.extend(tokenize_chunks(chunks, doc, eng, None)) cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) # get the attachment info for part in msg.iter_attachments(): content_disposition = part.get("Content-Disposition") if content_disposition: dispositions = content_disposition.strip().split(";") if dispositions[0].lower() == "attachment": filename = part.get_filename() payload = part.get_payload(decode=True) try: attachment_res.extend( naive_chunk(filename, payload, callback=callback, **kwargs) ) except Exception: pass return main_res + attachment_res if __name__ == "__main__": import sys def dummy(prog=None, msg=""): pass chunk(sys.argv[1], callback=dummy)