import re, os, sys from utils import * def get_paragraphs(text, cutoff=10): return [ x.strip() for x in re.split(r'\n+', text, flags=re.MULTILINE) if len(x.strip()) > cutoff ] def get_para_sentences(text, cutoff=60): para_sents = [] for para in get_paragraphs(text): sents = []; sent = "" chunks = re.split(r'\.+', para); n = len(chunks) for i in range(0, n): sent += chunks[i] if i < n - 1: sent += "." if len(sent) > cutoff: sents.append(sent) sent = "" if len(sent) > 0: sents.append(sent) # print(sents); input() para_sents.append(sents) return para_sents def get_idx_from_marked_chunk(marked_chunk): return int(re.match(r'<C\s*(\d+)>', marked_chunk)[1]) import random; idx = random.randint(0, 99999) assert get_idx_from_marked_chunk(f"<C {idx}> ha ha") == idx def add_chunk_markers(text, lookup_idx = None, para = True): if para: para_chunks = get_paragraphs(text) else: para_chunks = get_para_sentences(text) marked_text = ""; chunk_idx = 0 for chunks in para_chunks: if isinstance(chunks, str): chunks = [chunks] for idx, chunk in enumerate(chunks): marked_chunk = f"<C {chunk_idx}>{chunk.strip()}" chunks[idx] = marked_chunk if lookup_idx == chunk_idx: print(marked_chunk); sys.exit() # assert False, f"Đã tìm thấy {lookup_idx}" marked_text += f"{marked_chunk}\n" chunk_idx += 1 marked_text += "\n" return marked_text.strip(), para_chunks alphabet = '[0-9a-zaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵ]' word = re.compile(f'{alphabet}+', re.IGNORECASE) ### def hilite(query, source, hilite_color=YELLOW, source_color=GREY, query_color=None): for keyword in set(re.findall(word, query)): keyword = re.escape(keyword) re_keyword = re.compile(rf"(\b{keyword}\b)", flags=re.IGNORECASE | re.MULTILINE) if re_keyword.search(source): source = re.sub(re_keyword, rf'{hilite_color}\1{source_color}', source) if query_color is not None: query = re.sub(re_keyword, rf'{hilite_color}\1{query_color}', query) return source, query def pretty_num(x): return round(x*100)/100 def count_words(x): assert isinstance(x, str), f"đầu không phải string {x}" return len(x.split()) def extract_(text, tag): raw = text.split(f"</{tag}>")[0].split(f"<{tag}>")[-1] if tag == "summary": return raw.strip() splits = re.split(r'[\n,]+', raw) splits = [ re.sub(r'^\s*-\s*', '', s).strip() for s in splits ] splits = [ s for s in splits if len(s) > 0 ] return splits def extract_xmls(text, tags): if text is None: return None return { tag: extract_(text, tag) for tag in tags }