File size: 3,986 Bytes
cec26ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import re, os, sys
from utils import *

def get_paragraphs(text, cutoff=10):
    return [ x.strip() for x in re.split(r'\n+', text, flags=re.MULTILINE) if len(x.strip()) > cutoff ]

def get_para_sentences(text, cutoff=60):
    para_sents = []
    for para in get_paragraphs(text):
        sents = []; sent = ""
        chunks = re.split(r'\.+', para); n = len(chunks)
        for i in range(0, n):
            sent += chunks[i]
            if i < n - 1: sent += "."
            if len(sent) > cutoff:
                sents.append(sent)
                sent = ""
        if len(sent) > 0: sents.append(sent)
        # print(sents); input()
        para_sents.append(sents)
    return para_sents

def get_idx_from_marked_chunk(marked_chunk):
    return int(re.match(r'<C\s*(\d+)>', marked_chunk)[1])
import random; idx = random.randint(0, 99999)
assert get_idx_from_marked_chunk(f"<C {idx}> ha ha") == idx


def add_chunk_markers(text, lookup_idx = None, para = True):
    if para: para_chunks =  get_paragraphs(text)
    else:    para_chunks = get_para_sentences(text)

    marked_text = ""; chunk_idx = 0
    for chunks in para_chunks:
        if isinstance(chunks, str): chunks = [chunks]
        for idx, chunk in enumerate(chunks):
            marked_chunk = f"<C {chunk_idx}>{chunk.strip()}"

            chunks[idx] = marked_chunk
            if lookup_idx == chunk_idx: print(marked_chunk); sys.exit() # assert False, f"Đã tìm thấy {lookup_idx}"

            marked_text += f"{marked_chunk}\n"
            chunk_idx += 1
        marked_text += "\n"
    return marked_text.strip(), para_chunks


alphabet = '[0-9a-zaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵaăâáắấàằầảẳẩãẵẫạặậđeêéếèềẻểẽễẹệiíìỉĩịoôơóốớòồờỏổởõỗỡọộợuưúứùừủửũữụựyýỳỷỹỵ]'
word = re.compile(f'{alphabet}+', re.IGNORECASE)
###
def hilite(query, source, hilite_color=YELLOW, source_color=GREY, query_color=None):
    for keyword in set(re.findall(word, query)):
        keyword = re.escape(keyword)
        re_keyword = re.compile(rf"(\b{keyword}\b)", flags=re.IGNORECASE | re.MULTILINE)
        if re_keyword.search(source):
            source = re.sub(re_keyword, rf'{hilite_color}\1{source_color}', source)
            if query_color is not None:
                query = re.sub(re_keyword, rf'{hilite_color}\1{query_color}', query)
    return source, query


def pretty_num(x):
    return round(x*100)/100

def count_words(x):
    assert isinstance(x, str), f"đầu không phải string {x}"
    return len(x.split())

def extract_(text, tag):
    raw = text.split(f"</{tag}>")[0].split(f"<{tag}>")[-1]
    if tag == "summary": return raw.strip()
    splits = re.split(r'[\n,]+', raw)
    splits = [ re.sub(r'^\s*-\s*', '', s).strip() for s in splits ]
    splits = [ s for s in splits if len(s) > 0 ]
    return splits

def extract_xmls(text, tags):
    if text is None: return None
    return { tag: extract_(text, tag) for tag in tags }