File size: 1,530 Bytes
1be431a
350b1a0
caa635d
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
932cfaa
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be431a
 
 
350b1a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
import re
from sentence_transformers import SentenceTransformer, util
import re
from unidecode import unidecode
from transformers import AutoTokenizer
import yaml
import fitz


def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents


def remove_special_characters(text):
    text = text.replace("<s>", "").replace("</s>", "")
    text = remove_accents(text)
    pattern = r'[^\w\s\d.,!?\'"()-;]+'
    text = re.sub(pattern, "", text)
    return text


def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def update_character_count(text):
    return f"{len(text)} characters"


with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

text_bc_model_path = params["TEXT_BC_MODEL_PATH"]

text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)


def len_validator(text):
    min_tokens = 200
    lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
    if lengt < min_tokens:
        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
    else:
        return f"Input length ({lengt}) is satisified."


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


WORD = re.compile(r"\w+")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")