import re import re from sentence_transformers import SentenceTransformer, util import re from unidecode import unidecode from transformers import AutoTokenizer import yaml import fitz def remove_accents(input_str): text_no_accents = unidecode(input_str) return text_no_accents def remove_special_characters(text): text = text.replace("", "").replace("", "") text = remove_accents(text) pattern = r'[^\w\s\d.,!?\'"()-;]+' text = re.sub(pattern, "", text) return text def remove_special_characters_2(text): pattern = r"[^a-zA-Z0-9 ]+" text = re.sub(pattern, "", text) return text def update_character_count(text): return f"{len(text)} characters" with open("config.yaml", "r") as file: params = yaml.safe_load(file) text_bc_model_path = params["TEXT_BC_MODEL_PATH"] text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) def len_validator(text): min_tokens = 200 lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) if lengt < min_tokens: return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." else: return f"Input length ({lengt}) is satisified." def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text WORD = re.compile(r"\w+") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")