Spaces:
Running
Running
File size: 1,530 Bytes
1be431a 350b1a0 caa635d 45d10c4 932cfaa 45d10c4 1be431a 350b1a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import re
import re
from sentence_transformers import SentenceTransformer, util
import re
from unidecode import unidecode
from transformers import AutoTokenizer
import yaml
import fitz
def remove_accents(input_str):
text_no_accents = unidecode(input_str)
return text_no_accents
def remove_special_characters(text):
text = text.replace("<s>", "").replace("</s>", "")
text = remove_accents(text)
pattern = r'[^\w\s\d.,!?\'"()-;]+'
text = re.sub(pattern, "", text)
return text
def remove_special_characters_2(text):
pattern = r"[^a-zA-Z0-9 ]+"
text = re.sub(pattern, "", text)
return text
def update_character_count(text):
return f"{len(text)} characters"
with open("config.yaml", "r") as file:
params = yaml.safe_load(file)
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
def len_validator(text):
min_tokens = 200
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
if lengt < min_tokens:
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
else:
return f"Input length ({lengt}) is satisified."
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
WORD = re.compile(r"\w+")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|