from urllib.request import urlopen, Request from googleapiclient.discovery import build import requests import httpx import re from bs4 import BeautifulSoup import re, math from collections import Counter import numpy as np import asyncio import nltk from sentence_transformers import SentenceTransformer, util import threading import torch import re import numpy as np import asyncio from datetime import date import nltk from unidecode import unidecode from scipy.special import softmax from transformers import AutoTokenizer import yaml import fitz import os def remove_accents(input_str): text_no_accents = unidecode(input_str) return text_no_accents def remove_special_characters(text): text = text.replace("", "").replace("", "") text = remove_accents(text) pattern = r'[^\w\s\d.,!?\'"()-;]+' text = re.sub(pattern, "", text) return text def remove_special_characters_2(text): pattern = r"[^a-zA-Z0-9 ]+" text = re.sub(pattern, "", text) return text def update_character_count(text): return f"{len(text)} characters" nltk.download("punkt") with open("config.yaml", "r") as file: params = yaml.safe_load(file) text_bc_model_path = params["TEXT_BC_MODEL_PATH"] text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path) def len_validator(text): min_tokens = 200 lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt")) if lengt < min_tokens: return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens." else: return f"Input length ({lengt}) is satisified." def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text WORD = re.compile(r"\w+") model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")