File size: 2,780 Bytes
1be431a
 
 
 
 
 
 
 
 
 
 
caa635d
337ef1e
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b97e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be431a
 
 
8ad69ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from urllib.request import urlopen, Request
from googleapiclient.discovery import build
import requests
import httpx
import re
from bs4 import BeautifulSoup
import re, math
from collections import Counter
import numpy as np
import asyncio
import nltk
from sentence_transformers import SentenceTransformer, util
import threading
import torch
import re
import numpy as np
import asyncio
from datetime import date
import nltk
from unidecode import unidecode
from scipy.special import softmax
from transformers import AutoTokenizer
import yaml
import fitz
import os


def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents


def remove_special_characters(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["  
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text) 
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) 
    text = re.sub(r'\s+([.,!?;])', r'\1', text)
    text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def update_character_count(text):
    return f"{len(text)} characters"


nltk.download("punkt")


with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

text_bc_model_path = params["TEXT_BC_MODEL_PATH"]

text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)


def len_validator(text):
    min_tokens = 200
    lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
    if lengt < min_tokens:
        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
    else:
        return f"Input length ({lengt}) is satisified."


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


WORD = re.compile(r"\w+")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")