File size: 4,412 Bytes
1be431a
350b1a0
caa635d
45d10c4
 
 
 
 
173f4a0
 
45d10c4
173f4a0
 
 
4d41695
45d10c4
 
 
 
 
 
79b97e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d41695
45d10c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1be431a
 
2019311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import re
import re
from sentence_transformers import SentenceTransformer, util
import re
from unidecode import unidecode
from transformers import AutoTokenizer
import yaml
import fitz
import requests
from bs4 import BeautifulSoup

with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

# access_token = params['HF_TOKEN']

def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents

def remove_special_characters(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["  
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text) 
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) 
    text = re.sub(r'\s+([.,!?;])', r'\1', text)
    text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def update_character_count(text):
    return f"{len(text)} characters"


with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)


def len_validator(text):
    min_tokens = 200
    lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
    if lengt < min_tokens:
        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
    else:
        return f"Input length ({lengt}) is satisified."


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


def format_headings(text):
    lines = text.split(" ")
    formatted_lines = []
    heading = ""
    for line in lines:
        if line and line.isupper():
            heading += line + " "
        else:
            if heading != "" and len(heading) > 10:
                formatted = (
                    "\n"
                    + heading[: len(heading) - 2]
                    + "\n"
                    + heading[len(heading) - 2 :]
                    if heading.strip().endswith(" A")
                    else "\n" + heading + "\n"
                )
                formatted_lines.append(formatted.strip(" "))
            elif heading != "":
                formatted_lines.append(heading.strip())
            formatted_lines.append(line.strip())
            heading = ""
    return " ".join(formatted_lines)


def format_live_site(text):
    # insert a newline between lowercase and uppercase letters
    formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
    # format the "What's included" items
    formatted_text = re.sub(
        r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
    )
    # place headings in all caps on their own line
    formatted_text = format_headings(formatted_text)
    # ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
    formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
    return formatted_text


def extract_text_from_html(url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
    except Exception:
        return "Unable to extract URL"

    def remove_tags(soup):
        # parse html content
        for data in soup(["style", "script", "code", "a"]):
            # Remove tags
            data.decompose()
        # return data by retrieving the tag content
        return " ".join(soup.stripped_strings)

    text = remove_tags(soup)
    text = format_live_site(text)
    return text