Spaces:
Running
Running
File size: 4,412 Bytes
1be431a 350b1a0 caa635d 45d10c4 173f4a0 45d10c4 173f4a0 4d41695 45d10c4 79b97e2 45d10c4 4d41695 45d10c4 1be431a 2019311 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import re
import re
from sentence_transformers import SentenceTransformer, util
import re
from unidecode import unidecode
from transformers import AutoTokenizer
import yaml
import fitz
import requests
from bs4 import BeautifulSoup
with open("config.yaml", "r") as file:
params = yaml.safe_load(file)
# access_token = params['HF_TOKEN']
def remove_accents(input_str):
text_no_accents = unidecode(input_str)
return text_no_accents
def remove_special_characters(text):
text = re.sub(r'https?://\S+|www\.\S+', '', text)
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F700-\U0001F77F" # alchemical symbols
u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
u"\U0001FA00-\U0001FA6F" # Chess Symbols
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
u"\U00002702-\U000027B0" # Dingbats
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub('', text)
text = re.sub(r'#\w+', '', text)
text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
text = re.sub(r'\s+([.,!?;])', r'\1', text)
text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def remove_special_characters_2(text):
pattern = r"[^a-zA-Z0-9 ]+"
text = re.sub(pattern, "", text)
return text
def update_character_count(text):
return f"{len(text)} characters"
with open("config.yaml", "r") as file:
params = yaml.safe_load(file)
text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)
def len_validator(text):
min_tokens = 200
lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
if lengt < min_tokens:
return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
else:
return f"Input length ({lengt}) is satisified."
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def format_headings(text):
lines = text.split(" ")
formatted_lines = []
heading = ""
for line in lines:
if line and line.isupper():
heading += line + " "
else:
if heading != "" and len(heading) > 10:
formatted = (
"\n"
+ heading[: len(heading) - 2]
+ "\n"
+ heading[len(heading) - 2 :]
if heading.strip().endswith(" A")
else "\n" + heading + "\n"
)
formatted_lines.append(formatted.strip(" "))
elif heading != "":
formatted_lines.append(heading.strip())
formatted_lines.append(line.strip())
heading = ""
return " ".join(formatted_lines)
def format_live_site(text):
# insert a newline between lowercase and uppercase letters
formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
# format the "What's included" items
formatted_text = re.sub(
r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
)
# place headings in all caps on their own line
formatted_text = format_headings(formatted_text)
# ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
return formatted_text
def extract_text_from_html(url):
try:
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "html.parser")
except Exception:
return "Unable to extract URL"
def remove_tags(soup):
# parse html content
for data in soup(["style", "script", "code", "a"]):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return " ".join(soup.stripped_strings)
text = remove_tags(soup)
text = format_live_site(text)
return text
|