Spaces:

polygraf-ai
/

copyright_checker

Sleeping

File size: 4,412 Bytes

import re
import re
from sentence_transformers import SentenceTransformer, util
import re
from unidecode import unidecode
from transformers import AutoTokenizer
import yaml
import fitz
import requests
from bs4 import BeautifulSoup

with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

# access_token = params['HF_TOKEN']

def remove_accents(input_str):
    text_no_accents = unidecode(input_str)
    return text_no_accents

def remove_special_characters(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    emoji_pattern = re.compile("["  
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub('', text) 
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text) 
    text = re.sub(r'\s+([.,!?;])', r'\1', text)
    text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def remove_special_characters_2(text):
    pattern = r"[^a-zA-Z0-9 ]+"
    text = re.sub(pattern, "", text)
    return text


def update_character_count(text):
    return f"{len(text)} characters"


with open("config.yaml", "r") as file:
    params = yaml.safe_load(file)

text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)


def len_validator(text):
    min_tokens = 200
    lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
    if lengt < min_tokens:
        return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
    else:
        return f"Input length ({lengt}) is satisified."


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


def format_headings(text):
    lines = text.split(" ")
    formatted_lines = []
    heading = ""
    for line in lines:
        if line and line.isupper():
            heading += line + " "
        else:
            if heading != "" and len(heading) > 10:
                formatted = (
                    "\n"
                    + heading[: len(heading) - 2]
                    + "\n"
                    + heading[len(heading) - 2 :]
                    if heading.strip().endswith(" A")
                    else "\n" + heading + "\n"
                )
                formatted_lines.append(formatted.strip(" "))
            elif heading != "":
                formatted_lines.append(heading.strip())
            formatted_lines.append(line.strip())
            heading = ""
    return " ".join(formatted_lines)


def format_live_site(text):
    # insert a newline between lowercase and uppercase letters
    formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
    # format the "What's included" items
    formatted_text = re.sub(
        r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
    )
    # place headings in all caps on their own line
    formatted_text = format_headings(formatted_text)
    # ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
    formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
    return formatted_text


def extract_text_from_html(url):
    try:
        r = requests.get(url)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "html.parser")
    except Exception:
        return "Unable to extract URL"

    def remove_tags(soup):
        # parse html content
        for data in soup(["style", "script", "code", "a"]):
            # Remove tags
            data.decompose()
        # return data by retrieving the tag content
        return " ".join(soup.stripped_strings)

    text = remove_tags(soup)
    text = format_live_site(text)
    return text