Spaces:

polygraf-ai
/

copyright_checker

Running

App Files Files Community

copyright_checker / utils.py

minko186

Update utils.py

4d41695 verified 4 months ago

raw

history blame contribute delete

4.41 kB

	import re
	import re
	from sentence_transformers import SentenceTransformer, util
	import re
	from unidecode import unidecode
	from transformers import AutoTokenizer
	import yaml
	import fitz
	import requests
	from bs4 import BeautifulSoup

	with open("config.yaml", "r") as file:
	params = yaml.safe_load(file)

	# access_token = params['HF_TOKEN']

	def remove_accents(input_str):
	text_no_accents = unidecode(input_str)
	return text_no_accents

	def remove_special_characters(text):
	text = re.sub(r'https?://\S+\|www\.\S+', '', text)
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F700-\U0001F77F" # alchemical symbols
	u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	u"\U0001FA00-\U0001FA6F" # Chess Symbols
	u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	u"\U00002702-\U000027B0" # Dingbats
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	text = emoji_pattern.sub('', text)
	text = re.sub(r'#\w+', '', text)
	text = re.sub(r'[^\w\s\d.,!?\'"()-;]', '', text)
	text = re.sub(r'\s+([.,!?;])', r'\1', text)
	text = re.sub(r'([.,!?;])(\S)', r'\1 \2', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def remove_special_characters_2(text):
	pattern = r"[^a-zA-Z0-9 ]+"
	text = re.sub(pattern, "", text)
	return text


	def update_character_count(text):
	return f"{len(text)} characters"


	with open("config.yaml", "r") as file:
	params = yaml.safe_load(file)

	text_bc_model_path = params["TEXT_BC_MODEL_PATH"]
	text_bc_tokenizer = AutoTokenizer.from_pretrained(text_bc_model_path)


	def len_validator(text):
	min_tokens = 200
	lengt = len(text_bc_tokenizer.tokenize(text=text, return_tensors="pt"))
	if lengt < min_tokens:
	return f"Warning! Input length is {lengt}. Please input a text that is greater than {min_tokens} tokens long. Recommended length {min_tokens*2} tokens."
	else:
	return f"Input length ({lengt}) is satisified."


	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text


	def format_headings(text):
	lines = text.split(" ")
	formatted_lines = []
	heading = ""
	for line in lines:
	if line and line.isupper():
	heading += line + " "
	else:
	if heading != "" and len(heading) > 10:
	formatted = (
	"\n"
	+ heading[: len(heading) - 2]
	+ "\n"
	+ heading[len(heading) - 2 :]
	if heading.strip().endswith(" A")
	else "\n" + heading + "\n"
	)
	formatted_lines.append(formatted.strip(" "))
	elif heading != "":
	formatted_lines.append(heading.strip())
	formatted_lines.append(line.strip())
	heading = ""
	return " ".join(formatted_lines)


	def format_live_site(text):
	# insert a newline between lowercase and uppercase letters
	formatted_text = re.sub(r"([a-z])([A-Z])", r"\1\n\2", text)
	# format the "What's included" items
	formatted_text = re.sub(
	r"([a-z])(\d+\.\d+[MK])", r"\1\n\2 ", formatted_text
	)
	# place headings in all caps on their own line
	formatted_text = format_headings(formatted_text)
	# ddd a space after ':', ';', ',', '!', '?' if they are followed by a character
	formatted_text = re.sub(r"([:;,!?])(\S)", r"\1 \2", formatted_text)
	return formatted_text


	def extract_text_from_html(url):
	try:
	r = requests.get(url)
	if r.status_code == 200:
	soup = BeautifulSoup(r.content, "html.parser")
	except Exception:
	return "Unable to extract URL"

	def remove_tags(soup):
	# parse html content
	for data in soup(["style", "script", "code", "a"]):
	# Remove tags
	data.decompose()
	# return data by retrieving the tag content
	return " ".join(soup.stripped_strings)

	text = remove_tags(soup)
	text = format_live_site(text)
	return text