Spaces:

cuongnguyen910
/

topic-clustering-global-dashboard

Build error

App Files Files Community

topic-clustering-global-dashboard / function /utils.py

cuongnguyen910

Upload folder using huggingface_hub

5120311 verified 4 months ago

raw

history blame contribute delete

3.21 kB

	import editdistance
	import requests
	import numpy as np
	import re
	from .clean_text import normalize_text
	URL_SBERT = "http://10.9.3.240:6789/sbert/encode_list"
	# app_config.parse_url_api('api_sbert')

	def get_sbert_embedding(lst_sentence, url = URL_SBERT):
	input_data = {
	"sentences": lst_sentence
	}
	embs = requests.post(url, json=input_data).json()
	embs = np.array(embs)

	return embs

	def is_number(word):
	lst_end = ['$', '%', 'vnđ', '.', ',']
	word_lo = word.lower()
	for k in lst_end:
	word_lo = word_lo.replace(k, '')

	if word_lo.isdigit():
	return True
	else:
	return False


	def get_number(text):
	dt = text.split(' ')
	for w in dt:
	if is_number(w):
	return w

	return ''


	def check_editdistance(ww1, ww2):
	if len(ww1) == 0 or len(ww1) == 0:
	return 0
	else:
	n_c = editdistance.eval(ww1.lower(), ww2.lower())
	score = n_c / max(len(ww1), len(ww2))
	return 1 - score


	def remove_image_keyword(text_input):
	lst_key = ["ảnh:", "ảnh :", "Ảnh:", "Ảnh :",
	"Ảnh minh họa:", "Ảnh minh họa :", "ảnh minh họa:", "ảnh minh họa :",
	"Nguồn:", "nguồn:", "Nguồn :", "nguồn :",
	"Source:", "Source :", "source:", "source :",
	"Src:", "Src :", "src:", "src :",
	"Image:", "Image :", "img:", "img :",
	"image:", "image :", "Img:", "Img :",
	"xem tiếp", "xem thêm", "Xem tiếp", "Xem thêm"]
	for k in lst_key:
	text_input = text_input.replace(k, " ")
	return text_input.strip()

	def clean_text(text_in, normalize=True):
	doc = re.sub('<.*?>', '', text_in)
	doc = re.sub('(function).*}', ' ', doc)
	# link
	doc = re.sub('(Nguồn)\s?(http:\/\/).?(\.htm)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(http:\/\/).?(\.html)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\/\/)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\.htm)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\.html)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\.vn)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\.net)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(https:\/\/).?(\.vgp)', ' ', doc)
	doc = re.sub('(Nguồn)\s?(http:\/\/).?(\.vgp)', ' ', doc)

	doc = re.sub('(http:\/\/).*?(\.htm)', ' ', doc)
	doc = re.sub('(http:\/\/).*?(\.html)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\/\/)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\.htm)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\.html)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\.vn)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\.net)', ' ', doc)
	doc = re.sub('(https:\/\/).*?(\.vgp)', ' ', doc)
	doc = re.sub('(http:\/\/).*?(\.vgp)', ' ', doc)
	# escape sequence
	doc = re.sub('\n', ' ', doc)
	doc = re.sub('\t', ' ', doc)
	doc = re.sub('\r', ' ', doc)

	if normalize:
	doc = normalize_text(doc)
	return doc

	if __name__ == '__main__':
	print(check_editdistance('tttt', 'tt'))