klocher

Running on Zero

App Files Files Community

klocher / ComfyUI /custom_nodes /img2txt-comfyui-nodes /src /keyword_extract.py

Freak-ppa

Upload 31 files

ffd0e5b verified 11 months ago

raw

history blame contribute delete

3.79 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk import pos_tag
	from nltk.tokenize import word_tokenize
	import nltk


	def nltk_speach_tag(sentence):
	nltk.download("punkt")
	nltk.download("averaged_perceptron_tagger")
	nltk.download("stopwords")

	# Tokenize the sentence
	tokens = word_tokenize(sentence)

	# Filter out stopwords and punctuation
	stop_words = set(stopwords.words("english"))
	filtered_tokens = [
	word for word in tokens if word.lower() not in stop_words and word.isalnum()
	]

	# Perform Part-of-Speech tagging
	tagged_tokens = pos_tag(filtered_tokens)

	# Extract nouns and proper nouns
	salient_tokens = [
	token
	for token, pos in tagged_tokens
	if pos in ["NN", "NNP", "NNS", "NNPS", "ADJ", "JJ", "FW"]
	]
	salient_tokens = list(set(salient_tokens))

	# Re-add commas or periods relative to the original sentence

	comma_period_indices = [i for i, char in enumerate(sentence) if char in [",", "."]]
	salient_tokens_indices = [sentence.index(token) for token in salient_tokens]

	# Add commas or periods between words if there was one in the original sentence
	out = ""
	for i, index in enumerate(salient_tokens_indices):
	out += salient_tokens[i]
	distance_between_next = (
	salient_tokens_indices[i + 1] - index
	if i + 1 < len(salient_tokens_indices)
	else None
	)

	puncuated = False
	if not distance_between_next:
	puncuated = True
	else:
	for i in range(index, index + distance_between_next):
	if i in comma_period_indices:
	puncuated = True
	break

	if not puncuated:
	# IF the previous word was an adjective, and current is a noun, add a space
	if (
	i > 0
	and tagged_tokens[i - 1][1] in ["JJ", "ADJ"]
	and tagged_tokens[i][1] in ["NN", "NNP", "NNS", "NNPS"]
	):
	out += " "
	else:
	out += ", "
	else:
	out += ". "

	# Add the last token
	out += sentence[-1]

	# Print the salient tokens
	return out.strip().strip(",").strip(".").strip()


	def extract_keywords(text: str) -> str:
	tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
	model = AutoModelForTokenClassification.from_pretrained(
	"yanekyuk/bert-keyword-extractor"
	)
	"""Return keywords from text using a BERT model trained for keyword extraction as
	a comma-separated string."""
	print(f"Extracting keywords from text: {text}")

	for char in ["\n", "\t", "\r"]:
	text = text.replace(char, " ")

	sentences = text.split(".")
	result = ""

	for sentence in sentences:
	print(f"Extracting keywords from sentence: {sentence}")
	inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
	with torch.no_grad():
	logits = model(**inputs).logits

	predicted_token_class_ids = logits.argmax(dim=-1)

	predicted_keywords = []
	for token_id, token in zip(
	predicted_token_class_ids[0],
	tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
	):
	if token_id == 1:
	predicted_keywords.append(token)

	print(f"Extracted keywords: {predicted_keywords}")
	result += ", ".join(predicted_keywords) + ", "

	print(f"All Keywords: {result}")
	return result