Spaces:

avsolatorio
/

wbg-doc-topic-prediction

Sleeping

App Files Files Community

wbg-doc-topic-prediction / wbgtopic.py

avsolatorio

Add wbgtopic

96070b5 5 months ago

raw

history blame contribute delete

3.72 kB

	from transformers import pipeline
	from tqdm.auto import tqdm
	import pandas as pd
	from transformers import AutoTokenizer
	import nltk

	# Download the nltk data if not present
	nltk.download('punkt_tab')
	nltk.download('punkt')


	class WBGDocTopic:
	"""
	A class to handle document topic suggestion using multiple pre-trained text classification models.

	This class loads a set of text classification models from Hugging Face's model hub and
	provides a method to suggest topics for input documents based on the aggregated classification
	results from all the models.

	Attributes:
	-----------
	classifiers : dict
	A dictionary mapping model names to corresponding classification pipelines. It holds
	instances of Hugging Face's `pipeline` used for text classification.

	Methods:
	--------
	__init__(classifiers: dict = None)
	Initializes the `WBGDocTopic` instance. If no classifiers are provided, it loads a default
	set of classifiers by calling `load_classifiers`.

	load_classifiers()
	Loads a predefined set of document topic classifiers into the `classifiers` dictionary.
	It uses `tqdm` to display progress as the classifiers are loaded.

	suggest_topics(input_docs: str \| list[str]) -> list
	Suggests topics for the given document or list of documents. It runs each document
	through all classifiers, averages their scores, and returns a list of dictionaries where each
	dictionary contains the mean and standard deviation of the topic scores per document.

	Parameters:
	-----------
	input_docs : str or list of str
	A single document or a list of documents for which to suggest topics.

	Returns:
	--------
	list
	A list of dictionaries, where each dictionary represents the suggested topics for
	each document, along with the mean and standard deviation of the topic classification scores.
	"""

	def __init__(self, classifiers: dict = None, device: str = None):
	self.classifiers = classifiers or {}
	self.device = device

	if classifiers is None:
	self.load_classifiers()

	def load_classifiers(self):
	num_evals = 5
	num_train = 5

	tokenizer = AutoTokenizer.from_pretrained("avsolatorio/doc-topic-model_eval-04_train-03")

	for i in tqdm(range(num_evals)):
	for j in tqdm(range(num_train)):
	if i == j:
	continue

	model_name = f"avsolatorio/doc-topic-model_eval-{i:02}_train-{j:02}"
	classifier = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None, device=self.device)

	self.classifiers[model_name] = classifier

	def suggest_topics(self, input_docs: str \| list[str]):
	if isinstance(input_docs, str):
	input_docs = [input_docs]

	doc_outs = {i: [] for i in range(len(input_docs))}
	topics = []

	for _, classifier in self.classifiers.items():
	for doc_idx, doc in enumerate(classifier(input_docs)):
	doc_outs[doc_idx].append(pd.DataFrame.from_records(doc, index="label"))

	for doc_idx, outs in doc_outs.items():
	all_scores = pd.concat(outs, axis=1)
	mean_probs = all_scores.mean(axis=1).sort_values(ascending=False)
	std_probs = all_scores.std(axis=1).loc[mean_probs.index]
	output = pd.DataFrame({"score_mean": mean_probs, "score_std": std_probs})

	output["doc_idx"] = doc_idx
	output.reset_index(inplace=True)

	topics.append(output.to_dict(orient="records"))

	return topics