|
from transformers import pipeline |
|
from tqdm.auto import tqdm |
|
import pandas as pd |
|
from transformers import AutoTokenizer |
|
import nltk |
|
|
|
|
|
nltk.download('punkt_tab') |
|
nltk.download('punkt') |
|
|
|
|
|
class WBGDocTopic: |
|
""" |
|
A class to handle document topic suggestion using multiple pre-trained text classification models. |
|
|
|
This class loads a set of text classification models from Hugging Face's model hub and |
|
provides a method to suggest topics for input documents based on the aggregated classification |
|
results from all the models. |
|
|
|
Attributes: |
|
----------- |
|
classifiers : dict |
|
A dictionary mapping model names to corresponding classification pipelines. It holds |
|
instances of Hugging Face's `pipeline` used for text classification. |
|
|
|
Methods: |
|
-------- |
|
__init__(classifiers: dict = None) |
|
Initializes the `WBGDocTopic` instance. If no classifiers are provided, it loads a default |
|
set of classifiers by calling `load_classifiers`. |
|
|
|
load_classifiers() |
|
Loads a predefined set of document topic classifiers into the `classifiers` dictionary. |
|
It uses `tqdm` to display progress as the classifiers are loaded. |
|
|
|
suggest_topics(input_docs: str | list[str]) -> list |
|
Suggests topics for the given document or list of documents. It runs each document |
|
through all classifiers, averages their scores, and returns a list of dictionaries where each |
|
dictionary contains the mean and standard deviation of the topic scores per document. |
|
|
|
Parameters: |
|
----------- |
|
input_docs : str or list of str |
|
A single document or a list of documents for which to suggest topics. |
|
|
|
Returns: |
|
-------- |
|
list |
|
A list of dictionaries, where each dictionary represents the suggested topics for |
|
each document, along with the mean and standard deviation of the topic classification scores. |
|
""" |
|
|
|
def __init__(self, classifiers: dict = None, device: str = None): |
|
self.classifiers = classifiers or {} |
|
self.device = device |
|
|
|
if classifiers is None: |
|
self.load_classifiers() |
|
|
|
def load_classifiers(self): |
|
num_evals = 5 |
|
num_train = 5 |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("avsolatorio/doc-topic-model_eval-04_train-03") |
|
|
|
for i in tqdm(range(num_evals)): |
|
for j in tqdm(range(num_train)): |
|
if i == j: |
|
continue |
|
|
|
model_name = f"avsolatorio/doc-topic-model_eval-{i:02}_train-{j:02}" |
|
classifier = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None, device=self.device) |
|
|
|
self.classifiers[model_name] = classifier |
|
|
|
def suggest_topics(self, input_docs: str | list[str]): |
|
if isinstance(input_docs, str): |
|
input_docs = [input_docs] |
|
|
|
doc_outs = {i: [] for i in range(len(input_docs))} |
|
topics = [] |
|
|
|
for _, classifier in self.classifiers.items(): |
|
for doc_idx, doc in enumerate(classifier(input_docs)): |
|
doc_outs[doc_idx].append(pd.DataFrame.from_records(doc, index="label")) |
|
|
|
for doc_idx, outs in doc_outs.items(): |
|
all_scores = pd.concat(outs, axis=1) |
|
mean_probs = all_scores.mean(axis=1).sort_values(ascending=False) |
|
std_probs = all_scores.std(axis=1).loc[mean_probs.index] |
|
output = pd.DataFrame({"score_mean": mean_probs, "score_std": std_probs}) |
|
|
|
output["doc_idx"] = doc_idx |
|
output.reset_index(inplace=True) |
|
|
|
topics.append(output.to_dict(orient="records")) |
|
|
|
return topics |
|
|