from flair.nn import Classifier from flair.data import Sentence from flair.models import SequenceTagger from flair.tokenization import Tokenizer from segtok.segmenter import split_single from collections import Counter import pandas as pd import os ner_tagger = SequenceTagger.load("flair/ner-english-ontonotes") pos_tagger = Classifier.load("pos") def get_named_entities(text: str, tagger=ner_tagger): sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)] tagger.predict(sentence) entities = [] for token in sentence: for entity in token.get_spans("ner"): entity = str(entity) entities.append(entity) return entities def get_most_frequent_words(dataset: str, k=10): split_str = dataset.split() counter = Counter(split_str) most_frequent = counter.most_common(k) return most_frequent # POS categories https://huggingface.co/flair/pos-english def get_parts_of_sentence(text: str, tagger=pos_tagger): sentence = Sentence(text) tagger.predict(sentence) return sentence # path_stem = os.path.join("datasets") # file_name = "ch3_colour_data_viz_suggestions_set_2" # ner_output_path = os.path.join(path_stem, f"{file_name}_ner.csv") # df = pd.read_csv(ner_output_path) # df = df.head(3) # ner_dataset = df["alma_metadata"].to_list() # ner_dataset = " ".join(ner_dataset) # tokenizer = Tokenizer() # tokens = Tokenizer.tokenize(Tokenizer, ner_dataset) # print(tokens) # # most_common = get_most_frequent_words(ner_dataset) # print(most_common)