research / lrt /clustering /clusters.py
haoqi7's picture
Upload 47 files
16188ba
raw
history blame
2.68 kB
from typing import List, Iterable, Union
from pprint import pprint
class KeyphraseCount:
def __init__(self, keyphrase: str, count: int) -> None:
super().__init__()
self.keyphrase = keyphrase
self.count = count
@classmethod
def reduce(cls, kcs: list) :
'''
kcs: List[KeyphraseCount]
'''
keys = ''
count = 0
for i in range(len(kcs)-1):
kc = kcs[i]
keys += kc.keyphrase + '/'
count += kc.count
keys += kcs[-1].keyphrase
count += kcs[-1].count
return KeyphraseCount(keys, count)
class SingleCluster:
def __init__(self):
self.__container__ = []
self.__keyphrases__ = {}
def add(self, id:int):
self.__container__.append(id)
def __str__(self) -> str:
return str(self.__container__)
def elements(self) -> List:
return self.__container__
def get_keyphrases(self):
ret = []
for key, count in self.__keyphrases__.items():
ret.append(KeyphraseCount(key,count))
return ret
def add_keyphrase(self, keyphrase:Union[str,Iterable]):
if isinstance(keyphrase,str):
if keyphrase not in self.__keyphrases__.keys():
self.__keyphrases__[keyphrase] = 1
else:
self.__keyphrases__[keyphrase] += 1
elif isinstance(keyphrase,Iterable):
for i in keyphrase:
self.add_keyphrase(i)
def __len__(self):
return len(self.__container__)
def print_keyphrases(self):
pprint(self.__keyphrases__)
class ClusterList:
def __init__(self, k:int):
self.__clusters__ = [SingleCluster() for _ in range(k)]
# subscriptable and slice-able
def __getitem__(self, idx):
if isinstance(idx, int):
return self.__clusters__[idx]
if isinstance(idx, slice):
# return
return self.__clusters__[0 if idx.start is None else idx.start: idx.stop: 0 if idx.step is None else idx.step]
def instantiate(self, labels: Iterable):
for id, label in enumerate(labels):
self.__clusters__[label].add(id)
def __str__(self):
ret = f'There are {len(self.__clusters__)} clusters:\n'
for id,cluster in enumerate(self.__clusters__):
ret += f'cluster {id} contains: {cluster}.\n'
return ret
# return an iterator that can be used in for loop etc.
def __iter__(self):
return self.__clusters__.__iter__()
def __len__(self): return len(self.__clusters__)
def sort(self):
self.__clusters__.sort(key=len,reverse=True)