jinymusim
/

poet-validators

Model card Files Files and versions Community

File size: 20,217 Bytes

13cea7c

class StropheParams:
    

    # Most Common Rhyme Schemas (Every Rhyme schema with presence over 0.36 %)
    RHYME_SCHEMES = ['ABAB', 'XXXX',
                 'XAXA','AABB', 
                 'XXXXXX','ABBA', 
                 'AAXX', 'AABBCC',
                 'ABABCC','ABABXX',
                 'AABCCB','XXAA', 
                 'XAAX', 'AXAX', 
                 'XAXAXX','XXABAB',
                 'ABBACC','AXAA', 
                 'XAABBX','AABCBC',
                 'AABBXX','ABBAXX',
                 'ABABAB','AAXA', 
                 'AXXA','XAXABB',
                 'XXAABB','XXAAXX',
                 'ABABAX','XXABBA',
                 'AAXBBX','XXXAXA',
                 'AAAX','XABABX',
                 'XABBAX','AAXXBB',
                 'AXABBX','ABABBX',
                 'XAAXBB','AAAA',
                 'XAAA','XAABXB',
                 'AXABXB','AXAXBB',
                  None]
    
    RHYME = RHYME_SCHEMES


    NORMAL_SCHEMES = ["ABAB", "ABBA", "AABB", "AABBCC", "ABABCC", "ABBACC", "ABBAAB"]

    # First 200 Most common endings
    VERSE_ENDS = ['ní', 'la', 'je', 'tí', 'ce', 'ti', 'ky', 'ku', 'li', 'jí', 'ně', 'né', 'vá', 'se', 'ny', 'ly', 'na', 'ne', 'nou', 
              'lo', 'ci', 'mi', 'ný', 'sti', 'ka', 'le', 'cí', 'ná', 'ží', 'čí', 'ho', 'dí', 'ší', 'du', 'lí', 'dy', 'nu', 'ří', 
              'ji', 'ru', 'tě', 'ře', 'stí', 'vy', 'ká', 'še', 'dá', 'ni', 'te', 'ví', 'mu', 'tu', 'ta', 'vé', 'val', 'va', 'lý', 
              'tá', 'že', 'ty', 'no', 'vu', 'lá', 'kem', 'chu', 'ků', 'bě', 'vý', 'sy', 'me', 'zí', 'hu', 'vě', 'lu', 'da', 'ry', 
              'rá', 'lé', 'ko', 'ři', 'de', 'hy', 'lem', 'tem', 'kou', 'vou', 'ši', 'há', 'sí', 'ze', 'be', 'ra', 'má', 'to', 'by', 
              'mě', 'su', 'té', 'si', 'ných', 'den', 'či', 'ký', 'ním', 'če', 'tý', 'ma', 'my', 'sem', 'nem', 'dě', 'ha', 'vat', 'ným', 
              'dem', 'dou', 'sta', 'dla', 'svět', 'zem', 'jen', 'dal', 'mí', 'hou', 'zas', 'sen', 'rem', 'nů', 'bu', 'e', 'ba', 'ké', 
              'til', 'jest', 'ství', 'děl', 'květ', 'tů', 'chem', 'lou', 'sám', 'bí', 'tou', 'dé', 'šel', 'nul', 'chá', 'vem', 'sa', 
              'hlas', 'pí', 'čas', 'dil', 'let', 'cích', 'lů', 'žil', 'mů', 'dál', 'cha', 'byl', 'nost', 'ček', 'zy', 'hý', 'nám', 'di', 
              'bou', 'tím', 'ži', 'tek', 'vil', 'jsem', 'sů', 'dech', 'men', 'tla', 'sá', 'zrak', 'chy', 'vám', 'vi', 'dý', 'rád', 'svou', 
              'ném', 've', 'py', 'vo', 'vým', 'nek', 'již', 'víc', 'kal', 'mé', 'dů', 'stá', 'dnes', 'sty', 'ven', None]
    ENDS = VERSE_ENDS
    # Years to bucket to
    POET_YEARS_BUCKETS = [1800, 1820, 1840, 1860, 1880, 1900, 1920, 1940, 1960, None]
    POET_YEARS = POET_YEARS_BUCKETS
    YEAR = POET_YEARS_BUCKETS
    # Possible Meter Types
    METER_TYPES = ["J","T","D","A","X","Y","N","H","P", None]
    METER = METER_TYPES
    # Translation of Meter to one char types
    METER_TRANSLATE = {
        "J":"J",
        "T":"T",
        "D":"D",
        "A":"A",
        "X":"X",
        "Y":"Y",
        "hexameter": "H",
        "pentameter": "P",
        "N":"N"
    }
    
    # Basic Characters to consider in rhyme and syllables (43)
    VALID_CHARS = [""," ",'a','á','b','c','č','d','ď','e','é','ě',
               'f','g','h','i','í','j','k','l','m','n','ň',
               'o','ó','p','q','r','ř','s','š','t','ť','u',
               'ú','ů','v','w','x','y','ý','z','ž']
    CHARS = VALID_CHARS
class Tokens:
# Tokenizers Special Tokens
    EOS = "<|EOS|>"
    EOS_ID = 0
    PAD = "<|PAD|>"
    PAD_ID = 1
    UNK = "<|UNK|>"
    UNK_ID = 2
    CLS = "<|CLS|>"
    CLS_ID = 3
    # SEP Token is EOS Token
    SEP = EOS
    SEP_ID = 0
    
    ALL_TOKENS = {
        EOS : 0,
        PAD : 1,
        UNK : 2,
        CLS : 3,
    }



import re
import numpy as np

def parse_boolean(value):
    value = value.lower()

    if value in ["true", "yes", "y", "1", "t"]:
        return True
    elif value in ["false", "no", "n", "0", "f"]:
        return False

    return False

class TextManipulation:
    """Static class for string manipulation methods

    Returns:
        _type_: str returned by all methods
    """
        
    @staticmethod
    def _remove_most_nonchar(raw_text, lower_case=True):
        """Remove most non-alpha non-whitespace characters

        Args:
            raw_text (str): Text to manipulate
            lower_case (bool, optional): If resulting text should be lowercase. Defaults to True.

        Returns:
            str: Cleaned up text
        """
        text = re.sub(r'[–\„\“\’\;\:()\]\[\_\*\‘\”\'\-\—\"]+', "", raw_text)
        return text.lower() if lower_case else text
    
    @staticmethod
    def _remove_all_nonchar(raw_text):
        """Remove all possible non-alpha characters

        Args:
            raw_text (str): Text to manipulate

        Returns:
            str: Cleaned up text
        """
        sub = re.sub(r'([^\w\s]+|[0-9]+)', '', raw_text)
        return sub
    
    @staticmethod
    def _year_bucketor(raw_year):
        """Bucketizes year string to boundaries, Bad inputs returns NaN string

        Args:
            raw_year (str): Year string to bucketize

        Returns:
            _type_: Bucketized year string
        """
        if TextAnalysis._is_year(raw_year) and raw_year != "NaN":
            year_index = np.argmin(np.abs(np.asarray(StropheParams.YEAR[:-1]) - int(raw_year)))
            return str(StropheParams.YEAR[year_index])
        else:
            return "NaN"
        
    _RHYME_POS = ["A", "B", "C", "D", "E", "F", "G", "H"]
        
    @staticmethod
    def rhyme_sec(rhyme_ref, current_rhyme):
        """Return proper rhyme indicator to given reference
        
        Args:
            rhyme_ref (_type_): reference number of 'A'
            current_rhyme (_type_): current rhyme number that needs inidcation
            
        Returns:
            str: rhyme indicator character
        """
        
        return "X" if current_rhyme == None or current_rhyme== -1 or rhyme_ref == None or current_rhyme < rhyme_ref or current_rhyme >= rhyme_ref + len(TextManipulation._RHYME_POS) else TextManipulation._RHYME_POS[current_rhyme - rhyme_ref]
        
    @staticmethod
    def __post_process_rhyme(rhyme_str: str):
        # First Pass
        marker_count = {marker: rhyme_str.count(marker) for marker in TextManipulation._RHYME_POS}
        for key, val in marker_count.items():
            # Replace all, that ocurr only once with X
            if val == 1:
                rhyme_str = re.sub(key, 'X', rhyme_str)
        # Downscale higher to lower if lower not present
        marker_count = {marker: rhyme_str.count(marker) for marker in TextManipulation._RHYME_POS}
        for key, val in marker_count.items():
            if val > 1 and key != 'X':
                key_index = TextManipulation._RHYME_POS.index(key)
                replacements = {marker: rhyme_str.count(marker) for marker in TextManipulation._RHYME_POS[:key_index]}
                for rep_key, rep_val in replacements.items():
                    if rep_val ==0:
                        rhyme_str = re.sub(key, rep_key, rhyme_str)
                        break
                    
        # Pass to swap letters
        marker_index = {marker: rhyme_str.find(marker) for marker in TextManipulation._RHYME_POS if rhyme_str.find(marker) != -1}
        keys_values = marker_index.items()
        keys = [v[0] for v in keys_values]
        values = [v[1] for v in keys_values]
       
        i = 0
        while i < len(keys):
            j= 0
            while j< len(keys):
                if TextManipulation._RHYME_POS.index(keys[j]) > TextManipulation._RHYME_POS.index(keys[i]) and values[j] < values[i]:
                    # Swap the positions
                    rhyme_str = re.sub(keys[j], 'Z', rhyme_str)
                    rhyme_str = re.sub(keys[i], keys[j], rhyme_str)
                    rhyme_str = re.sub('Z', keys[i], rhyme_str)
                    # Need to update the value
                    temp = values[i]
                    values[i]= values[j]
                    values[j] = temp
                j+=1
            i+=1
        
            
        return rhyme_str
            
        
    @staticmethod
    def _rhyme_string(curr_rhyme_list):
        """Translate rhyme as list of rhyming number to rhyme schema

        Args:
            curr_rhyme_list (list): Current rhyme as list of ints indicating rhyming verses

        Returns:
            str: Rhyme schema
        """
        rhyme_list = curr_rhyme_list.copy()
        reference = None
        # Give None a blank -1 rhyme id
        for i in range(len(rhyme_list)):
            if rhyme_list[i] != None and reference == None:
                reference = rhyme_list[i]
            elif rhyme_list[i] != None and rhyme_list[i] < reference:
                reference = rhyme_list[i]
            elif rhyme_list[i] == None:
                 rhyme_list[i] = -1
        
        # With more robust post processing, this is may not needed
               
        # if there is valid rhyme, normalize 
        if reference != None:
            # sort the rhyme and get index of reference number
            cheat_sheet =  sorted(list(set(rhyme_list[:])))
            ref_index = cheat_sheet.index(reference)
            # normalize the rest around this reference
            for i in range(len(rhyme_list)):
                idx = cheat_sheet.index(rhyme_list[i])
                rhyme_list[i] = reference + (idx - ref_index)
        
                
        rhyme_str = ""
        for num in rhyme_list:
           rhyme_str += TextManipulation.rhyme_sec(reference, num)
        
        return TextManipulation.__post_process_rhyme(rhyme_str)

class TextAnalysis:
    """Static class with methods of analysis of strings

    Returns:
        Union[str, bool, dict, numpy.ndarray]: Analyzed input
    """
    
    # Possible Keys if returned type is dict
    POET_PARAM_LIST = ["RHYME", "YEAR", "METER", "LENGTH", "END", "TRUE_LENGTH", "TRUE_END"]
    
    @staticmethod
    def _is_meter(meter:str):
        """Return if string is meter type

        Args:
            meter (str): string to analyze

        Returns:
            bool: If string is meter type
        """
        return meter in StropheParams.METER[:-1]
    
    @staticmethod
    def _is_year(year:str):
        """Return if string is year or special NaN

        Args:
            year (str): string to analyze

        Returns:
            bool: If string is year or special NaN
        """
        return (year.isdecimal() and int(year) > 1_000 and int(year) < 10_000) or year == "NaN"
    
    @staticmethod
    def _rhyme_like(rhyme:str):
        """Return if string is structured like rhyme schema

        Args:
            rhyme (str): string to analyze

        Returns:
            bool: If string is structured like rhyme schema
        """
        return (rhyme.isupper() and len(rhyme) >= 3 and len(rhyme) <= 6)
    
    @staticmethod
    def _rhyme_vector(rhyme:str) -> np.ndarray:
        """Create One-hot encoded rhyme schema vector from given string

        Args:
            rhyme (str): string to construct vector from 

        Returns:
            numpy.ndarray: One-hot encoded rhyme schema vector
        """
        
        rhyme_vec = np.zeros(len(StropheParams.RHYME))
        if rhyme in StropheParams.RHYME:
            rhyme_vec[StropheParams.RHYME.index(rhyme)] = 1
        else:
            rhyme_vec[-1] = 1
            
        return rhyme_vec
    
    
    @staticmethod
    def _publish_year_vector(year_string):
        """Construct vector of year of publishing, weighting by distance

        Args:
            year_string (str): String with publish year

        Returns:
            numpy.ndarray: Vector of bucketized One-hot encoded publish year
        """
        publish_year = None if not year_string.isdigit() else int(year_string)
        publish_vector = np.zeros(len(StropheParams.YEAR))
        if publish_year == None:
            publish_vector[-1] = 1
        else:
            # Distance Part
            #distance_weighting = [1/(1 + abs(year - publish_year)) for year in POET_YEARS_BUCKETS[:-1]] + [0]
            #publish_vector = np.asarray(distance_weighting)
            # Correct class correction
            publish_vector[np.argmin( abs(np.asarray(StropheParams.YEAR[:-1]) - publish_year))] += 1
            # Normalize
            #publish_vector = publish_vector/np.sum(publish_vector)
        return publish_vector  
    
    @staticmethod
    def _rhyme_or_not(rhyme_str:str) -> np.ndarray:
        """Create vector if given rhyme string is in our list of rhyme schemas

        Args:
            rhyme_str (str): string to construct vector from 

        Returns:
            numpy.ndarray: Boolean flag vector
        """
        rhyme_vector = np.zeros(2)
        if rhyme_str in StropheParams.RHYME:
            rhyme_vector[0] = 1
        else:
            rhyme_vector[1] = 1
        return rhyme_vector
        
    @staticmethod
    def _metre_vector(metre: str) -> np.ndarray:
        """Create One-hot encoded metre vector from given string

        Args:
            metre (str): string to construct vector from 

        Returns:
            numpy.ndarray: One-hot encoded metre vector
        """
        metre_vec = np.zeros(len(StropheParams.METER))
        if metre in StropheParams.METER:
            metre_vec[StropheParams.METER.index(metre)] = 1
        else:
            metre_vec[-1] = 1            
        return metre_vec
    
    @staticmethod
    def _first_line_analysis(text:str):
        """Analysis of parameter line for RHYME, METER, YEAR

        Args:
            text (str): parameter line string

        Returns:
            dict: Dictionary with analysis result
        """
        line_striped = text.strip()
        if not line_striped:
            return {}
        poet_params = {}
        # Look for each possible parameter
        for param in line_striped.split():
            if TextAnalysis._is_year(param):
                # Year is Bucketized so to fit
                poet_params["YEAR"] = TextManipulation._year_bucketor(param) 
            elif TextAnalysis._rhyme_like(param):
                poet_params["RHYME"] = param
            elif TextAnalysis._is_meter(param):
                poet_params["STROPHE_METER"] = param
        return poet_params
    
    @staticmethod
    def _is_line_length(length:str):
        """Return if string is number of syllables parameter

        Args:
            length (str): string to analyze

        Returns:
            bool: If string is number of syllables parameter
        """
        return length.isdigit() and int(length) > 1 and int(length) < 100
    
    @staticmethod
    def _is_line_end(end:str):
        """Return if string is valid ending syllable/sequence parameter

        Args:
            end (str): string to analyze

        Returns:
            bool: If string is valid ending syllable/sequence parameter
        """
        return end.isalpha() and end.islower() and len(end) <= 5
    
    @staticmethod
    def _continuos_line_analysis(text:str):
        """Analysis of Content lines for LENGTH, TRUE_LENGTH, END, TRUE_END

        Args:
            text (str): content line to analyze

        Returns:
            dict: Dictionary with analysis result
        """
        # Strip line of most separators and look if its empty
        line_striped = TextManipulation._remove_most_nonchar(text, lower_case=False).strip()
        if not line_striped:
            return {}
        line_params = {}
        # OLD MODEL
        if text.count('#') == 0: # BASIC
            pass
        else:
            for param_group in text.split('#')[:-1]:
                for param in param_group.split():
                    if TextAnalysis._is_meter(param.strip()):
                        line_params["METER"] = param.strip()
                    elif TextAnalysis._is_line_length(param.strip()):
                        line_params["LENGTH"] = int(param.strip())
                    elif TextAnalysis._is_line_end(param.strip()):
                        line_params["END"] = param.strip()
                    
                        
        line_params["TRUE_LENGTH"] = len(SyllableMaker.syllabify(line_striped.split('#')[-1]))     
        line_only_char = TextManipulation._remove_all_nonchar(line_striped).strip()
        if len(line_only_char) > 2:
            line_params["TRUE_END"] = SyllableMaker.syllabify(" ".join(line_only_char.split()[-2:]))[-1]
        
        return line_params
    
    @staticmethod
    def _is_param_line(text:str):
        """Return if line is a Parameter line (Parameters RHYME, METER, YEAR)

        Args:
            text (str): line to analyze

        Returns:
            bool: If line is a Parameter line
        """
        line_striped = text.strip()
        if not line_striped:
            return False
        small_analysis = TextAnalysis._first_line_analysis(line_striped)
        return  "RHYME" in small_analysis.keys() or "YEAR" in small_analysis.keys()

class SyllableMaker:
    """Static class with methods for separating string to list of Syllables

    Returns:
        list: List of syllables
    """
    
        
# NON-Original code!
# Taken from Barbora Štěpánková

    @staticmethod
    def syllabify(text : str) -> list[str]:
        words = re.findall(r"[aábcčdďeéěfghiíjklmnňoópqrřsštťuúůvwxyýzžAÁBCČDĎEÉĚFGHIÍJKLMNŇOÓPQRŘSŠTŤUÚŮVWXYÝZŽäöüÄÜÖ]+", text)
        syllables : list[str] = []

        i = 0
        while i < len(words):
            word = words[i]

            if (word.lower() == "k" or word.lower() == "v" or word.lower() == "s" or word.lower() == "z") and i < len(words) - 1 and len(words[i + 1]) > 1:
                i += 1
                word = word + words[i]

            letter_counter = 0

            # Get syllables: mask the word and split the mask
            for syllable_mask in SyllableMaker.__split_mask(SyllableMaker.__create_word_mask(word)):
                word_syllable = ""
                for character in syllable_mask:
                    word_syllable += word[letter_counter]
                    letter_counter += 1

                syllables.append(word_syllable)

            i += 1

        return syllables


    @staticmethod
    def __create_word_mask(word : str) -> str:
        word = word.lower()

        vocals = r"[aeiyouáéěíýóůúäöü]"
        consonants = r"[bcčdďfghjklmnňpqrřsštťvwxzž]"

        replacements = [
            #double letters
    		('ch', 'c0'),
    		('rr', 'r0'),
            ('ll', 'l0'),
    		('nn', 'n0'),
    		('th', 't0'),

            # au, ou, ai, oi
    		(r'[ao]u', '0V'),
            (r'[ao]i','0V'),

            # eu at the beginning of the word
    		(r'^eu', '0V'),
    
            # now all vocals
    		(vocals, 'V'),

            # r,l that act like vocals in syllables
    		(r'([^V])([rl])(0*[^0Vrl]|$)', r'\1V\3'),

            # sp, st, sk, št, Cř, Cl, Cr, Cv
    		(r's[pt]', 's0'),
    		(r'([^V0lr]0*)[řlrv]', r'\g<1>0'),
    		(r'([^V0]0*)sk', r'\1s0'),
    		(r'([^V0]0*)št', r'\1š0'),

    		(consonants, 'K')
    	]

        for (original, replacement) in replacements:
            word = re.sub(original, replacement, word)

        return word


    @staticmethod
    def __split_mask(mask : str) -> list[str]:
        replacements = [
    		# vocal at the beginning
    		(r'(^0*V)(K0*V)', r'\1/\2'),
    		(r'(^0*V0*K0*)K', r'\1/K'),

    		# dividing the middle of the word
    		(r'(K0*V(K0*$)?)', r'\1/'),
    		(r'/(K0*)K', r'\1/K'),
    		(r'/(0*V)(0*K0*V)', r'/\1/\2'),
    		(r'/(0*V0*K0*)K', r'/\1/K'),

    		# add the last consonant to the previous syllable
    		(r'/(K0*)$', r'\1/')
    	]

        for (original, replacement) in replacements:
            mask = re.sub(original, replacement, mask)

        if len(mask) > 0 and mask[-1] == "/":
            mask = mask[0:-1]

        return mask.split("/")