import json from pathlib import Path import pandas as pd import RuleBasedModels from constants import app_logger from typing_hints import Category class TextDataset: """Sentences dataset.""" def __init__(self, table: pd.DataFrame, language: str): self.table_dataframe = table self.language = language def __getitem__(self, idx) -> list[str]: line = [self.table_dataframe['sentence'].iloc[idx]] return line def __len__(self) -> int: return len(self.table_dataframe) def get_category_from_df(self, category_value:Category) -> pd.DataFrame: """Filter the sentence dataframe by category returning Args: category_value (int): The category value to filter the dataframe. Returns: pd.DataFrame: The filtered dataframe. """ selector = self.table_dataframe["category"] == category_value df_by_category = self.table_dataframe[selector] return df_by_category def get_random_sample_from_df(self, category_value:Category) -> list[str]: """Get a random sentence from the category filtered dataframe. Args: category_value (int): The category value to filter the dataframe. Returns: list: A list with the selected sentence. """ app_logger.info(f"language={self.language}, category_value={category_value}.") choice = self.table_dataframe.sample(n=1) if category_value !=0: df_language_filtered_by_category = self.get_category_from_df(category_value) choice = df_language_filtered_by_category.sample(n=1) sentence = choice["sentence"].iloc[0] app_logger.info(f"sentence={sentence} ...") return [sentence] sample_folder = Path(__file__).parent / "databases" lambda_database = {} lambda_ipa_converter = {} available_languages = ['de', 'en'] for lang in available_languages: # avoid using ";" or "," as separator because these are present within the dataframe sentences df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|') lambda_database[lang] = TextDataset(df, lang) lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang) lambda_translate_new_sample = False def lambda_handler(event: dict[str], context) -> str: """ lambda handler to return a random text sample from the dataset. Args: event (dict): The event data passed to the Lambda function. context (dict): The context in which the Lambda function is called. Returns: str: The JSON-encoded result. """ try: body = json.loads(event['body']) try: category = int(body['category']) except KeyError: category = 0 language = body['language'] try: current_transcript = str(body["transcript"]) except KeyError: current_transcript = get_random_selection(language, category) current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript) app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.") result = { 'real_transcript': [current_transcript], 'ipa_transcript': current_ipa, 'transcript_translation': "" } return json.dumps(result) except Exception as ex: app_logger.error(f"ex: {ex} ...") raise ex def get_random_selection(language: str, category_value: Category) -> str: """ Get a random text sample from the dataset. Args: language (str): The language code. category_value (int): The category value to filter the dataset. Returns: str: The selected text sample. """ lambda_df_lang = lambda_database[language] current_transcript = lambda_df_lang.get_random_sample_from_df(category_value) app_logger.info(f"category_value={category_value}, language={language}, current_transcript={current_transcript}.") return current_transcript[0] def getSentenceCategory(sentence) -> int | None: number_of_words = len(sentence.split()) categories_word_limits = [0, 8, 20, 100000] for category in range(len(categories_word_limits)-1): if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]: return category+1 raise ValueError(f"category not assigned for sentence '{sentence}' ...") def get_enriched_dataframe_csv( language: str, custom_dataframe_csv_filename_no_ext: str = "data", custom_folder: Path = sample_folder ) -> None: """ Read a csv dataframe adding a 'category' column. Args: language (str): The language code (e.g. "de" for German). custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension. custom_folder (Path): The folder containing the csv dataframe. Returns: None """ custom_folder = Path(custom_folder).absolute() df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv' with open(df_filename, 'r') as handle: df2 = pd.read_csv(handle, sep="|") df2["category"] = df2["sentence"].apply(getSentenceCategory) app_logger.info("de_category added") output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv' df2.to_csv(output_path, index=False, sep="|") app_logger.info(f"written {output_path} ...") if __name__ == '__main__': get_enriched_dataframe_csv("de") get_enriched_dataframe_csv("en")