Spaces:
Running
Running
import json | |
from pathlib import Path | |
import pandas as pd | |
import RuleBasedModels | |
from constants import app_logger | |
from typing_hints import Category | |
class TextDataset: | |
"""Sentences dataset.""" | |
def __init__(self, table: pd.DataFrame, language: str): | |
self.table_dataframe = table | |
self.language = language | |
def __getitem__(self, idx) -> list[str]: | |
line = [self.table_dataframe['sentence'].iloc[idx]] | |
return line | |
def __len__(self) -> int: | |
return len(self.table_dataframe) | |
def get_category_from_df(self, category_value:Category) -> pd.DataFrame: | |
"""Filter the sentence dataframe by category returning | |
Args: | |
category_value (int): The category value to filter the dataframe. | |
Returns: | |
pd.DataFrame: The filtered dataframe. | |
""" | |
selector = self.table_dataframe["category"] == category_value | |
df_by_category = self.table_dataframe[selector] | |
return df_by_category | |
def get_random_sample_from_df(self, category_value:Category) -> list[str]: | |
"""Get a random sentence from the category filtered dataframe. | |
Args: | |
category_value (int): The category value to filter the dataframe. | |
Returns: | |
list: A list with the selected sentence. | |
""" | |
app_logger.info(f"language={self.language}, category_value={category_value}.") | |
choice = self.table_dataframe.sample(n=1) | |
if category_value !=0: | |
df_language_filtered_by_category = self.get_category_from_df(category_value) | |
choice = df_language_filtered_by_category.sample(n=1) | |
sentence = choice["sentence"].iloc[0] | |
app_logger.info(f"sentence={sentence} ...") | |
return [sentence] | |
sample_folder = Path(__file__).parent / "databases" | |
lambda_database = {} | |
lambda_ipa_converter = {} | |
available_languages = ['de', 'en'] | |
for lang in available_languages: | |
# avoid using ";" or "," as separator because these are present within the dataframe sentences | |
df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|') | |
lambda_database[lang] = TextDataset(df, lang) | |
lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang) | |
lambda_translate_new_sample = False | |
def lambda_handler(event: dict[str], context) -> str: | |
""" | |
lambda handler to return a random text sample from the dataset. | |
Args: | |
event (dict): The event data passed to the Lambda function. | |
context (dict): The context in which the Lambda function is called. | |
Returns: | |
str: The JSON-encoded result. | |
""" | |
try: | |
body = json.loads(event['body']) | |
try: | |
category = int(body['category']) | |
except KeyError: | |
category = 0 | |
language = body['language'] | |
try: | |
current_transcript = str(body["transcript"]) | |
except KeyError: | |
current_transcript = get_random_selection(language, category) | |
current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript) | |
app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.") | |
result = { | |
'real_transcript': [current_transcript], | |
'ipa_transcript': current_ipa, | |
'transcript_translation': "" | |
} | |
return json.dumps(result) | |
except Exception as ex: | |
app_logger.error(f"ex: {ex} ...") | |
raise ex | |
def get_random_selection(language: str, category_value: Category) -> str: | |
""" | |
Get a random text sample from the dataset. | |
Args: | |
language (str): The language code. | |
category_value (int): The category value to filter the dataset. | |
Returns: | |
str: The selected text sample. | |
""" | |
lambda_df_lang = lambda_database[language] | |
current_transcript = lambda_df_lang.get_random_sample_from_df(category_value) | |
app_logger.info(f"category_value={category_value}, language={language}, current_transcript={current_transcript}.") | |
return current_transcript[0] | |
def getSentenceCategory(sentence) -> int | None: | |
number_of_words = len(sentence.split()) | |
categories_word_limits = [0, 8, 20, 100000] | |
for category in range(len(categories_word_limits)-1): | |
if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]: | |
return category+1 | |
raise ValueError(f"category not assigned for sentence '{sentence}' ...") | |
def get_enriched_dataframe_csv( | |
language: str, | |
custom_dataframe_csv_filename_no_ext: str = "data", | |
custom_folder: Path = sample_folder | |
) -> None: | |
""" | |
Read a csv dataframe adding a 'category' column. | |
Args: | |
language (str): The language code (e.g. "de" for German). | |
custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension. | |
custom_folder (Path): The folder containing the csv dataframe. | |
Returns: | |
None | |
""" | |
custom_folder = Path(custom_folder).absolute() | |
df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv' | |
with open(df_filename, 'r') as handle: | |
df2 = pd.read_csv(handle, sep="|") | |
df2["category"] = df2["sentence"].apply(getSentenceCategory) | |
app_logger.info("de_category added") | |
output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv' | |
df2.to_csv(output_path, index=False, sep="|") | |
app_logger.info(f"written {output_path} ...") | |
if __name__ == '__main__': | |
get_enriched_dataframe_csv("de") | |
get_enriched_dataframe_csv("en") | |