Spaces:
Running
Running
File size: 5,615 Bytes
85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 |
|
import json
from pathlib import Path
import pandas as pd
import RuleBasedModels
from constants import app_logger
from typing_hints import Category
class TextDataset:
"""Sentences dataset."""
def __init__(self, table: pd.DataFrame, language: str):
self.table_dataframe = table
self.language = language
def __getitem__(self, idx) -> list[str]:
line = [self.table_dataframe['sentence'].iloc[idx]]
return line
def __len__(self) -> int:
return len(self.table_dataframe)
def get_category_from_df(self, category_value:Category) -> pd.DataFrame:
"""Filter the sentence dataframe by category returning
Args:
category_value (int): The category value to filter the dataframe.
Returns:
pd.DataFrame: The filtered dataframe.
"""
selector = self.table_dataframe["category"] == category_value
df_by_category = self.table_dataframe[selector]
return df_by_category
def get_random_sample_from_df(self, category_value:Category) -> list[str]:
"""Get a random sentence from the category filtered dataframe.
Args:
category_value (int): The category value to filter the dataframe.
Returns:
list: A list with the selected sentence.
"""
app_logger.info(f"language={self.language}, category_value={category_value}.")
choice = self.table_dataframe.sample(n=1)
if category_value !=0:
df_language_filtered_by_category = self.get_category_from_df(category_value)
choice = df_language_filtered_by_category.sample(n=1)
sentence = choice["sentence"].iloc[0]
app_logger.info(f"sentence={sentence} ...")
return [sentence]
sample_folder = Path(__file__).parent / "databases"
lambda_database = {}
lambda_ipa_converter = {}
available_languages = ['de', 'en']
for lang in available_languages:
# avoid using ";" or "," as separator because these are present within the dataframe sentences
df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|')
lambda_database[lang] = TextDataset(df, lang)
lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang)
lambda_translate_new_sample = False
def lambda_handler(event: dict[str], context) -> str:
"""
lambda handler to return a random text sample from the dataset.
Args:
event (dict): The event data passed to the Lambda function.
context (dict): The context in which the Lambda function is called.
Returns:
str: The JSON-encoded result.
"""
try:
body = json.loads(event['body'])
try:
category = int(body['category'])
except KeyError:
category = 0
language = body['language']
try:
current_transcript = str(body["transcript"])
except KeyError:
current_transcript = get_random_selection(language, category)
current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript)
app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
result = {
'real_transcript': [current_transcript],
'ipa_transcript': current_ipa,
'transcript_translation': ""
}
return json.dumps(result)
except Exception as ex:
app_logger.error(f"ex: {ex} ...")
raise ex
def get_random_selection(language: str, category_value: Category) -> str:
"""
Get a random text sample from the dataset.
Args:
language (str): The language code.
category_value (int): The category value to filter the dataset.
Returns:
str: The selected text sample.
"""
lambda_df_lang = lambda_database[language]
current_transcript = lambda_df_lang.get_random_sample_from_df(category_value)
app_logger.info(f"category_value={category_value}, language={language}, current_transcript={current_transcript}.")
return current_transcript[0]
def getSentenceCategory(sentence) -> int | None:
number_of_words = len(sentence.split())
categories_word_limits = [0, 8, 20, 100000]
for category in range(len(categories_word_limits)-1):
if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
return category+1
raise ValueError(f"category not assigned for sentence '{sentence}' ...")
def get_enriched_dataframe_csv(
language: str,
custom_dataframe_csv_filename_no_ext: str = "data",
custom_folder: Path = sample_folder
) -> None:
"""
Read a csv dataframe adding a 'category' column.
Args:
language (str): The language code (e.g. "de" for German).
custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
custom_folder (Path): The folder containing the csv dataframe.
Returns:
None
"""
custom_folder = Path(custom_folder).absolute()
df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
with open(df_filename, 'r') as handle:
df2 = pd.read_csv(handle, sep="|")
df2["category"] = df2["sentence"].apply(getSentenceCategory)
app_logger.info("de_category added")
output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
df2.to_csv(output_path, index=False, sep="|")
app_logger.info(f"written {output_path} ...")
if __name__ == '__main__':
get_enriched_dataframe_csv("de")
get_enriched_dataframe_csv("en")
|