Spaces:
Running
Running
File size: 5,615 Bytes
85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 0700cb3 85b7206 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import json
from pathlib import Path
import pandas as pd
import RuleBasedModels
from constants import app_logger
from typing_hints import Category
class TextDataset:
"""Sentences dataset."""
def __init__(self, table: pd.DataFrame, language: str):
self.table_dataframe = table
self.language = language
def __getitem__(self, idx) -> list[str]:
line = [self.table_dataframe['sentence'].iloc[idx]]
return line
def __len__(self) -> int:
return len(self.table_dataframe)
def get_category_from_df(self, category_value:Category) -> pd.DataFrame:
"""Filter the sentence dataframe by category returning
Args:
category_value (int): The category value to filter the dataframe.
Returns:
pd.DataFrame: The filtered dataframe.
"""
selector = self.table_dataframe["category"] == category_value
df_by_category = self.table_dataframe[selector]
return df_by_category
def get_random_sample_from_df(self, category_value:Category) -> list[str]:
"""Get a random sentence from the category filtered dataframe.
Args:
category_value (int): The category value to filter the dataframe.
Returns:
list: A list with the selected sentence.
"""
app_logger.info(f"language={self.language}, category_value={category_value}.")
choice = self.table_dataframe.sample(n=1)
if category_value !=0:
df_language_filtered_by_category = self.get_category_from_df(category_value)
choice = df_language_filtered_by_category.sample(n=1)
sentence = choice["sentence"].iloc[0]
app_logger.info(f"sentence={sentence} ...")
return [sentence]
sample_folder = Path(__file__).parent / "databases"
lambda_database = {}
lambda_ipa_converter = {}
available_languages = ['de', 'en']
for lang in available_languages:
# avoid using ";" or "," as separator because these are present within the dataframe sentences
df = pd.read_csv(sample_folder / f'data_{lang}.csv', delimiter='|')
lambda_database[lang] = TextDataset(df, lang)
lambda_ipa_converter[lang] = RuleBasedModels.get_phonem_converter(lang)
lambda_translate_new_sample = False
def lambda_handler(event: dict[str], context) -> str:
"""
lambda handler to return a random text sample from the dataset.
Args:
event (dict): The event data passed to the Lambda function.
context (dict): The context in which the Lambda function is called.
Returns:
str: The JSON-encoded result.
"""
try:
body = json.loads(event['body'])
try:
category = int(body['category'])
except KeyError:
category = 0
language = body['language']
try:
current_transcript = str(body["transcript"])
except KeyError:
current_transcript = get_random_selection(language, category)
current_ipa = lambda_ipa_converter[language].convertToPhonem(current_transcript)
app_logger.info(f"real_transcript='{current_transcript}', ipa_transcript='{current_ipa}'.")
result = {
'real_transcript': [current_transcript],
'ipa_transcript': current_ipa,
'transcript_translation': ""
}
return json.dumps(result)
except Exception as ex:
app_logger.error(f"ex: {ex} ...")
raise ex
def get_random_selection(language: str, category_value: Category) -> str:
"""
Get a random text sample from the dataset.
Args:
language (str): The language code.
category_value (int): The category value to filter the dataset.
Returns:
str: The selected text sample.
"""
lambda_df_lang = lambda_database[language]
current_transcript = lambda_df_lang.get_random_sample_from_df(category_value)
app_logger.info(f"category_value={category_value}, language={language}, current_transcript={current_transcript}.")
return current_transcript[0]
def getSentenceCategory(sentence) -> int | None:
number_of_words = len(sentence.split())
categories_word_limits = [0, 8, 20, 100000]
for category in range(len(categories_word_limits)-1):
if categories_word_limits[category] < number_of_words <= categories_word_limits[category + 1]:
return category+1
raise ValueError(f"category not assigned for sentence '{sentence}' ...")
def get_enriched_dataframe_csv(
language: str,
custom_dataframe_csv_filename_no_ext: str = "data",
custom_folder: Path = sample_folder
) -> None:
"""
Read a csv dataframe adding a 'category' column.
Args:
language (str): The language code (e.g. "de" for German).
custom_dataframe_csv_filename_no_ext (str): The csv dataframe without extension.
custom_folder (Path): The folder containing the csv dataframe.
Returns:
None
"""
custom_folder = Path(custom_folder).absolute()
df_filename = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
with open(df_filename, 'r') as handle:
df2 = pd.read_csv(handle, sep="|")
df2["category"] = df2["sentence"].apply(getSentenceCategory)
app_logger.info("de_category added")
output_path = custom_folder / f'{custom_dataframe_csv_filename_no_ext}_{language}.csv'
df2.to_csv(output_path, index=False, sep="|")
app_logger.info(f"written {output_path} ...")
if __name__ == '__main__':
get_enriched_dataframe_csv("de")
get_enriched_dataframe_csv("en")
|