conex / espnet2 /fileio /datadir_writer.py
tobiasc's picture
Initial commit
ad16788
raw
history blame
2.36 kB
from pathlib import Path
from typing import Union
import warnings
from typeguard import check_argument_types
from typeguard import check_return_type
class DatadirWriter:
"""Writer class to create kaldi like data directory.
Examples:
>>> with DatadirWriter("output") as writer:
... # output/sub.txt is created here
... subwriter = writer["sub.txt"]
... # Write "uttidA some/where/a.wav"
... subwriter["uttidA"] = "some/where/a.wav"
... subwriter["uttidB"] = "some/where/b.wav"
"""
def __init__(self, p: Union[Path, str]):
assert check_argument_types()
self.path = Path(p)
self.chilidren = {}
self.fd = None
self.has_children = False
self.keys = set()
def __enter__(self):
return self
def __getitem__(self, key: str) -> "DatadirWriter":
assert check_argument_types()
if self.fd is not None:
raise RuntimeError("This writer points out a file")
if key not in self.chilidren:
w = DatadirWriter((self.path / key))
self.chilidren[key] = w
self.has_children = True
retval = self.chilidren[key]
assert check_return_type(retval)
return retval
def __setitem__(self, key: str, value: str):
assert check_argument_types()
if self.has_children:
raise RuntimeError("This writer points out a directory")
if key in self.keys:
warnings.warn(f"Duplicated: {key}")
if self.fd is None:
self.path.parent.mkdir(parents=True, exist_ok=True)
self.fd = self.path.open("w", encoding="utf-8")
self.keys.add(key)
self.fd.write(f"{key} {value}\n")
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def close(self):
if self.has_children:
prev_child = None
for child in self.chilidren.values():
child.close()
if prev_child is not None and prev_child.keys != child.keys:
warnings.warn(
f"Ids are mismatching between "
f"{prev_child.path} and {child.path}"
)
prev_child = child
elif self.fd is not None:
self.fd.close()