|
"""Tabular parser. |
|
|
|
Contains parsers for tabular data files. |
|
|
|
""" |
|
from pathlib import Path |
|
from typing import Any, Dict, List, Union |
|
|
|
from application.parser.file.base_parser import BaseParser |
|
|
|
|
|
class CSVParser(BaseParser): |
|
"""CSV parser. |
|
|
|
Args: |
|
concat_rows (bool): whether to concatenate all rows into one document. |
|
If set to False, a Document will be created for each row. |
|
True by default. |
|
|
|
""" |
|
|
|
def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: |
|
"""Init params.""" |
|
super().__init__(*args, **kwargs) |
|
self._concat_rows = concat_rows |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: |
|
"""Parse file. |
|
|
|
Returns: |
|
Union[str, List[str]]: a string or a List of strings. |
|
|
|
""" |
|
try: |
|
import csv |
|
except ImportError: |
|
raise ValueError("csv module is required to read CSV files.") |
|
text_list = [] |
|
with open(file, "r") as fp: |
|
csv_reader = csv.reader(fp) |
|
for row in csv_reader: |
|
text_list.append(", ".join(row)) |
|
if self._concat_rows: |
|
return "\n".join(text_list) |
|
else: |
|
return text_list |
|
|
|
|
|
class PandasCSVParser(BaseParser): |
|
r"""Pandas-based CSV parser. |
|
|
|
Parses CSVs using the separator detection from Pandas `read_csv`function. |
|
If special parameters are required, use the `pandas_config` dict. |
|
|
|
Args: |
|
concat_rows (bool): whether to concatenate all rows into one document. |
|
If set to False, a Document will be created for each row. |
|
True by default. |
|
|
|
col_joiner (str): Separator to use for joining cols per row. |
|
Set to ", " by default. |
|
|
|
row_joiner (str): Separator to use for joining each row. |
|
Only used when `concat_rows=True`. |
|
Set to "\n" by default. |
|
|
|
pandas_config (dict): Options for the `pandas.read_csv` function call. |
|
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html |
|
for more information. |
|
Set to empty dict by default, this means pandas will try to figure |
|
out the separators, table head, etc. on its own. |
|
|
|
""" |
|
|
|
def __init__( |
|
self, |
|
*args: Any, |
|
concat_rows: bool = True, |
|
col_joiner: str = ", ", |
|
row_joiner: str = "\n", |
|
pandas_config: dict = {}, |
|
**kwargs: Any |
|
) -> None: |
|
"""Init params.""" |
|
super().__init__(*args, **kwargs) |
|
self._concat_rows = concat_rows |
|
self._col_joiner = col_joiner |
|
self._row_joiner = row_joiner |
|
self._pandas_config = pandas_config |
|
|
|
def _init_parser(self) -> Dict: |
|
"""Init parser.""" |
|
return {} |
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: |
|
"""Parse file.""" |
|
try: |
|
import pandas as pd |
|
except ImportError: |
|
raise ValueError("pandas module is required to read CSV files.") |
|
|
|
df = pd.read_csv(file, **self._pandas_config) |
|
|
|
text_list = df.apply( |
|
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 |
|
).tolist() |
|
|
|
if self._concat_rows: |
|
return (self._row_joiner).join(text_list) |
|
else: |
|
return text_list |
|
|