Source code for algebraixlib.io.csv

r"""Import :term:`regular` :term:`clan`\s from and export them to CSV data."""

# $Id: csv.py 22735 2015-08-04 22:59:12Z gfiedler $
# Copyright Algebraix Data Corporation 2015 - $Date: 2015-08-04 17:59:12 -0500 (Tue, 04 Aug 2015) $
#
# This file is part of algebraixlib <http://github.com/AlgebraixData/algebraixlib>.
#
# algebraixlib is free software: you can redistribute it and/or modify it under the terms of version
# 3 of the GNU Lesser General Public License as published by the Free Software Foundation.
#
# algebraixlib is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License along with algebraixlib.
# If not, see <http://www.gnu.org/licenses/>.
# --------------------------------------------------------------------------------------------------
import collections as _collections
import csv as _csv

import algebraixlib.algebras.clans as _clans
import algebraixlib.algebras.multiclans as _multiclans
import algebraixlib.algebras.relations as _relations
# noinspection PyProtectedMember
import algebraixlib.io._util as _util
import algebraixlib.mathobjects as _mo
import algebraixlib.undef as _ud
import algebraixlib.util.miscellaneous as _misc


[docs]def export_csv(absolute_clan: 'PP(A x A)', file_or_path, ordered_lefts=None, sort_key=None):
    r"""Export a regular, absolute clan as CSV file with header row.

    The :term:`left component`\s of the :term:`clan` are interpreted as column names and are
    exported as header row. Every :term:`relation` in the clan becomes a data row in the CSV file.

    :param absolute_clan: A :term:`regular`, :term:`absolute clan`.
    :param file_or_path: Either a file path (in this case the CSV data is written to a file at this
        location) or a file object (in this case the CSV data is written to its ``.write()``
        function).
    :param ordered_lefts: (Optional) A ``Sequence`` of :term:`left`\s that are exported in the
        given order. Default is the sequence that is the lexically sorted :term:`left set` of the
        clan.
    :param sort_key: (Optional) A function that compares two row-:term:`relation`\s and provides an
        order (for use with :func:`sorted`). The output is not sorted if ``sort_key`` is missing.
    :return: ``True`` if the CSV export succeeded, `Undef()` if not.
    """
    if not _clans.is_absolute_member(absolute_clan) \
            and not _multiclans.is_absolute_member(absolute_clan):
        return _ud.make_or_raise_undef()
    if ordered_lefts is None and not _clans.is_regular(absolute_clan):
        return _ud.make_or_raise_undef()

    if ordered_lefts is None:
        # Since this clan is regular, get first relation to acquire left set.
        rel = next(iter(absolute_clan))
        # left_set is sorted to guarantee consistent iterations
        ordered_lefts = sorted([left.value for left in rel.get_left_set()])

    # Generate dictionaries that associates left components with their right components for each
    # relation.
    clan_as_list_of_dicts = _convert_clan_to_list_of_dicts(
        ordered_lefts, (absolute_clan if sort_key is None else sorted(absolute_clan, key=sort_key)))
    # Write the dictionaries.
    _csv_dict_writer(file_or_path, ordered_lefts, clan_as_list_of_dicts)
    return True


def _csv_dict_writer(file_or_path, ordered_columns: _collections.Sequence,
                     data: _collections.Sequence):
    """Write a CSV file using `csv.DictWriter`.

    :param file_or_path: Either a file path (in this case the CSV data is written to a file at this
        location) or a file object (in this case the CSV data is written to its ``.write()``
        function).
    :param ordered_columns: A `Sequence` of column names (atoms). The columns are
        written in the given order.
    :param data: A `Sequence` of rows, where each row is a dictionary, mapping a column name to its
        value in the given row. Both the column name and the value are atoms.
    """
    def write_data(out_file):
        writer = _csv.DictWriter(
            f=out_file, fieldnames=ordered_columns, dialect='excel')
        writer.writeheader()
        for row in data:
            writer.writerow(row)

    _misc.write_to_file_or_path(file_or_path, write_data)


def _convert_clan_to_list_of_dicts(ordered_lefts: 'P( A )', absolute_clan: 'PP(A x A)') -> list:
    """Convert a regular, absolute clan into a list of dictionaries.

    :param ordered_lefts: The left components of ``absolute_clan`` that are converted.
    :param absolute_clan: A regular, absolute clan that is converted into a list of dictionaries.
    :return: A list of dictionaries. Every dictionary represents a single relation in
        ``absolute_clan``. The lefts of the relation become the keys, the rights become the values
        of the dictionary.
    """
    for rel in absolute_clan:
        left_to_right_dict = {}
        for left in ordered_lefts:
            # Get the right component associated with left and add it to our row.
            right = _relations.get_right(rel, left)
            if right is not _ud.Undef():
                left_to_right_dict[left] = right.value
        # Add right component dictionary to result
        yield left_to_right_dict


[docs]def import_csv(csv_file_or_filepath, types=None, skip_rows=0, index_column=None,
               has_dup_rows=False) -> 'PP( A x M )':
    r"""Import the file ``csv_file_or_filepath`` as CSV data and return a clan.

    :param csv_file_or_filepath: The file path or file object (for example ``StringIO`` buffer) to
        import.
    :param types: (Optional) A dictionary of type conversions. The keys are the column names; the
        values are functors (or types) that receive the string from the CSV cell and return the
        value to be imported. Example: ``{'foo': int, 'bar': float}``. By default all values are
        interpreted as `string`\s.
    :param skip_rows: (Optional) A number of lines to skip (default 0). Some CSV files have a
        preamble that can be skipped with this option.
    :param index_column: (Optional) A name for an index column. (No index column is created if this
        argument is not specified.) The index starts with 0. (This option is not compatible with the
        ``has_dup_rows`` option.)
    :param has_dup_rows: (Optional) If ``True``, allow duplicate rows and return a multiclan
        instead of a clan. By default, the value is ``False`` and a clan is returned. (This option
        is not compatible with the option ``index_column``.)
    :return: A :term:`clan` (if ``has_dup_rows is ``False`` or not provided) or a :term:`multiclan`
        (if ``has_dup_rows`` is ``True``).
    """
    if types is None:
        types = {}

    def _filter_row(row):
        """Remove missing and blank elements from the CSV row."""
        for key, val in row.items():
            if val is None or val == '':
                continue
            yield key, val

    _util.get_left_cached.left_cache = {}
    import_csv.regular = True  # Set to false if any row is missing one or more values

    assert ((index_column is not None) & (has_dup_rows is False)) or (index_column is None)

    def _import_csv(csv_file):
        for _ in range(0, skip_rows):
            next(csv_file)
        reader = _csv.DictReader(csv_file)

        _index = 0
        for row in reader:
            filtered_row = {key: val for key, val in _filter_row(row)}
            if import_csv.regular and len(row) != len(filtered_row):
                import_csv.regular = False
            for key, val in types.items():
                if key in filtered_row:
                    filtered_row[key] = val(filtered_row[key])
            if index_column is not None:
                filtered_row[index_column] = _index
                _index += 1
            yield _mo.Set(
                (_mo.Couplet(left=_util.get_left_cached(left), right=_mo.Atom(right),
                direct_load=True) for left, right in filtered_row.items()), direct_load=True)\
                .cache_relation(_mo.CacheStatus.IS).cache_functional(_mo.CacheStatus.IS)

    if hasattr(csv_file_or_filepath, "readlines"):  # Support StringIO.
        if has_dup_rows:
            return _mo.Multiset(_import_csv(csv_file_or_filepath), direct_load=True)
        else:
            return _mo.Set(_import_csv(csv_file_or_filepath), direct_load=True)\
                .cache_clan(_mo.CacheStatus.IS).cache_functional(_mo.CacheStatus.IS)\
                .cache_regular(_mo.CacheStatus.from_bool(import_csv.regular))
    else:
        with open(csv_file_or_filepath, encoding='utf-8', errors='ignore') as file:
            if has_dup_rows:
                return _mo.Multiset(_import_csv(file), direct_load=True)
            else:
                return _mo.Set(_import_csv(file), direct_load=True)\
                    .cache_clan(_mo.CacheStatus.IS).cache_functional(_mo.CacheStatus.IS)\
                    .cache_regular(_mo.CacheStatus.from_bool(import_csv.regular))