Source code for algebraixlib.import_export.rdf

"""Import and export facilities for RDF data."""

# Copyright Algebraix Data Corporation 2015 - 2017
#
# This file is part of algebraixlib <http://github.com/AlgebraixData/algebraixlib>.
#
# algebraixlib is free software: you can redistribute it and/or modify it under the terms of version
# 3 of the GNU Lesser General Public License as published by the Free Software Foundation.
#
# algebraixlib is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License along with algebraixlib.
# If not, see <http://www.gnu.org/licenses/>.
# --------------------------------------------------------------------------------------------------
import abc as _abc
import rdflib as _rdflib

import algebraixlib.algebras.clans as _clans
import algebraixlib.algebras.relations as _relations
import algebraixlib.mathobjects as _mo
import algebraixlib.undef as _ud
import algebraixlib.util.miscellaneous as _misc
import algebraixlib.util.rdf as _rdf


[docs]def import_graph(graph_file_or_filepath, rdf_format: str=None) -> 'PP(A x A)':
    r"""Return an absolute clan that represents the RDF graph ``graph_file_or_filepath``.

    -   The :term:`graph` is represented by a :term:`regular`, :term:`absolute clan`.
    -   The :term:`left component`\s are 's' (subject), 'p' (predicate) and 'o' (object) (see
        :term:`triple`).
    -   Blank nodes are converted to IRIs (skolemization).
    -   URI references are represented as ``rdflib.URIRef`` instances (embedded in an
        :class:`~.Atom`).
    -   Literals are converted to intuitive native Python types (embedded in an :class:`~.Atom`).

    :param graph_file_or_filepath: A string that is a path (relative or absolute) to the file to
        be imported, or a file object (like `StringIO`).
    :param rdf_format: The format of the RDF graph that is being imported. See `<Plugin parsers>`_
        for a list of supported formats and their corresponding strings. If none is given, the file
        name's extension is used to guess a format. If that fails, Turtle is used as default.
    :return: An :term:`absolute clan` that represents the RDF graph that has been imported.

    .. _Plugin parsers: http://rdflib.readthedocs.org/en/latest/plugin_parsers.html
    """
    if rdf_format is None:
        try:
            rdf_format = _rdflib.util.guess_format(graph_file_or_filepath) or 'turtle'
        except AttributeError:
            rdf_format = 'turtle'

    graph = _rdflib.Graph()
    graph.parse(source=graph_file_or_filepath, format=rdf_format)
    graph.skolemize()
    return _convert_graph_to_mathobjects(graph)


class _ExportTable(_abc.ABC):
    """A base class for data export writers for tabular data (absolute clans)."""

    def __init__(self, clan: 'PP(A x A)', ordered_lefts=None, leftset: 'P( A )'=None):
        """Constructor. Call in the derived class's constructor.

        :param clan: The tabular data to be exported.
        :param ordered_lefts: An optional (ordered) list of left atoms that determines the
            columns to be exported from ``clan`` and their order.
        :param leftset: An optional (unordered) set of left atoms that determines the the columns
            to be exported from ``clan``. If neither ordered_lefts nor lefts is given, all left
            components from ``clan`` are exported.
        """
        # Is set in .export()
        self._file = None

        if not _clans.is_member(clan):
            raise TypeError("'clan' must be a clan")
        self._clan = clan

        if ordered_lefts is not None:
            if leftset is not None:
                raise AssertionError("Only one of 'ordered_lefts' and 'lefts' may be given")
            if len(ordered_lefts) == 0:
                raise AssertionError("'ordered_lefts' must contain at least one left component")
            for left in ordered_lefts:
                if not left.is_atom:
                    raise TypeError("'ordered_lefts' must consist of Atom instances")
            self._ordered_lefts = ordered_lefts
        elif leftset is not None:
            self._ordered_lefts = []
            for left in leftset:
                if not left.is_atom:
                    raise TypeError("'lefts' must consist of Atom instances")
                self._ordered_lefts.append(left)
            if len(self._ordered_lefts) == 0:
                raise AssertionError("'lefts' must contain at least one left component")
        else:
            lefts = _clans.get_lefts(clan, _checked=False)
            self._ordered_lefts = [left for left in lefts]

    def export(self, file_or_path):
        """Export the data of this instance to ``file_or_path``.

        :param file_or_path: Either a file path (in this case the data is written to a file at this
            location) or a file object (in this case the data is written to its .write() function).
        """
        def write_data(out_file):
            self._file = out_file
            self._doc_start()
            self._write_header()
            self._write_body()
            self._doc_end()
        _misc.write_to_file_or_path(file_or_path, write_data)

    # ----------------------------------------------------------------------------------------------

    @_abc.abstractmethod
    def _doc_start(self):
        """Document start sequence."""
        pass

    @_abc.abstractmethod
    def _doc_end(self):
        """Document end sequence."""
        pass

    @_abc.abstractmethod
    def _header_start(self):
        """Header start sequence."""
        pass

    @_abc.abstractmethod
    def _header_between_lefts(self):
        """Sequence between the lefts in the header (that's not in start or end)."""
        pass

    @_abc.abstractmethod
    def _header_end(self):
        """Header end sequence."""
        pass

    @_abc.abstractmethod
    def _body_start(self):
        """Body (data) start sequence."""
        pass

    @_abc.abstractmethod
    def _body_end(self):
        """Body (data) end sequence."""
        pass

    @_abc.abstractmethod
    def _row_start(self):
        """Row (relation) start sequence."""
        pass

    @_abc.abstractmethod
    def _row_between_relations(self):
        """Sequence between the rows/relations (that's not in start or end)."""
        pass

    @_abc.abstractmethod
    def _row_end(self):
        """Row (relation) end sequence."""
        pass

    @_abc.abstractmethod
    def _write_item(self, left, right, prefix_separator):
        """Write an item. Prefix a separator (if this is not handled by _item_between_couplets())
        if prefix_separator is True."""
        pass

    @_abc.abstractmethod
    def _item_between_couplets(self):
        """Sequence between the items/couplets (that's not in _write_item())."""
        pass

    @_abc.abstractmethod
    def _write_atom(self, atom: _mo.Atom):
        """Write a data item. Handle escaping, quoting etc. -- everything to convert an atom into an
        output string."""
        pass

    # ----------------------------------------------------------------------------------------------

    def _write_header(self):
        self._header_start()
        itr = iter(self._ordered_lefts)
        self._write_atom(next(itr))
        for left in itr:
            self._header_between_lefts()
            self._write_atom(left)
        self._header_end()

    def _write_body(self):
        self._body_start()
        relation_itr = iter(self._clan)
        self._write_row(next(relation_itr))
        for relation in relation_itr:
            self._row_between_relations()
            self._write_row(relation)
        self._body_end()

    def _write_row(self, relation: 'P(A x A)'):
        self._row_start()
        lefts_itr = iter(self._ordered_lefts)
        left = next(lefts_itr)
        self._write_item_wrapper(relation, left, False)
        for left in lefts_itr:
            self._item_between_couplets()
            self._write_item_wrapper(relation, left, True)
        self._row_end()

    def _write_item_wrapper(self, relation: 'P(A x A)', left: '( A )', prefix_separator: bool):
        right = _relations.get_right(relation, left)
        self._write_item(left, right, prefix_separator)


[docs]def export_table(file_or_path, lefts: 'P( A )', table: 'PP(A x A)', out_format: str='csv'):
    r"""Return a serialized table as string in a supported RDF format for table serialization.

    **Limitations**:
    -   Leading '?' and '$' are not stripped from lefts (variable names).
    -   Non-printable characters are backslash-escaped.
    -   ``table`` must be an :term:`absolute` :term:`graph`.

    :param file_or_path: Either a file path (in this case the data is written to a file at this
        location) or a file object (in this case the data is written to its ``.write()`` function).
    :param lefts: The set of the :term:`left component`\s in ``table`` that is exported.
    :param table: An :term:`absolute clan` that contains the data to be exported.
    :param out_format: A supported RDF table format. Supported are ``'csv'`` (`SPARQL 1.1 Query
        Results CSV and TSV Formats`_) and ``'json'`` (`SPARQL 1.1 Query Results JSON Format`_).

    .. _SPARQL 1.1 Query Results CSV and TSV Formats:
        http://www.w3.org/TR/2013/REC-sparql11-results-csv-tsv-20130321/
    .. _SPARQL 1.1 Query Results JSON Format:
        http://www.w3.org/TR/2013/REC-sparql11-results-json-20130321/
    """

    # noinspection PyPep8
    class Csv(_ExportTable):
        """Specialize ``_ExportTable`` for RDF CSV."""
        def __init__(self, clan, ordered_lefts):
            super().__init__(clan=clan, ordered_lefts=ordered_lefts)
        def _doc_start(self):
            pass
        def _doc_end(self):
            pass
        def _header_start(self):
            pass
        def _header_between_lefts(self):
            self._file.write(', ')
        def _header_end(self):
            self._file.write('\n')
        def _body_start(self):
            pass
        def _body_end(self):
            pass
        def _row_start(self):
            pass
        def _row_between_relations(self):
            pass
        def _row_end(self):
            self._file.write('\n')
        def _item_between_couplets(self):
            self._file.write(', ')
        def _write_item(self, left, right, prefix_separator):
            if right is not _ud.Undef():
                self._write_atom(right)
        def _write_atom(self, atom: _mo.Atom):
            data = str(atom.value).replace('"', '""')
            self._file.write('"' + data + '"')

    # noinspection PyPep8
    class Json(_ExportTable):
        """Specialize ``_ExportTable`` for RDF JSON."""
        def __init__(self, clan, ordered_lefts):
            super().__init__(clan=clan, ordered_lefts=ordered_lefts)
        def _doc_start(self):
            self._file.write('{')
        def _doc_end(self):
            self._file.write('}')
        def _header_start(self):
            self._file.write('"head":{"vars":[')
        def _header_between_lefts(self):
            self._file.write(', ')
        def _header_end(self):
            self._file.write(']},\n')
        def _body_start(self):
            self._file.write('"results":{"bindings":[\n')
        def _body_end(self):
            self._file.write(']}')
        def _row_start(self):
            self._file.write('{')
        def _row_between_relations(self):
            self._file.write(',')
        def _row_end(self):
            self._file.write('}\n')
        def _item_between_couplets(self):
            pass
        def _write_item(self, left, right, prefix_separator):
            if right is not _ud.Undef():
                if prefix_separator:
                    self._file.write(', ')
                self._write_atom(left)
                self._file.write(': {"type":')
                if isinstance(right.value, _rdflib.URIRef):
                    self._file.write('"uri"')
                elif isinstance(right.value, _rdflib.BNode):
                    self._file.write('"bnode"')
                else:
                    self._file.write('"literal"')
                    lit = _rdflib.Literal(right.value)
                    if isinstance(lit.datatype, str):
                        self._file.write(', "datatype":"')
                        self._file.write(lit.datatype)
                        self._file.write('"')
                self._file.write(', "value":')
                self._write_atom(right)
                self._file.write('}\n')
        def _write_atom(self, atom: _mo.Atom):
            self._file.write('"' + str(atom.value) + '"')

    assert _clans.is_absolute_member(table)
    sorted_lefts = sorted(lefts)
    if out_format == 'csv':
        writer = Csv(clan=table, ordered_lefts=sorted_lefts)
    elif out_format == 'json':
        writer = Json(clan=table, ordered_lefts=sorted_lefts)
    else:
        raise AssertionError("'out_format' must be 'csv' or 'json'")
    writer.export(file_or_path)


def _convert_graph_to_mathobjects(rdflib_graph: _rdflib.Graph) -> 'PP(A x A)':
    """Return a `graph` from the data in the `~_rdflib.Graph` `rdflib_graph`."""
    return _mo.Set(_make_triple_from_graph(triple_tuple) for triple_tuple in rdflib_graph)


def _make_triple_from_graph(triple_tuple) -> 'P(A x A)':
    """Return a `triple` created from the `~.rdflib` triple tuple ``triple_tuple``."""
    return _rdf.make_triple(
        subject=_convert_identifier_to_mathobject(triple_tuple[0]),
        predicate=_convert_identifier_to_mathobject(triple_tuple[1]),
        object_=_convert_identifier_to_mathobject(triple_tuple[2])
    )


def _convert_identifier_to_mathobject(term) -> '( A )':
    """Return an :class:`~.Atom` with the rdflib identifier ``term`` converted into the atom's
    value.

    :param term: Must be an `rdflib.URIRef`, `rdflib.BNode` or an `rdflib.Literal`.
    :return: An :class:`~.Atom` with a value derived from ``term``:
        -   `URIRef` and `BNode` instances are inserted into the atom as their type.
        -   `Literal` instances are inserted as the intuitive native Python type (using
            `Literal.toPython` -- see the documentation for `Literal`).
    """
    if isinstance(term, _rdflib.URIRef) or isinstance(term, _rdflib.BNode):
        return _mo.Atom(term)
    elif isinstance(term, _rdflib.Literal):
        return _mo.Atom(term.toPython())
    else:
        raise TypeError("'term' must be 'URIRef', 'BNode' or 'Literal'")