Source code for algebraixlib.import_export.rdf

"""Import and export facilities for RDF data."""

# Copyright Algebraix Data Corporation 2015 - 2017
#
# This file is part of algebraixlib <http://github.com/AlgebraixData/algebraixlib>.
#
# algebraixlib is free software: you can redistribute it and/or modify it under the terms of version
# 3 of the GNU Lesser General Public License as published by the Free Software Foundation.
#
# algebraixlib is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License along with algebraixlib.
# If not, see <http://www.gnu.org/licenses/>.
# --------------------------------------------------------------------------------------------------
import abc as _abc
import rdflib as _rdflib

import algebraixlib.algebras.clans as _clans
import algebraixlib.algebras.relations as _relations
import algebraixlib.mathobjects as _mo
import algebraixlib.undef as _ud
import algebraixlib.util.miscellaneous as _misc
import algebraixlib.util.rdf as _rdf


[docs]def import_graph(graph_file_or_filepath, rdf_format: str=None) -> 'PP(A x A)': r"""Return an absolute clan that represents the RDF graph ``graph_file_or_filepath``. - The :term:`graph` is represented by a :term:`regular`, :term:`absolute clan`. - The :term:`left component`\s are 's' (subject), 'p' (predicate) and 'o' (object) (see :term:`triple`). - Blank nodes are converted to IRIs (skolemization). - URI references are represented as ``rdflib.URIRef`` instances (embedded in an :class:`~.Atom`). - Literals are converted to intuitive native Python types (embedded in an :class:`~.Atom`). :param graph_file_or_filepath: A string that is a path (relative or absolute) to the file to be imported, or a file object (like `StringIO`). :param rdf_format: The format of the RDF graph that is being imported. See `<Plugin parsers>`_ for a list of supported formats and their corresponding strings. If none is given, the file name's extension is used to guess a format. If that fails, Turtle is used as default. :return: An :term:`absolute clan` that represents the RDF graph that has been imported. .. _Plugin parsers: http://rdflib.readthedocs.org/en/latest/plugin_parsers.html """ if rdf_format is None: try: rdf_format = _rdflib.util.guess_format(graph_file_or_filepath) or 'turtle' except AttributeError: rdf_format = 'turtle' graph = _rdflib.Graph() graph.parse(source=graph_file_or_filepath, format=rdf_format) graph.skolemize() return _convert_graph_to_mathobjects(graph)
class _ExportTable(_abc.ABC): """A base class for data export writers for tabular data (absolute clans).""" def __init__(self, clan: 'PP(A x A)', ordered_lefts=None, leftset: 'P( A )'=None): """Constructor. Call in the derived class's constructor. :param clan: The tabular data to be exported. :param ordered_lefts: An optional (ordered) list of left atoms that determines the columns to be exported from ``clan`` and their order. :param leftset: An optional (unordered) set of left atoms that determines the the columns to be exported from ``clan``. If neither ordered_lefts nor lefts is given, all left components from ``clan`` are exported. """ # Is set in .export() self._file = None if not _clans.is_member(clan): raise TypeError("'clan' must be a clan") self._clan = clan if ordered_lefts is not None: if leftset is not None: raise AssertionError("Only one of 'ordered_lefts' and 'lefts' may be given") if len(ordered_lefts) == 0: raise AssertionError("'ordered_lefts' must contain at least one left component") for left in ordered_lefts: if not left.is_atom: raise TypeError("'ordered_lefts' must consist of Atom instances") self._ordered_lefts = ordered_lefts elif leftset is not None: self._ordered_lefts = [] for left in leftset: if not left.is_atom: raise TypeError("'lefts' must consist of Atom instances") self._ordered_lefts.append(left) if len(self._ordered_lefts) == 0: raise AssertionError("'lefts' must contain at least one left component") else: lefts = _clans.get_lefts(clan, _checked=False) self._ordered_lefts = [left for left in lefts] def export(self, file_or_path): """Export the data of this instance to ``file_or_path``. :param file_or_path: Either a file path (in this case the data is written to a file at this location) or a file object (in this case the data is written to its .write() function). """ def write_data(out_file): self._file = out_file self._doc_start() self._write_header() self._write_body() self._doc_end() _misc.write_to_file_or_path(file_or_path, write_data) # ---------------------------------------------------------------------------------------------- @_abc.abstractmethod def _doc_start(self): """Document start sequence.""" pass @_abc.abstractmethod def _doc_end(self): """Document end sequence.""" pass @_abc.abstractmethod def _header_start(self): """Header start sequence.""" pass @_abc.abstractmethod def _header_between_lefts(self): """Sequence between the lefts in the header (that's not in start or end).""" pass @_abc.abstractmethod def _header_end(self): """Header end sequence.""" pass @_abc.abstractmethod def _body_start(self): """Body (data) start sequence.""" pass @_abc.abstractmethod def _body_end(self): """Body (data) end sequence.""" pass @_abc.abstractmethod def _row_start(self): """Row (relation) start sequence.""" pass @_abc.abstractmethod def _row_between_relations(self): """Sequence between the rows/relations (that's not in start or end).""" pass @_abc.abstractmethod def _row_end(self): """Row (relation) end sequence.""" pass @_abc.abstractmethod def _write_item(self, left, right, prefix_separator): """Write an item. Prefix a separator (if this is not handled by _item_between_couplets()) if prefix_separator is True.""" pass @_abc.abstractmethod def _item_between_couplets(self): """Sequence between the items/couplets (that's not in _write_item()).""" pass @_abc.abstractmethod def _write_atom(self, atom: _mo.Atom): """Write a data item. Handle escaping, quoting etc. -- everything to convert an atom into an output string.""" pass # ---------------------------------------------------------------------------------------------- def _write_header(self): self._header_start() itr = iter(self._ordered_lefts) self._write_atom(next(itr)) for left in itr: self._header_between_lefts() self._write_atom(left) self._header_end() def _write_body(self): self._body_start() relation_itr = iter(self._clan) self._write_row(next(relation_itr)) for relation in relation_itr: self._row_between_relations() self._write_row(relation) self._body_end() def _write_row(self, relation: 'P(A x A)'): self._row_start() lefts_itr = iter(self._ordered_lefts) left = next(lefts_itr) self._write_item_wrapper(relation, left, False) for left in lefts_itr: self._item_between_couplets() self._write_item_wrapper(relation, left, True) self._row_end() def _write_item_wrapper(self, relation: 'P(A x A)', left: '( A )', prefix_separator: bool): right = _relations.get_right(relation, left) self._write_item(left, right, prefix_separator)
[docs]def export_table(file_or_path, lefts: 'P( A )', table: 'PP(A x A)', out_format: str='csv'): r"""Return a serialized table as string in a supported RDF format for table serialization. **Limitations**: - Leading '?' and '$' are not stripped from lefts (variable names). - Non-printable characters are backslash-escaped. - ``table`` must be an :term:`absolute` :term:`graph`. :param file_or_path: Either a file path (in this case the data is written to a file at this location) or a file object (in this case the data is written to its ``.write()`` function). :param lefts: The set of the :term:`left component`\s in ``table`` that is exported. :param table: An :term:`absolute clan` that contains the data to be exported. :param out_format: A supported RDF table format. Supported are ``'csv'`` (`SPARQL 1.1 Query Results CSV and TSV Formats`_) and ``'json'`` (`SPARQL 1.1 Query Results JSON Format`_). .. _SPARQL 1.1 Query Results CSV and TSV Formats: http://www.w3.org/TR/2013/REC-sparql11-results-csv-tsv-20130321/ .. _SPARQL 1.1 Query Results JSON Format: http://www.w3.org/TR/2013/REC-sparql11-results-json-20130321/ """ # noinspection PyPep8 class Csv(_ExportTable): """Specialize ``_ExportTable`` for RDF CSV.""" def __init__(self, clan, ordered_lefts): super().__init__(clan=clan, ordered_lefts=ordered_lefts) def _doc_start(self): pass def _doc_end(self): pass def _header_start(self): pass def _header_between_lefts(self): self._file.write(', ') def _header_end(self): self._file.write('\n') def _body_start(self): pass def _body_end(self): pass def _row_start(self): pass def _row_between_relations(self): pass def _row_end(self): self._file.write('\n') def _item_between_couplets(self): self._file.write(', ') def _write_item(self, left, right, prefix_separator): if right is not _ud.Undef(): self._write_atom(right) def _write_atom(self, atom: _mo.Atom): data = str(atom.value).replace('"', '""') self._file.write('"' + data + '"') # noinspection PyPep8 class Json(_ExportTable): """Specialize ``_ExportTable`` for RDF JSON.""" def __init__(self, clan, ordered_lefts): super().__init__(clan=clan, ordered_lefts=ordered_lefts) def _doc_start(self): self._file.write('{') def _doc_end(self): self._file.write('}') def _header_start(self): self._file.write('"head":{"vars":[') def _header_between_lefts(self): self._file.write(', ') def _header_end(self): self._file.write(']},\n') def _body_start(self): self._file.write('"results":{"bindings":[\n') def _body_end(self): self._file.write(']}') def _row_start(self): self._file.write('{') def _row_between_relations(self): self._file.write(',') def _row_end(self): self._file.write('}\n') def _item_between_couplets(self): pass def _write_item(self, left, right, prefix_separator): if right is not _ud.Undef(): if prefix_separator: self._file.write(', ') self._write_atom(left) self._file.write(': {"type":') if isinstance(right.value, _rdflib.URIRef): self._file.write('"uri"') elif isinstance(right.value, _rdflib.BNode): self._file.write('"bnode"') else: self._file.write('"literal"') lit = _rdflib.Literal(right.value) if isinstance(lit.datatype, str): self._file.write(', "datatype":"') self._file.write(lit.datatype) self._file.write('"') self._file.write(', "value":') self._write_atom(right) self._file.write('}\n') def _write_atom(self, atom: _mo.Atom): self._file.write('"' + str(atom.value) + '"') assert _clans.is_absolute_member(table) sorted_lefts = sorted(lefts) if out_format == 'csv': writer = Csv(clan=table, ordered_lefts=sorted_lefts) elif out_format == 'json': writer = Json(clan=table, ordered_lefts=sorted_lefts) else: raise AssertionError("'out_format' must be 'csv' or 'json'") writer.export(file_or_path)
def _convert_graph_to_mathobjects(rdflib_graph: _rdflib.Graph) -> 'PP(A x A)': """Return a `graph` from the data in the `~_rdflib.Graph` `rdflib_graph`.""" return _mo.Set(_make_triple_from_graph(triple_tuple) for triple_tuple in rdflib_graph) def _make_triple_from_graph(triple_tuple) -> 'P(A x A)': """Return a `triple` created from the `~.rdflib` triple tuple ``triple_tuple``.""" return _rdf.make_triple( subject=_convert_identifier_to_mathobject(triple_tuple[0]), predicate=_convert_identifier_to_mathobject(triple_tuple[1]), object_=_convert_identifier_to_mathobject(triple_tuple[2]) ) def _convert_identifier_to_mathobject(term) -> '( A )': """Return an :class:`~.Atom` with the rdflib identifier ``term`` converted into the atom's value. :param term: Must be an `rdflib.URIRef`, `rdflib.BNode` or an `rdflib.Literal`. :return: An :class:`~.Atom` with a value derived from ``term``: - `URIRef` and `BNode` instances are inserted into the atom as their type. - `Literal` instances are inserted as the intuitive native Python type (using `Literal.toPython` -- see the documentation for `Literal`). """ if isinstance(term, _rdflib.URIRef) or isinstance(term, _rdflib.BNode): return _mo.Atom(term) elif isinstance(term, _rdflib.Literal): return _mo.Atom(term.toPython()) else: raise TypeError("'term' must be 'URIRef', 'BNode' or 'Literal'")