bench_executor.ontop

Ontop is a Virtual Knowledge Graph system. It exposes the content of arbitrary relational databases as knowledge graphs. These graphs are virtual, which means that data remains in the data sources instead of being moved to another database.

Website: https://ontop-vkg.org
Repository: https://github.com/ontop/ontop

  1#!/usr/bin/env python3
  2
  3"""
  4Ontop is a Virtual Knowledge Graph system. It exposes the content of
  5arbitrary relational databases as knowledge graphs. These graphs are virtual,
  6which means that data remains in the data sources instead of being moved
  7to another database.
  8
  9**Website**: https://ontop-vkg.org<br>
 10**Repository**: https://github.com/ontop/ontop
 11"""
 12
 13import os
 14import psutil
 15import configparser
 16from rdflib import Graph, Namespace, RDF, URIRef
 17from timeout_decorator import timeout, TimeoutError  # type: ignore
 18from typing import Dict, Optional
 19from bench_executor.container import Container
 20from bench_executor.logger import Logger
 21
 22VERSION = '5.0.0'
 23TIMEOUT = 6 * 3600  # 6 hours
 24R2RML = Namespace('http://www.w3.org/ns/r2rml#')
 25
 26
 27class Ontop(Container):
 28    """Ontop container super class for OntopMaterialize and OntopVirtualize."""
 29    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
 30        """Creates an instance of the Ontop class.
 31
 32        Parameters
 33        ----------
 34        name : str
 35            Pretty name of the container.
 36        data_path: str
 37            Path to the data directory of the case.
 38        logger : Logger
 39            Logger to use for log messages.
 40        mode : str
 41            Ontop mode: `materialize` or `endpoint`
 42        """
 43        self._mode = mode
 44        self._headers: Dict[str, Dict[str, str]] = {}
 45        self._logger = logger
 46        self._data_path = data_path
 47
 48        if self._mode == 'endpoint':
 49            subdir = 'ontopvirtualize'
 50        elif self._mode == 'materialize':
 51            subdir = 'ontopmaterialize'
 52        else:
 53            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 54        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
 55
 56        # Set Java heap to 1/2 of available memory instead of the default 1/4
 57        max_heap = int(psutil.virtual_memory().total * (1/2))
 58
 59        # Configure logging
 60        log_level = 'info'
 61        if self._logger.verbose:
 62            log_level = 'debug'
 63        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
 64
 65        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
 66                       'ONTOP_LOG_LEVEL': log_level}
 67        super().__init__(f'blindreviewing/ontop:v{VERSION}', name,
 68                         self._logger,
 69                         ports={'8888': '8888'},
 70                         environment=environment,
 71                         volumes=[f'{self._data_path}/'
 72                                  f'{self.root_mount_directory}:/data',
 73                                  f'{self._data_path}/shared:/data/shared'])
 74
 75    @property
 76    def root_mount_directory(self) -> str:
 77        """Subdirectory in the root directory of the case for Ontop.
 78
 79        Returns
 80        -------
 81        subdirectory : str
 82            Subdirectory of the root directory for Ontop.
 83
 84        """
 85        if self._mode == 'endpoint':
 86            return 'ontopvirtualize'
 87        elif self._mode == 'materialize':
 88            return 'ontopmaterialize'
 89        else:
 90            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 91
 92    @property
 93    def endpoint(self) -> str:
 94        """SPARQL endpoint URL for Ontop.
 95
 96        Returns
 97        -------
 98        url : str
 99            SPARQL endpoint URL.
100        """
101        return 'http://localhost:8888/sparql'
102
103    @property
104    def headers(self) -> dict:
105        """HTTP headers of SPARQL queries for serialization formats.
106
107        Only supported serialization formats are included in the dictionary.
108        Currently, the following formats are supported:
109        - N-Triples
110        - N-Quads
111        - Turtle
112        - CSV
113        - RDF/JSON
114        - RDF/XML
115        - JSON-LD
116
117        Returns
118        -------
119        headers : dict
120            Dictionary of headers to use for each serialization format.
121        """
122        return self._headers
123
124    def _execute(self, arguments: list) -> bool:
125        """Execute Ontop with given arguments.
126
127        Parameters
128        ----------
129        arguments : list
130            Arguments to supply to Ontop.
131
132        Returns
133        -------
134        success : bool
135            Whether the execution succeeded or not.
136        """
137
138        cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}'
139        self._logger.info(f'Executing Ontop with command: {cmd}')
140        if self._mode == 'endpoint':
141            log_line = 'OntopEndpointApplication - Started ' + \
142                       'OntopEndpointApplication'
143            success = self.run_and_wait_for_log(log_line, cmd)
144        elif self._mode == 'materialize':
145            success = self.run_and_wait_for_exit(cmd)
146        else:
147            self._logger.error(f'Unknown Ontop mode "{self._mode}"')
148            success = False
149
150        return success
151
152    def _execute_mapping(self,
153                         config_file: str,
154                         arguments: list,
155                         mapping_file: str,
156                         output_file: Optional[str],
157                         rdb_username: str,
158                         rdb_password: str,
159                         rdb_host: str,
160                         rdb_port: int,
161                         rdb_name: str,
162                         rdb_type: str) -> bool:
163        """Execute a mapping file with Ontop.
164
165        Only relational databases are supported by
166        Ontop, thus the relational database parameters are mandantory.
167
168        Parameters
169        ----------
170        config_file : str
171            Name of the generated config file for Ontop.
172        arguments : list
173            List of arguments to pass to Ontop.
174        mapping_file : str
175            Name of the mapping file to use.
176        output_file : Optional[str]
177            Name of the output file to use. Only applicable for
178            materialization.
179        rdb_username : str
180            Username for the database.
181        rdb_password : str
182            Password for the database.
183        rdb_host : str
184            Hostname for the database.
185        rdb_port : int
186            Port for the database.
187        rdb_name : str
188            Database name for the database.
189        rdb_type : str
190            Database type.
191
192        Returns
193        -------
194        success : bool
195            Whether the execution was successfull or not.
196        """
197        # Generate INI configuration file since no CLI is available
198        config = configparser.ConfigParser()
199        config['root'] = {}
200        if rdb_type == 'MySQL':
201            dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}'
202            config['root']['jdbc.url'] = dsn
203            config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver'
204        elif rdb_type == 'PostgreSQL':
205            dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}'
206            config['root']['jdbc.url'] = dsn
207            config['root']['jdbc.driver'] = 'org.postgresql.Driver'
208        else:
209            msg = f'Unknown RDB type: "{rdb_type}"'
210            self._logger.error(msg)
211            raise ValueError(msg)
212        config['root']['jdbc.user'] = rdb_username
213        config['root']['jdbc.password'] = rdb_password
214
215        path = os.path.join(self._data_path, self.root_mount_directory)
216        os.makedirs(path, exist_ok=True)
217        with open(os.path.join(path, 'config.properties'), 'w') as f:
218            config.write(f, space_around_delimiters=False)
219
220        # .properties files are like .ini files but without a [HEADER]
221        # Use a [root] header and remove it after writing
222        with open(os.path.join(path, 'config.properties'), 'r') as f:
223            data = f.read()
224
225        with open(os.path.join(path, 'config.properties'), 'w') as f:
226            f.write(data.replace('[root]\n', ''))
227
228        # Compatibility with Ontop requiring rr:class
229        # Replace any rdf:type construction with rr:class
230        # Without this, a strange error is raised: 'The definition of the
231        # predicate is not always a ground term triple(s,p,o)'
232        g = Graph()
233        g.bind('r2rml', R2RML)
234        g.bind('rdf', RDF)
235        g.parse(os.path.join(self._data_path, 'shared',
236                             os.path.basename(mapping_file)))
237
238        for triples_map_iri, p, o in g.triples((None, RDF.type,
239                                                R2RML.TriplesMap)):
240            subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap)
241
242            iter_pom = g.triples((triples_map_iri,
243                                  R2RML.predicateObjectMap,
244                                  None))
245            for s, p, predicate_object_map_iri in iter_pom:
246                predicate_map_iri = g.value(predicate_object_map_iri,
247                                            R2RML.predicateMap)
248                object_map_iri = g.value(predicate_object_map_iri,
249                                         R2RML.objectMap)
250
251                if predicate_map_iri is None or object_map_iri is None:
252                    continue
253
254                # Check if PredicateObjectMap is pointing to a PredicateMap
255                # specifying rdf:type. Skip this PredicateObjectMap if not
256                if g.value(predicate_map_iri, R2RML.constant) != RDF.type:
257                    continue
258
259                # Retrieve the ObjectMap rr:constant value and add it as
260                # rr:class to the Subject Map is present
261                rdf_type_value = g.value(object_map_iri, R2RML.constant)
262                if rdf_type_value is not None:
263                    iri = URIRef(rdf_type_value.toPython())
264                    g.add((subject_map_iri, R2RML['class'], iri))
265                else:
266                    msg = 'Cannot extract rr:class value, rdf:type value ' + \
267                          'is not a constant value!'
268                    self._logger.error(msg)
269                    return False
270
271                # Remove all triples associated with the rdf:type PredicateMap
272                for s, p, o in g.triples((predicate_map_iri, None, None)):
273                    g.remove((s, p, o))
274
275                # Remove all triples associated with the rdf:type ObjectMap
276                for s, p, o in g.triples((object_map_iri, None, None)):
277                    g.remove((s, p, o))
278
279                # Remove all triples associated with the
280                # rdf:type PredicateObjectMap
281                for s, p, o in g.triples((object_map_iri, None, None)):
282                    g.remove((s, p, o))
283
284                # Remove PredicateObjectMap from Triples Map
285                g.remove((triples_map_iri, R2RML.predicateObjectMap,
286                          predicate_object_map_iri))
287
288            destination = os.path.join(self._data_path,
289                                       self.root_mount_directory,
290                                       'mapping_converted.r2rml.ttl')
291            g.serialize(destination=destination, format='turtle')
292
293        arguments.append('-m')
294        arguments.append('/data/mapping_converted.r2rml.ttl')
295        if output_file is not None:
296            arguments.append('-o')
297            arguments.append(os.path.join('/data/shared/', output_file))
298        arguments.append('-p')
299        arguments.append('/data/config.properties')
300
301        return self._execute(arguments)
302
303
304class OntopVirtualize(Ontop):
305    """OntopVirtualize container for setting up an Ontop SPARQL endpoint."""
306    def __init__(self, data_path: str, config_path: str, directory: str,
307                 verbose: bool):
308        """Creates an instance of the OntopVirtualize class.
309
310        Parameters
311        ----------
312        data_path : str
313            Path to the data directory of the case.
314        config_path : str
315            Path to the config directory of the case.
316        directory : str
317            Path to the directory to store logs.
318        verbose : bool
319            Enable verbose logs.
320        """
321        self._data_path = os.path.abspath(data_path)
322        self._config_path = os.path.abspath(config_path)
323        self._logger = Logger(__name__, directory, verbose)
324        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
325                         'endpoint')
326
327    def execute_mapping(self,
328                        mapping_file: str,
329                        serialization: str,
330                        rdb_username: str,
331                        rdb_password: str,
332                        rdb_host: str,
333                        rdb_port: int,
334                        rdb_name: str,
335                        rdb_type: str) -> bool:
336        """Start an Ontop SPARQL endpoint with a mapping.
337
338        Only relational databases are supported by
339        Ontop, thus the relational database parameters are mandantory.
340        Ontop SPARQL endpoint supports the following serialization formats:
341        - N-Triples (Ontop v5+)
342        - N-Quads (Ontop v5+)
343        - Turtle
344        - RDF/JSON
345        - JSON-LD
346        - CSV
347
348        Parameters
349        ----------
350        mapping_file : str
351            Path to the mapping file to execute.
352        serialization : str
353            Serialization format to use.
354        rdb_username : str
355            Username for the database.
356        rdb_password : str
357            Password for the database.
358        rdb_host : str
359            Hostname for the database.
360        rdb_port : int
361            Port for the database.
362        rdb_name : str
363            Database name for the database.
364        rdb_type : str
365            Database type.
366
367        Returns
368        -------
369        success : bool
370            Whether the execution was successfull or not.
371        """
372        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
373                      '/config.properties'
374        arguments = ['--cors-allowed-origins=*', '--port=8888']
375        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
376        self._headers['nquads'] = {'Accept': 'application/n-quads'}
377        self._headers['turtle'] = {'Accept': 'text/turtle'}
378        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
379        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
380        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
381        self._headers['csv'] = {'Accept': 'text/csv'}
382        if serialization not in self._headers.keys():
383            msg = 'Unsupported serialization format ' + \
384                  f'"{serialization}" for Ontop'
385            self._logger.error(msg)
386            raise ValueError(msg)
387        return super()._execute_mapping(config_file, arguments,
388                                        mapping_file, None, rdb_username,
389                                        rdb_password, rdb_host, rdb_port,
390                                        rdb_name, rdb_type)
391
392
393class OntopMaterialize(Ontop):
394    """OntopMaterialize container to execute a R2RML mapping."""
395    def __init__(self, data_path: str, config_path: str, directory: str,
396                 verbose: bool):
397        """Creates an instance of the OntopMaterialize class.
398
399        Parameters
400        ----------
401        data_path : str
402            Path to the data directory of the case.
403        config_path : str
404            Path to the config directory of the case.
405        directory : str
406            Path to the directory to store logs.
407        verbose : bool
408            Enable verbose logs.
409        """
410        self._data_path = os.path.abspath(data_path)
411        self._config_path = os.path.abspath(config_path)
412        self._logger = Logger(__name__, directory, verbose)
413        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
414                    exist_ok=True)
415        super().__init__('Ontop-Materialize', self._data_path, self._logger,
416                         'materialize')
417
418    @timeout(TIMEOUT)
419    def _execute_mapping_with_timeout(self, mapping_file: str,
420                                      output_file: str,
421                                      serialization: str,
422                                      rdb_username: str,
423                                      rdb_password: str,
424                                      rdb_host: str,
425                                      rdb_port: int,
426                                      rdb_name: str,
427                                      rdb_type: str) -> bool:
428        """Execute a mapping with a provided timeout.
429
430        Returns
431        -------
432        success : bool
433            Whether the execution was successfull or not.
434        """
435        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
436                      '/config.properties'
437        arguments = ['-f', serialization]
438        self._headers = {}
439        return super()._execute_mapping(config_file, arguments,
440                                        mapping_file, output_file,
441                                        rdb_username, rdb_password,
442                                        rdb_host, rdb_port, rdb_name, rdb_type)
443
444    def execute_mapping(self,
445                        mapping_file: str,
446                        output_file: str,
447                        serialization: str,
448                        rdb_username: str,
449                        rdb_password: str,
450                        rdb_host: str,
451                        rdb_port: int,
452                        rdb_name: str,
453                        rdb_type: str) -> bool:
454        """Execute a R2RML mapping with Ontop
455
456        N-Quads and N-Triples are currently supported as serialization
457        for Ontop materialize. Only relational databases are supported by
458        Ontop, thus the relational database parameters are mandantory.
459
460        Parameters
461        ----------
462        mapping_file : str
463            Path to the mapping file to execute.
464        output_file : str
465            Name of the output file to store the triples in. This is not used
466            for OntopVirtualize.
467        serialization : str
468            Serialization format to use.
469        rdb_username : str
470            Username for the database.
471        rdb_password : str
472            Password for the database.
473        rdb_host : str
474            Hostname for the database.
475        rdb_port : int
476            Port for the database.
477        rdb_name : str
478            Database name for the database.
479        rdb_type : str
480            Database type.
481
482        Returns
483        -------
484        success : bool
485            Whether the execution was successfull or not.
486        """
487        try:
488            return self._execute_mapping_with_timeout(mapping_file,
489                                                      output_file,
490                                                      serialization,
491                                                      rdb_username,
492                                                      rdb_password,
493                                                      rdb_host,
494                                                      rdb_port,
495                                                      rdb_name,
496                                                      rdb_type)
497        except TimeoutError:
498            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
499            self._logger.warning(msg)
500
501        return False
class Ontop(bench_executor.container.Container):
 28class Ontop(Container):
 29    """Ontop container super class for OntopMaterialize and OntopVirtualize."""
 30    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
 31        """Creates an instance of the Ontop class.
 32
 33        Parameters
 34        ----------
 35        name : str
 36            Pretty name of the container.
 37        data_path: str
 38            Path to the data directory of the case.
 39        logger : Logger
 40            Logger to use for log messages.
 41        mode : str
 42            Ontop mode: `materialize` or `endpoint`
 43        """
 44        self._mode = mode
 45        self._headers: Dict[str, Dict[str, str]] = {}
 46        self._logger = logger
 47        self._data_path = data_path
 48
 49        if self._mode == 'endpoint':
 50            subdir = 'ontopvirtualize'
 51        elif self._mode == 'materialize':
 52            subdir = 'ontopmaterialize'
 53        else:
 54            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 55        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
 56
 57        # Set Java heap to 1/2 of available memory instead of the default 1/4
 58        max_heap = int(psutil.virtual_memory().total * (1/2))
 59
 60        # Configure logging
 61        log_level = 'info'
 62        if self._logger.verbose:
 63            log_level = 'debug'
 64        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
 65
 66        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
 67                       'ONTOP_LOG_LEVEL': log_level}
 68        super().__init__(f'blindreviewing/ontop:v{VERSION}', name,
 69                         self._logger,
 70                         ports={'8888': '8888'},
 71                         environment=environment,
 72                         volumes=[f'{self._data_path}/'
 73                                  f'{self.root_mount_directory}:/data',
 74                                  f'{self._data_path}/shared:/data/shared'])
 75
 76    @property
 77    def root_mount_directory(self) -> str:
 78        """Subdirectory in the root directory of the case for Ontop.
 79
 80        Returns
 81        -------
 82        subdirectory : str
 83            Subdirectory of the root directory for Ontop.
 84
 85        """
 86        if self._mode == 'endpoint':
 87            return 'ontopvirtualize'
 88        elif self._mode == 'materialize':
 89            return 'ontopmaterialize'
 90        else:
 91            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
 92
 93    @property
 94    def endpoint(self) -> str:
 95        """SPARQL endpoint URL for Ontop.
 96
 97        Returns
 98        -------
 99        url : str
100            SPARQL endpoint URL.
101        """
102        return 'http://localhost:8888/sparql'
103
104    @property
105    def headers(self) -> dict:
106        """HTTP headers of SPARQL queries for serialization formats.
107
108        Only supported serialization formats are included in the dictionary.
109        Currently, the following formats are supported:
110        - N-Triples
111        - N-Quads
112        - Turtle
113        - CSV
114        - RDF/JSON
115        - RDF/XML
116        - JSON-LD
117
118        Returns
119        -------
120        headers : dict
121            Dictionary of headers to use for each serialization format.
122        """
123        return self._headers
124
125    def _execute(self, arguments: list) -> bool:
126        """Execute Ontop with given arguments.
127
128        Parameters
129        ----------
130        arguments : list
131            Arguments to supply to Ontop.
132
133        Returns
134        -------
135        success : bool
136            Whether the execution succeeded or not.
137        """
138
139        cmd = f'/ontop/ontop {self._mode} {" ".join(arguments)}'
140        self._logger.info(f'Executing Ontop with command: {cmd}')
141        if self._mode == 'endpoint':
142            log_line = 'OntopEndpointApplication - Started ' + \
143                       'OntopEndpointApplication'
144            success = self.run_and_wait_for_log(log_line, cmd)
145        elif self._mode == 'materialize':
146            success = self.run_and_wait_for_exit(cmd)
147        else:
148            self._logger.error(f'Unknown Ontop mode "{self._mode}"')
149            success = False
150
151        return success
152
153    def _execute_mapping(self,
154                         config_file: str,
155                         arguments: list,
156                         mapping_file: str,
157                         output_file: Optional[str],
158                         rdb_username: str,
159                         rdb_password: str,
160                         rdb_host: str,
161                         rdb_port: int,
162                         rdb_name: str,
163                         rdb_type: str) -> bool:
164        """Execute a mapping file with Ontop.
165
166        Only relational databases are supported by
167        Ontop, thus the relational database parameters are mandantory.
168
169        Parameters
170        ----------
171        config_file : str
172            Name of the generated config file for Ontop.
173        arguments : list
174            List of arguments to pass to Ontop.
175        mapping_file : str
176            Name of the mapping file to use.
177        output_file : Optional[str]
178            Name of the output file to use. Only applicable for
179            materialization.
180        rdb_username : str
181            Username for the database.
182        rdb_password : str
183            Password for the database.
184        rdb_host : str
185            Hostname for the database.
186        rdb_port : int
187            Port for the database.
188        rdb_name : str
189            Database name for the database.
190        rdb_type : str
191            Database type.
192
193        Returns
194        -------
195        success : bool
196            Whether the execution was successfull or not.
197        """
198        # Generate INI configuration file since no CLI is available
199        config = configparser.ConfigParser()
200        config['root'] = {}
201        if rdb_type == 'MySQL':
202            dsn = f'jdbc:mysql://{rdb_host}:{rdb_port}/{rdb_name}'
203            config['root']['jdbc.url'] = dsn
204            config['root']['jdbc.driver'] = 'com.mysql.cj.jdbc.Driver'
205        elif rdb_type == 'PostgreSQL':
206            dsn = f'jdbc:postgresql://{rdb_host}:{rdb_port}/{rdb_name}'
207            config['root']['jdbc.url'] = dsn
208            config['root']['jdbc.driver'] = 'org.postgresql.Driver'
209        else:
210            msg = f'Unknown RDB type: "{rdb_type}"'
211            self._logger.error(msg)
212            raise ValueError(msg)
213        config['root']['jdbc.user'] = rdb_username
214        config['root']['jdbc.password'] = rdb_password
215
216        path = os.path.join(self._data_path, self.root_mount_directory)
217        os.makedirs(path, exist_ok=True)
218        with open(os.path.join(path, 'config.properties'), 'w') as f:
219            config.write(f, space_around_delimiters=False)
220
221        # .properties files are like .ini files but without a [HEADER]
222        # Use a [root] header and remove it after writing
223        with open(os.path.join(path, 'config.properties'), 'r') as f:
224            data = f.read()
225
226        with open(os.path.join(path, 'config.properties'), 'w') as f:
227            f.write(data.replace('[root]\n', ''))
228
229        # Compatibility with Ontop requiring rr:class
230        # Replace any rdf:type construction with rr:class
231        # Without this, a strange error is raised: 'The definition of the
232        # predicate is not always a ground term triple(s,p,o)'
233        g = Graph()
234        g.bind('r2rml', R2RML)
235        g.bind('rdf', RDF)
236        g.parse(os.path.join(self._data_path, 'shared',
237                             os.path.basename(mapping_file)))
238
239        for triples_map_iri, p, o in g.triples((None, RDF.type,
240                                                R2RML.TriplesMap)):
241            subject_map_iri = g.value(triples_map_iri, R2RML.subjectMap)
242
243            iter_pom = g.triples((triples_map_iri,
244                                  R2RML.predicateObjectMap,
245                                  None))
246            for s, p, predicate_object_map_iri in iter_pom:
247                predicate_map_iri = g.value(predicate_object_map_iri,
248                                            R2RML.predicateMap)
249                object_map_iri = g.value(predicate_object_map_iri,
250                                         R2RML.objectMap)
251
252                if predicate_map_iri is None or object_map_iri is None:
253                    continue
254
255                # Check if PredicateObjectMap is pointing to a PredicateMap
256                # specifying rdf:type. Skip this PredicateObjectMap if not
257                if g.value(predicate_map_iri, R2RML.constant) != RDF.type:
258                    continue
259
260                # Retrieve the ObjectMap rr:constant value and add it as
261                # rr:class to the Subject Map is present
262                rdf_type_value = g.value(object_map_iri, R2RML.constant)
263                if rdf_type_value is not None:
264                    iri = URIRef(rdf_type_value.toPython())
265                    g.add((subject_map_iri, R2RML['class'], iri))
266                else:
267                    msg = 'Cannot extract rr:class value, rdf:type value ' + \
268                          'is not a constant value!'
269                    self._logger.error(msg)
270                    return False
271
272                # Remove all triples associated with the rdf:type PredicateMap
273                for s, p, o in g.triples((predicate_map_iri, None, None)):
274                    g.remove((s, p, o))
275
276                # Remove all triples associated with the rdf:type ObjectMap
277                for s, p, o in g.triples((object_map_iri, None, None)):
278                    g.remove((s, p, o))
279
280                # Remove all triples associated with the
281                # rdf:type PredicateObjectMap
282                for s, p, o in g.triples((object_map_iri, None, None)):
283                    g.remove((s, p, o))
284
285                # Remove PredicateObjectMap from Triples Map
286                g.remove((triples_map_iri, R2RML.predicateObjectMap,
287                          predicate_object_map_iri))
288
289            destination = os.path.join(self._data_path,
290                                       self.root_mount_directory,
291                                       'mapping_converted.r2rml.ttl')
292            g.serialize(destination=destination, format='turtle')
293
294        arguments.append('-m')
295        arguments.append('/data/mapping_converted.r2rml.ttl')
296        if output_file is not None:
297            arguments.append('-o')
298            arguments.append(os.path.join('/data/shared/', output_file))
299        arguments.append('-p')
300        arguments.append('/data/config.properties')
301
302        return self._execute(arguments)

Ontop container super class for OntopMaterialize and OntopVirtualize.

Ontop( name: str, data_path: str, logger: bench_executor.logger.Logger, mode: str)
30    def __init__(self, name: str, data_path: str, logger: Logger, mode: str):
31        """Creates an instance of the Ontop class.
32
33        Parameters
34        ----------
35        name : str
36            Pretty name of the container.
37        data_path: str
38            Path to the data directory of the case.
39        logger : Logger
40            Logger to use for log messages.
41        mode : str
42            Ontop mode: `materialize` or `endpoint`
43        """
44        self._mode = mode
45        self._headers: Dict[str, Dict[str, str]] = {}
46        self._logger = logger
47        self._data_path = data_path
48
49        if self._mode == 'endpoint':
50            subdir = 'ontopvirtualize'
51        elif self._mode == 'materialize':
52            subdir = 'ontopmaterialize'
53        else:
54            raise ValueError(f'Unknown Ontop mode: "{self._mode}"')
55        os.makedirs(os.path.join(self._data_path, subdir), exist_ok=True)
56
57        # Set Java heap to 1/2 of available memory instead of the default 1/4
58        max_heap = int(psutil.virtual_memory().total * (1/2))
59
60        # Configure logging
61        log_level = 'info'
62        if self._logger.verbose:
63            log_level = 'debug'
64        self._logger.info(f'Initialized Ontop logger at "{log_level}" level')
65
66        environment = {'ONTOP_JAVA_ARGS': f'-Xmx{max_heap} -Xms{max_heap}',
67                       'ONTOP_LOG_LEVEL': log_level}
68        super().__init__(f'blindreviewing/ontop:v{VERSION}', name,
69                         self._logger,
70                         ports={'8888': '8888'},
71                         environment=environment,
72                         volumes=[f'{self._data_path}/'
73                                  f'{self.root_mount_directory}:/data',
74                                  f'{self._data_path}/shared:/data/shared'])

Creates an instance of the Ontop class.

Parameters
  • name (str): Pretty name of the container.
  • data_path (str): Path to the data directory of the case.
  • logger (Logger): Logger to use for log messages.
  • mode (str): Ontop mode: materialize or endpoint
root_mount_directory: str

Subdirectory in the root directory of the case for Ontop.

Returns
  • subdirectory (str): Subdirectory of the root directory for Ontop.
endpoint: str

SPARQL endpoint URL for Ontop.

Returns
  • url (str): SPARQL endpoint URL.
headers: dict

HTTP headers of SPARQL queries for serialization formats.

Only supported serialization formats are included in the dictionary. Currently, the following formats are supported:

  • N-Triples
  • N-Quads
  • Turtle
  • CSV
  • RDF/JSON
  • RDF/XML
  • JSON-LD
Returns
  • headers (dict): Dictionary of headers to use for each serialization format.
class OntopVirtualize(Ontop):
305class OntopVirtualize(Ontop):
306    """OntopVirtualize container for setting up an Ontop SPARQL endpoint."""
307    def __init__(self, data_path: str, config_path: str, directory: str,
308                 verbose: bool):
309        """Creates an instance of the OntopVirtualize class.
310
311        Parameters
312        ----------
313        data_path : str
314            Path to the data directory of the case.
315        config_path : str
316            Path to the config directory of the case.
317        directory : str
318            Path to the directory to store logs.
319        verbose : bool
320            Enable verbose logs.
321        """
322        self._data_path = os.path.abspath(data_path)
323        self._config_path = os.path.abspath(config_path)
324        self._logger = Logger(__name__, directory, verbose)
325        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
326                         'endpoint')
327
328    def execute_mapping(self,
329                        mapping_file: str,
330                        serialization: str,
331                        rdb_username: str,
332                        rdb_password: str,
333                        rdb_host: str,
334                        rdb_port: int,
335                        rdb_name: str,
336                        rdb_type: str) -> bool:
337        """Start an Ontop SPARQL endpoint with a mapping.
338
339        Only relational databases are supported by
340        Ontop, thus the relational database parameters are mandantory.
341        Ontop SPARQL endpoint supports the following serialization formats:
342        - N-Triples (Ontop v5+)
343        - N-Quads (Ontop v5+)
344        - Turtle
345        - RDF/JSON
346        - JSON-LD
347        - CSV
348
349        Parameters
350        ----------
351        mapping_file : str
352            Path to the mapping file to execute.
353        serialization : str
354            Serialization format to use.
355        rdb_username : str
356            Username for the database.
357        rdb_password : str
358            Password for the database.
359        rdb_host : str
360            Hostname for the database.
361        rdb_port : int
362            Port for the database.
363        rdb_name : str
364            Database name for the database.
365        rdb_type : str
366            Database type.
367
368        Returns
369        -------
370        success : bool
371            Whether the execution was successfull or not.
372        """
373        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
374                      '/config.properties'
375        arguments = ['--cors-allowed-origins=*', '--port=8888']
376        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
377        self._headers['nquads'] = {'Accept': 'application/n-quads'}
378        self._headers['turtle'] = {'Accept': 'text/turtle'}
379        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
380        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
381        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
382        self._headers['csv'] = {'Accept': 'text/csv'}
383        if serialization not in self._headers.keys():
384            msg = 'Unsupported serialization format ' + \
385                  f'"{serialization}" for Ontop'
386            self._logger.error(msg)
387            raise ValueError(msg)
388        return super()._execute_mapping(config_file, arguments,
389                                        mapping_file, None, rdb_username,
390                                        rdb_password, rdb_host, rdb_port,
391                                        rdb_name, rdb_type)

OntopVirtualize container for setting up an Ontop SPARQL endpoint.

OntopVirtualize(data_path: str, config_path: str, directory: str, verbose: bool)
307    def __init__(self, data_path: str, config_path: str, directory: str,
308                 verbose: bool):
309        """Creates an instance of the OntopVirtualize class.
310
311        Parameters
312        ----------
313        data_path : str
314            Path to the data directory of the case.
315        config_path : str
316            Path to the config directory of the case.
317        directory : str
318            Path to the directory to store logs.
319        verbose : bool
320            Enable verbose logs.
321        """
322        self._data_path = os.path.abspath(data_path)
323        self._config_path = os.path.abspath(config_path)
324        self._logger = Logger(__name__, directory, verbose)
325        super().__init__('Ontop-Virtualize', self._data_path, self._logger,
326                         'endpoint')

Creates an instance of the OntopVirtualize class.

Parameters
  • data_path (str): Path to the data directory of the case.
  • config_path (str): Path to the config directory of the case.
  • directory (str): Path to the directory to store logs.
  • verbose (bool): Enable verbose logs.
def execute_mapping( self, mapping_file: str, serialization: str, rdb_username: str, rdb_password: str, rdb_host: str, rdb_port: int, rdb_name: str, rdb_type: str) -> bool:
328    def execute_mapping(self,
329                        mapping_file: str,
330                        serialization: str,
331                        rdb_username: str,
332                        rdb_password: str,
333                        rdb_host: str,
334                        rdb_port: int,
335                        rdb_name: str,
336                        rdb_type: str) -> bool:
337        """Start an Ontop SPARQL endpoint with a mapping.
338
339        Only relational databases are supported by
340        Ontop, thus the relational database parameters are mandantory.
341        Ontop SPARQL endpoint supports the following serialization formats:
342        - N-Triples (Ontop v5+)
343        - N-Quads (Ontop v5+)
344        - Turtle
345        - RDF/JSON
346        - JSON-LD
347        - CSV
348
349        Parameters
350        ----------
351        mapping_file : str
352            Path to the mapping file to execute.
353        serialization : str
354            Serialization format to use.
355        rdb_username : str
356            Username for the database.
357        rdb_password : str
358            Password for the database.
359        rdb_host : str
360            Hostname for the database.
361        rdb_port : int
362            Port for the database.
363        rdb_name : str
364            Database name for the database.
365        rdb_type : str
366            Database type.
367
368        Returns
369        -------
370        success : bool
371            Whether the execution was successfull or not.
372        """
373        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
374                      '/config.properties'
375        arguments = ['--cors-allowed-origins=*', '--port=8888']
376        self._headers['ntriples'] = {'Accept': 'application/n-triples'}
377        self._headers['nquads'] = {'Accept': 'application/n-quads'}
378        self._headers['turtle'] = {'Accept': 'text/turtle'}
379        self._headers['rdfjson'] = {'Accept': 'application/rdf+json'}
380        self._headers['rdfxml'] = {'Accept': 'application/rdf+xml'}
381        self._headers['jsonld'] = {'Accept': 'application/ld+json'}
382        self._headers['csv'] = {'Accept': 'text/csv'}
383        if serialization not in self._headers.keys():
384            msg = 'Unsupported serialization format ' + \
385                  f'"{serialization}" for Ontop'
386            self._logger.error(msg)
387            raise ValueError(msg)
388        return super()._execute_mapping(config_file, arguments,
389                                        mapping_file, None, rdb_username,
390                                        rdb_password, rdb_host, rdb_port,
391                                        rdb_name, rdb_type)

Start an Ontop SPARQL endpoint with a mapping.

Only relational databases are supported by Ontop, thus the relational database parameters are mandantory. Ontop SPARQL endpoint supports the following serialization formats:

  • N-Triples (Ontop v5+)
  • N-Quads (Ontop v5+)
  • Turtle
  • RDF/JSON
  • JSON-LD
  • CSV
Parameters
  • mapping_file (str): Path to the mapping file to execute.
  • serialization (str): Serialization format to use.
  • rdb_username (str): Username for the database.
  • rdb_password (str): Password for the database.
  • rdb_host (str): Hostname for the database.
  • rdb_port (int): Port for the database.
  • rdb_name (str): Database name for the database.
  • rdb_type (str): Database type.
Returns
  • success (bool): Whether the execution was successfull or not.
class OntopMaterialize(Ontop):
394class OntopMaterialize(Ontop):
395    """OntopMaterialize container to execute a R2RML mapping."""
396    def __init__(self, data_path: str, config_path: str, directory: str,
397                 verbose: bool):
398        """Creates an instance of the OntopMaterialize class.
399
400        Parameters
401        ----------
402        data_path : str
403            Path to the data directory of the case.
404        config_path : str
405            Path to the config directory of the case.
406        directory : str
407            Path to the directory to store logs.
408        verbose : bool
409            Enable verbose logs.
410        """
411        self._data_path = os.path.abspath(data_path)
412        self._config_path = os.path.abspath(config_path)
413        self._logger = Logger(__name__, directory, verbose)
414        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
415                    exist_ok=True)
416        super().__init__('Ontop-Materialize', self._data_path, self._logger,
417                         'materialize')
418
419    @timeout(TIMEOUT)
420    def _execute_mapping_with_timeout(self, mapping_file: str,
421                                      output_file: str,
422                                      serialization: str,
423                                      rdb_username: str,
424                                      rdb_password: str,
425                                      rdb_host: str,
426                                      rdb_port: int,
427                                      rdb_name: str,
428                                      rdb_type: str) -> bool:
429        """Execute a mapping with a provided timeout.
430
431        Returns
432        -------
433        success : bool
434            Whether the execution was successfull or not.
435        """
436        config_file = f'{self._data_path}/{self.root_mount_directory}' + \
437                      '/config.properties'
438        arguments = ['-f', serialization]
439        self._headers = {}
440        return super()._execute_mapping(config_file, arguments,
441                                        mapping_file, output_file,
442                                        rdb_username, rdb_password,
443                                        rdb_host, rdb_port, rdb_name, rdb_type)
444
445    def execute_mapping(self,
446                        mapping_file: str,
447                        output_file: str,
448                        serialization: str,
449                        rdb_username: str,
450                        rdb_password: str,
451                        rdb_host: str,
452                        rdb_port: int,
453                        rdb_name: str,
454                        rdb_type: str) -> bool:
455        """Execute a R2RML mapping with Ontop
456
457        N-Quads and N-Triples are currently supported as serialization
458        for Ontop materialize. Only relational databases are supported by
459        Ontop, thus the relational database parameters are mandantory.
460
461        Parameters
462        ----------
463        mapping_file : str
464            Path to the mapping file to execute.
465        output_file : str
466            Name of the output file to store the triples in. This is not used
467            for OntopVirtualize.
468        serialization : str
469            Serialization format to use.
470        rdb_username : str
471            Username for the database.
472        rdb_password : str
473            Password for the database.
474        rdb_host : str
475            Hostname for the database.
476        rdb_port : int
477            Port for the database.
478        rdb_name : str
479            Database name for the database.
480        rdb_type : str
481            Database type.
482
483        Returns
484        -------
485        success : bool
486            Whether the execution was successfull or not.
487        """
488        try:
489            return self._execute_mapping_with_timeout(mapping_file,
490                                                      output_file,
491                                                      serialization,
492                                                      rdb_username,
493                                                      rdb_password,
494                                                      rdb_host,
495                                                      rdb_port,
496                                                      rdb_name,
497                                                      rdb_type)
498        except TimeoutError:
499            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
500            self._logger.warning(msg)
501
502        return False

OntopMaterialize container to execute a R2RML mapping.

OntopMaterialize(data_path: str, config_path: str, directory: str, verbose: bool)
396    def __init__(self, data_path: str, config_path: str, directory: str,
397                 verbose: bool):
398        """Creates an instance of the OntopMaterialize class.
399
400        Parameters
401        ----------
402        data_path : str
403            Path to the data directory of the case.
404        config_path : str
405            Path to the config directory of the case.
406        directory : str
407            Path to the directory to store logs.
408        verbose : bool
409            Enable verbose logs.
410        """
411        self._data_path = os.path.abspath(data_path)
412        self._config_path = os.path.abspath(config_path)
413        self._logger = Logger(__name__, directory, verbose)
414        os.makedirs(os.path.join(self._data_path, 'ontopmaterialize'),
415                    exist_ok=True)
416        super().__init__('Ontop-Materialize', self._data_path, self._logger,
417                         'materialize')

Creates an instance of the OntopMaterialize class.

Parameters
  • data_path (str): Path to the data directory of the case.
  • config_path (str): Path to the config directory of the case.
  • directory (str): Path to the directory to store logs.
  • verbose (bool): Enable verbose logs.
def execute_mapping( self, mapping_file: str, output_file: str, serialization: str, rdb_username: str, rdb_password: str, rdb_host: str, rdb_port: int, rdb_name: str, rdb_type: str) -> bool:
445    def execute_mapping(self,
446                        mapping_file: str,
447                        output_file: str,
448                        serialization: str,
449                        rdb_username: str,
450                        rdb_password: str,
451                        rdb_host: str,
452                        rdb_port: int,
453                        rdb_name: str,
454                        rdb_type: str) -> bool:
455        """Execute a R2RML mapping with Ontop
456
457        N-Quads and N-Triples are currently supported as serialization
458        for Ontop materialize. Only relational databases are supported by
459        Ontop, thus the relational database parameters are mandantory.
460
461        Parameters
462        ----------
463        mapping_file : str
464            Path to the mapping file to execute.
465        output_file : str
466            Name of the output file to store the triples in. This is not used
467            for OntopVirtualize.
468        serialization : str
469            Serialization format to use.
470        rdb_username : str
471            Username for the database.
472        rdb_password : str
473            Password for the database.
474        rdb_host : str
475            Hostname for the database.
476        rdb_port : int
477            Port for the database.
478        rdb_name : str
479            Database name for the database.
480        rdb_type : str
481            Database type.
482
483        Returns
484        -------
485        success : bool
486            Whether the execution was successfull or not.
487        """
488        try:
489            return self._execute_mapping_with_timeout(mapping_file,
490                                                      output_file,
491                                                      serialization,
492                                                      rdb_username,
493                                                      rdb_password,
494                                                      rdb_host,
495                                                      rdb_port,
496                                                      rdb_name,
497                                                      rdb_type)
498        except TimeoutError:
499            msg = f'Timeout ({TIMEOUT}s) reached for Ontop Materialize'
500            self._logger.warning(msg)
501
502        return False

Execute a R2RML mapping with Ontop

N-Quads and N-Triples are currently supported as serialization for Ontop materialize. Only relational databases are supported by Ontop, thus the relational database parameters are mandantory.

Parameters
  • mapping_file (str): Path to the mapping file to execute.
  • output_file (str): Name of the output file to store the triples in. This is not used for OntopVirtualize.
  • serialization (str): Serialization format to use.
  • rdb_username (str): Username for the database.
  • rdb_password (str): Password for the database.
  • rdb_host (str): Hostname for the database.
  • rdb_port (int): Port for the database.
  • rdb_name (str): Database name for the database.
  • rdb_type (str): Database type.
Returns
  • success (bool): Whether the execution was successfull or not.