bench_executor.morphkgc
Morph-KGC is an engine that constructs RDF and RDF-star knowledge graphs from heterogeneous data sources with the R2RML, RML and RML-star mapping languages.
Website: https://morph-kgc.readthedocs.io/
Repository: https://github.com/oeg-upm/morph-kgc
1#!/usr/bin/env python3 2 3""" 4Morph-KGC is an engine that constructs RDF and RDF-star knowledge graphs 5from heterogeneous data sources with the R2RML, RML and RML-star mapping 6languages. 7 8**Website**: https://morph-kgc.readthedocs.io/<br> 9**Repository**: https://github.com/oeg-upm/morph-kgc 10""" 11 12import os 13import configparser 14from timeout_decorator import timeout, TimeoutError # type: ignore 15from typing import Optional 16from bench_executor.container import Container 17from bench_executor.logger import Logger 18 19VERSION = '2.2.0' 20TIMEOUT = 6 * 3600 # 6 hours 21 22 23class MorphKGC(Container): 24 """Morph-KGC container for executing R2RML, RML, and RML-star mappings.""" 25 def __init__(self, data_path: str, config_path: str, directory: str, 26 verbose: bool): 27 """Creates an instance of the MorphKGC class. 28 29 Parameters 30 ---------- 31 data_path : str 32 Path to the data directory of the case. 33 config_path : str 34 Path to the config directory of the case. 35 directory : str 36 Path to the directory to store logs. 37 verbose : bool 38 Enable verbose logs. 39 """ 40 self._data_path = os.path.abspath(data_path) 41 self._config_path = os.path.abspath(config_path) 42 self._logger = Logger(__name__, directory, verbose) 43 os.umask(0) 44 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 45 super().__init__(f'blindreviewing/morph-kgc:v{VERSION}', 'Morph-KGC', 46 self._logger, 47 volumes=[f'{self._data_path}/morphkgc:/data', 48 f'{self._data_path}/shared:/data/shared']) 49 50 @property 51 def root_mount_directory(self) -> str: 52 """Subdirectory in the root directory of the case for Morph-KGC. 53 54 Returns 55 ------- 56 subdirectory : str 57 Subdirectory of the root directory for Morph-KGC. 58 59 """ 60 return __name__.lower() 61 62 @timeout(TIMEOUT) 63 def _execute_with_timeout(self, arguments) -> bool: 64 """Execute a mapping with a provided timeout. 65 66 Returns 67 ------- 68 success : bool 69 Whether the execution was successfull or not. 70 """ 71 cmd = 'python3 -m morph_kgc /data/config_morphkgc.ini' 72 success = self.run_and_wait_for_exit(cmd) 73 74 return success 75 76 def execute(self, arguments: list) -> bool: 77 """Execute Morph-KGC with given arguments. 78 79 Parameters 80 ---------- 81 arguments : list 82 Arguments to supply to Morph-KGC. 83 84 Returns 85 ------- 86 success : bool 87 Whether the execution succeeded or not. 88 """ 89 try: 90 return self._execute_with_timeout(arguments) 91 except TimeoutError: 92 msg = f'Timeout ({TIMEOUT}s) reached for Morph-KGC' 93 self._logger.warning(msg) 94 95 return False 96 97 def execute_mapping(self, 98 mapping_file: str, 99 output_file: str, 100 serialization: str, 101 rdb_username: Optional[str] = None, 102 rdb_password: Optional[str] = None, 103 rdb_host: Optional[str] = None, 104 rdb_port: Optional[int] = None, 105 rdb_name: Optional[str] = None, 106 rdb_type: Optional[str] = None, 107 multiple_files: bool = False) -> bool: 108 """Execute a mapping file with Morph-KGC. 109 110 Morph-KGC can transform SQL relational databases (MySQL, PostgreSQL, 111 Oracle, Microsoft SQL Server, MariaDB, SQLite), tabular (CSV, TSV, 112 Excel, Parquet, Feather, ORC, Stata, SAS, SPSS, ODS) and 113 hierarchical files (JSON, XML). 114 115 Morph-KGC currently only supports N-Quads and N-Triples in RDF and 116 RDF-Star. RDF and RDF-Star output is done automatically, you can use 117 the same serialization format (`ntriples`, `nquads`) for both RDF 118 and RDF-Star output. 119 120 Morph-KGC can generate all triples in a single file or spread it 121 among multiple files. 122 123 Parameters 124 ---------- 125 mapping_file : str 126 Path to the mapping file to execute. 127 output_file : str 128 Name of the output file to store the triples in. 129 serialization : str 130 Serialization format to use. 131 rdb_username : Optional[str] 132 Username for the database, required when a database is used as 133 source. 134 rdb_password : Optional[str] 135 Password for the database, required when a database is used as 136 source. 137 rdb_host : Optional[str] 138 Hostname for the database, required when a database is used as 139 source. 140 rdb_port : Optional[int] 141 Port for the database, required when a database is used as source. 142 rdb_name : Optional[str] 143 Database name for the database, required when a database is used as 144 source. 145 rdb_type : Optional[str] 146 Database type, required when a database is used as source. 147 multiple_files : bool 148 If the generated triples must be stored in multiple files, default 149 a single file. 150 151 Returns 152 ------- 153 success : bool 154 Whether the execution was successfull or not. 155 """ 156 157 if serialization == 'nquads': 158 serialization = 'N-QUADS' 159 elif serialization == 'ntriples': 160 serialization = 'N-TRIPLES' 161 else: 162 raise NotImplementedError('Unsupported serialization:' 163 f'"{serialization}"') 164 165 # Generate INI configuration file since no CLI is available 166 config = configparser.ConfigParser() 167 config['CONFIGURATION'] = { 168 'output_format': serialization 169 } 170 config['DataSource0'] = { 171 'mappings': f'/data/shared/{mapping_file}' 172 } 173 174 # Morph-KGC can keep the mapping partition results separate, provide 175 # this option, default OFF 176 if multiple_files: 177 config['CONFIGURATION']['output_dir'] = '/data/shared/' 178 else: 179 config['CONFIGURATION']['output_file'] = \ 180 f'/data/shared/{output_file}' 181 182 if rdb_username is not None and rdb_password is not None \ 183 and rdb_host is not None and rdb_port is not None \ 184 and rdb_name is not None and rdb_type is not None: 185 if rdb_type == 'MySQL': 186 protocol = 'mysql+pymysql' 187 elif rdb_type == 'PostgreSQL': 188 protocol = 'postgresql+psycopg2' 189 else: 190 raise ValueError(f'Unknown RDB type: "{rdb_type}"') 191 rdb_dsn = f'{protocol}://{rdb_username}:{rdb_password}' + \ 192 f'@{rdb_host}:{rdb_port}/{rdb_name}' 193 config['DataSource0']['db_url'] = rdb_dsn 194 195 os.umask(0) 196 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 197 path = os.path.join(self._data_path, 'morphkgc', 'config_morphkgc.ini') 198 with open(path, 'w') as f: 199 config.write(f) 200 201 return self.execute([])
24class MorphKGC(Container): 25 """Morph-KGC container for executing R2RML, RML, and RML-star mappings.""" 26 def __init__(self, data_path: str, config_path: str, directory: str, 27 verbose: bool): 28 """Creates an instance of the MorphKGC class. 29 30 Parameters 31 ---------- 32 data_path : str 33 Path to the data directory of the case. 34 config_path : str 35 Path to the config directory of the case. 36 directory : str 37 Path to the directory to store logs. 38 verbose : bool 39 Enable verbose logs. 40 """ 41 self._data_path = os.path.abspath(data_path) 42 self._config_path = os.path.abspath(config_path) 43 self._logger = Logger(__name__, directory, verbose) 44 os.umask(0) 45 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 46 super().__init__(f'blindreviewing/morph-kgc:v{VERSION}', 'Morph-KGC', 47 self._logger, 48 volumes=[f'{self._data_path}/morphkgc:/data', 49 f'{self._data_path}/shared:/data/shared']) 50 51 @property 52 def root_mount_directory(self) -> str: 53 """Subdirectory in the root directory of the case for Morph-KGC. 54 55 Returns 56 ------- 57 subdirectory : str 58 Subdirectory of the root directory for Morph-KGC. 59 60 """ 61 return __name__.lower() 62 63 @timeout(TIMEOUT) 64 def _execute_with_timeout(self, arguments) -> bool: 65 """Execute a mapping with a provided timeout. 66 67 Returns 68 ------- 69 success : bool 70 Whether the execution was successfull or not. 71 """ 72 cmd = 'python3 -m morph_kgc /data/config_morphkgc.ini' 73 success = self.run_and_wait_for_exit(cmd) 74 75 return success 76 77 def execute(self, arguments: list) -> bool: 78 """Execute Morph-KGC with given arguments. 79 80 Parameters 81 ---------- 82 arguments : list 83 Arguments to supply to Morph-KGC. 84 85 Returns 86 ------- 87 success : bool 88 Whether the execution succeeded or not. 89 """ 90 try: 91 return self._execute_with_timeout(arguments) 92 except TimeoutError: 93 msg = f'Timeout ({TIMEOUT}s) reached for Morph-KGC' 94 self._logger.warning(msg) 95 96 return False 97 98 def execute_mapping(self, 99 mapping_file: str, 100 output_file: str, 101 serialization: str, 102 rdb_username: Optional[str] = None, 103 rdb_password: Optional[str] = None, 104 rdb_host: Optional[str] = None, 105 rdb_port: Optional[int] = None, 106 rdb_name: Optional[str] = None, 107 rdb_type: Optional[str] = None, 108 multiple_files: bool = False) -> bool: 109 """Execute a mapping file with Morph-KGC. 110 111 Morph-KGC can transform SQL relational databases (MySQL, PostgreSQL, 112 Oracle, Microsoft SQL Server, MariaDB, SQLite), tabular (CSV, TSV, 113 Excel, Parquet, Feather, ORC, Stata, SAS, SPSS, ODS) and 114 hierarchical files (JSON, XML). 115 116 Morph-KGC currently only supports N-Quads and N-Triples in RDF and 117 RDF-Star. RDF and RDF-Star output is done automatically, you can use 118 the same serialization format (`ntriples`, `nquads`) for both RDF 119 and RDF-Star output. 120 121 Morph-KGC can generate all triples in a single file or spread it 122 among multiple files. 123 124 Parameters 125 ---------- 126 mapping_file : str 127 Path to the mapping file to execute. 128 output_file : str 129 Name of the output file to store the triples in. 130 serialization : str 131 Serialization format to use. 132 rdb_username : Optional[str] 133 Username for the database, required when a database is used as 134 source. 135 rdb_password : Optional[str] 136 Password for the database, required when a database is used as 137 source. 138 rdb_host : Optional[str] 139 Hostname for the database, required when a database is used as 140 source. 141 rdb_port : Optional[int] 142 Port for the database, required when a database is used as source. 143 rdb_name : Optional[str] 144 Database name for the database, required when a database is used as 145 source. 146 rdb_type : Optional[str] 147 Database type, required when a database is used as source. 148 multiple_files : bool 149 If the generated triples must be stored in multiple files, default 150 a single file. 151 152 Returns 153 ------- 154 success : bool 155 Whether the execution was successfull or not. 156 """ 157 158 if serialization == 'nquads': 159 serialization = 'N-QUADS' 160 elif serialization == 'ntriples': 161 serialization = 'N-TRIPLES' 162 else: 163 raise NotImplementedError('Unsupported serialization:' 164 f'"{serialization}"') 165 166 # Generate INI configuration file since no CLI is available 167 config = configparser.ConfigParser() 168 config['CONFIGURATION'] = { 169 'output_format': serialization 170 } 171 config['DataSource0'] = { 172 'mappings': f'/data/shared/{mapping_file}' 173 } 174 175 # Morph-KGC can keep the mapping partition results separate, provide 176 # this option, default OFF 177 if multiple_files: 178 config['CONFIGURATION']['output_dir'] = '/data/shared/' 179 else: 180 config['CONFIGURATION']['output_file'] = \ 181 f'/data/shared/{output_file}' 182 183 if rdb_username is not None and rdb_password is not None \ 184 and rdb_host is not None and rdb_port is not None \ 185 and rdb_name is not None and rdb_type is not None: 186 if rdb_type == 'MySQL': 187 protocol = 'mysql+pymysql' 188 elif rdb_type == 'PostgreSQL': 189 protocol = 'postgresql+psycopg2' 190 else: 191 raise ValueError(f'Unknown RDB type: "{rdb_type}"') 192 rdb_dsn = f'{protocol}://{rdb_username}:{rdb_password}' + \ 193 f'@{rdb_host}:{rdb_port}/{rdb_name}' 194 config['DataSource0']['db_url'] = rdb_dsn 195 196 os.umask(0) 197 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 198 path = os.path.join(self._data_path, 'morphkgc', 'config_morphkgc.ini') 199 with open(path, 'w') as f: 200 config.write(f) 201 202 return self.execute([])
Morph-KGC container for executing R2RML, RML, and RML-star mappings.
26 def __init__(self, data_path: str, config_path: str, directory: str, 27 verbose: bool): 28 """Creates an instance of the MorphKGC class. 29 30 Parameters 31 ---------- 32 data_path : str 33 Path to the data directory of the case. 34 config_path : str 35 Path to the config directory of the case. 36 directory : str 37 Path to the directory to store logs. 38 verbose : bool 39 Enable verbose logs. 40 """ 41 self._data_path = os.path.abspath(data_path) 42 self._config_path = os.path.abspath(config_path) 43 self._logger = Logger(__name__, directory, verbose) 44 os.umask(0) 45 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 46 super().__init__(f'blindreviewing/morph-kgc:v{VERSION}', 'Morph-KGC', 47 self._logger, 48 volumes=[f'{self._data_path}/morphkgc:/data', 49 f'{self._data_path}/shared:/data/shared'])
Creates an instance of the MorphKGC class.
Parameters
- data_path (str): Path to the data directory of the case.
- config_path (str): Path to the config directory of the case.
- directory (str): Path to the directory to store logs.
- verbose (bool): Enable verbose logs.
Subdirectory in the root directory of the case for Morph-KGC.
Returns
- subdirectory (str): Subdirectory of the root directory for Morph-KGC.
77 def execute(self, arguments: list) -> bool: 78 """Execute Morph-KGC with given arguments. 79 80 Parameters 81 ---------- 82 arguments : list 83 Arguments to supply to Morph-KGC. 84 85 Returns 86 ------- 87 success : bool 88 Whether the execution succeeded or not. 89 """ 90 try: 91 return self._execute_with_timeout(arguments) 92 except TimeoutError: 93 msg = f'Timeout ({TIMEOUT}s) reached for Morph-KGC' 94 self._logger.warning(msg) 95 96 return False
Execute Morph-KGC with given arguments.
Parameters
- arguments (list): Arguments to supply to Morph-KGC.
Returns
- success (bool): Whether the execution succeeded or not.
98 def execute_mapping(self, 99 mapping_file: str, 100 output_file: str, 101 serialization: str, 102 rdb_username: Optional[str] = None, 103 rdb_password: Optional[str] = None, 104 rdb_host: Optional[str] = None, 105 rdb_port: Optional[int] = None, 106 rdb_name: Optional[str] = None, 107 rdb_type: Optional[str] = None, 108 multiple_files: bool = False) -> bool: 109 """Execute a mapping file with Morph-KGC. 110 111 Morph-KGC can transform SQL relational databases (MySQL, PostgreSQL, 112 Oracle, Microsoft SQL Server, MariaDB, SQLite), tabular (CSV, TSV, 113 Excel, Parquet, Feather, ORC, Stata, SAS, SPSS, ODS) and 114 hierarchical files (JSON, XML). 115 116 Morph-KGC currently only supports N-Quads and N-Triples in RDF and 117 RDF-Star. RDF and RDF-Star output is done automatically, you can use 118 the same serialization format (`ntriples`, `nquads`) for both RDF 119 and RDF-Star output. 120 121 Morph-KGC can generate all triples in a single file or spread it 122 among multiple files. 123 124 Parameters 125 ---------- 126 mapping_file : str 127 Path to the mapping file to execute. 128 output_file : str 129 Name of the output file to store the triples in. 130 serialization : str 131 Serialization format to use. 132 rdb_username : Optional[str] 133 Username for the database, required when a database is used as 134 source. 135 rdb_password : Optional[str] 136 Password for the database, required when a database is used as 137 source. 138 rdb_host : Optional[str] 139 Hostname for the database, required when a database is used as 140 source. 141 rdb_port : Optional[int] 142 Port for the database, required when a database is used as source. 143 rdb_name : Optional[str] 144 Database name for the database, required when a database is used as 145 source. 146 rdb_type : Optional[str] 147 Database type, required when a database is used as source. 148 multiple_files : bool 149 If the generated triples must be stored in multiple files, default 150 a single file. 151 152 Returns 153 ------- 154 success : bool 155 Whether the execution was successfull or not. 156 """ 157 158 if serialization == 'nquads': 159 serialization = 'N-QUADS' 160 elif serialization == 'ntriples': 161 serialization = 'N-TRIPLES' 162 else: 163 raise NotImplementedError('Unsupported serialization:' 164 f'"{serialization}"') 165 166 # Generate INI configuration file since no CLI is available 167 config = configparser.ConfigParser() 168 config['CONFIGURATION'] = { 169 'output_format': serialization 170 } 171 config['DataSource0'] = { 172 'mappings': f'/data/shared/{mapping_file}' 173 } 174 175 # Morph-KGC can keep the mapping partition results separate, provide 176 # this option, default OFF 177 if multiple_files: 178 config['CONFIGURATION']['output_dir'] = '/data/shared/' 179 else: 180 config['CONFIGURATION']['output_file'] = \ 181 f'/data/shared/{output_file}' 182 183 if rdb_username is not None and rdb_password is not None \ 184 and rdb_host is not None and rdb_port is not None \ 185 and rdb_name is not None and rdb_type is not None: 186 if rdb_type == 'MySQL': 187 protocol = 'mysql+pymysql' 188 elif rdb_type == 'PostgreSQL': 189 protocol = 'postgresql+psycopg2' 190 else: 191 raise ValueError(f'Unknown RDB type: "{rdb_type}"') 192 rdb_dsn = f'{protocol}://{rdb_username}:{rdb_password}' + \ 193 f'@{rdb_host}:{rdb_port}/{rdb_name}' 194 config['DataSource0']['db_url'] = rdb_dsn 195 196 os.umask(0) 197 os.makedirs(os.path.join(self._data_path, 'morphkgc'), exist_ok=True) 198 path = os.path.join(self._data_path, 'morphkgc', 'config_morphkgc.ini') 199 with open(path, 'w') as f: 200 config.write(f) 201 202 return self.execute([])
Execute a mapping file with Morph-KGC.
Morph-KGC can transform SQL relational databases (MySQL, PostgreSQL, Oracle, Microsoft SQL Server, MariaDB, SQLite), tabular (CSV, TSV, Excel, Parquet, Feather, ORC, Stata, SAS, SPSS, ODS) and hierarchical files (JSON, XML).
Morph-KGC currently only supports N-Quads and N-Triples in RDF and
RDF-Star. RDF and RDF-Star output is done automatically, you can use
the same serialization format (ntriples
, nquads
) for both RDF
and RDF-Star output.
Morph-KGC can generate all triples in a single file or spread it among multiple files.
Parameters
- mapping_file (str): Path to the mapping file to execute.
- output_file (str): Name of the output file to store the triples in.
- serialization (str): Serialization format to use.
- rdb_username (Optional[str]): Username for the database, required when a database is used as source.
- rdb_password (Optional[str]): Password for the database, required when a database is used as source.
- rdb_host (Optional[str]): Hostname for the database, required when a database is used as source.
- rdb_port (Optional[int]): Port for the database, required when a database is used as source.
- rdb_name (Optional[str]): Database name for the database, required when a database is used as source.
- rdb_type (Optional[str]): Database type, required when a database is used as source.
- multiple_files (bool): If the generated triples must be stored in multiple files, default a single file.
Returns
- success (bool): Whether the execution was successfull or not.