Skip to content

Commit 7fb97d2

Browse files
michal-mmmankatiyar
authored andcommitted
fix(datasets): add metadata parameter to datasets (kedro-org#708)
Signed-off-by: michal-mmm <madej.michal@outlook.com> Co-authored-by: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
1 parent a9ebbc5 commit 7fb97d2

File tree

6 files changed

+19
-1
lines changed

6 files changed

+19
-1
lines changed

kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py

+4
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ def __init__( # noqa: PLR0913
230230
schema: dict[str, Any] | None = None,
231231
partition_columns: list[str] | None = None,
232232
owner_group: str | None = None,
233+
metadata: dict[str, Any] | None = None,
233234
) -> None:
234235
"""Creates a new instance of ``ManagedTableDataset``.
235236
@@ -259,6 +260,8 @@ def __init__( # noqa: PLR0913
259260
owner_group: if table access control is enabled in your workspace,
260261
specifying owner_group will transfer ownership of the table and database to
261262
this owner. All databases should have the same owner_group. Defaults to None.
263+
metadata: Any arbitrary metadata.
264+
This is ignored by Kedro, but may be consumed by users or external plugins.
262265
Raises:
263266
DatasetError: Invalid configuration supplied (through ManagedTable validation)
264267
"""
@@ -276,6 +279,7 @@ def __init__( # noqa: PLR0913
276279
)
277280

278281
self._version = version
282+
self.metadata = metadata
279283

280284
super().__init__(
281285
filepath=None, # type: ignore[arg-type]

kedro-datasets/kedro_datasets/huggingface/hugging_face_dataset.py

+2
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@ def __init__(
4141
*,
4242
dataset_name: str,
4343
dataset_kwargs: dict[str, Any] | None = None,
44+
metadata: dict[str, Any] | None = None,
4445
):
4546
self.dataset_name = dataset_name
4647
self._dataset_kwargs = dataset_kwargs or {}
48+
self.metadata = metadata
4749

4850
def _load(self):
4951
return load_dataset(self.dataset_name, **self._dataset_kwargs)

kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py

+2
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,14 @@ def __init__(
4343
task: str | None = None,
4444
model_name: str | None = None,
4545
pipeline_kwargs: dict[str, t.Any] | None = None,
46+
metadata: dict[str, t.Any] | None = None,
4647
):
4748
if task is None and model_name is None:
4849
raise ValueError("At least 'task' or 'model_name' are needed")
4950
self._task = task if task else None
5051
self._model_name = model_name
5152
self._pipeline_kwargs = pipeline_kwargs or {}
53+
self.metadata = metadata
5254

5355
if self._pipeline_kwargs and (
5456
"task" in self._pipeline_kwargs or "model" in self._pipeline_kwargs

kedro-datasets/kedro_datasets/ibis/table_dataset.py

+4
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def __init__( # noqa: PLR0913
8080
connection: dict[str, Any] | None = None,
8181
load_args: dict[str, Any] | None = None,
8282
save_args: dict[str, Any] | None = None,
83+
metadata: dict[str, Any] | None = None,
8384
) -> None:
8485
"""Creates a new ``TableDataset`` pointing to a table (or file).
8586
@@ -117,6 +118,8 @@ def __init__( # noqa: PLR0913
117118
objects are materialized as views. To save a table using
118119
a different materialization strategy, supply a value for
119120
`materialized` in `save_args`.
121+
metadata: Any arbitrary metadata. This is ignored by Kedro,
122+
but may be consumed by users or external plugins.
120123
"""
121124
if filepath is None and table_name is None:
122125
raise DatasetError(
@@ -127,6 +130,7 @@ def __init__( # noqa: PLR0913
127130
self._file_format = file_format
128131
self._table_name = table_name
129132
self._connection_config = connection
133+
self.metadata = metadata
130134

131135
# Set load and save arguments, overwriting defaults if provided.
132136
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)

kedro-datasets/kedro_datasets/polars/eager_polars_dataset.py

+2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__( # noqa: PLR0913
6868
version: Version | None = None,
6969
credentials: dict[str, Any] | None = None,
7070
fs_args: dict[str, Any] | None = None,
71+
metadata: dict[str, Any] | None = None,
7172
):
7273
"""Creates a new instance of ``EagerPolarsDataset`` pointing to a concrete data file
7374
on a specific filesystem. The appropriate polars load/save methods are dynamically
@@ -124,6 +125,7 @@ def __init__( # noqa: PLR0913
124125

125126
self._protocol = protocol
126127
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
128+
self.metadata = metadata
127129

128130
super().__init__(
129131
filepath=PurePosixPath(path),

kedro-datasets/kedro_datasets/spark/spark_streaming_dataset.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,14 @@ class SparkStreamingDataset(AbstractDataset):
4242
DEFAULT_LOAD_ARGS = {} # type: dict[str, Any]
4343
DEFAULT_SAVE_ARGS = {} # type: dict[str, Any]
4444

45-
def __init__(
45+
def __init__( # noqa: PLR0913
4646
self,
4747
*,
4848
filepath: str = "",
4949
file_format: str = "",
5050
save_args: dict[str, Any] | None = None,
5151
load_args: dict[str, Any] | None = None,
52+
metadata: dict[str, Any] | None = None,
5253
) -> None:
5354
"""Creates a new instance of SparkStreamingDataset.
5455
@@ -73,10 +74,13 @@ def __init__(
7374
respectively. You can find a list of options for each selected format in
7475
Spark DataFrame write documentation, see
7576
https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html
77+
metadata: Any arbitrary metadata.
78+
This is ignored by Kedro, but may be consumed by users or external plugins.
7679
"""
7780
self._file_format = file_format
7881
self._save_args = save_args
7982
self._load_args = load_args
83+
self.metadata = metadata
8084

8185
fs_prefix, filepath = _split_filepath(filepath)
8286

0 commit comments

Comments
 (0)