From 3c1c9c782b314d24229fbe444ed3492e83b9ff06 Mon Sep 17 00:00:00 2001 From: Bradley Lowekamp Date: Wed, 15 Nov 2023 15:59:11 -0500 Subject: [PATCH 1/3] Add option to rechunk to load array into memory This reduces the computaiton time at the cost of more memory. --- pytools/HedwigZarrImage.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pytools/HedwigZarrImage.py b/pytools/HedwigZarrImage.py index 44ac0ab..2b2596a 100644 --- a/pytools/HedwigZarrImage.py +++ b/pytools/HedwigZarrImage.py @@ -67,7 +67,7 @@ def shape(self) -> Tuple[int]: """ return self._ome_ngff_multiscale_get_array(0).shape - def rechunk(self, chunk_size: int, compressor=None) -> None: + def rechunk(self, chunk_size: int, compressor=None, *, in_memory=False) -> None: """ Change the chunk size of each ZARR array inplace in the pyramid. @@ -78,6 +78,9 @@ def rechunk(self, chunk_size: int, compressor=None) -> None: :param chunk_size: The size as an integer to resize the chunk sizes. :param compressor: The output arrays will be written with the provided compressor, if None then the compressor of the input arrays will be used. + :param in_memory: If true the entire arrays will be loaded into memory uncompressed, before writing to the + rechunked size, otherwise the arrays will be written directly to the rechunked size. The former is faster but + requires enough memory to hold the arrays. """ logger.info(f'Processing group: "{self.zarr_group.name}"...') @@ -100,11 +103,16 @@ def rechunk(self, chunk_size: int, compressor=None) -> None: logger.info("Chunks already requested size") continue - if compressor is None: - compressor = arr.compressor + temp_arr = arr + if in_memory: + # optionally load the entire array uncompressed into memory + memory_group = zarr.group(store=zarr.MemoryStore(), overwrite=True) + zarr.copy(temp_arr, memory_group, name="temp", compressor=None) + temp_arr = memory_group["temp"] + # copy array to a temp zarr array on file zarr.copy( - arr, + temp_arr, self.zarr_group, name=arr_name + ".temp", chunks=chunks, From 70f7f4bcfdf30b3298ea588635e55677722b7456 Mon Sep 17 00:00:00 2001 From: Bradley Lowekamp Date: Wed, 15 Nov 2023 16:27:10 -0500 Subject: [PATCH 2/3] Add in-memory flag to zarr_rechunk command line --- pytools/zarr_rechunk.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytools/zarr_rechunk.py b/pytools/zarr_rechunk.py index be97cbd..0a6809f 100644 --- a/pytools/zarr_rechunk.py +++ b/pytools/zarr_rechunk.py @@ -25,15 +25,18 @@ default=False, help="Use the preferred compressor when recompressing.", ) +@click.option( + "--in-memory", is_flag=True, show_default=True, default=False, help="Use in-memory zarr store when recompressing." +) @click.version_option(__version__) -def main(input_zarr, log_level, chunk_size, recompress): +def main(input_zarr, log_level, chunk_size, recompress, in_memory): logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.getLevelName(log_level)) compressor = Blosc(cname="zstd", clevel=5, shuffle=Blosc.SHUFFLE) z = HedwigZarrImages(input_zarr, read_only=False) for k in z.get_series_keys(): - z[k].rechunk(chunk_size, compressor=compressor if recompress else None) + z[k].rechunk(chunk_size, compressor=compressor if recompress else None, in_memory=in_memory) if __name__ == "__main__": From 5e59e5811479f5c93ed6c5abea1be1343aa31776 Mon Sep 17 00:00:00 2001 From: Bradley Lowekamp Date: Thu, 16 Nov 2023 07:43:34 -0500 Subject: [PATCH 3/3] Adding logging info when rechunking into memory. --- pytools/HedwigZarrImage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytools/HedwigZarrImage.py b/pytools/HedwigZarrImage.py index 2b2596a..6d1843a 100644 --- a/pytools/HedwigZarrImage.py +++ b/pytools/HedwigZarrImage.py @@ -105,11 +105,14 @@ def rechunk(self, chunk_size: int, compressor=None, *, in_memory=False) -> None: temp_arr = arr if in_memory: + logger.info(f'Loading array: "{arr.name}" into memory...') # optionally load the entire array uncompressed into memory memory_group = zarr.group(store=zarr.MemoryStore(), overwrite=True) zarr.copy(temp_arr, memory_group, name="temp", compressor=None) temp_arr = memory_group["temp"] + logger.info(f'Rechunking array: "{arr.name} to disk"...') + # copy array to a temp zarr array on file zarr.copy( temp_arr,