From 0e345286158ba568ee242e4842218fcb9a029697 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 3 Dec 2024 16:11:45 -0700 Subject: [PATCH 1/2] Cache the result of `DaskManager.normalize_chunks` This is only used with the backends codepath, where the inputs are guaranteed to be tuples. By contrast, `dask.array.normalize_chunks` accepts dicts as inputs and so, is harder to cache transparently --- xarray/namedarray/daskmanager.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py index 6485ba375f5..26bcc3e43e3 100644 --- a/xarray/namedarray/daskmanager.py +++ b/xarray/namedarray/daskmanager.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Callable, Iterable, Sequence +from functools import lru_cache from typing import TYPE_CHECKING, Any import numpy as np @@ -19,12 +20,20 @@ try: from dask.array import Array as DaskArray + except ImportError: DaskArray = np.ndarray[Any, Any] dask_available = module_available("dask") +if dask_available: + from dask.array.core import normalize_chunks + + normalize_chunks = lru_cache(normalize_chunks) +else: + normalize_chunks = None + class DaskManager(ChunkManagerEntrypoint["DaskArray"]): array_cls: type[DaskArray] @@ -52,8 +61,6 @@ def normalize_chunks( previous_chunks: _NormalizedChunks | None = None, ) -> Any: """Called by open_dataset""" - from dask.array.core import normalize_chunks - return normalize_chunks( chunks, shape=shape, From de010ea2e60dc6e06b7edc1c7f2d3d93035c6c7f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 3 Dec 2024 16:13:08 -0700 Subject: [PATCH 2/2] Try using uuids xref #8902 xref #1525 --- xarray/core/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d4a23ac275a..ee706f6e23b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6,6 +6,7 @@ import itertools import math import sys +import uuid import warnings from collections import defaultdict from collections.abc import ( @@ -257,7 +258,6 @@ def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint): chunk_shape = chunkmanager.normalize_chunks( chunk_shape, shape=shape, dtype=var.dtype, previous_chunks=preferred_chunk_shape ) - # Warn where requested chunks break preferred chunks, provided that the variable # contains data. if var.size: @@ -344,7 +344,8 @@ def _maybe_chunk( # by providing chunks as an input to tokenize. # subtle bugs result otherwise. see GH3350 # we use str() for speed, and use the name for the final array name on the next line - token2 = tokenize(token if token else var._data, str(chunks)) + mixin = uuid.uuid4() + token2 = tokenize(token if token else var._data, mixin) name2 = f"{name_prefix}{name}-{token2}" from_array_kwargs = utils.consolidate_dask_from_array_kwargs(