diff --git a/python/arcticdb/util/_versions.py b/python/arcticdb/util/_versions.py new file mode 100644 index 00000000000..fcfcd09fbfa --- /dev/null +++ b/python/arcticdb/util/_versions.py @@ -0,0 +1,14 @@ +""" +Copyright 2023 Man Group Operations Limited + +Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. + +As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. +""" +import pandas as pd +from packaging import version + +PANDAS_VERSION = version.parse(pd.__version__) +CHECK_FREQ_VERSION = version.Version("1.1") +IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0") +IS_PANDAS_TWO = PANDAS_VERSION >= version.Version("2.0") diff --git a/python/arcticdb/util/test.py b/python/arcticdb/util/test.py index c6c816641e1..f49adb59a8d 100644 --- a/python/arcticdb/util/test.py +++ b/python/arcticdb/util/test.py @@ -18,10 +18,10 @@ from six import PY3 from copy import deepcopy from functools import wraps -from packaging import version from arcticdb.config import Defaults from arcticdb.log import configure, logger_by_name +from arcticdb.util._versions import PANDAS_VERSION, CHECK_FREQ_VERSION from arcticdb.version_store import NativeVersionStore from arcticdb.version_store._custom_normalizers import CustomNormalizer from arcticc.pb2.descriptors_pb2 import NormalizationMetadata @@ -32,11 +32,6 @@ from arcticdb_ext import set_config_int, get_config_int, unset_config_int -PANDAS_VERSION = version.parse(pd.__version__) -CHECK_FREQ_VERSION = version.Version("1.1") -IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0") - - def maybe_not_check_freq(f): """Ignore frequency when pandas is newer as starts to check frequency which it did not previously do.""" diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 986e9e44b6b..4ac981a5500 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -25,6 +25,7 @@ from collections import Counter from arcticdb.exceptions import ArcticNativeException, ArcticNativeNotYetImplemented from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types +from arcticdb.util._versions import IS_PANDAS_TWO from arcticdb.version_store.read_result import ReadResult from arcticdb_ext.version_store import SortedValue as _SortedValue from pandas.core.internals import make_block @@ -179,6 +180,14 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co return arr.codes obj_tokens = (object, "object", "O") + if np.issubdtype(arr.dtype, np.datetime64): + # ArcticDB only operates at nanosecond resolution (i.e. `datetime64[ns]`) type because so did Pandas < 2. + # In Pandas >= 2.0, other resolution are supported (namely `ms`, `s`, and `us`). + # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution # noqa: E501 + # We want to maintain consistent behaviour, so we convert any other resolution + # to `datetime64[ns]`. + arr = arr.astype(DTN64_DTYPE, copy=False) + if arr.dtype.hasobject is False and not ( dynamic_strings and arr.dtype == "float" and coerce_column_type in obj_tokens ): @@ -188,12 +197,19 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co if len(arr) == 0: if coerce_column_type is None: - raise ArcticNativeNotYetImplemented( - "coercing column type is required when empty column of object type, Column type={} for column={}" - .format(arr.dtype, arr_name) - ) - else: - return arr.astype(coerce_column_type) + if IS_PANDAS_TWO: + # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. + # See: https://github.com/pandas-dev/pandas/issues/17261 + # We want to maintain consistent behaviour, so we treat empty series as containing floats. + # val_type = ValueType::FLOAT; + coerce_column_type = float + return arr.astype(coerce_column_type) + else: + raise ArcticNativeNotYetImplemented( + "coercing column type is required when empty column of object type, Column type={} for column={}" + .format(arr.dtype, arr_name) + ) + return arr.astype(coerce_column_type) # Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in # Python with the caveat that if the user gave an invalid type it's going to blow up in the core. @@ -277,6 +293,7 @@ def _from_tz_timestamp(ts, tz): def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None, string_max_len=None): # index: pd.Index or np.ndarray -> np.ndarray index_tz = None + if isinstance(index, RangeIndex): # skip index since we can reconstruct it, so no need to actually store it if index.name: @@ -507,6 +524,20 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): df.reset_index(fields, inplace=True) index = df.index else: + n_rows = len(index) + n_categorical_columns = len(df.select_dtypes(include="category").columns) + if IS_PANDAS_TWO and isinstance(index, RangeIndex) and n_rows == 0 and n_categorical_columns == 0: + # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if + # there are categorical columns in which case a RangeIndex is used. + # + # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series. + # See: https://github.com/pandas-dev/pandas/issues/49572 + # Yet internally, ArcticDB uses a DatetimeIndex for empty dataframes and series without categorical + # columns. + # + # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0. + index = DatetimeIndex([]) + index_norm = pd_norm.index index_norm.is_not_range_index = not isinstance(index, RangeIndex) @@ -563,6 +594,13 @@ def denormalize(self, item, norm_meta): else: s.name = None + if s.empty and IS_PANDAS_TWO: + # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. + # See: https://github.com/pandas-dev/pandas/issues/17261 + # We want to maintain consistent behaviour, so we return empty series as containing objects + # when the Pandas version is >= 2.0 + s = s.astype("object") + return s @@ -670,7 +708,23 @@ def denormalize(self, item, norm_meta): columns, denormed_columns, data = _denormalize_columns(item, norm_meta, idx_type, n_indexes) if not self._skip_df_consolidation: + columns_dtype = {} if data is None else {name: np_array.dtype for name, np_array in data.items()} df = DataFrame(data, index=index, columns=columns) + + # Setting the columns' dtype manually, since pandas might just convert the dtype of some + # (empty) columns to another one and since the `dtype` keyword for `pd.DataFrame` constructor + # does not accept a mapping such as `columns_dtype`. + # For instance the following code has been tried but returns a pandas.DataFrame full of NaNs: + # + # columns_mapping = {} if data is None else { + # name: pd.Series(np_array, index=index, dtype=np_array.dtype) + # for name, np_array in data.items() + # } + # df = DataFrame(index=index, columns=columns_mapping, copy=False) + # + for column_name, dtype in columns_dtype.items(): + df[column_name] = df[column_name].astype(dtype, copy=False) + else: if index is not None: df = self.df_without_consolidation(columns, index, item, n_indexes, data) diff --git a/python/arcticdb/version_store/library.py b/python/arcticdb/version_store/library.py index c9e8471dd04..c26ace94981 100644 --- a/python/arcticdb/version_store/library.py +++ b/python/arcticdb/version_store/library.py @@ -7,11 +7,13 @@ """ import datetime +import pytz from enum import Enum, auto from typing import Optional, Any, Tuple, Dict, AnyStr, Union, List, Iterable, NamedTuple from numpy import datetime64 from arcticdb.supported_types import Timestamp +from arcticdb.util._versions import IS_PANDAS_TWO from arcticdb.version_store.processing import QueryBuilder from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput @@ -1503,6 +1505,11 @@ def get_description(self, symbol: str, as_of: Optional[AsOf] = None) -> SymbolDe """ info = self._nvs.get_info(symbol, as_of) last_update_time = pd.to_datetime(info["last_update"], utc=True) + if IS_PANDAS_TWO: + # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`. + # See: https://github.com/pandas-dev/pandas/issues/34916 + # We enforce the use of `pytz.UTC` for consistency. + last_update_time = last_update_time.replace(tzinfo=pytz.UTC) columns = tuple(NameWithDType(n, t) for n, t in zip(info["col_names"]["columns"], info["dtype"])) index = NameWithDType(info["col_names"]["index"], info["col_names"]["index_dtype"]) date_range = tuple( diff --git a/python/tests/integration/arcticdb/test_arctic_batch.py b/python/tests/integration/arcticdb/test_arctic_batch.py index 8b3d34c6c43..e0e110fa14d 100644 --- a/python/tests/integration/arcticdb/test_arctic_batch.py +++ b/python/tests/integration/arcticdb/test_arctic_batch.py @@ -30,8 +30,9 @@ random_strings_of_length, random_floats, ) -import random +from arcticdb.util._versions import IS_PANDAS_TWO +import random if AZURE_SUPPORT: from azure.storage.blob import BlobServiceClient @@ -1094,7 +1095,18 @@ def test_read_description_batch_high_amount(arctic_library): assert results_list[idx].date_range == date_range_comp_with_utc if version > 0: assert results_list[idx].last_update_time > results_list[idx - 1].last_update_time - assert results_list[idx].last_update_time.tz == pytz.UTC + + result_last_update_time = results_list[idx].last_update_time + tz = result_last_update_time.tz + + if IS_PANDAS_TWO: + # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`. + # See: https://github.com/pandas-dev/pandas/issues/34916 + # TODO: is there a better way to handle this edge case? + assert tz == timezone.utc + else: + assert isinstance(tz, pytz.BaseTzInfo) + assert tz == pytz.UTC def test_read_description_batch_empty_nat(arctic_library): diff --git a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py index c5a83e394da..eea6f25a1c9 100644 --- a/python/tests/integration/arcticdb/version_store/test_basic_version_store.py +++ b/python/tests/integration/arcticdb/version_store/test_basic_version_store.py @@ -2046,8 +2046,12 @@ def test_dynamic_schema_column_hash_update(lmdb_version_store_column_buckets): lib.update("symbol", df2) vit = lib.read("symbol") + # In Pandas < 2.0, updating a `DataFrame` uniquely storing integers with + # another `DataFrame` that is uniquely storing integers changes all the dtypes + # to float64. df.update(df2) - assert_frame_equal(vit.data.astype("float"), df) + df = df.astype("int64", copy=False) + assert_frame_equal(vit.data, df) def test_dynamic_schema_column_hash_append(lmdb_version_store_column_buckets): diff --git a/python/tests/integration/arcticdb/version_store/test_categorical.py b/python/tests/integration/arcticdb/version_store/test_categorical.py index f7a4b4f5e88..972d3d494a1 100644 --- a/python/tests/integration/arcticdb/version_store/test_categorical.py +++ b/python/tests/integration/arcticdb/version_store/test_categorical.py @@ -6,11 +6,14 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ import datetime +import sys + import numpy as np import pandas as pd import pytest from arcticdb.exceptions import ArcticNativeNotYetImplemented +from arcticdb.util._versions import IS_PANDAS_TWO from arcticdb.util.test import assert_frame_equal @@ -81,19 +84,40 @@ def test_categorical_empty(lmdb_version_store, sym): lib = lmdb_version_store lib.write(sym, df) read_df = lib.read(sym).data + # In Pandas 1.0, an Index is used by default for any an empty dataframe or series is created, + # except if there are categorical columns in which case a RangeIndex is used. + # + # In Pandas 2.0, RangeIndex is used by default for _any_ an empty dataframe or series is created. + # See: https://github.com/pandas-dev/pandas/issues/49572 + assert isinstance(df.index, pd.RangeIndex) + assert isinstance(read_df.index, pd.RangeIndex) assert_frame_equal(df, read_df) def test_categorical_with_integers(lmdb_version_store, sym): c = pd.Categorical(np.arange(6)) - df = pd.DataFrame({"int": np.arange(6), "cat": c}) + df = pd.DataFrame({"int": np.arange(6), "cat_int": c}) lib = lmdb_version_store lib.write(sym, df) read_df = lib.read(sym).data # Not pickled assert lib.get_info(sym)["type"] == "pandasdf" # should be category - assert read_df.cat.dtype == "category" + assert read_df.cat_int.dtype == "category" + if IS_PANDAS_TWO and sys.platform.startswith("win32"): + # Pandas 2.0.0 changed the underlying creation from numpy integral arrays: + # "Instantiating using a numpy numeric array now follows the dtype of the numpy array. + # Previously, all indexes created from numpy numeric arrays were forced to 64-bit. + # Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems, + # where it previously would have been int64 even on 32-bit systems. + # Instantiating Index using a list of numbers will still return 64bit dtypes, + # e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously." + # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes + # We have not control over the underlying integral array storing code for categorical columns + # so we replace the categorical column with its codes to perform the comparison with indentical dtypes. + df.cat_int = df.cat_int.cat.codes.astype(np.int32) + read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32) + assert_frame_equal(df, read_df) @@ -109,6 +133,20 @@ def test_categorical_with_integers_and_strings(lmdb_version_store, sym): # should be category assert read_df.cat_int.dtype == "category" assert read_df.cat_str.dtype == "category" + if IS_PANDAS_TWO and sys.platform.startswith("win32"): + # Pandas 2.0.0 changed the underlying creation from numpy integral arrays: + # "Instantiating using a numpy numeric array now follows the dtype of the numpy array. + # Previously, all indexes created from numpy numeric arrays were forced to 64-bit. + # Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems, + # where it previously would have been int64 even on 32-bit systems. + # Instantiating Index using a list of numbers will still return 64bit dtypes, + # e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously." + # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes + # We have not control over the underlying integral array storing code for categorical columns + # so we replace the categorical column with its codes to perform the comparison with indentical dtypes. + df.cat_int = df.cat_int.cat.codes.astype(np.int32) + read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32) + assert_frame_equal(df, read_df) diff --git a/python/tests/stress/arcticdb/version_store/test_stress_multicolumn.py b/python/tests/stress/arcticdb/version_store/test_stress_multicolumn.py index 4465014c847..f08c9c5d3de 100644 --- a/python/tests/stress/arcticdb/version_store/test_stress_multicolumn.py +++ b/python/tests/stress/arcticdb/version_store/test_stress_multicolumn.py @@ -13,6 +13,7 @@ import sys from arcticdb.util.test import assert_frame_equal +from arcticdb.util._versions import IS_PANDAS_TWO from arcticdb_ext.tools import AZURE_SUPPORT @@ -81,4 +82,9 @@ def test_stress_multicolumn(lib_type, request): output_df = lib.read(name).data print("reading from arctic native: {}".format(pd.Timestamp("now") - now)) + if IS_PANDAS_TWO and test_data.empty: + # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created. + # The index has to be converted to a DatetimeIndex by ArcticDB to perform updates. + test_data.index = test_data.index.astype("datetime64[ns]") + assert_frame_equal(test_data, output_df) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_writes.py b/python/tests/unit/arcticdb/version_store/test_empty_writes.py index 070d8de8a92..b061f4d060b 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_writes.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_writes.py @@ -10,6 +10,7 @@ from arcticdb.version_store._common import TimeFrame from arcticdb.util.test import assert_frame_equal, assert_series_equal +from arcticdb.util._versions import IS_PANDAS_TWO def test_write_no_rows(lmdb_version_store, sym): @@ -134,6 +135,12 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym): ser = pd.Series([]) lmdb_version_store_dynamic_schema.write(sym, ser) assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym) + if IS_PANDAS_TWO: + # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created. + # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with + # Pandas 1.0. + ser.index = ser.index.astype("datetime64[ns]") + assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser) @@ -141,5 +148,32 @@ def test_fallback_to_pickle(lmdb_version_store, sym): column_names = ["a", "b", "c"] df = pd.DataFrame(columns=column_names) lmdb_version_store.write(sym, df) - assert lmdb_version_store.is_symbol_pickled(sym) - assert_frame_equal(df, lmdb_version_store.read(sym).data) + + # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created. + # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0. + assert isinstance(df.index, pd.RangeIndex if IS_PANDAS_TWO else pd.Index) + + if IS_PANDAS_TWO: + # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created. + # The index has to be converted to a DatetimeIndex by ArcticDB to perform updates. + df.index = df.index.astype("datetime64[ns]") + + # Before Pandas 2.0, empty Series' dtype was "float64" and empty DataFrames' Columns' dtype was "object". + # As of Pandas 2.0, empty Series' dtype is "object" and empty DataFrames' Columns' dtype remains "object". + # See: https://github.com/pandas-dev/pandas/issues/17261 + # When normalizing in Pandas 2.0, we convert empty Series' dtype to float to "float64" to be consistent + # with the behavior of ArcticDB with Pandas 1.0. + # The same logic is used to normalize empty DataFrames' columns. + + # Hence: + if IS_PANDAS_TWO: + # In Pandas 2.0, empty Dataframes can now be stored without being pickled. + assert not lmdb_version_store.is_symbol_pickled(sym) + # and ArcticDB returns empty DataFrames with float64 for all columns if they have not been specified. + df = df.astype("float64", copy=False) + assert_frame_equal(df, lmdb_version_store.read(sym).data) + else: + # In Pandas 1.0, empty Dataframes are pickled. + assert lmdb_version_store.is_symbol_pickled(sym) + # and ArcticDB simply deserialize empty DataFrames. + assert_frame_equal(df, lmdb_version_store.read(sym).data) diff --git a/python/tests/unit/arcticdb/version_store/test_filtering.py b/python/tests/unit/arcticdb/version_store/test_filtering.py index 9975ffe1291..4a61de8254e 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering.py @@ -27,7 +27,8 @@ from arcticdb_ext.storage import NoDataFoundException from arcticdb.version_store.processing import QueryBuilder from arcticdb_ext.exceptions import InternalException, UserInputException -from arcticdb.util.test import assert_frame_equal, PANDAS_VERSION +from arcticdb.util.test import assert_frame_equal +from arcticdb.util._versions import PANDAS_VERSION from arcticdb.util.hypothesis import ( use_of_function_scoped_fixtures_in_hypothesis_checked, integral_type_strategies, diff --git a/python/tests/unit/arcticdb/version_store/test_filtering_dynamic.py b/python/tests/unit/arcticdb/version_store/test_filtering_dynamic.py index 6df54ba4467..fe9d5868b69 100644 --- a/python/tests/unit/arcticdb/version_store/test_filtering_dynamic.py +++ b/python/tests/unit/arcticdb/version_store/test_filtering_dynamic.py @@ -5,12 +5,16 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import sys + from hypothesis import assume, given, settings from hypothesis.extra.pandas import column, data_frames, range_indexes import hypothesis.strategies as st import numpy as np import pandas as pd +from arcticdb.util._versions import IS_PANDAS_TWO + try: from pandas.errors import UndefinedVariableError except ImportError: @@ -312,7 +316,20 @@ def test_filter_column_not_present_dynamic(lmdb_version_store_dynamic_schema): symbol = "test_filter_column_not_present_static" lmdb_version_store_dynamic_schema.write(symbol, df) vit = lmdb_version_store_dynamic_schema.read(symbol, query_builder=q) - expected = pd.DataFrame({"a": pd.Series(dtype="int64")}, index=pd.Index([], dtype="int64")) + + if IS_PANDAS_TWO and sys.platform.startswith("win32"): + # Pandas 2.0.0 changed the behavior of Index creation from numpy arrays: + # "Previously, all indexes created from numpy numeric arrays were forced to 64-bit. + # Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems, + # where it previously would have been int64 even on 32-bit systems. + # Instantiating Index using a list of numbers will still return 64bit dtypes, + # e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously." + # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes + index_dtype = "int32" + else: + index_dtype = "int64" + + expected = pd.DataFrame({"a": pd.Series(dtype="int64")}, index=pd.Index([], dtype=index_dtype)) assert_frame_equal(vit.data, expected) diff --git a/python/tests/unit/arcticdb/version_store/test_normalization.py b/python/tests/unit/arcticdb/version_store/test_normalization.py index 16c974814f7..ddf8ede630b 100644 --- a/python/tests/unit/arcticdb/version_store/test_normalization.py +++ b/python/tests/unit/arcticdb/version_store/test_normalization.py @@ -42,11 +42,10 @@ TestCustomNormalizer, assert_frame_equal, assert_series_equal, - IS_PANDAS_ZERO, ) +from arcticdb.util._versions import IS_PANDAS_ZERO, IS_PANDAS_TWO from arcticdb.exceptions import ArcticNativeException - params = { "simple_dict": {"a": "1", "b": 2, "c": 3.0, "d": True}, "pd_ts": {"a": pd.Timestamp("2018-01-12 09:15"), "b": pd.Timestamp("2017-01-31", tz="America/New_York")}, @@ -171,7 +170,19 @@ def test_empty_df_with_multiindex_with_tz(tz): assert sliced_denorm_df.index.names == orig_df.index.names for index_level_num in [0, 1]: - assert sliced_denorm_df.index.levels[index_level_num].tz.zone == orig_df.index.levels[index_level_num].tz.zone + sliced_denorm_df_index_level_num = sliced_denorm_df.index.levels[index_level_num] + orig_df_index_level_num = orig_df.index.levels[index_level_num] + if IS_PANDAS_TWO and tz is pytz.UTC: + # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`. + # See: https://github.com/pandas-dev/pandas/issues/34916 + # TODO: is there a better way to handle this edge case? + assert sliced_denorm_df_index_level_num.tz == datetime.timezone.utc + assert isinstance(orig_df_index_level_num.tz, pytz.BaseTzInfo) + assert sliced_denorm_df_index_level_num.dtype == orig_df_index_level_num.dtype == "datetime64[ns, UTC]" + else: + assert isinstance(sliced_denorm_df_index_level_num.tz, pytz.BaseTzInfo) + assert isinstance(orig_df_index_level_num.tz, pytz.BaseTzInfo) + assert sliced_denorm_df_index_level_num.tz.zone == orig_df_index_level_num.tz.zone def test_timestamp_without_tz(): diff --git a/python/tests/unit/arcticdb/version_store/test_parallel.py b/python/tests/unit/arcticdb/version_store/test_parallel.py index 54ca1e3025f..3b3ea29bbd8 100644 --- a/python/tests/unit/arcticdb/version_store/test_parallel.py +++ b/python/tests/unit/arcticdb/version_store/test_parallel.py @@ -19,6 +19,8 @@ random_dates, ) +from arcticdb.util._versions import IS_PANDAS_TWO + def test_parallel_write(lmdb_version_store): sym = "parallel" @@ -182,7 +184,6 @@ def test_datetimes_to_nats(lmdb_version_store): index = pd.Index([dt + datetime.timedelta(seconds=s) for s in range(num_rows_per_day)]) vals = {c: random_dates(num_rows_per_day) for c in cols} new_df = pd.DataFrame(data=vals, index=index) - dataframes.append(new_df) df = pd.concat((df, new_df)) dt = dt + datetime.timedelta(days=1) @@ -196,4 +197,12 @@ def test_datetimes_to_nats(lmdb_version_store): df.sort_index(axis=1, inplace=True) result = vit.data result.sort_index(axis=1, inplace=True) - assert_frame_equal(vit.data, df) + + if IS_PANDAS_TWO: + # In Pandas < 2.0, `datetime64[ns]` was _always_ used. `datetime64[ns]` is also used by ArcticDB. + # In Pandas >= 2.0, the `datetime64` can be used with other resolutions (namely 's', 'ms', and 'us'). + # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution # noqa + # Hence, we convert to the largest resolution (which is guaranteed to be the ones of the original `df`). + result = result.astype(df.dtypes) + + assert_frame_equal(result, df) diff --git a/python/tests/util/mark.py b/python/tests/util/mark.py index cd5c098ec50..5b21e21c3d5 100644 --- a/python/tests/util/mark.py +++ b/python/tests/util/mark.py @@ -13,7 +13,7 @@ from numpy import datetime64 from packaging import version -from arcticdb.util.test import PANDAS_VERSION +from arcticdb.util._versions import PANDAS_VERSION def _no_op_decorator(fun):