Skip to content

Commit 69ba1bd

Browse files
itholicdongjoon-hyun
authored andcommitted
[SPARK-45164][PS] Remove deprecated Index APIs
### What changes were proposed in this pull request? This PR proposes to remove deprecated `Index` APIs from Pandas API on Spark. ### Why are the changes needed? To follow the behavior of the latest Pandas. See pandas-dev/pandas#37877 for `Index.asi8` See pandas-dev/pandas#42113 for `Index.is_type_compatible` ### Does this PR introduce _any_ user-facing change? `Index.asi8`, `Index.is_type_compatible` is removed. `Index.astype` and `Index.isin` can be used instead respectively. ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #42926 from itholic/SPARK-45164. Authored-by: Haejoon Lee <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 1b7dbf7 commit 69ba1bd

File tree

4 files changed

+2
-210
lines changed

4 files changed

+2
-210
lines changed

python/docs/source/migration_guide/pyspark_upgrade.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ Upgrading from PySpark 3.5 to 4.0
4242
* In Spark 4.0, ``squeeze`` parameter from ``ps.read_csv`` and ``ps.read_excel`` has been removed from pandas API on Spark.
4343
* In Spark 4.0, ``null_counts`` parameter from ``DataFrame.info`` has been removed from pandas API on Spark, use ``show_counts`` instead.
4444
* In Spark 4.0, the result of ``MultiIndex.append`` does not keep the index names from pandas API on Spark.
45+
* In Spark 4.0, ``Index.asi8`` has been removed from pandas API on Spark, use ``Index.astype`` instead.
46+
* In Spark 4.0, ``Index.is_type_compatible`` has been removed from pandas API on Spark, use ``Index.isin`` instead.
4547
* In Spark 4.0, ``col_space`` parameter from ``DataFrame.to_latex`` and ``Series.to_latex`` has been removed from pandas API on Spark.
4648

4749

python/pyspark/pandas/indexes/base.py

Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -624,44 +624,6 @@ def values(self) -> np.ndarray:
624624
warnings.warn("We recommend using `{}.to_numpy()` instead.".format(type(self).__name__))
625625
return self.to_numpy()
626626

627-
@property
628-
def asi8(self) -> np.ndarray:
629-
"""
630-
Integer representation of the values.
631-
632-
.. warning:: We recommend using `Index.to_numpy()` instead.
633-
634-
.. note:: This method should only be used if the resulting NumPy ndarray is expected
635-
to be small, as all the data is loaded into the driver's memory.
636-
637-
.. deprecated:: 3.4.0
638-
639-
Returns
640-
-------
641-
numpy.ndarray
642-
An ndarray with int64 dtype.
643-
644-
Examples
645-
--------
646-
>>> ps.Index([1, 2, 3]).asi8
647-
array([1, 2, 3])
648-
649-
Returns None for non-int64 dtype
650-
651-
>>> ps.Index(['a', 'b', 'c']).asi8 is None
652-
True
653-
"""
654-
warnings.warn(
655-
"Index.asi8 is deprecated and will be removed in 4.0.0. " "Use Index.astype instead.",
656-
FutureWarning,
657-
)
658-
if isinstance(self.spark.data_type, IntegralType):
659-
return self.to_numpy()
660-
elif isinstance(self.spark.data_type, (TimestampType, TimestampNTZType)):
661-
return np.array(list(map(lambda x: x.astype(np.int64), self.to_numpy())))
662-
else:
663-
return None
664-
665627
@property
666628
def has_duplicates(self) -> bool:
667629
"""
@@ -1118,31 +1080,6 @@ def is_object(self) -> bool:
11181080
"""
11191081
return is_object_dtype(self.dtype)
11201082

1121-
def is_type_compatible(self, kind: str) -> bool:
1122-
"""
1123-
Whether the index type is compatible with the provided type.
1124-
1125-
.. deprecated:: 3.4.0
1126-
1127-
Examples
1128-
--------
1129-
>>> psidx = ps.Index([1, 2, 3])
1130-
>>> psidx.is_type_compatible('integer')
1131-
True
1132-
1133-
>>> psidx = ps.Index([1.0, 2.0, 3.0])
1134-
>>> psidx.is_type_compatible('integer')
1135-
False
1136-
>>> psidx.is_type_compatible('floating')
1137-
True
1138-
"""
1139-
warnings.warn(
1140-
"Index.is_type_compatible is deprecated and will be removed in 4.0.0. "
1141-
"Use Index.isin instead.",
1142-
FutureWarning,
1143-
)
1144-
return kind == self.inferred_type
1145-
11461083
def dropna(self, how: str = "any") -> "Index":
11471084
"""
11481085
Return Index or MultiIndex without NA/NaN values

python/pyspark/pandas/indexes/multi.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,14 +1267,6 @@ def inferred_type(self) -> str:
12671267
# Always returns "mixed" for MultiIndex
12681268
return "mixed"
12691269

1270-
@property
1271-
def asi8(self) -> None:
1272-
"""
1273-
Integer representation of the values.
1274-
"""
1275-
# Always returns None for MultiIndex
1276-
return None
1277-
12781270
def factorize(
12791271
self, sort: bool = True, na_sentinel: Optional[int] = -1
12801272
) -> Tuple["MultiIndex", pd.Index]:

python/pyspark/pandas/tests/indexes/test_base.py

Lines changed: 0 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1581,145 +1581,6 @@ def test_multiindex_from_frame(self):
15811581
psdf = ps.from_pandas(pdf)
15821582
self.assert_eq(ps.MultiIndex.from_frame(psdf), pd.MultiIndex.from_frame(pdf))
15831583

1584-
def test_is_type_compatible(self):
1585-
data_types = ["integer", "floating", "string", "boolean"]
1586-
# Integer
1587-
pidx = pd.Index([1, 2, 3])
1588-
psidx = ps.from_pandas(pidx)
1589-
# is_type_compatible is removed from pandas 2.0.0.
1590-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1591-
expected_results = [True, False, False, False]
1592-
for data_type, expected_result in zip(data_types, expected_results):
1593-
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
1594-
else:
1595-
for data_type in data_types:
1596-
self.assert_eq(
1597-
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
1598-
)
1599-
1600-
# Floating
1601-
pidx = pd.Index([1.0, 2.0, 3.0])
1602-
psidx = ps.from_pandas(pidx)
1603-
# is_type_compatible is removed from pandas 2.0.0.
1604-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1605-
expected_results = [False, True, False, False]
1606-
for data_type, expected_result in zip(data_types, expected_results):
1607-
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
1608-
else:
1609-
for data_type in data_types:
1610-
self.assert_eq(
1611-
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
1612-
)
1613-
1614-
# String
1615-
pidx = pd.Index(["a", "b", "c"])
1616-
psidx = ps.from_pandas(pidx)
1617-
# is_type_compatible is removed from pandas 2.0.0.
1618-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1619-
expected_results = [False, False, True, False]
1620-
for data_type, expected_result in zip(data_types, expected_results):
1621-
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
1622-
else:
1623-
for data_type in data_types:
1624-
self.assert_eq(
1625-
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
1626-
)
1627-
1628-
# Boolean
1629-
pidx = pd.Index([True, False, True, False])
1630-
psidx = ps.from_pandas(pidx)
1631-
# is_type_compatible is removed from pandas 2.0.0.
1632-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1633-
expected_results = [False, False, False, True]
1634-
for data_type, expected_result in zip(data_types, expected_results):
1635-
self.assert_eq(psidx.is_type_compatible(data_type), expected_result)
1636-
else:
1637-
for data_type in data_types:
1638-
self.assert_eq(
1639-
pidx.is_type_compatible(data_type), psidx.is_type_compatible(data_type)
1640-
)
1641-
1642-
# MultiIndex
1643-
pmidx = pd.MultiIndex.from_tuples([("a", "x")])
1644-
psmidx = ps.from_pandas(pmidx)
1645-
# is_type_compatible is removed from pandas 2.0.0.
1646-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1647-
expected_results = [False, False, False, False]
1648-
for data_type, expected_result in zip(data_types, expected_results):
1649-
self.assert_eq(psmidx.is_type_compatible(data_type), expected_result)
1650-
else:
1651-
for data_type in data_types:
1652-
self.assert_eq(
1653-
pmidx.is_type_compatible(data_type), psmidx.is_type_compatible(data_type)
1654-
)
1655-
1656-
def test_asi8(self):
1657-
# Integer
1658-
pidx = pd.Index([1, 2, 3])
1659-
psidx = ps.from_pandas(pidx)
1660-
# asi8 is removed from pandas 2.0.0.
1661-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1662-
self.assert_eq(np.array(pidx), psidx.asi8)
1663-
self.assert_eq(np.array(pidx.astype("int")), psidx.astype("int").asi8)
1664-
self.assert_eq(np.array(pidx.astype("int16")), psidx.astype("int16").asi8)
1665-
self.assert_eq(np.array(pidx.astype("int8")), psidx.astype("int8").asi8)
1666-
else:
1667-
self.assert_eq(pidx.asi8, psidx.asi8)
1668-
self.assert_eq(pidx.astype("int").asi8, psidx.astype("int").asi8)
1669-
self.assert_eq(pidx.astype("int16").asi8, psidx.astype("int16").asi8)
1670-
self.assert_eq(pidx.astype("int8").asi8, psidx.astype("int8").asi8)
1671-
1672-
# Integer with missing value
1673-
pidx = pd.Index([1, 2, None, 4, 5])
1674-
psidx = ps.from_pandas(pidx)
1675-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1676-
self.assert_eq(None, psidx.asi8)
1677-
else:
1678-
self.assert_eq(pidx.asi8, psidx.asi8)
1679-
1680-
# Datetime
1681-
pidx = pd.date_range(end="1/1/2018", periods=3)
1682-
psidx = ps.from_pandas(pidx)
1683-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1684-
self.assert_eq(
1685-
np.array([1514592000000000000, 1514678400000000000, 1514764800000000000]),
1686-
psidx.asi8,
1687-
)
1688-
else:
1689-
self.assert_eq(pidx.asi8, psidx.asi8)
1690-
1691-
# Floating
1692-
pidx = pd.Index([1.0, 2.0, 3.0])
1693-
psidx = ps.from_pandas(pidx)
1694-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1695-
self.assert_eq(None, psidx.asi8)
1696-
else:
1697-
self.assert_eq(pidx.asi8, psidx.asi8)
1698-
1699-
# String
1700-
pidx = pd.Index(["a", "b", "c"])
1701-
psidx = ps.from_pandas(pidx)
1702-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1703-
self.assert_eq(None, psidx.asi8)
1704-
else:
1705-
self.assert_eq(pidx.asi8, psidx.asi8)
1706-
1707-
# Boolean
1708-
pidx = pd.Index([True, False, True, False])
1709-
psidx = ps.from_pandas(pidx)
1710-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1711-
self.assert_eq(None, psidx.asi8)
1712-
else:
1713-
self.assert_eq(pidx.asi8, psidx.asi8)
1714-
1715-
# MultiIndex
1716-
pmidx = pd.MultiIndex.from_tuples([(1, 2)])
1717-
psmidx = ps.from_pandas(pmidx)
1718-
if LooseVersion(pd.__version__) >= LooseVersion("2.0.0"):
1719-
self.assert_eq(None, psmidx.asi8)
1720-
else:
1721-
self.assert_eq(pmidx.asi8, psmidx.asi8)
1722-
17231584
def test_index_is_unique(self):
17241585
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
17251586
names = [None, "ks", "ks", None]

0 commit comments

Comments
 (0)