diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b2c1e38f61f4c..d3e6db040aba6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11046,6 +11046,7 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: MergeValidate | None = None, @@ -11065,6 +11066,7 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 68d61da0cf7dd..8d2b35d33969f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -153,6 +153,7 @@ def merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, copy: bool | lib.NoDefault = lib.no_default, indicator: str | bool = False, validate: str | None = None, @@ -395,6 +396,7 @@ def merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) @@ -411,6 +413,7 @@ def _cross_merge( right_index: bool = False, sort: bool = False, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, indicator: str | bool = False, validate: str | None = None, ) -> DataFrame: @@ -447,6 +450,7 @@ def _cross_merge( right_index=right_index, sort=sort, suffixes=suffixes, + force_suffixes=force_suffixes, indicator=indicator, validate=validate, ) @@ -966,6 +970,7 @@ def __init__( right_index: bool = False, sort: bool = True, suffixes: Suffixes = ("_x", "_y"), + force_suffixes: bool = False, indicator: str | bool = False, validate: str | None = None, ) -> None: @@ -978,6 +983,8 @@ def __init__( self.on = com.maybe_make_list(on) self.suffixes = suffixes + self.force_suffixes = force_suffixes + self.sort = sort or how == "outer" self.left_index = left_index @@ -1088,7 +1095,7 @@ def _reindex_and_concat( right = self.right[:] llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, self.right._info_axis, self.suffixes + self.left._info_axis, self.right._info_axis, self.suffixes, self.force_suffixes ) if left_indexer is not None and not is_range_indexer(left_indexer, len(left)): @@ -3007,7 +3014,7 @@ def _validate_operand(obj: DataFrame | Series) -> DataFrame: def _items_overlap_with_suffix( - left: Index, right: Index, suffixes: Suffixes + left: Index, right: Index, suffixes: Suffixes, force_suffixes: bool = False ) -> tuple[Index, Index]: """ Suffixes type validation. @@ -3023,7 +3030,11 @@ def _items_overlap_with_suffix( "Provide 'suffixes' as a tuple instead." ) - to_rename = left.intersection(right) + if not force_suffixes: + to_rename = left.intersection(right) + else: + to_rename = left.union(right) + if len(to_rename) == 0: return left, right diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..b1405220fab5b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2369,6 +2369,77 @@ def test_merge_suffix(col1, col2, kwargs, expected_cols): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("force_suffixes", [False, True]) +def test_merge_suffix_with_force_simple(force_suffixes): + df1 = pd.DataFrame({ + 'ID': [1, 2, 3], + 'Value': ['A', 'B', 'C'] + }) + + df2 = pd.DataFrame({ + 'ID': [2, 3, 4], + 'Value': ['D', 'E', 'F'] + }) + + if force_suffixes: + expected = DataFrame([[2, 2, "B", "D"], [3, 3, "C", "E"]], + columns=["ID_left", "Value_left", "ID_right", "Value_right"]) + else: + expected = DataFrame([[2, "B", "D"], [3, "C", "E"]], + columns=["ID", "Value_left", "Value_right"]) + + result = merge(df1, df2, on="ID", suffixes=("_left", "_right"), + force_suffixes=force_suffixes) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize("force_suffixes", [False, True]) +def test_merge_suffix_with_force_multi_column(force_suffixes): + a = DataFrame({"A": [1, 2, 3, 98], "B": [4, 5, 6, 99], "ALPHABET": ["A", "B", "C", "Z"]}) + b = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "alphabet": ["a", "b", "c"]}) + + if force_suffixes: + expected = DataFrame([[1, 4, "A", 1, 4, "a"], [2, 5, "B", 2, 5, "b"], [3, 6, "C", 3, 6, "c"]], + columns=["A_x", "B_x", "ALPHABET_x", "a_y", "b_y", "alphabet_y"]) + else: + expected = DataFrame([[1, 4, "A", 1, 4, "a"], [2, 5, "B", 2, 5, "b"], [3, 6, "C", 3, 6, "c"]], + columns=["A", "B", "ALPHABET", "a", "b", "alphabet"]) + + result = merge(a, b, left_on=["A", "B"], right_on=["a", "b"], + force_suffixes=force_suffixes) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "col1, col2, kwargs, expected_cols", + [ + (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]), + (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]), + (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]), + (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]), + ("a", 0, {"suffixes": (None, "_y")}, ["a", "0_y"]), + (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]), + ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]), + ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]), + ("a", "b", {"suffixes": ("_x", None)}, ["a_x", "b"]), + ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]), + (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]), + ("a", "a", {}, ["a_x", "a_y"]), + (0, 0, {}, ["0_x", "0_y"]), + ], +) +def test_merge_suffix_with_force(col1, col2, kwargs, expected_cols): + # issue: 24782 + a = DataFrame({col1: [1, 2, 3]}) + b = DataFrame({col2: [4, 5, 6]}) + + expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols) + + result = a.merge(b, left_index=True, right_index=True, force_suffixes=True, **kwargs) + tm.assert_frame_equal(result, expected) + + result = merge(a, b, left_index=True, right_index=True, force_suffixes=True, **kwargs) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "how,expected", [ @@ -2577,6 +2648,7 @@ def test_categorical_non_unique_monotonic(n_categories): tm.assert_frame_equal(expected, result) + def test_merge_join_categorical_multiindex(): # From issue 16627 a = {