From e2492bf1beee7d5168cfb7fe74664fc01f6b0620 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 27 May 2019 13:16:05 +0200 Subject: [PATCH 01/53] Initial version of SPSS reader --- pandas/__init__.py | 2 +- pandas/io/api.py | 1 + pandas/io/read_spss.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 pandas/io/read_spss.py diff --git a/pandas/__init__.py b/pandas/__init__.py index a2fa14be83998..b95c312f12eed 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ # misc read_clipboard, read_parquet, read_feather, read_gbq, - read_html, read_json, read_stata, read_sas) + read_html, read_json, read_stata, read_sas, read_spss) from pandas.util._tester import test import pandas.testing diff --git a/pandas/io/api.py b/pandas/io/api.py index 8c8d7cf73b37a..e5e635716dbb8 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -18,3 +18,4 @@ from pandas.io.sas import read_sas from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata +from pandas.io.spss import read_spss diff --git a/pandas/io/read_spss.py b/pandas/io/read_spss.py new file mode 100644 index 0000000000000..5cbf91c47a0c2 --- /dev/null +++ b/pandas/io/read_spss.py @@ -0,0 +1,19 @@ +def read_spss(path): + """ + Load an parquet object from the file path, returning a DataFrame. + + .. versionadded 0.24.3 + + Parameters + ---------- + path : string + File path + + Returns + ------- + DataFrame + """ + + from pyreadstat import read_sav + df, _ = read_sav(path) + return df From b3581b2f7ba7eb1baa771394fdaaa284292b62f0 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 27 May 2019 13:25:56 +0200 Subject: [PATCH 02/53] Rename file --- pandas/io/{read_spss.py => spss.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/io/{read_spss.py => spss.py} (100%) diff --git a/pandas/io/read_spss.py b/pandas/io/spss.py similarity index 100% rename from pandas/io/read_spss.py rename to pandas/io/spss.py From 6db09411b9fb8557ce38f5e3f9555f1fc24e80eb Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 27 May 2019 13:38:56 +0200 Subject: [PATCH 03/53] Add usecols and categorical optional parameters --- pandas/io/spss.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 5cbf91c47a0c2..f386b965b00b4 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,6 +1,6 @@ -def read_spss(path): +def read_spss(path, usecols=None, categorical=True): """ - Load an parquet object from the file path, returning a DataFrame. + Load an SPSS file from the file path, returning a DataFrame. .. versionadded 0.24.3 @@ -8,6 +8,10 @@ def read_spss(path): ---------- path : string File path + usecols : str or list-like or None + Return a subset of the columns. If None, return all columns. + categorical : bool + Convert categorical columns into pd.Categorical. Returns ------- @@ -15,5 +19,9 @@ def read_spss(path): """ from pyreadstat import read_sav - df, _ = read_sav(path) + if usecols is not None: + if isinstance(usecols, str): + usecols = [usecols] + df, _ = read_sav(path, usecols=usecols, + apply_value_formats=use_categorical) return df From 27a2768ae274d602030401a4ee60a69624f6fcd3 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 27 May 2019 13:39:08 +0200 Subject: [PATCH 04/53] Fix typo --- pandas/io/spss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index f386b965b00b4..0255835030d61 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -23,5 +23,5 @@ def read_spss(path, usecols=None, categorical=True): if isinstance(usecols, str): usecols = [usecols] df, _ = read_sav(path, usecols=usecols, - apply_value_formats=use_categorical) + apply_value_formats=categorical) return df From 554fd3f25faae3c46bd9b9f2aa317773e4950b5a Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 10:46:41 +0200 Subject: [PATCH 05/53] Add tests --- pandas/tests/io/data/labelled-num-na.sav | Bin 0 -> 535 bytes pandas/tests/io/data/labelled-num.sav | Bin 0 -> 507 bytes pandas/tests/io/data/labelled-str.sav | Bin 0 -> 525 bytes pandas/tests/io/data/umlauts.sav | Bin 0 -> 567 bytes pandas/tests/io/test_spss.py | 65 +++++++++++++++++++++++ 5 files changed, 65 insertions(+) create mode 100755 pandas/tests/io/data/labelled-num-na.sav create mode 100755 pandas/tests/io/data/labelled-num.sav create mode 100755 pandas/tests/io/data/labelled-str.sav create mode 100755 pandas/tests/io/data/umlauts.sav create mode 100644 pandas/tests/io/test_spss.py diff --git a/pandas/tests/io/data/labelled-num-na.sav b/pandas/tests/io/data/labelled-num-na.sav new file mode 100755 index 0000000000000000000000000000000000000000..fbe6ee77672406ba5e28289d88621faf68c72ee3 GIT binary patch literal 535 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyPy#D{=L2Lm$&x6~vBLsLT&D`OKYBO^4uBy)iV!_0@<$-&CN0K{RAK?Xoz z#0OO4pO;gqke`>TP?nfenyR1xagUM%Gmrt&2LT`KxkEBCixq$fq!tKRfD$ls*@0}3 zdXN|g5Q_mZ3|s&@0;Ue+o|k_a7(nV_YC-z`gTZ?U_5XeSKPVrhjvJ~Lqz(i?=7Y=w z$?-zjfw86p26; W0mF_VG{jBU;yKhUskuNAAOHZej8AF+ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-num.sav b/pandas/tests/io/data/labelled-num.sav new file mode 100755 index 0000000000000000000000000000000000000000..bfab052089d7e62d2e9747c51434cb3a42156278 GIT binary patch literal 507 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#Xt(o2GI-*kq!oC3T~-M3WlbJCRWDAR>o#%dP(L2O$FJ90ytP17=Sp;F~|T2 zjF^FvAooDP2Yc?2jLc#MAj;261=+^}lz_RH9moc$2Z?b2u^14;zy+X1Fm)h1U;bra z0I7$m1?l?_2Ja!%|M&I(pnQ-zZm3$2IuHPv4>AuV#|vc(0%@4LZD9T$&(7B2Y!Z@L>oIanrSU4s}Z^Pyhq~H&;mv literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/labelled-str.sav b/pandas/tests/io/data/labelled-str.sav new file mode 100755 index 0000000000000000000000000000000000000000..b96a9c00fcec10b33cf35b2d3d87bef396fcd3ad GIT binary patch literal 525 zcmbVIT}uK%6dlV-j3CMfA1^4V55|6&#DcJCt}L_>&PMcDID}vt8Tja5^k0Ne{sOam z+PTnkXE@xsbI-kJW~$Mx7uG8cin|Hvd#y>Q*J-TNxTmSzYQBs=Dbe&eo{naVIeD!M z2a5!IN~xSB2ZcPtQ|S7n%{#em*AF}=xb&szzmW%vpSY+TyE6yv0&F zx95qW#OC<~8Bv~fa_=MFqYq~VW|=8iv7zYTz1}JXy=c+5`^6>;yUp_3=FlBmEp(WJ z`2cDsOq?NR_wT%#>BxMbc*=zM?}M=iP(Nd$`J9<`1=Vmko0xjdsTCWLl&s`{<3k!X RufA{##+Dxe$fw9>{Qxb9P$&QZ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/umlauts.sav b/pandas/tests/io/data/umlauts.sav new file mode 100755 index 0000000000000000000000000000000000000000..e99cf1267bebebd16bdfe881579e6e319aa10986 GIT binary patch literal 567 zcmY#!^D%PJP}WrNbn;aQ4hRlb2o7-!@eB^}bPiVV4OR%x%uC5HFIF%z(lY=-1vJ3K zz`(!=#XyRI1w?>Bq=S*Mf?H~mf}yFQk(G(Dm5CXeUXr;$b3yij0LV@-;9zB70OBym zAVZLOB0#~AjLc#MAWFj2vPtHTZYgOH(iV8Q2SHUQqzHA3;+QaUGV?_ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py new file mode 100644 index 0000000000000..c7a95cf281f23 --- /dev/null +++ b/pandas/tests/io/test_spss.py @@ -0,0 +1,65 @@ +import numpy as np +import pandas as pd +from pandas.io.spss import read_spss +from pandas.util import testing as tm + + +data = ["data/labelled-num.sav", + "data/labelled-num-na.sav", + "data/labelled-str.sav", + "data/umlauts.sav"] + + +def test_spss_labelled_num(): + fname = "data/labelled-num.sav" + + df = read_spss(fname, categorical=True) + expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = read_spss(fname, categorical=False) + expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_num_na(): + fname = "data/labelled-num-na.sav" + + df = read_spss(fname, categorical=True) + expected = pd.DataFrame({"VAR00002": ["This is one", None]}) + expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) + tm.assert_frame_equal(df, expected) + + df = read_spss(fname, categorical=False) + expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_labelled_str(): + fname = "data/labelled-str.sav" + + df = read_spss(fname, categorical=True) + expected = pd.DataFrame({"gender": ["Male", "Female"]}) + expected["gender"] = pd.Categorical(expected["gender"]) + tm.assert_frame_equal(df, expected) + + df = read_spss(fname, categorical=False) + expected = pd.DataFrame({"gender": ["M", "F"]}) + tm.assert_frame_equal(df, expected) + + +def test_spss_umlauts(): + fname = "data/umlauts.sav" + + df = read_spss(fname, categorical=True) + expected = pd.DataFrame({"var1": ["the ä umlaut", + "the ü umlaut", + "the ä umlaut", + "the ö umlaut"]}) + expected["var1"] = pd.Categorical(expected["var1"]) + tm.assert_frame_equal(df, expected) + + df = read_spss(fname, categorical=False) + expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) + tm.assert_frame_equal(df, expected) From d8b2cb8251c5cc03a88f5fb0305b89736978e2cb Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 10:55:51 +0200 Subject: [PATCH 06/53] Skip tests if pyreadstat is not available --- pandas/tests/io/test_spss.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index c7a95cf281f23..aaab3fb4f4c8b 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -1,15 +1,19 @@ +import pytest import numpy as np import pandas as pd from pandas.io.spss import read_spss from pandas.util import testing as tm -data = ["data/labelled-num.sav", - "data/labelled-num-na.sav", - "data/labelled-str.sav", - "data/umlauts.sav"] +try: + import pyreadstat +except ImportError: + _HAVE_PYREADSTAT = False +else: + _HAVE_PYREADSTAT = True +@pytest.mark.skipif(not _HAVE_PYREADSTAT) def test_spss_labelled_num(): fname = "data/labelled-num.sav" @@ -23,6 +27,7 @@ def test_spss_labelled_num(): tm.assert_frame_equal(df, expected) +@pytest.mark.skipif(not _HAVE_PYREADSTAT) def test_spss_labelled_num_na(): fname = "data/labelled-num-na.sav" @@ -36,6 +41,7 @@ def test_spss_labelled_num_na(): tm.assert_frame_equal(df, expected) +@pytest.mark.skipif(not _HAVE_PYREADSTAT) def test_spss_labelled_str(): fname = "data/labelled-str.sav" @@ -49,6 +55,7 @@ def test_spss_labelled_str(): tm.assert_frame_equal(df, expected) +@pytest.mark.skipif(not _HAVE_PYREADSTAT) def test_spss_umlauts(): fname = "data/umlauts.sav" From 7640448ed1d7f75c99aa212623fc04e825a8579e Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 10:59:59 +0200 Subject: [PATCH 07/53] Add pyreadstat to Travis (just 37 for now to see if it works) --- ci/deps/travis-37.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 722a35111ab01..7694f30a2abb0 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -21,3 +21,4 @@ dependencies: - pip - pip: - moto + - pyreadstat From 8fc9ee5cf50c2f78c222c19e0f69070a3d0e6a63 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 11:20:02 +0200 Subject: [PATCH 08/53] Ignore flake8 F401 --- pandas/tests/io/test_spss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index aaab3fb4f4c8b..ef9d42a4dbd1a 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -6,7 +6,7 @@ try: - import pyreadstat + import pyreadstat # noqa except ImportError: _HAVE_PYREADSTAT = False else: From 01fd5ece8dd1f2e2305bd79373d03c4518f60c23 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 11:35:16 +0200 Subject: [PATCH 09/53] Update whatsnew --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 78d3d8fcb3d01..66279347d17ed 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -99,6 +99,7 @@ Other Enhancements - Error message for missing required imports now includes the original import error's text (:issue:`23868`) - :class:`DatetimeIndex` and :class:`TimedeltaIndex` now have a ``mean`` method (:issue:`24757`) - :meth:`DataFrame.describe` now formats integer percentiles without decimal point (:issue:`26660`) +- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`) .. _whatsnew_0250.api_breaking: From 57bc84c88b8ac1886fd78ff381c7eb2734457e14 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 11:35:59 +0200 Subject: [PATCH 10/53] Change versionadded to 0.25.0 --- pandas/io/spss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 0255835030d61..2ff612c2872e3 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -2,7 +2,7 @@ def read_spss(path, usecols=None, categorical=True): """ Load an SPSS file from the file path, returning a DataFrame. - .. versionadded 0.24.3 + .. versionadded 0.25.0 Parameters ---------- From b515ecc0f67b922179467be3b5f1bfd6565137fd Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 11:55:32 +0200 Subject: [PATCH 11/53] Specify reason for skipif --- pandas/tests/io/test_spss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ef9d42a4dbd1a..4b6ad55f86df3 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -13,7 +13,7 @@ _HAVE_PYREADSTAT = True -@pytest.mark.skipif(not _HAVE_PYREADSTAT) +@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num(): fname = "data/labelled-num.sav" @@ -27,7 +27,7 @@ def test_spss_labelled_num(): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT) +@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num_na(): fname = "data/labelled-num-na.sav" @@ -41,7 +41,7 @@ def test_spss_labelled_num_na(): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT) +@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_str(): fname = "data/labelled-str.sav" @@ -55,7 +55,7 @@ def test_spss_labelled_str(): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT) +@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_umlauts(): fname = "data/umlauts.sav" From ef9f7d0cc0c90e60e2e76e958655e754f6695e23 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 11:57:05 +0200 Subject: [PATCH 12/53] Fix API --- pandas/tests/api/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index aa42484bf9513..b57c7a0cf0625 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -81,7 +81,7 @@ class TestPDApi(Base): 'read_gbq', 'read_hdf', 'read_html', 'read_json', 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table', 'read_feather', 'read_parquet'] + 'read_table', 'read_feather', 'read_parquet', 'read_spss'] # top-level to_* funcs funcs_to = ['to_datetime', 'to_msgpack', From 951a0c2191fe46b37ea8bff1eb12c14c9ee08396 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 12:45:53 +0200 Subject: [PATCH 13/53] Fix path to test files --- pandas/tests/io/test_spss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 4b6ad55f86df3..e60f7321fda78 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -15,7 +15,7 @@ @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num(): - fname = "data/labelled-num.sav" + fname = "io/data/labelled-num.sav" df = read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) @@ -29,7 +29,7 @@ def test_spss_labelled_num(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num_na(): - fname = "data/labelled-num-na.sav" + fname = "io/data/labelled-num-na.sav" df = read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) @@ -43,7 +43,7 @@ def test_spss_labelled_num_na(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_str(): - fname = "data/labelled-str.sav" + fname = "io/data/labelled-str.sav" df = read_spss(fname, categorical=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) @@ -57,7 +57,7 @@ def test_spss_labelled_str(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_umlauts(): - fname = "data/umlauts.sav" + fname = "io/data/umlauts.sav" df = read_spss(fname, categorical=True) expected = pd.DataFrame({"var1": ["the ä umlaut", From 977fff01bf30de046c1626ced383a36c6c51c5e1 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 28 May 2019 12:49:17 +0200 Subject: [PATCH 14/53] Sort imports --- pandas/io/api.py | 2 +- pandas/tests/io/test_spss.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/api.py b/pandas/io/api.py index e5e635716dbb8..725e82604ca7f 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -16,6 +16,6 @@ from pandas.io.pickle import read_pickle, to_pickle from pandas.io.pytables import HDFStore, read_hdf from pandas.io.sas import read_sas +from pandas.io.spss import read_spss from pandas.io.sql import read_sql, read_sql_query, read_sql_table from pandas.io.stata import read_stata -from pandas.io.spss import read_spss diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index e60f7321fda78..f77917be6731e 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -1,10 +1,10 @@ -import pytest import numpy as np +import pytest + import pandas as pd from pandas.io.spss import read_spss from pandas.util import testing as tm - try: import pyreadstat # noqa except ImportError: From a69c2bc9fdf4305b8cd7ad5b30db0f0a94aada35 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 09:14:49 +0200 Subject: [PATCH 15/53] Use datapath fixture --- pandas/tests/io/test_spss.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index f77917be6731e..e5d37eac7ebda 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -14,9 +14,8 @@ @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") -def test_spss_labelled_num(): - fname = "io/data/labelled-num.sav" - +def test_spss_labelled_num(datapath): + fname = datapath("io", "data", "labelled-num.sav") df = read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) @@ -28,8 +27,8 @@ def test_spss_labelled_num(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") -def test_spss_labelled_num_na(): - fname = "io/data/labelled-num-na.sav" +def test_spss_labelled_num_na(datapath): + fname = datapath("io", "data", "labelled-num-na.sav") df = read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) @@ -42,8 +41,8 @@ def test_spss_labelled_num_na(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") -def test_spss_labelled_str(): - fname = "io/data/labelled-str.sav" +def test_spss_labelled_str(datapath): + fname = datapath("io", "data", "labelled-str.sav") df = read_spss(fname, categorical=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) @@ -56,8 +55,8 @@ def test_spss_labelled_str(): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") -def test_spss_umlauts(): - fname = "io/data/umlauts.sav" +def test_spss_umlauts(datapath): + fname = datapath("io", "data", "umlauts.sav") df = read_spss(fname, categorical=True) expected = pd.DataFrame({"var1": ["the ä umlaut", From 40c9875d89419c6e6117b3af50cc424483343652 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 09:16:44 +0200 Subject: [PATCH 16/53] Acknowledge Haven project --- pandas/tests/io/test_spss.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index e5d37eac7ebda..0a4551786205c 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -15,7 +15,9 @@ @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num.sav") + df = read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) @@ -28,6 +30,7 @@ def test_spss_labelled_num(datapath): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num_na(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num-na.sav") df = read_spss(fname, categorical=True) @@ -42,6 +45,7 @@ def test_spss_labelled_num_na(datapath): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_str(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-str.sav") df = read_spss(fname, categorical=True) @@ -56,6 +60,7 @@ def test_spss_labelled_str(datapath): @pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_umlauts(datapath): + # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "umlauts.sav") df = read_spss(fname, categorical=True) From c3a4291fd26895b0edf0fa5b26d3567cc7ecb019 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 11:17:28 +0200 Subject: [PATCH 17/53] Fix imports order --- pandas/tests/io/test_spss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 0a4551786205c..527160100dbfa 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -2,9 +2,10 @@ import pytest import pandas as pd -from pandas.io.spss import read_spss from pandas.util import testing as tm +from pandas.io.spss import read_spss + try: import pyreadstat # noqa except ImportError: From c59f1e8ea9490348d77078b158b74dd6d4bce9ad Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 17:36:45 +0200 Subject: [PATCH 18/53] Use importorskip --- pandas/tests/io/test_spss.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 527160100dbfa..71a8d1075c3bd 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -6,15 +6,10 @@ from pandas.io.spss import read_spss -try: - import pyreadstat # noqa -except ImportError: - _HAVE_PYREADSTAT = False -else: - _HAVE_PYREADSTAT = True + +pyreadstat = pytest.importorskip("pyreadstat") -@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num.sav") @@ -29,7 +24,6 @@ def test_spss_labelled_num(datapath): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num-na.sav") @@ -44,7 +38,6 @@ def test_spss_labelled_num_na(datapath): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-str.sav") @@ -59,7 +52,6 @@ def test_spss_labelled_str(datapath): tm.assert_frame_equal(df, expected) -@pytest.mark.skipif(not _HAVE_PYREADSTAT, reason="pyreadstat not installed") def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "umlauts.sav") From a3a95bf856f6b6df05c47afbc805ac5867898056 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 17:40:54 +0200 Subject: [PATCH 19/53] Add pyreadstat dependency to macOS and Windows CI --- ci/deps/azure-macos-35.yaml | 1 + ci/deps/azure-windows-37.yaml | 3 +++ 2 files changed, 4 insertions(+) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 8ed48b46b5b5a..50feac28ded44 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -28,3 +28,4 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 + - pyreadstat diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 04e4f74f85e4d..8464284368aeb 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -30,3 +30,6 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 + - isort + - pip: + - pyreadstat From 1983464e2fa3175c6633b36266378e612dcdfb01 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 17:51:15 +0200 Subject: [PATCH 20/53] Add typing --- pandas/io/spss.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2ff612c2872e3..797c0cd28f737 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,4 +1,10 @@ -def read_spss(path, usecols=None, categorical=True): +from typing import Union, Sequence +from pandas.core.api import DataFrame + + +def read_spss(path: str, + usecols: Union[str, Sequence[str], None]=None, + categorical: bool=True) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. From 5829c95530f5ea609cb0755dc5374c45ddee5070 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 17:53:07 +0200 Subject: [PATCH 21/53] Add missing whitespace --- pandas/io/spss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 797c0cd28f737..84c1dfd171f7a 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -3,8 +3,8 @@ def read_spss(path: str, - usecols: Union[str, Sequence[str], None]=None, - categorical: bool=True) -> DataFrame: + usecols: Union[str, Sequence[str], None] = None, + categorical: bool = True) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. From 17e8786339aa9656ae82bf133ee43ecd871da6f6 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Wed, 29 May 2019 18:00:41 +0200 Subject: [PATCH 22/53] Add Haven license files --- LICENSES/HAVEN_LICENSE | 2 ++ LICENSES/HAVEN_MIT | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 LICENSES/HAVEN_LICENSE create mode 100644 LICENSES/HAVEN_MIT diff --git a/LICENSES/HAVEN_LICENSE b/LICENSES/HAVEN_LICENSE new file mode 100644 index 0000000000000..2f444cb44d505 --- /dev/null +++ b/LICENSES/HAVEN_LICENSE @@ -0,0 +1,2 @@ +YEAR: 2013-2016 +COPYRIGHT HOLDER: Hadley Wickham; RStudio; and Evan Miller diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT new file mode 100644 index 0000000000000..c0c840fe6ce00 --- /dev/null +++ b/LICENSES/HAVEN_MIT @@ -0,0 +1,32 @@ +Based on http://opensource.org/licenses/MIT + +This is a template. Complete and ship as file LICENSE the following 2 +lines (only) + +YEAR: +COPYRIGHT HOLDER: + +and specify as + +License: MIT + file LICENSE + +Copyright (c) , + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. From fe2e2fc7e7f7871105eef9352f613ec19ac82f75 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 30 May 2019 10:04:58 +0200 Subject: [PATCH 23/53] Remove trailing whitespace --- LICENSES/HAVEN_MIT | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSES/HAVEN_MIT b/LICENSES/HAVEN_MIT index c0c840fe6ce00..b03d0e640627a 100644 --- a/LICENSES/HAVEN_MIT +++ b/LICENSES/HAVEN_MIT @@ -4,7 +4,7 @@ This is a template. Complete and ship as file LICENSE the following 2 lines (only) YEAR: -COPYRIGHT HOLDER: +COPYRIGHT HOLDER: and specify as From 1510b88f5ad375600d9bd87bb15f5a40b1294e10 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 30 May 2019 10:07:21 +0200 Subject: [PATCH 24/53] Fix import format and add pathlib.Path --- pandas/io/spss.py | 8 +++++--- pandas/tests/io/test_spss.py | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 84c1dfd171f7a..2c9ad52f67611 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,8 +1,10 @@ -from typing import Union, Sequence +from pathlib import Path +from typing import Sequence, Union + from pandas.core.api import DataFrame -def read_spss(path: str, +def read_spss(path: Union[str, Path], usecols: Union[str, Sequence[str], None] = None, categorical: bool = True) -> DataFrame: """ @@ -12,7 +14,7 @@ def read_spss(path: str, Parameters ---------- - path : string + path : string or Path File path usecols : str or list-like or None Return a subset of the columns. If None, return all columns. diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 71a8d1075c3bd..7b28535ab2020 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -6,7 +6,6 @@ from pandas.io.spss import read_spss - pyreadstat = pytest.importorskip("pyreadstat") From a6e5ad670ac639aa24294003843b61b1ca3adc75 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 30 May 2019 20:41:46 +0200 Subject: [PATCH 25/53] Use Optional to properly type an optional argument with default value of None --- pandas/io/spss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2c9ad52f67611..c872ad5a53144 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,11 +1,11 @@ from pathlib import Path -from typing import Sequence, Union +from typing import Optional, Sequence, Union from pandas.core.api import DataFrame def read_spss(path: Union[str, Path], - usecols: Union[str, Sequence[str], None] = None, + usecols: Optional[Union[str, Sequence[str]]] = None, categorical: bool = True) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. From 78171364b73e484a95dbdc524a4c395b12f0a124 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 31 May 2019 09:43:10 +0200 Subject: [PATCH 26/53] Use conda-forge instead of PyPI --- ci/deps/azure-macos-35.yaml | 2 +- ci/deps/azure-windows-37.yaml | 3 +-- ci/deps/travis-37.yaml | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 50feac28ded44..77c105e24c7bf 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -14,6 +14,7 @@ dependencies: - numpy=1.13.3 - openpyxl - pyarrow + - pyreadstat - pytables - python=3.5.* - python-dateutil==2.6.1 @@ -28,4 +29,3 @@ dependencies: - pytest-xdist - pytest-mock - hypothesis>=3.58.0 - - pyreadstat diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 8464284368aeb..3a001c6473dc2 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -31,5 +31,4 @@ dependencies: - moto - hypothesis>=3.58.0 - isort - - pip: - - pyreadstat + - pyreadstat diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 7694f30a2abb0..253ca9fd4ffdc 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -19,6 +19,6 @@ dependencies: - hypothesis>=3.58.0 - s3fs - pip + - pyreadstat - pip: - - moto - - pyreadstat + - moto From f0702820fee78950537fa0565c46ccb5a704dedf Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 31 May 2019 09:53:54 +0200 Subject: [PATCH 27/53] Better ImportError message --- pandas/io/spss.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index c872ad5a53144..2b3c30f8c5a1d 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -25,8 +25,10 @@ def read_spss(path: Union[str, Path], ------- DataFrame """ - - from pyreadstat import read_sav + try: + from pyreadstat import read_sav + except ImportError: + raise ImportError("pyreadstat is required to read SPSS .sav files.") if usecols is not None: if isinstance(usecols, str): usecols = [usecols] From 8a52e4126f22aaf5d4dcd0495bb690c2ef32a6f1 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 31 May 2019 09:54:11 +0200 Subject: [PATCH 28/53] Use pd.read_spss --- pandas/tests/io/test_spss.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 7b28535ab2020..43a800e4ac758 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -4,8 +4,6 @@ import pandas as pd from pandas.util import testing as tm -from pandas.io.spss import read_spss - pyreadstat = pytest.importorskip("pyreadstat") @@ -13,12 +11,12 @@ def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num.sav") - df = read_spss(fname, categorical=True) + df = pd.read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) tm.assert_frame_equal(df, expected) - df = read_spss(fname, categorical=False) + df = pd.read_spss(fname, categorical=False) expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) tm.assert_frame_equal(df, expected) @@ -27,12 +25,12 @@ def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num-na.sav") - df = read_spss(fname, categorical=True) + df = pd.read_spss(fname, categorical=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) tm.assert_frame_equal(df, expected) - df = read_spss(fname, categorical=False) + df = pd.read_spss(fname, categorical=False) expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) tm.assert_frame_equal(df, expected) @@ -41,12 +39,12 @@ def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-str.sav") - df = read_spss(fname, categorical=True) + df = pd.read_spss(fname, categorical=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) expected["gender"] = pd.Categorical(expected["gender"]) tm.assert_frame_equal(df, expected) - df = read_spss(fname, categorical=False) + df = pd.read_spss(fname, categorical=False) expected = pd.DataFrame({"gender": ["M", "F"]}) tm.assert_frame_equal(df, expected) @@ -55,7 +53,7 @@ def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "umlauts.sav") - df = read_spss(fname, categorical=True) + df = pd.read_spss(fname, categorical=True) expected = pd.DataFrame({"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", @@ -63,6 +61,6 @@ def test_spss_umlauts(datapath): expected["var1"] = pd.Categorical(expected["var1"]) tm.assert_frame_equal(df, expected) - df = read_spss(fname, categorical=False) + df = pd.read_spss(fname, categorical=False) expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) tm.assert_frame_equal(df, expected) From aa85f94e7358c6e4bc388e20873eab040cda80d5 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 31 May 2019 10:49:11 +0200 Subject: [PATCH 29/53] Add conda-forge on macOS task --- ci/deps/azure-macos-35.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 77c105e24c7bf..e0faa9814ae77 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - defaults + - conda-forge dependencies: - beautifulsoup4 - bottleneck From 0707fbf6bfcb24341a553b603199e18d78f8e9c8 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 3 Jun 2019 08:25:02 +0200 Subject: [PATCH 30/53] Use pyreadstat from pip for Python 3.5 (not available on conda-forge) --- ci/deps/azure-macos-35.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index e0faa9814ae77..b96a22befc45b 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -15,7 +15,6 @@ dependencies: - numpy=1.13.3 - openpyxl - pyarrow - - pyreadstat - pytables - python=3.5.* - python-dateutil==2.6.1 @@ -25,6 +24,7 @@ dependencies: - xlsxwriter - xlwt - pip: + - pyreadstat # universal - pytest==4.5.0 - pytest-xdist From cf6403f3370840445e212044cb87e4b3c18a77e4 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 6 Jun 2019 19:41:35 +0200 Subject: [PATCH 31/53] usecols only accepts list-like or None --- pandas/io/spss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 2b3c30f8c5a1d..c74db05837921 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -5,7 +5,7 @@ def read_spss(path: Union[str, Path], - usecols: Optional[Union[str, Sequence[str]]] = None, + usecols: Optional[Sequence[str]] = None, categorical: bool = True) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -16,7 +16,7 @@ def read_spss(path: Union[str, Path], ---------- path : string or Path File path - usecols : str or list-like or None + usecols : list-like or None Return a subset of the columns. If None, return all columns. categorical : bool Convert categorical columns into pd.Categorical. From f6a27478f7d26742418a73947ddc80483e8ec834 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 6 Jun 2019 19:42:40 +0200 Subject: [PATCH 32/53] Remove condition (str is not allowed anymore) --- pandas/io/spss.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index c74db05837921..be6e8c60bb1b6 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -29,9 +29,6 @@ def read_spss(path: Union[str, Path], from pyreadstat import read_sav except ImportError: raise ImportError("pyreadstat is required to read SPSS .sav files.") - if usecols is not None: - if isinstance(usecols, str): - usecols = [usecols] df, _ = read_sav(path, usecols=usecols, apply_value_formats=categorical) return df From af9dda9e5daf8e654b4a94761951ab66855c9093 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sat, 8 Jun 2019 20:21:16 +0200 Subject: [PATCH 33/53] Rename to convert_categoricals --- pandas/io/spss.py | 6 +++--- pandas/tests/io/test_spss.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index be6e8c60bb1b6..39d434e2ae5f2 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -6,7 +6,7 @@ def read_spss(path: Union[str, Path], usecols: Optional[Sequence[str]] = None, - categorical: bool = True) -> DataFrame: + convert_categoricals: bool = True) -> DataFrame: """ Load an SPSS file from the file path, returning a DataFrame. @@ -18,7 +18,7 @@ def read_spss(path: Union[str, Path], File path usecols : list-like or None Return a subset of the columns. If None, return all columns. - categorical : bool + convert_categoricals : bool Convert categorical columns into pd.Categorical. Returns @@ -30,5 +30,5 @@ def read_spss(path: Union[str, Path], except ImportError: raise ImportError("pyreadstat is required to read SPSS .sav files.") df, _ = read_sav(path, usecols=usecols, - apply_value_formats=categorical) + apply_value_formats=convert_categoricals) return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 43a800e4ac758..f7f0f5bfcedc6 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -64,3 +64,15 @@ def test_spss_umlauts(datapath): df = pd.read_spss(fname, categorical=False) expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) tm.assert_frame_equal(df, expected) + + +def test_spss_usecols(datapath): + # usecols must be list-like + fname = datapath("io", "data", "labelled-num.sav") + + usecols = "VAR00002" + msg = ("Argument 'usecols' has incorrect type " + "(expected list, got {})".format(type(usecols).__name__)) + + with pytest.raises(TypeError, match=msg): + df = pd.read_spss(fname, usecols=usecols) From d6d408e2361c0dacea924c538ba9b1de16d88d50 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sat, 8 Jun 2019 20:21:29 +0200 Subject: [PATCH 34/53] Explicitly convert to list --- pandas/io/spss.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 39d434e2ae5f2..bce9c21f62e91 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -29,6 +29,9 @@ def read_spss(path: Union[str, Path], from pyreadstat import read_sav except ImportError: raise ImportError("pyreadstat is required to read SPSS .sav files.") + + usecols = list(usecols) # explicitly convert to list + df, _ = read_sav(path, usecols=usecols, apply_value_formats=convert_categoricals) return df From 1497a03d4315843e86675fbc70c79f9569cb263d Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sat, 8 Jun 2019 20:23:14 +0200 Subject: [PATCH 35/53] Add pyreadstat to environment.yml --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 7db2ec72ccb3b..0912a1224530b 100644 --- a/environment.yml +++ b/environment.yml @@ -79,3 +79,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - pip: + - pyreadstat # pandas.read_spss From 2d7a256fffa2bb5fc7e132d7e6d08832b37a5bba Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sat, 8 Jun 2019 22:27:07 +0200 Subject: [PATCH 36/53] Use is_list_like --- pandas/io/spss.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index bce9c21f62e91..8d5cdc62a31f1 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -2,6 +2,7 @@ from typing import Optional, Sequence, Union from pandas.core.api import DataFrame +from pandas.api.types import is_list_like def read_spss(path: Union[str, Path], @@ -30,7 +31,11 @@ def read_spss(path: Union[str, Path], except ImportError: raise ImportError("pyreadstat is required to read SPSS .sav files.") - usecols = list(usecols) # explicitly convert to list + if usecols is not None: + if not is_list_like(usecols): + raise TypeError("usecols must be list-like.") + else: + usecols = list(usecols) # pyreadstat requires a list df, _ = read_sav(path, usecols=usecols, apply_value_formats=convert_categoricals) From bd56eee7ae08c5c080cb18b295ef461114b28cdf Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 10 Jun 2019 11:03:56 +0200 Subject: [PATCH 37/53] Fix tests --- pandas/tests/io/test_spss.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index f7f0f5bfcedc6..3c7a60f777e15 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -11,12 +11,12 @@ def test_spss_labelled_num(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num.sav") - df = pd.read_spss(fname, categorical=True) + df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) tm.assert_frame_equal(df, expected) - df = pd.read_spss(fname, categorical=False) + df = pd.read_spss(fname, convert_categoricals=False) expected = pd.DataFrame({"VAR00002": 1.0}, index=[0]) tm.assert_frame_equal(df, expected) @@ -25,12 +25,12 @@ def test_spss_labelled_num_na(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-num-na.sav") - df = pd.read_spss(fname, categorical=True) + df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": ["This is one", None]}) expected["VAR00002"] = pd.Categorical(expected["VAR00002"]) tm.assert_frame_equal(df, expected) - df = pd.read_spss(fname, categorical=False) + df = pd.read_spss(fname, convert_categoricals=False) expected = pd.DataFrame({"VAR00002": [1.0, np.nan]}) tm.assert_frame_equal(df, expected) @@ -39,12 +39,12 @@ def test_spss_labelled_str(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "labelled-str.sav") - df = pd.read_spss(fname, categorical=True) + df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"gender": ["Male", "Female"]}) expected["gender"] = pd.Categorical(expected["gender"]) tm.assert_frame_equal(df, expected) - df = pd.read_spss(fname, categorical=False) + df = pd.read_spss(fname, convert_categoricals=False) expected = pd.DataFrame({"gender": ["M", "F"]}) tm.assert_frame_equal(df, expected) @@ -53,7 +53,7 @@ def test_spss_umlauts(datapath): # test file from the Haven project (https://haven.tidyverse.org/) fname = datapath("io", "data", "umlauts.sav") - df = pd.read_spss(fname, categorical=True) + df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", @@ -61,7 +61,7 @@ def test_spss_umlauts(datapath): expected["var1"] = pd.Categorical(expected["var1"]) tm.assert_frame_equal(df, expected) - df = pd.read_spss(fname, categorical=False) + df = pd.read_spss(fname, convert_categoricals=False) expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}) tm.assert_frame_equal(df, expected) @@ -70,9 +70,5 @@ def test_spss_usecols(datapath): # usecols must be list-like fname = datapath("io", "data", "labelled-num.sav") - usecols = "VAR00002" - msg = ("Argument 'usecols' has incorrect type " - "(expected list, got {})".format(type(usecols).__name__)) - - with pytest.raises(TypeError, match=msg): - df = pd.read_spss(fname, usecols=usecols) + with pytest.raises(TypeError, match="usecols must be list-like."): + df = pd.read_spss(fname, usecols="VAR00002") From f68b516dcc09978fff03198413662b61ccae2b8f Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 10 Jun 2019 14:11:43 +0200 Subject: [PATCH 38/53] Fix df is assigned but never used --- pandas/tests/io/test_spss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 3c7a60f777e15..b9f58f9bf6cf6 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -71,4 +71,4 @@ def test_spss_usecols(datapath): fname = datapath("io", "data", "labelled-num.sav") with pytest.raises(TypeError, match="usecols must be list-like."): - df = pd.read_spss(fname, usecols="VAR00002") + pd.read_spss(fname, usecols="VAR00002") From ee14f2986708a797970fff145fc38e73a980b907 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 10 Jun 2019 14:24:44 +0200 Subject: [PATCH 39/53] Sort imports --- pandas/io/spss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 8d5cdc62a31f1..d3172d5521311 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,8 +1,8 @@ from pathlib import Path from typing import Optional, Sequence, Union -from pandas.core.api import DataFrame from pandas.api.types import is_list_like +from pandas.core.api import DataFrame def read_spss(path: Union[str, Path], From 15d7c71e8a4f6b6d3d78840376cf26a974a4bf06 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Mon, 10 Jun 2019 16:48:46 +0200 Subject: [PATCH 40/53] Update requirements-dev.txt --- requirements-dev.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index b40aa86e946b6..169af7da5e037 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,4 +52,5 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt +pyreadstat \ No newline at end of file From a05de6c13ebaf59d5f42dbe2c953d16a4e4bc92b Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 11 Jun 2019 07:48:53 +0200 Subject: [PATCH 41/53] Remove isort --- ci/deps/azure-windows-37.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 3a001c6473dc2..5bdc29e0eec80 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -30,5 +30,4 @@ dependencies: - pytest-mock - moto - hypothesis>=3.58.0 - - isort - pyreadstat From 748fe618bcae54a579f6c46e75ad9b1a612c5988 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 11 Jun 2019 07:49:35 +0200 Subject: [PATCH 42/53] Improve docstring --- pandas/io/spss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index d3172d5521311..a1669d0adc0e9 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -17,9 +17,9 @@ def read_spss(path: Union[str, Path], ---------- path : string or Path File path - usecols : list-like or None + usecols : list-like, optional Return a subset of the columns. If None, return all columns. - convert_categoricals : bool + convert_categoricals : bool, default is True Convert categorical columns into pd.Categorical. Returns From ced48667c57b24cd185e59cdb592a6fa54ecf53e Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 11 Jun 2019 08:11:29 +0200 Subject: [PATCH 43/53] Revert indent --- ci/deps/travis-37.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 253ca9fd4ffdc..c9a8c274fb144 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -21,4 +21,4 @@ dependencies: - pip - pyreadstat - pip: - - moto + - moto From a18e0f5b5f9b6eeefaed11835d153ea8c050bc82 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Tue, 11 Jun 2019 08:12:30 +0200 Subject: [PATCH 44/53] Indent should be 2 spaces --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 0912a1224530b..de9bd67dd9f06 100644 --- a/environment.yml +++ b/environment.yml @@ -80,4 +80,4 @@ dependencies: - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - pip: - - pyreadstat # pandas.read_spss + - pyreadstat # pandas.read_spss From 913989d0a9a1743a5045fa592b2f751af77435f7 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 13 Jun 2019 08:14:13 +0200 Subject: [PATCH 45/53] Add minimum version for pyreadstat --- doc/source/install.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/install.rst b/doc/source/install.rst index db31d75e3013e..cc9a923536ff3 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,6 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy +pyreadstat 0.2.5 SPSS files (.sav) reading qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data From 0abcde8efed8ef797fc71d385e08bb3e7ca4d781 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 13 Jun 2019 08:17:18 +0200 Subject: [PATCH 46/53] Add minimum version --- requirements-dev.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 169af7da5e037..13e8d6b540649 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -45,6 +45,7 @@ lxml openpyxl pyarrow>=0.9.0 pyqt +pyreadstat tables>=3.4.2 python-snappy s3fs @@ -53,4 +54,4 @@ xarray xlrd xlsxwriter xlwt -pyreadstat \ No newline at end of file +pyreadstat From 040af2bdadd21352bc506f99d7ef2dac1b0cbac4 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 13 Jun 2019 08:32:23 +0200 Subject: [PATCH 47/53] Use import_optional_dependency --- pandas/io/spss.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index a1669d0adc0e9..e462519bea097 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -2,6 +2,7 @@ from typing import Optional, Sequence, Union from pandas.api.types import is_list_like +from pandas.compat._optional import import_optional_dependency from pandas.core.api import DataFrame @@ -26,10 +27,7 @@ def read_spss(path: Union[str, Path], ------- DataFrame """ - try: - from pyreadstat import read_sav - except ImportError: - raise ImportError("pyreadstat is required to read SPSS .sav files.") + pyreadstat = import_optional_dependency("pyreadstat") if usecols is not None: if not is_list_like(usecols): @@ -37,6 +35,6 @@ def read_spss(path: Union[str, Path], else: usecols = list(usecols) # pyreadstat requires a list - df, _ = read_sav(path, usecols=usecols, - apply_value_formats=convert_categoricals) + df, _ = pyreadstat.read_sav(path, usecols=usecols, + apply_value_formats=convert_categoricals) return df From 53f56922d827a9771227edf1fe916fb17da699b4 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 13 Jun 2019 08:32:57 +0200 Subject: [PATCH 48/53] Remove minimum version for now --- doc/source/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index cc9a923536ff3..1c1f0c1d4cf8e 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -285,7 +285,7 @@ pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.9.0 Parquet and feather reading / writing pymysql MySQL engine for sqlalchemy -pyreadstat 0.2.5 SPSS files (.sav) reading +pyreadstat SPSS files (.sav) reading qtpy Clipboard I/O s3fs 0.0.8 Amazon S3 access xarray 0.8.2 pandas-like API for N-dimensional data From ceef885316953ef76c1d6ae11892a8018a0e4ac5 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Thu, 13 Jun 2019 10:24:20 +0200 Subject: [PATCH 49/53] Correct import order --- pandas/io/spss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index e462519bea097..b1b92fc2b8439 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -1,8 +1,9 @@ from pathlib import Path from typing import Optional, Sequence, Union -from pandas.api.types import is_list_like from pandas.compat._optional import import_optional_dependency + +from pandas.api.types import is_list_like from pandas.core.api import DataFrame From b232b617beaa51bee296d3b6c8d2edb570be163e Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 14 Jun 2019 15:14:12 +0200 Subject: [PATCH 50/53] Remove duplicate --- requirements-dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 13e8d6b540649..ab3ed4b96f9aa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -54,4 +54,3 @@ xarray xlrd xlsxwriter xlwt -pyreadstat From b8b7fff3b206d225b79f3da9d21cb990a540c1bb Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 14 Jun 2019 15:19:11 +0200 Subject: [PATCH 51/53] Don't need conda-forge here --- ci/deps/azure-macos-35.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index b96a22befc45b..24c753e16d98d 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -1,7 +1,6 @@ name: pandas-dev channels: - defaults - - conda-forge dependencies: - beautifulsoup4 - bottleneck From 90702f3fad445632038d332a0c73d6f118d54a39 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Fri, 14 Jun 2019 15:22:45 +0200 Subject: [PATCH 52/53] Remove blank line --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index ab3ed4b96f9aa..3d6a1520cec0a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -53,4 +53,4 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt +xlwt \ No newline at end of file From e55b8c420e9109a6b1b23a69b827944a5a9bc542 Mon Sep 17 00:00:00 2001 From: Clemens Brunner Date: Sat, 15 Jun 2019 09:05:12 +0200 Subject: [PATCH 53/53] Fix order --- requirements-dev.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 3d6a1520cec0a..169af7da5e037 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -45,7 +45,6 @@ lxml openpyxl pyarrow>=0.9.0 pyqt -pyreadstat tables>=3.4.2 python-snappy s3fs @@ -53,4 +52,5 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt +pyreadstat \ No newline at end of file