Skip to content

Commit 910e29f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into enh/convert_dtypes/pyarrow
2 parents b03ecd8 + 0189674 commit 910e29f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+754
-673
lines changed

.github/workflows/code-checks.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ jobs:
3939
with:
4040
extra_args: --verbose --all-files
4141

42-
docstring_typing_pylint:
43-
name: Docstring validation, typing, and pylint
42+
docstring_typing_manual_hooks:
43+
name: Docstring validation, typing, and other manual pre-commit hooks
4444
runs-on: ubuntu-22.04
4545
defaults:
4646
run:

.pre-commit-config.yaml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
minimum_pre_commit_version: 2.15.0
22
exclude: ^LICENSES/|\.(html|csv|svg)$
3-
# reserve "manual" for mypy and pyright
4-
default_stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite]
3+
# reserve "manual" for relatively slow hooks which we still want to run in CI
4+
default_stages: [
5+
commit,
6+
merge-commit,
7+
push,
8+
prepare-commit-msg,
9+
commit-msg,
10+
post-checkout,
11+
post-commit,
12+
post-merge,
13+
post-rewrite
14+
]
515
ci:
616
autofix_prs: false
717
repos:
@@ -34,9 +44,11 @@ repos:
3444
- id: debug-statements
3545
- id: end-of-file-fixer
3646
exclude: \.txt$
37-
stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite]
47+
stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg,
48+
post-checkout, post-commit, post-merge, post-rewrite]
3849
- id: trailing-whitespace
39-
stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite]
50+
stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg,
51+
post-checkout, post-commit, post-merge, post-rewrite]
4052
- repo: https://github.com/cpplint/cpplint
4153
rev: 1.6.1
4254
hooks:
@@ -46,7 +58,13 @@ repos:
4658
# this particular codebase (e.g. src/headers, src/klib). However,
4759
# we can lint all header files since they aren't "generated" like C files are.
4860
exclude: ^pandas/_libs/src/(klib|headers)/
49-
args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir']
61+
args: [
62+
--quiet,
63+
'--extensions=c,h',
64+
'--headers=h',
65+
--recursive,
66+
'--filter=-readability/casting,-runtime/int,-build/include_subdir'
67+
]
5068
- repo: https://github.com/PyCQA/flake8
5169
rev: 6.0.0
5270
hooks:
@@ -107,6 +125,7 @@ repos:
107125
hooks:
108126
- id: yesqa
109127
additional_dependencies: *flake8_dependencies
128+
stages: [manual]
110129
- repo: local
111130
hooks:
112131
# NOTE: we make `black` a local hook because if it's installed from
@@ -214,7 +233,6 @@ repos:
214233
exclude: ^pandas/tests/extension/base/base\.py
215234
- id: pip-to-conda
216235
name: Generate pip dependency from conda
217-
description: This hook checks if the conda environment.yml and requirements-dev.txt are equal
218236
language: python
219237
entry: python scripts/generate_pip_deps_from_conda.py
220238
files: ^(environment.yml|requirements-dev.txt)$
@@ -311,6 +329,7 @@ repos:
311329
files: ^pandas
312330
exclude: ^(pandas/tests|pandas/_version.py|pandas/io/clipboard)
313331
language: python
332+
stages: [manual]
314333
additional_dependencies:
315334
- autotyping==22.9.0
316335
- libcst==0.4.7

ci/deps/actions-310.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies:
3333
- gcsfs
3434
- jinja2
3535
- lxml
36-
- matplotlib
36+
- matplotlib>=3.6.1
3737
- numba
3838
- numexpr
3939
- openpyxl

ci/deps/actions-38.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies:
3333
- gcsfs
3434
- jinja2
3535
- lxml
36-
- matplotlib
36+
- matplotlib>=3.6.1
3737
- numba
3838
- numexpr
3939
- openpyxl

ci/deps/actions-39.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ dependencies:
3333
- gcsfs
3434
- jinja2
3535
- lxml
36-
- matplotlib
36+
- matplotlib>=3.6.1
3737
- numba
3838
- numexpr
3939
- openpyxl

doc/source/user_guide/basics.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,7 @@ useful if you are reading in data which is mostly of the desired dtype (e.g. num
23122312
non-conforming elements intermixed that you want to represent as missing:
23132313

23142314
.. ipython:: python
2315+
:okwarning:
23152316
23162317
import datetime
23172318
@@ -2328,6 +2329,7 @@ The ``errors`` parameter has a third option of ``errors='ignore'``, which will s
23282329
encounters any errors with the conversion to a desired data type:
23292330

23302331
.. ipython:: python
2332+
:okwarning:
23312333
23322334
import datetime
23332335

doc/source/user_guide/io.rst

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -968,17 +968,7 @@ To parse the mixed-timezone values as a datetime column, pass a partially-applie
968968
Inferring datetime format
969969
+++++++++++++++++++++++++
970970

971-
If you have ``parse_dates`` enabled for some or all of your columns, and your
972-
datetime strings are all formatted the same way, you may get a large speed
973-
up by setting ``infer_datetime_format=True``. If set, pandas will attempt
974-
to guess the format of your datetime strings, and then use a faster means
975-
of parsing the strings. 5-10x parsing speeds have been observed. pandas
976-
will fallback to the usual parsing if either the format cannot be guessed
977-
or the format that was guessed cannot properly parse the entire column
978-
of strings. So in general, ``infer_datetime_format`` should not have any
979-
negative consequences if enabled.
980-
981-
Here are some examples of datetime strings that can be guessed (All
971+
Here are some examples of datetime strings that can be guessed (all
982972
representing December 30th, 2011 at 00:00:00):
983973

984974
* "20111230"
@@ -988,21 +978,36 @@ representing December 30th, 2011 at 00:00:00):
988978
* "30/Dec/2011 00:00:00"
989979
* "30/December/2011 00:00:00"
990980

991-
Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With
981+
Note that format inference is sensitive to ``dayfirst``. With
992982
``dayfirst=True``, it will guess "01/12/2011" to be December 1st. With
993983
``dayfirst=False`` (default) it will guess "01/12/2011" to be January 12th.
994984

985+
If you try to parse a column of date strings, pandas will attempt to guess the format
986+
from the first non-NaN element, and will then parse the rest of the column with that
987+
format. If pandas fails to guess the format (for example if your first string is
988+
``'01 December US/Pacific 2000'``), then a warning will be raised and each
989+
row will be parsed individually by ``dateutil.parser.parse``. The safest
990+
way to parse dates is to explicitly set ``format=``.
991+
995992
.. ipython:: python
996993
997-
# Try to infer the format for the index column
998994
df = pd.read_csv(
999995
"foo.csv",
1000996
index_col=0,
1001997
parse_dates=True,
1002-
infer_datetime_format=True,
1003998
)
1004999
df
10051000
1001+
In the case that you have mixed datetime formats within the same column, you'll need to
1002+
first read it in as an object dtype and then apply :func:`to_datetime` to each element.
1003+
1004+
.. ipython:: python
1005+
1006+
data = io.StringIO("date\n12 Jan 2000\n2000-01-13\n")
1007+
df = pd.read_csv(data)
1008+
df['date'] = df['date'].apply(pd.to_datetime)
1009+
df
1010+
10061011
.. ipython:: python
10071012
:suppress:
10081013

doc/source/user_guide/timeseries.rst

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ time.
132132

133133
.. ipython:: python
134134
135+
import datetime
136+
135137
pd.Timestamp(datetime.datetime(2012, 5, 1))
136138
pd.Timestamp("2012-05-01")
137139
pd.Timestamp(2012, 5, 1)
@@ -196,26 +198,25 @@ is converted to a ``DatetimeIndex``:
196198

197199
.. ipython:: python
198200
199-
pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None]))
201+
pd.to_datetime(pd.Series(["Jul 31, 2009", "Jan 10, 2010", None]))
200202
201-
pd.to_datetime(["2005/11/23", "2010.12.31"])
203+
pd.to_datetime(["2005/11/23", "2010/12/31"])
202204
203205
If you use dates which start with the day first (i.e. European style),
204206
you can pass the ``dayfirst`` flag:
205207

206208
.. ipython:: python
207-
:okwarning:
209+
:okwarning:
208210
209211
pd.to_datetime(["04-01-2012 10:00"], dayfirst=True)
210212
211-
pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True)
213+
pd.to_datetime(["04-14-2012 10:00"], dayfirst=True)
212214
213215
.. warning::
214216

215217
You see in the above example that ``dayfirst`` isn't strict. If a date
216218
can't be parsed with the day being first it will be parsed as if
217-
``dayfirst`` were False, and in the case of parsing delimited date strings
218-
(e.g. ``31-12-2012``) then a warning will also be raised.
219+
``dayfirst`` were ``False`` and a warning will also be raised.
219220

220221
If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``.
221222
``Timestamp`` can also accept string input, but it doesn't accept string parsing

doc/source/whatsnew/v2.0.0.rst

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,38 @@ Optional libraries below the lowest tested version may still work, but are not c
417417

418418
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
419419

420+
Datetimes are now parsed with a consistent format
421+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
422+
423+
In the past, :func:`to_datetime` guessed the format for each element independently. This was appropriate for some cases where elements had mixed date formats - however, it would regularly cause problems when users expected a consistent format but the function would switch formats between elements. As of version 2.0.0, parsing will use a consistent format, determined by the first non-NA value (unless the user specifies a format, in which case that is used).
424+
425+
*Old behavior*:
426+
427+
.. code-block:: ipython
428+
429+
In [1]: ser = pd.Series(['13-01-2000', '12-01-2000'])
430+
In [2]: pd.to_datetime(ser)
431+
Out[2]:
432+
0 2000-01-13
433+
1 2000-12-01
434+
dtype: datetime64[ns]
435+
436+
*New behavior*:
437+
438+
.. ipython:: python
439+
:okwarning:
440+
441+
ser = pd.Series(['13-01-2000', '12-01-2000'])
442+
pd.to_datetime(ser)
443+
444+
Note that this affects :func:`read_csv` as well.
445+
446+
If you still need to parse dates with inconsistent formats, you'll need to apply :func:`to_datetime`
447+
to each element individually, e.g. ::
448+
449+
ser = pd.Series(['13-01-2000', '12 January 2000'])
450+
ser.apply(pd.to_datetime)
451+
420452
.. _whatsnew_200.api_breaking.other:
421453

422454
Other API changes
@@ -459,7 +491,7 @@ Other API changes
459491

460492
Deprecations
461493
~~~~~~~~~~~~
462-
-
494+
- Deprecated argument ``infer_datetime_format`` in :func:`to_datetime` and :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
463495

464496
.. ---------------------------------------------------------------------------
465497
@@ -834,6 +866,7 @@ I/O
834866
- Improved error message in :func:`read_excel` by including the offending sheet name when an exception is raised while reading a file (:issue:`48706`)
835867
- Bug when a pickling a subset PyArrow-backed data that would serialize the entire data instead of the subset (:issue:`42600`)
836868
- Bug in :func:`read_csv` for a single-line csv with fewer columns than ``names`` raised :class:`.errors.ParserError` with ``engine="c"`` (:issue:`47566`)
869+
- Bug in displaying ``string`` dtypes not showing storage option (:issue:`50099`)
837870
- Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
838871
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
839872
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)

pandas/_libs/tslib.pyx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,10 @@ def array_with_unit_to_datetime(
300300
iresult = values.astype("i8", copy=False)
301301
# fill missing values by comparing to NPY_NAT
302302
mask = iresult == NPY_NAT
303+
# Trying to Convert NaN to integer results in undefined
304+
# behaviour, so handle it explicitly (see GH #48705)
305+
if values.dtype.kind == "f":
306+
mask |= values != values
303307
iresult[mask] = 0
304308
fvalues = iresult.astype("f8") * mult
305309
need_to_iterate = False

pandas/_libs/tslibs/parsing.pyx

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,7 @@ def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None:
10321032
# rebuild string, capturing any inferred padding
10331033
dt_str = "".join(tokens)
10341034
if parsed_datetime.strftime(guessed_format) == dt_str:
1035+
_maybe_warn_about_dayfirst(guessed_format, dayfirst)
10351036
return guessed_format
10361037
else:
10371038
return None
@@ -1051,6 +1052,28 @@ cdef str _fill_token(token: str, padding: int):
10511052
token_filled = f"{seconds}.{nanoseconds}"
10521053
return token_filled
10531054

1055+
cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst):
1056+
"""Warn if guessed datetime format doesn't respect dayfirst argument."""
1057+
cdef:
1058+
int day_index = format.find("%d")
1059+
int month_index = format.find("%m")
1060+
1061+
if (day_index != -1) and (month_index != -1):
1062+
if (day_index > month_index) and dayfirst:
1063+
warnings.warn(
1064+
f"Parsing dates in {format} format when dayfirst=True was specified. "
1065+
"Pass `dayfirst=False` or specify a format to silence this warning.",
1066+
UserWarning,
1067+
stacklevel=find_stack_level(),
1068+
)
1069+
if (day_index < month_index) and not dayfirst:
1070+
warnings.warn(
1071+
f"Parsing dates in {format} format when dayfirst=False was specified. "
1072+
"Pass `dayfirst=True` or specify a format to silence this warning.",
1073+
UserWarning,
1074+
stacklevel=find_stack_level(),
1075+
)
1076+
10541077

10551078
@cython.wraparound(False)
10561079
@cython.boundscheck(False)

0 commit comments

Comments
 (0)