Description
Code Sample, a copy-pastable example if possible
import pandas as pd
from dateutil import parser as date_parser
def perform_operations_that_change_dtype(df):
# If df is a single row DataFrame, the type of the
# datetime columns will be changed to object.
for x in df.columns:
if df[x].dtype.kind == 'M': # if it is a date
if df[x].dt.tz is None: # not time zone aware
df.loc[:, x] = df[x].dt.tz_localize('GMT') # tz aware
df.loc[:, x] = df[x].dt.tz_convert('UTC')
def perform_operations_that_do_not_change_dtype(df):
for x in df.columns:
if df[x].dtype.kind == 'M': # if it is a date
if df[x].dt.tz is None: # not time zone aware
df[x] = df[x].dt.tz_localize('GMT') # tz aware
df[x] = df[x].dt.tz_convert('UTC')
# Show effects of tz_operations on the 'broken' DataFrame when using
# .loc for accessing columns.
broken_df = pd.DataFrame(
[{'reference_date': '2019-03-14T10:00:00Z', 'value': '0.0'}])
broken_df['reference_date'] = broken_df['reference_date'].apply(
lambda datestr: date_parser.parse(datestr))
assert(broken_df['reference_date'].dtype == 'datetime64[ns, tzutc()]')
# broken_df reference_date will be changed to object from datetime
perform_operations_that_change_dtype(broken_df)
assert(broken_df['reference_date'].dtype == 'object')
# Show effects of tz_operations on the 'working' DataFrame when using
# .loc for accessing columns.
working_df = pd.DataFrame(
[{'reference_date': '2019-03-14T10:00:00Z', 'value': '0.0'},
{'reference_date': '2019-03-14T10:00:00Z', 'value': '0.0'}])
working_df['reference_date'] = working_df['reference_date'].apply(
lambda datestr: date_parser.parse(datestr))
assert(working_df['reference_date'].dtype == 'datetime64[ns, tzutc()]')
# this time the datetime type will be perserved
# the tz part will only be changed
perform_operations_that_change_dtype(working_df)
assert(working_df['reference_date'].dtype == 'datetime64[ns, UTC]')
# Show effects of tz_operations on the 'broken' DataFrame when using
# [] for accessing columns.
broken_df = pd.DataFrame(
[{'reference_date': '2019-03-14T10:00:00Z', 'value': '0.0'}])
broken_df['reference_date'] = broken_df['reference_date'].apply(
lambda datestr: date_parser.parse(datestr))
assert(broken_df['reference_date'].dtype == 'datetime64[ns, tzutc()]')
perform_operations_that_do_not_change_dtype(broken_df)
assert(broken_df['reference_date'].dtype == 'datetime64[ns, UTC]')
Problem description
Hello all!
I am facing a particularly weird issue, which I have pinpointed to be caused by the _setitem_with_indexer method
From the code above, you can see that when loc
is used inside perform_operations_that_change_dtype
to retrieve a column and change its timezone, the resulting series is of dtype object
instead of datetime
. This behavior is not present when using []
as in perform_operations_that_do_not_change_dtype
, and is only present for DataFrames with single rows. I know that the two approaches are different, but if both loc
and []
return the same series, why is the resulting assignment different? From what I can see something happens inside the _setitem_with_indexer
method that changes the type.
One thing that might be the issue is the _try_coerce_args
method inside ObjectBlock
: link to line. Why is other
's type changed to object when it is a DatetimeArray?
The only similar issue I managed to find is this StackOverflow question that has 0 answers.
Thank you for looking at this issue and for creating and maintaining this amazing library! Hope my findings would be enough to guide someone to help me resolve this; I've been hammering at it the whole day.
Expected Output
The dtype of datetime is not changed to object.
Output of pd.show_versions()
INSTALLED VERSIONS
commit: None
python: 3.7.2.final.0
python-bits: 64
OS: Windows
OS-release: 10
machine: AMD64
processor: Intel64 Family 6 Model 79 Stepping 1, GenuineIntel
byteorder: little
LC_ALL: None
LANG: None
LOCALE: None.None
pandas: 0.24.2
pytest: 4.3.0
pip: 19.0.3
setuptools: 40.8.0
Cython: None
numpy: 1.16.0
scipy: None
pyarrow: None
xarray: None
IPython: 7.2.0
sphinx: None
patsy: None
dateutil: 2.8.0
pytz: 2018.9
blosc: None
bottleneck: None
tables: None
numexpr: None
feather: None
matplotlib: 3.0.3
openpyxl: None
xlrd: 1.2.0
xlwt: None
xlsxwriter: None
lxml.etree: None
bs4: None
html5lib: None
sqlalchemy: None
pymysql: None
psycopg2: None
jinja2: 2.10
s3fs: None
fastparquet: None
pandas_gbq: None
pandas_datareader: None
gcsfs: None