diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 106530f84223a..d15c924d2dad2 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -45,6 +45,7 @@ jobs: (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) runs-on: ubuntu-24.04 env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} outputs: sdist_file: ${{ steps.save-path.outputs.sdist_name }} @@ -117,6 +118,7 @@ jobs: python: ["cp313t", "3.13"] env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout pandas @@ -164,7 +166,6 @@ jobs: uses: pypa/cibuildwheel@v3.2.1 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} - output-dir: ./dist env: CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} @@ -195,19 +196,26 @@ jobs: - name: Validate wheel RECORD shell: bash -el {0} - run: for whl in $(ls ./dist/*.whl); do wheel unpack $whl -d /tmp; done + run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done - uses: actions/upload-artifact@v6 with: name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - path: ./dist/*.whl + path: ./wheelhouse/*.whl - name: Upload wheels & sdist - if: ${{ success() && env.IS_SCHEDULE_DISPATCH == 'true' }} - uses: scientific-python/upload-nightly-action@0.6.2 - with: - artifacts_path: ./dist - anaconda_nightly_upload_token: ${{secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN}} + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + env: + PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} + # trigger an upload to + # https://anaconda.org/scientific-python-nightly-wheels/pandas + # for cron jobs or "Run workflow" (restricted to main branch). + # The tokens were originally generated at anaconda.org + run: | + source ci/upload_wheels.sh + set_upload_vars + upload_wheels publish: if: > diff --git a/.gitignore b/.gitignore index 5177a195f99f9..a4a21293ab1ee 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,7 @@ pandas/py.typed .ropeproject # wheel files *.whl +**/wheelhouse/* pip-wheel-metadata # coverage .coverage diff --git a/README.md b/README.md index c6e0a4b319930..2a91238ff48e2 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ details, see the commit logs at https://github.com/pandas-dev/pandas. ## Dependencies - [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org) - [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html) -- [tzdata - Provides an IANA time zone database](https://tzdata.readthedocs.io/en/latest/) +- [tzdata - Provides an IANA time zone database](https://tzdata.readthedocs.io/en/latest/) (Only required on Windows/Emscripten) See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies. diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh new file mode 100644 index 0000000000000..92959e8716a80 --- /dev/null +++ b/ci/upload_wheels.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh + +set_upload_vars() { + echo "IS_SCHEDULE_DISPATCH is $IS_SCHEDULE_DISPATCH" + if [[ "$IS_SCHEDULE_DISPATCH" == "true" ]]; then + echo scheduled or dispatched event + export ANACONDA_ORG="scientific-python-nightly-wheels" + export TOKEN="$PANDAS_NIGHTLY_UPLOAD_TOKEN" + export ANACONDA_UPLOAD="true" + else + echo non-dispatch event + export ANACONDA_UPLOAD="false" + fi +} +upload_wheels() { + echo "${PWD}" + if [[ ${ANACONDA_UPLOAD} == true ]]; then + if [ -z "${TOKEN}" ]; then + echo no token set, not uploading + else + # sdists are located under dist folder when built through setup.py + if compgen -G "./dist/*.gz"; then + echo "Found sdist" + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz + echo "Uploaded sdist" + fi + if compgen -G "./wheelhouse/*.whl"; then + echo "Found wheel" + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl + echo "Uploaded wheel" + fi + echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" + fi + fi +} diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index bb3290e18e26b..33804a57c2bf9 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -150,9 +150,11 @@ Package Minimum support ================================================================ ========================== `NumPy `__ 1.26.0 `python-dateutil `__ 2.8.2 -`tzdata `__ 2023.3 + `tzdata `__ \* / ================================================================ ========================== +\* ``tzdata`` is only required on Windows and Pyodide (Emscripten). + Generally, the minimum supported version is ~2 years old from the release date of a major or minor pandas version. .. _install.optional_dependencies: diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7de1ffbacbde6..7d65ca781d81e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -686,8 +686,6 @@ The following required dependencies were updated: +=================+======================+ | numpy | 1.26.0 | +-----------------+----------------------+ -| tzdata | 2023.3 | -+-----------------+----------------------+ For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. @@ -1229,6 +1227,7 @@ Indexing - Bug in :meth:`DataFrame.loc.__getitem__` and :meth:`DataFrame.iloc.__getitem__` with a :class:`CategoricalDtype` column with integer categories raising when trying to index a row containing a ``NaN`` entry (:issue:`58954`) - Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`) - Bug in :meth:`Index.get_indexer` not casting missing values correctly for new string datatype (:issue:`55833`) +- Bug in :meth:`Index.intersection`, :meth:`Index.union`, :meth:`MultiIndex.intersection`, and :meth:`MultiIndex.union` returning a reference to the original Index instead of a new instance when operating on identical indexes, which could cause metadata corruption when modifying the result (:issue:`63169`) - Bug in adding new rows with :meth:`DataFrame.loc.__setitem__` or :class:`Series.loc.__setitem__` which failed to retain dtype on the object's index in some cases (:issue:`41626`) - Bug in indexing on a :class:`DatetimeIndex` with a ``timestamp[pyarrow]`` dtype or on a :class:`TimedeltaIndex` with a ``duration[pyarrow]`` dtype (:issue:`62277`) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index b02ddbcaf18e9..8218c5196e0c0 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -59,9 +59,6 @@ cdef bint is_utc_zoneinfo(tzinfo tz): utc_zoneinfo = zoneinfo.ZoneInfo("UTC") except zoneinfo.ZoneInfoNotFoundError: return False - # Warn if tzdata is too old, even if there is a system tzdata to alert - # users about the mismatch between local/system tzdata - import_optional_dependency("tzdata", errors="warn", min_version="2022.7") return tz is utc_zoneinfo diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 11928e79ffc62..ff01d4ac835ba 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -208,6 +208,10 @@ def _hash_pandas_object( values, encoding=encoding, hash_key=hash_key, categorize=categorize ) + def _cast_pointwise_result(self, values: ArrayLike) -> ArrayLike: + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) + # Signature of "argmin" incompatible with supertype "ExtensionArray" def argmin(self, axis: AxisInt = 0, skipna: bool = True): # type: ignore[override] # override base class by adding axis keyword diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index edf1d7ddcaa76..d2d67dd644303 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -442,7 +442,8 @@ def _cast_pointwise_result(self, values) -> ArrayLike: # e.g. test_by_column_values_with_same_starting_value with nested # values, one entry of which is an ArrowStringArray # or test_agg_lambda_complex128_dtype_conversion for complex values - return super()._cast_pointwise_result(values) + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) if pa.types.is_null(arr.type): if lib.infer_dtype(values) == "decimal": @@ -498,7 +499,8 @@ def _cast_pointwise_result(self, values) -> ArrayLike: if self.dtype.na_value is np.nan: # ArrowEA has different semantics, so we return numpy-based # result instead - return super()._cast_pointwise_result(values) + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) return ArrowExtensionArray(arr) return self._from_pyarrow_array(arr) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 225cc888d50db..d31e3daa55d8c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -19,6 +19,7 @@ cast, overload, ) +import warnings import numpy as np @@ -33,6 +34,7 @@ cache_readonly, set_module, ) +from pandas.util._exceptions import find_stack_level from pandas.util._validators import ( validate_bool_kwarg, validate_insert_loc, @@ -86,6 +88,7 @@ AstypeArg, AxisInt, Dtype, + DtypeObj, FillnaOptions, InterpolateOptions, NumpySorter, @@ -383,13 +386,67 @@ def _from_factorized(cls, values, original): """ raise AbstractMethodError(cls) + @classmethod + def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: + """ + Strict analogue to _from_sequence, allowing only sequences of scalars + that should be specifically inferred to the given dtype. + + Parameters + ---------- + scalars : sequence + dtype : ExtensionDtype + + Raises + ------ + TypeError or ValueError + + Notes + ----- + This is called in a try/except block when casting the result of a + pointwise operation in ExtensionArray._cast_pointwise_result. + """ + try: + return cls._from_sequence(scalars, dtype=dtype, copy=False) + except (ValueError, TypeError): + raise + except Exception: + warnings.warn( + "_from_scalars should only raise ValueError or TypeError. " + "Consider overriding _from_scalars where appropriate.", + stacklevel=find_stack_level(), + ) + raise + def _cast_pointwise_result(self, values) -> ArrayLike: """ + Construct an ExtensionArray after a pointwise operation. + Cast the result of a pointwise operation (e.g. Series.map) to an - array, preserve dtype_backend if possible. + array. This is not required to return an ExtensionArray of the same + type as self or of the same dtype. It can also return another + ExtensionArray of the same "family" if you implement multiple + ExtensionArrays/Dtypes that are interoperable (e.g. if you have float + array with units, this method can return an int array with units). + + If converting to your own ExtensionArray is not possible, this method + falls back to returning an array with the default type inference. + If you only need to cast to `self.dtype`, it is recommended to override + `_from_scalars` instead of this method. + + Parameters + ---------- + values : sequence + + Returns + ------- + ExtensionArray or ndarray """ - values = np.asarray(values, dtype=object) - return lib.maybe_convert_objects(values, convert_non_numeric=True) + try: + return type(self)._from_scalars(values, dtype=self.dtype) + except (ValueError, TypeError): + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) # ------------------------------------------------------------------------ # Must be a Sequence diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c15a196dc6727..86140229b724e 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -622,7 +622,8 @@ def _from_factorized(cls, values, original) -> Self: return cls(values, dtype=original.dtype) def _cast_pointwise_result(self, values): - result = super()._cast_pointwise_result(values) + values = np.asarray(values, dtype=object) + result = lib.maybe_convert_objects(values, convert_non_numeric=True) if result.dtype.kind == self.dtype.kind: try: # e.g. test_groupby_agg_extension diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8bad206eea028..3506a8e3a5279 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -210,6 +210,26 @@ def _from_sequence( result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) pa_arr = pa.array(result, mask=na_values, type=pa.large_string()) + elif isinstance(scalars, ArrowExtensionArray): + pa_type = scalars._pa_array.type + # Use PyArrow's native cast for integer, string, and boolean types. + # Float has different representation in PyArrow: 1.0 -> "1" instead + # of "1.0", and uses different scientific notation (1e+10 vs 1e10). + # Boolean needs capitalize (true -> True, false -> False). + if ( + pa.types.is_integer(pa_type) + or pa.types.is_large_string(pa_type) + or pa.types.is_string(pa_type) + or pa.types.is_boolean(pa_type) + ): + pa_arr = pc.cast(scalars._pa_array, pa.large_string()) + if pa.types.is_boolean(pa_type): + pa_arr = pc.utf8_capitalize(pa_arr) + else: + # Fall back for types where PyArrow's string representation + # differs from Python's str() + result = lib.ensure_string_array(scalars, copy=copy) + pa_arr = pa.array(result, type=pa.large_string(), from_pandas=True) elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): pa_arr = pc.cast(scalars, pa.large_string()) else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d6af3c7b9917..a7535835f4b84 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5582,51 +5582,72 @@ def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: fill_value=fill_value, ) - @Appender( + def set_axis( + self, + labels, + *, + axis: Axis = 0, + copy: bool | lib.NoDefault = lib.no_default, + ) -> DataFrame: """ + Assign desired index to given axis. + + Indexes for column or row labels can be changed by assigning + a list-like or Index. + + Parameters + ---------- + labels : list-like, Index + The values for the new index. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to update. The value 0 identifies the rows. For `Series` + this parameter is unused and defaults to 0. + + copy : bool, default False + This keyword is now ignored; changing its value will have no + impact on the method. + + .. deprecated:: 3.0.0 + + This keyword is ignored and will be removed in pandas 4.0. Since + pandas 3.0, this method always returns a new object using a lazy + copy mechanism that defers copies until necessary + (Copy-on-Write). See the `user guide on Copy-on-Write + `__ + for more details. + + Returns + ------- + DataFrame + An object of type DataFrame. + + See Also + -------- + DataFrame.rename_axis : Alter the name of the index or columns. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) Change the row labels. - >>> df.set_axis(['a', 'b', 'c'], axis='index') - A B + >>> df.set_axis(["a", "b", "c"], axis="index") + A B a 1 4 b 2 5 c 3 6 Change the column labels. - >>> df.set_axis(['I', 'II'], axis='columns') - I II + >>> df.set_axis(["I", "II"], axis="columns") + I II 0 1 4 1 2 5 2 3 6 """ - ) - @Substitution( - klass=_shared_doc_kwargs["klass"], - axes_single_arg=_shared_doc_kwargs["axes_single_arg"], - extended_summary_sub=" column or", - axis_description_sub=", and 1 identifies the columns", - see_also_sub=" or columns", - ) - @Appender(NDFrame.set_axis.__doc__) - def set_axis( - self, - labels, - *, - axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, - ) -> DataFrame: return super().set_axis(labels, axis=axis, copy=copy) - @doc( - NDFrame.reindex, - klass=_shared_doc_kwargs["klass"], - optional_reindex=_shared_doc_kwargs["optional_reindex"], - ) def reindex( self, labels=None, @@ -5641,6 +5662,227 @@ def reindex( limit: int | None = None, tolerance=None, ) -> DataFrame: + """ + Conform DataFrame to new index with optional filling logic. + + Places NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + ``copy=False``. + + Parameters + ---------- + + labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. + index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. + columns : array-like, optional + New labels for the columns. Preferably an Index object to avoid + duplicating data. + axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1). + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: Propagate last valid observation forward to next + valid. + * backfill / bfill: Use next valid observation to fill gap. + * nearest: Use nearest valid observations to fill gap. + + copy : bool, default False + This keyword is now ignored; changing its value will have no + impact on the method. + + .. deprecated:: 3.0.0 + + This keyword is ignored and will be removed in pandas 4.0. Since + pandas 3.0, this method always returns a new object using a lazy + copy mechanism that defers copies until necessary + (Copy-on-Write). See the `user guide on Copy-on-Write + `__ + for more details. + + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level. + fill_value : scalar, default np.nan + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value. + limit : int, default None + Maximum number of consecutive elements to forward or backward fill. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. + + Returns + ------- + DataFrame + DataFrame with changed index. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + + Examples + -------- + ``DataFrame.reindex`` supports two calling conventions + + * ``(index=index_labels, columns=column_labels, ...)`` + * ``(labels, axis={'index', 'columns'}, ...)`` + + We *highly* recommend using keyword arguments to clarify your + intent. + + Create a DataFrame with some fictional data. + + >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"] + >>> columns = ["http_status", "response_time"] + >>> df = pd.DataFrame( + ... [[200, 0.04], [200, 0.02], [404, 0.07], [404, 0.08], [301, 1.0]], + ... columns=columns, + ... index=index, + ... ) + >>> df + http_status response_time + Firefox 200 0.04 + Chrome 200 0.02 + Safari 404 0.07 + IE10 404 0.08 + Konqueror 301 1.00 + + Create a new index and reindex the DataFrame. By default + values in the new index that do not have corresponding + records in the DataFrame are assigned ``NaN``. + + >>> new_index = ["Safari", "Iceweasel", "Comodo Dragon", "IE10", "Chrome"] + >>> df.reindex(new_index) + http_status response_time + Safari 404.0 0.07 + Iceweasel NaN NaN + Comodo Dragon NaN NaN + IE10 404.0 0.08 + Chrome 200.0 0.02 + + We can fill in the missing values by passing a value to + the keyword ``fill_value``. Because the index is not monotonically + increasing or decreasing, we cannot use arguments to the keyword + ``method`` to fill the ``NaN`` values. + + >>> df.reindex(new_index, fill_value=0) + http_status response_time + Safari 404 0.07 + Iceweasel 0 0.00 + Comodo Dragon 0 0.00 + IE10 404 0.08 + Chrome 200 0.02 + + >>> df.reindex(new_index, fill_value="missing") + http_status response_time + Safari 404 0.07 + Iceweasel missing missing + Comodo Dragon missing missing + IE10 404 0.08 + Chrome 200 0.02 + + We can also reindex the columns. + + >>> df.reindex(columns=["http_status", "user_agent"]) + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + Or we can use "axis-style" keyword arguments + + >>> df.reindex(["http_status", "user_agent"], axis="columns") + http_status user_agent + Firefox 200 NaN + Chrome 200 NaN + Safari 404 NaN + IE10 404 NaN + Konqueror 301 NaN + + To further illustrate the filling functionality in + ``reindex``, we will create a DataFrame with a + monotonically increasing index (for example, a sequence + of dates). + + >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D") + >>> df2 = pd.DataFrame( + ... {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index + ... ) + >>> df2 + prices + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + + Suppose we decide to expand the DataFrame to cover a wider + date range. + + >>> date_index2 = pd.date_range("12/29/2009", periods=10, freq="D") + >>> df2.reindex(date_index2) + prices + 2009-12-29 NaN + 2009-12-30 NaN + 2009-12-31 NaN + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + The index entries that did not have a value in the original data frame + (for example, '2009-12-29') are by default filled with ``NaN``. + If desired, we can fill in the missing values using one of several + options. + + For example, to back-propagate the last valid value to fill the ``NaN`` + values, pass ``bfill`` as an argument to the ``method`` keyword. + + >>> df2.reindex(date_index2, method="bfill") + prices + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 + 2010-01-03 NaN + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 + 2010-01-07 NaN + + Please note that the ``NaN`` value present in the original DataFrame + (at index value 2010-01-03) will not be filled by any of the + value propagation schemes. This is because filling while reindexing + does not look at DataFrame values, but only compares the original and + desired indexes. If you do want to fill in the ``NaN`` values present + in the original DataFrame, use the ``fillna()`` method. + + See the :ref:`user guide ` for more. + """ return super().reindex( labels=labels, index=index, @@ -6129,7 +6371,6 @@ def _replace_columnwise( return res if inplace else res.__finalize__(self) - @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift( self, periods: int | Sequence[int] = 1, @@ -6138,6 +6379,120 @@ def shift( fill_value: Hashable = lib.no_default, suffix: str | None = None, ) -> DataFrame: + """ + Shift index by desired number of periods with an optional time `freq`. + + When `freq` is not passed, shift the index without realigning the data. + If `freq` is passed (in this case, the index must be date or datetime, + or it will raise a `NotImplementedError`), the index will be + increased using the periods and the `freq`. `freq` can be inferred + when specified as "infer" as long as either freq or inferred_freq + attribute is set in the index. + + Parameters + ---------- + periods : int or Sequence + Number of periods to shift. Can be positive or negative. + If an iterable of ints, the data will be shifted once by each int. + This is equivalent to shifting by one value at a time and + concatenating all resulting frames. The resulting columns will have + the shift suffixed to their column names. For multiple periods, + axis must not be 1. + freq : DateOffset, tseries.offsets, timedelta, or str, optional + Offset to use from the tseries module or time rule (e.g. 'EOM'). + If `freq` is specified then the index values are shifted but the + data is not realigned. That is, use `freq` if you would like to + extend the index when shifting and preserve the original data. + If `freq` is specified as "infer" then it will be inferred from + the freq or inferred_freq attributes of the index. If neither of + those attributes exist, a ValueError is thrown. + axis : {0 or 'index', 1 or 'columns', None}, default None + Shift direction. For `Series` this parameter is unused and defaults to 0. + fill_value : object, optional + The scalar value to use for newly introduced missing values. + the default depends on the dtype of `self`. + For Boolean and numeric NumPy data types, ``np.nan`` is used. + For datetime, timedelta, or period data, etc. :attr:`NaT` is used. + For extension dtypes, ``self.dtype.na_value`` is used. + suffix : str, optional + If str and periods is an iterable, this is added after the column + name and before the shift value for each shifted column name. + For `Series` this parameter is unused and defaults to `None`. + + Returns + ------- + DataFrame + Copy of input object, shifted. + + See Also + -------- + Index.shift : Shift values of Index. + DatetimeIndex.shift : Shift values of DatetimeIndex. + PeriodIndex.shift : Shift values of PeriodIndex. + + Examples + -------- + >>> df = pd.DataFrame( + ... [[10, 13, 17], [20, 23, 27], [15, 18, 22], [30, 33, 37], [45, 48, 52]], + ... columns=["Col1", "Col2", "Col3"], + ... index=pd.date_range("2020-01-01", "2020-01-05"), + ... ) + >>> df + Col1 Col2 Col3 + 2020-01-01 10 13 17 + 2020-01-02 20 23 27 + 2020-01-03 15 18 22 + 2020-01-04 30 33 37 + 2020-01-05 45 48 52 + + >>> df.shift(periods=3) + Col1 Col2 Col3 + 2020-01-01 NaN NaN NaN + 2020-01-02 NaN NaN NaN + 2020-01-03 NaN NaN NaN + 2020-01-04 10.0 13.0 17.0 + 2020-01-05 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis="columns") + Col1 Col2 Col3 + 2020-01-01 NaN 10 13 + 2020-01-02 NaN 20 23 + 2020-01-03 NaN 15 18 + 2020-01-04 NaN 30 33 + 2020-01-05 NaN 45 48 + + >>> df.shift(periods=3, fill_value=0) + Col1 Col2 Col3 + 2020-01-01 0 0 0 + 2020-01-02 0 0 0 + 2020-01-03 0 0 0 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + + >>> df.shift(periods=3, freq="D") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 + + >>> df.shift(periods=3, freq="infer") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 + + >>> df["Col1"].shift(periods=[0, 1, 2]) + Col1_0 Col1_1 Col1_2 + 2020-01-01 10 NaN NaN + 2020-01-02 20 10.0 NaN + 2020-01-03 15 20.0 10.0 + 2020-01-04 30 15.0 20.0 + 2020-01-05 45 30.0 15.0 + """ if freq is not None and fill_value is not lib.no_default: # GH#53832 raise ValueError( @@ -8743,12 +9098,316 @@ def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): new_data = self._dispatch_frame_op(other, op, axis=axis) return self._construct_result(new_data, other=other) - @Appender(ops.make_flex_doc("eq", "dataframe")) def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: + """ + Get Not equal to of dataframe and other, element-wise (binary operator `eq`). + + Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison + operators. + + Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis + (rows or columns) and level for comparison. + + Parameters + ---------- + other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. + axis : {0 or 'index', 1 or 'columns'}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). + level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + + Returns + ------- + DataFrame of bool + Result of the comparison. + + See Also + -------- + DataFrame.eq : Compare DataFrames for equality elementwise. + DataFrame.ne : Compare DataFrames for inequality elementwise. + DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. + DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. + DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. + DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + + Notes + ----- + Mismatched indices will be unioned together. + `NaN` values are considered different (i.e. `NaN` != `NaN`). + + Examples + -------- + >>> df = pd.DataFrame( + ... {"cost": [250, 150, 100], "revenue": [100, 250, 300]}, + ... index=["A", "B", "C"], + ... ) + >>> df + cost revenue + A 250 100 + B 150 250 + C 100 300 + + Comparison with a scalar, using either the operator or method: + + >>> df == 100 + cost revenue + A False True + B False False + C True False + + >>> df.eq(100) + cost revenue + A False True + B False False + C True False + + When `other` is a :class:`Series`, the columns of a DataFrame are aligned + with the index of `other` and broadcast: + + >>> df != pd.Series([100, 250], index=["cost", "revenue"]) + cost revenue + A True True + B True False + C False True + + Use the method to control the broadcast axis: + + >>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis="index") + cost revenue + A True False + B True True + C True True + D True True + + When comparing to an arbitrary sequence, the number of columns must + match the number elements in `other`: + + >>> df == [250, 100] + cost revenue + A True True + B False False + C False False + + Use the method to control the axis: + + >>> df.eq([250, 250, 100], axis="index") + cost revenue + A True False + B False True + C True False + + Compare to a DataFrame of different shape. + + >>> other = pd.DataFrame( + ... {"revenue": [300, 250, 100, 150]}, index=["A", "B", "C", "D"] + ... ) + >>> other + revenue + A 300 + B 250 + C 100 + D 150 + + >>> df.gt(other) + cost revenue + A False False + B False False + C False True + D False False + + Compare to a MultiIndex by level. + + >>> df_multindex = pd.DataFrame( + ... { + ... "cost": [250, 150, 100, 150, 300, 220], + ... "revenue": [100, 250, 300, 200, 175, 225], + ... }, + ... index=[ + ... ["Q1", "Q1", "Q1", "Q2", "Q2", "Q2"], + ... ["A", "B", "C", "A", "B", "C"], + ... ], + ... ) + >>> df_multindex + cost revenue + Q1 A 250 100 + B 150 250 + C 100 300 + Q2 A 150 200 + B 300 175 + C 220 225 + + >>> df.le(df_multindex, level=1) + cost revenue + Q1 A True True + B True True + C True True + Q2 A False True + B True False + C True False + """ return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) - @Appender(ops.make_flex_doc("ne", "dataframe")) def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: + """ + Get Not equal to of dataframe and other, element-wise (binary operator `ne`). + + Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison + operators. + + Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis + (rows or columns) and level for comparison. + + Parameters + ---------- + other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. + axis : {0 or 'index', 1 or 'columns'}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). + level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + + Returns + ------- + DataFrame of bool + Result of the comparison. + + See Also + -------- + DataFrame.eq : Compare DataFrames for equality elementwise. + DataFrame.ne : Compare DataFrames for inequality elementwise. + DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. + DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. + DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. + DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + + Notes + ----- + Mismatched indices will be unioned together. + `NaN` values are considered different (i.e. `NaN` != `NaN`). + + Examples + -------- + >>> df = pd.DataFrame( + ... {"cost": [250, 150, 100], "revenue": [100, 250, 300]}, + ... index=["A", "B", "C"], + ... ) + >>> df + cost revenue + A 250 100 + B 150 250 + C 100 300 + + Comparison with a scalar, using either the operator or method: + + >>> df == 100 + cost revenue + A False True + B False False + C True False + + >>> df.eq(100) + cost revenue + A False True + B False False + C True False + + When `other` is a :class:`Series`, the columns of a DataFrame are aligned + with the index of `other` and broadcast: + + >>> df != pd.Series([100, 250], index=["cost", "revenue"]) + cost revenue + A True True + B True False + C False True + + Use the method to control the broadcast axis: + + >>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis="index") + cost revenue + A True False + B True True + C True True + D True True + + When comparing to an arbitrary sequence, the number of columns must + match the number elements in `other`: + + >>> df == [250, 100] + cost revenue + A True True + B False False + C False False + + Use the method to control the axis: + + >>> df.eq([250, 250, 100], axis="index") + cost revenue + A True False + B False True + C True False + + Compare to a DataFrame of different shape. + + >>> other = pd.DataFrame( + ... {"revenue": [300, 250, 100, 150]}, index=["A", "B", "C", "D"] + ... ) + >>> other + revenue + A 300 + B 250 + C 100 + D 150 + + >>> df.gt(other) + cost revenue + A False False + B False False + C False True + D False False + + Compare to a MultiIndex by level. + + >>> df_multindex = pd.DataFrame( + ... { + ... "cost": [250, 150, 100, 150, 300, 220], + ... "revenue": [100, 250, 300, 200, 175, 225], + ... }, + ... index=[ + ... ["Q1", "Q1", "Q1", "Q2", "Q2", "Q2"], + ... ["A", "B", "C", "A", "B", "C"], + ... ], + ... ) + >>> df_multindex + cost revenue + Q1 A 250 100 + B 150 250 + C 100 300 + Q2 A 150 200 + B 300 175 + C 220 225 + + >>> df.le(df_multindex, level=1) + cost revenue + Q1 A True True + B True True + C True True + Q2 A False True + B True False + C True False + """ return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) @Appender(ops.make_flex_doc("le", "dataframe")) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dc37f3bfa6a4c..7b17a519d169e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2984,13 +2984,12 @@ def __bool__(self) -> NoReturn: def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, - return self, unless the name changes, in which - case make a shallow copy of self. + return a shallow copy of self. """ name = get_op_result_name(self, other) if self.name is not name: return self.rename(name) - return self + return self.copy(deep=False) @final def _validate_sort_keyword(self, sort) -> None: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 90f710b0de4de..3a0a1d8deacb3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -4099,13 +4099,12 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _get_reconciled_name_object(self, other) -> MultiIndex: """ If the result of a set operation will be self, - return self, unless the names change, in which - case make a shallow copy of self. + return a shallow copy of self. """ names = self._maybe_match_names(other) if self.names != names: return self.rename(names) - return self + return self.copy(deep=False) def _maybe_match_names(self, other): """ diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 0075a7ed59795..c436391739ab2 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import HAS_PYARROW - from pandas import ( DataFrame, Series, @@ -218,10 +216,12 @@ def test_convert_dtypes(using_infer_string): df_orig = df.copy() df2 = df.convert_dtypes() - if HAS_PYARROW: - assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) - else: + if using_infer_string: + # String column is already Arrow-backed, so memory is shared assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + # String column converts from object to Arrow, no memory sharing + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 7d055e2143112..23f3fa046dab4 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -111,15 +111,15 @@ def _from_sequence_of_strings(cls, strings, *, dtype: ExtensionDtype, copy=False def _from_factorized(cls, values, original): return cls(values) - def _cast_pointwise_result(self, values): - result = super()._cast_pointwise_result(values) - try: - # If this were ever made a non-test EA, special-casing could - # be avoided by handling Decimal in maybe_convert_objects - res = type(self)._from_sequence(result, dtype=self.dtype) - except (ValueError, TypeError): - return result - return res + # test to ensure that the base class _cast_pointwise_result works as expected + # def _cast_pointwise_result(self, values): + # try: + # # If this were ever made a non-test EA, special-casing could + # # be avoided by handling Decimal in maybe_convert_objects + # res = type(self)._from_sequence(values, dtype=self.dtype) + # except (ValueError, TypeError): + # return values + # return res _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 1878fac1b8111..5eb1a9ff286fa 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -94,11 +94,14 @@ def _from_factorized(cls, values, original): return cls([UserDict(x) for x in values if x != ()]) def _cast_pointwise_result(self, values): - result = super()._cast_pointwise_result(values) try: - return type(self)._from_sequence(result, dtype=self.dtype) + return type(self)._from_sequence(values, dtype=self.dtype) except (ValueError, TypeError): - return result + # TODO replace with public function + from pandas._libs import lib + + values = np.asarray(values, dtype=object) + return lib.maybe_convert_objects(values, convert_non_numeric=True) def __getitem__(self, item): if isinstance(item, tuple): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 6922803c325b3..a3d31b0571a26 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -727,7 +727,7 @@ def test_intersection(self, index, sort): # Corner cases inter = first.intersection(first, sort=sort) - assert inter is first + assert inter is not first @pytest.mark.parametrize( "index2_name,keeps_name", @@ -812,16 +812,16 @@ def test_union_identity(self, index, sort): first = index[5:20] union = first.union(first, sort=sort) - # i.e. identity is not preserved when sort is True - assert (union is first) is (not sort) + # GH#63169 - identity is not preserved to prevent shared mutable state + assert union is not first # This should no longer be the same object, since [] is not consistent, # both objects will be recast to dtype('O') union = first.union(Index([], dtype=first.dtype), sort=sort) - assert (union is first) is (not sort) + assert union is not first union = Index([], dtype=first.dtype).union(first, sort=sort) - assert (union is first) is (not sort) + assert union is not first @pytest.mark.parametrize("index", ["string"], indirect=True) @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) @@ -984,3 +984,83 @@ def test_union_pyarrow_timestamp(self): res = left.union(right) expected = Index(["2020-01-01", "2020-01-02"], dtype=left.dtype) tm.assert_index_equal(res, expected) + + +def test_intersection_mutation_safety(): + # GH#63169 + index1 = Index([0, 1], name="original") + index2 = Index([0, 1], name="original") + + result = index1.intersection(index2) + + assert result is not index1 + assert result is not index2 + + tm.assert_index_equal(result, index1) + assert result.name == "original" + + index1.name = "changed" + + assert result.name == "original" + assert index1.name == "changed" + + +def test_union_mutation_safety(): + # GH#63169 + index1 = Index([0, 1], name="original") + index2 = Index([0, 1], name="original") + + result = index1.union(index2) + + assert result is not index1 + assert result is not index2 + + tm.assert_index_equal(result, index1) + assert result.name == "original" + + index1.name = "changed" + + assert result.name == "original" + assert index1.name == "changed" + + +def test_union_mutation_safety_other(): + # GH#63169 + index1 = Index([0, 1], name="original") + index2 = Index([0, 1], name="original") + + result = index1.union(index2) + + assert result is not index2 + + tm.assert_index_equal(result, index2) + assert result.name == "original" + + index2.name = "changed" + + assert result.name == "original" + assert index2.name == "changed" + + +def test_multiindex_intersection_mutation_safety(): + # GH#63169 + mi1 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"]) + mi2 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"]) + + result = mi1.intersection(mi2) + assert result is not mi1 + + mi1.names = ["changed1", "changed2"] + assert result.names == ["x", "y"] + + +def test_multiindex_union_mutation_safety(): + # GH#63169 + mi1 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"]) + mi2 = MultiIndex.from_tuples([("a", 1), ("b", 2)], names=["x", "y"]) + + result = mi1.union(mi2) + assert result is not mi1 + + mi1.names = ["changed1", "changed2"] + assert result.names == ["x", "y"] diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index fbbe0700c2380..424961a4b652b 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -114,7 +114,7 @@ def test_intersection_bug_1708(self): def test_intersection_equal(self, sort): # GH 24471 Test intersection outcome given the sort keyword - # for equal indices intersection should return the original index + # GH#63169 intersection returns a copy to prevent shared mutable state first = timedelta_range("1 day", periods=4, freq="h") second = timedelta_range("1 day", periods=4, freq="h") intersect = first.intersection(second, sort=sort) @@ -124,7 +124,8 @@ def test_intersection_equal(self, sort): # Corner cases inter = first.intersection(first, sort=sort) - assert inter is first + assert inter is not first + tm.assert_index_equal(inter, first) @pytest.mark.parametrize("period_1, period_2", [(0, 4), (4, 0)]) def test_intersection_zero_length(self, period_1, period_2, sort): diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index d3bce7b4bbf65..c48986c597356 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -3,6 +3,9 @@ timedelta, timezone, ) +import subprocess +import sys +import textwrap import dateutil.tz import pytest @@ -16,6 +19,25 @@ from pandas import Timestamp +@pytest.mark.single_cpu +def test_no_timezone_data(): + # https://github.com/pandas-dev/pandas/pull/63335 + # Test error message when timezone data is not available. + msg = "'No time zone found with key Europe/Brussels'" + code = textwrap.dedent( + f"""\ + import sys, zoneinfo, pandas as pd + sys.modules['tzdata'] = None + zoneinfo.reset_tzpath(['/path/to/nowhere']) + try: + pd.to_datetime('2012-01-01').tz_localize('Europe/Brussels') + except zoneinfo.ZoneInfoNotFoundError as err: + assert str(err) == "{msg}" + """ + ) + subprocess.check_call([sys.executable, "-c", code]) + + def test_is_utc(utc_fixture): tz = timezones.maybe_get_tz(utc_fixture) assert timezones.is_utc(tz) diff --git a/pyproject.toml b/pyproject.toml index 694f555f0d620..3242738cafc2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,9 @@ requires-python = '>=3.11' dependencies = [ "numpy>=1.26.0", "python-dateutil>=2.8.2", - "tzdata>=2023.3" + "tzdata; sys_platform == 'win32'", + # Emscripten is the platform system for Pyodide. + "tzdata; sys_platform == 'emscripten'", ] classifiers = [ 'Development Status :: 5 - Production/Stable',