Skip to content

Commit 476e987

Browse files
fangchenliclaude
andauthored
PERF: avoid NumPy fallback when casting Arrow int/string to string (#63401)
Co-authored-by: Claude Opus 4.5 <[email protected]>
1 parent 39ac242 commit 476e987

File tree

2 files changed

+25
-5
lines changed

2 files changed

+25
-5
lines changed

‎pandas/core/arrays/string_arrow.py‎

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,26 @@ def _from_sequence(
210210
result=scalars._data
211211
result=lib.ensure_string_array(result, copy=copy, convert_na_value=False)
212212
pa_arr=pa.array(result, mask=na_values, type=pa.large_string())
213+
elifisinstance(scalars, ArrowExtensionArray):
214+
pa_type=scalars._pa_array.type
215+
# Use PyArrow's native cast for integer, string, and boolean types.
216+
# Float has different representation in PyArrow: 1.0 -> "1" instead
217+
# of "1.0", and uses different scientific notation (1e+10 vs 1e10).
218+
# Boolean needs capitalize (true -> True, false -> False).
219+
if (
220+
pa.types.is_integer(pa_type)
221+
orpa.types.is_large_string(pa_type)
222+
orpa.types.is_string(pa_type)
223+
orpa.types.is_boolean(pa_type)
224+
):
225+
pa_arr=pc.cast(scalars._pa_array, pa.large_string())
226+
ifpa.types.is_boolean(pa_type):
227+
pa_arr=pc.utf8_capitalize(pa_arr)
228+
else:
229+
# Fall back for types where PyArrow's string representation
230+
# differs from Python's str()
231+
result=lib.ensure_string_array(scalars, copy=copy)
232+
pa_arr=pa.array(result, type=pa.large_string(), from_pandas=True)
213233
elifisinstance(scalars, (pa.Array, pa.ChunkedArray)):
214234
pa_arr=pc.cast(scalars, pa.large_string())
215235
else:

‎pandas/tests/copy_view/test_astype.py‎

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
importnumpyasnp
44
importpytest
55

6-
frompandas.compatimportHAS_PYARROW
7-
86
frompandasimport (
97
DataFrame,
108
Series,
@@ -218,10 +216,12 @@ def test_convert_dtypes(using_infer_string):
218216
df_orig=df.copy()
219217
df2=df.convert_dtypes()
220218

221-
ifHAS_PYARROW:
222-
assertnottm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
223-
else:
219+
ifusing_infer_string:
220+
# String column is already Arrow-backed, so memory is shared
224221
asserttm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
222+
else:
223+
# String column converts from object to Arrow, no memory sharing
224+
assertnottm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
225225
asserttm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
226226
asserttm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
227227
asserttm.shares_memory(get_array(df2, "c"), get_array(df, "c"))

0 commit comments

Comments
(0)