Skip to content

Unify PyUncode_Count and unicode_count#97982

@sobolevn

Description

@sobolevn

Feature or enhancement

Right now PyUnicode_Count from

Py_ssize_t
PyUnicode_Count(PyObject*str,
PyObject*substr,
Py_ssize_tstart,
Py_ssize_tend)
{
Py_ssize_tresult;
intkind1, kind2;
constvoid*buf1=NULL, *buf2=NULL;
Py_ssize_tlen1, len2;
if (ensure_unicode(str) <0||ensure_unicode(substr) <0)
return-1;
kind1=PyUnicode_KIND(str);
kind2=PyUnicode_KIND(substr);
if (kind1<kind2)
return0;
len1=PyUnicode_GET_LENGTH(str);
len2=PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start, end, len1);
if (end-start<len2)
return0;
buf1=PyUnicode_DATA(str);
buf2=PyUnicode_DATA(substr);
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
goto onError;
}
switch (kind1){
casePyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str) &&PyUnicode_IS_ASCII(substr))
result=asciilib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
else
result=ucs1lib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
result=ucs2lib_count(
((constPy_UCS2*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
result=ucs4lib_count(
((constPy_UCS4*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
onError:
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
return-1;
}
and unicode_count from
staticPyObject*
unicode_count(PyObject*self, PyObject*args)
{
PyObject*substring=NULL; /* initialize to fix a compiler warning */
Py_ssize_tstart=0;
Py_ssize_tend=PY_SSIZE_T_MAX;
PyObject*result;
intkind1, kind2;
constvoid*buf1, *buf2;
Py_ssize_tlen1, len2, iresult;
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
returnNULL;
kind1=PyUnicode_KIND(self);
kind2=PyUnicode_KIND(substring);
if (kind1<kind2)
returnPyLong_FromLong(0);
len1=PyUnicode_GET_LENGTH(self);
len2=PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start, end, len1);
if (end-start<len2)
returnPyLong_FromLong(0);
buf1=PyUnicode_DATA(self);
buf2=PyUnicode_DATA(substring);
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
returnNULL;
}
switch (kind1){
casePyUnicode_1BYTE_KIND:
iresult=ucs1lib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
iresult=ucs2lib_count(
((constPy_UCS2*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
iresult=ucs4lib_count(
((constPy_UCS4*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
result=PyLong_FromSsize_t(iresult);
assert((kind2==kind1) == (buf2==PyUnicode_DATA(substring)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
}
share a lot of code.

They can be unified, because the do the same thing.

Pitch

Citing@encukou:

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

Previous discussion

Link: #96929

PR in the works.

Metadata

Metadata

Assignees

Labels

interpreter-core(Objects, Python, Grammar, and Parser dirs)type-featureA feature request or enhancement

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions