Skip to content

Conversation

@sobolevn
Copy link
Member

@sobolevnsobolevn commented Sep 19, 2022

It is heavily inspired by

deftest_count(self):
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(3, 'aaa', 'count', 'a')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(0, 'aaa', 'count', 'b')
self.checkequal(2, 'aaa', 'count', 'a', 1)
self.checkequal(0, 'aaa', 'count', 'a', 10)
self.checkequal(1, 'aaa', 'count', 'a', -1)
self.checkequal(3, 'aaa', 'count', 'a', -10)
self.checkequal(1, 'aaa', 'count', 'a', 0, 1)
self.checkequal(3, 'aaa', 'count', 'a', 0, 10)
self.checkequal(2, 'aaa', 'count', 'a', 0, -1)
self.checkequal(0, 'aaa', 'count', 'a', 0, -10)
self.checkequal(3, 'aaa', 'count', '', 1)
self.checkequal(1, 'aaa', 'count', '', 3)
self.checkequal(0, 'aaa', 'count', '', 10)
self.checkequal(2, 'aaa', 'count', '', -1)
self.checkequal(4, 'aaa', 'count', '', -10)
self.checkequal(1, '', 'count', '')
self.checkequal(0, '', 'count', '', 1, 1)
self.checkequal(0, '', 'count', '', sys.maxsize, 0)
self.checkequal(0, '', 'count', 'xx')
self.checkequal(0, '', 'count', 'xx', 1, 1)
self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0)
self.checkraises(TypeError, 'hello', 'count')
ifself.contains_bytes:
self.checkequal(0, 'hello', 'count', 42)
else:
self.checkraises(TypeError, 'hello', 'count', 42)
# For a variety of combinations,
# verify that str.count() matches an equivalent function
# replacing all occurrences and then differencing the string lengths
charset= ['', 'a', 'b']
digits=7
base=len(charset)
teststrings=set()
foriinrange(base**digits):
entry= []
forjinrange(digits):
i, m=divmod(i, base)
entry.append(charset[m])
teststrings.add(''.join(entry))
teststrings= [self.fixtype(ts) fortsinteststrings]
foriinteststrings:
n=len(i)
forjinteststrings:
r1=i.count(j)
ifj:
r2, rem=divmod(n-len(i.replace(j, self.fixtype(''))),
len(j))
else:
r2, rem=len(i)+1, 0
ifremorr1!=r2:
self.assertEqual(rem, 0, '%s != 0 for %s'% (rem, i))
self.assertEqual(r1, r2, '%s != %s for %s'% (r1, r2, i))

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count? They look pretty similar:

Py_ssize_t
PyUnicode_Count(PyObject*str,
PyObject*substr,
Py_ssize_tstart,
Py_ssize_tend)
{
Py_ssize_tresult;
intkind1, kind2;
constvoid*buf1=NULL, *buf2=NULL;
Py_ssize_tlen1, len2;
if (ensure_unicode(str) <0||ensure_unicode(substr) <0)
return-1;
kind1=PyUnicode_KIND(str);
kind2=PyUnicode_KIND(substr);
if (kind1<kind2)
return0;
len1=PyUnicode_GET_LENGTH(str);
len2=PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start, end, len1);
if (end-start<len2)
return0;
buf1=PyUnicode_DATA(str);
buf2=PyUnicode_DATA(substr);
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
goto onError;
}
switch (kind1){
casePyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str) &&PyUnicode_IS_ASCII(substr))
result=asciilib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
else
result=ucs1lib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
result=ucs2lib_count(
((constPy_UCS2*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
result=ucs4lib_count(
((constPy_UCS4*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
onError:
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
return-1;
}

And

staticPyObject*
unicode_count(PyObject*self, PyObject*args)
{
PyObject*substring=NULL; /* initialize to fix a compiler warning */
Py_ssize_tstart=0;
Py_ssize_tend=PY_SSIZE_T_MAX;
PyObject*result;
intkind1, kind2;
constvoid*buf1, *buf2;
Py_ssize_tlen1, len2, iresult;
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
returnNULL;
kind1=PyUnicode_KIND(self);
kind2=PyUnicode_KIND(substring);
if (kind1<kind2)
returnPyLong_FromLong(0);
len1=PyUnicode_GET_LENGTH(self);
len2=PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start, end, len1);
if (end-start<len2)
returnPyLong_FromLong(0);
buf1=PyUnicode_DATA(self);
buf2=PyUnicode_DATA(substring);
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
returnNULL;
}
switch (kind1){
casePyUnicode_1BYTE_KIND:
iresult=ucs1lib_count(
((constPy_UCS1*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_2BYTE_KIND:
iresult=ucs2lib_count(
((constPy_UCS2*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
casePyUnicode_4BYTE_KIND:
iresult=ucs4lib_count(
((constPy_UCS4*)buf1) +start, end-start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
result=PyLong_FromSsize_t(iresult);
assert((kind2==kind1) == (buf2==PyUnicode_DATA(substring)));
if (kind2!=kind1)
PyMem_Free((void*)buf2);
returnresult;
}

@sobolevnsobolevn added tests Tests in the Lib/test dir skip news labels Sep 19, 2022
@mdboom
Copy link
Contributor

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in making unicode_count call PyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is also anylib_count which is a subset of unicode_count and PyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

@encukou
Copy link
Member

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

@sobolevn
Copy link
MemberAuthor

Thanks! Yes, I would like to do that! I will open a new issue for it.

carljm added a commit to carljm/cpython that referenced this pull request Oct 6, 2022
* main: pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)
carljm added a commit to carljm/cpython that referenced this pull request Oct 8, 2022
* main: (53 commits) pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926) pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528) pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879) docs(typing): add "see PEP 675" to LiteralString (python#97926) pythongh-97850: Remove all known instances of module_repr() (python#97876) I changed my surname early this year (python#96671) pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768) pythongh-91539: improve performance of get_proxies_environment (python#91566) build(deps): bump actions/stale from 5 to 6 (python#97701) pythonGH-95172 Make the same version `versionadded` oneline (python#95172) pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073) pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774) ...
mpage pushed a commit to mpage/cpython that referenced this pull request Oct 11, 2022
Sign up for freeto join this conversation on GitHub. Already have an account? Sign in to comment

Labels

skip newstestsTests in the Lib/test dir

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants

@sobolevn@mdboom@encukou@bedevere-bot