gh-94808: Cover `PyUnicode_Count` in CAPI#96929

sobolevn · 2022-09-19T09:22:04Z

It is heavily inspired by

Lines 99 to 161 in cbdeda8

deftest_count(self): 
self.checkequal(3, 'aaa', 'count', 'a') 
self.checkequal(0, 'aaa', 'count', 'b') 
self.checkequal(3, 'aaa', 'count', 'a') 
self.checkequal(0, 'aaa', 'count', 'b') 
self.checkequal(3, 'aaa', 'count', 'a') 
self.checkequal(0, 'aaa', 'count', 'b') 
self.checkequal(0, 'aaa', 'count', 'b') 
self.checkequal(2, 'aaa', 'count', 'a', 1) 
self.checkequal(0, 'aaa', 'count', 'a', 10) 
self.checkequal(1, 'aaa', 'count', 'a', -1) 
self.checkequal(3, 'aaa', 'count', 'a', -10) 
self.checkequal(1, 'aaa', 'count', 'a', 0, 1) 
self.checkequal(3, 'aaa', 'count', 'a', 0, 10) 
self.checkequal(2, 'aaa', 'count', 'a', 0, -1) 
self.checkequal(0, 'aaa', 'count', 'a', 0, -10) 
self.checkequal(3, 'aaa', 'count', '', 1) 
self.checkequal(1, 'aaa', 'count', '', 3) 
self.checkequal(0, 'aaa', 'count', '', 10) 
self.checkequal(2, 'aaa', 'count', '', -1) 
self.checkequal(4, 'aaa', 'count', '', -10) 
self.checkequal(1, '', 'count', '') 
self.checkequal(0, '', 'count', '', 1, 1) 
self.checkequal(0, '', 'count', '', sys.maxsize, 0) 
self.checkequal(0, '', 'count', 'xx') 
self.checkequal(0, '', 'count', 'xx', 1, 1) 
self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0) 
self.checkraises(TypeError, 'hello', 'count') 
ifself.contains_bytes: 
self.checkequal(0, 'hello', 'count', 42) 
else: 
self.checkraises(TypeError, 'hello', 'count', 42) 
# For a variety of combinations,
# verify that str.count() matches an equivalent function
# replacing all occurrences and then differencing the string lengths
charset= ['', 'a', 'b'] 
digits=7
base=len(charset) 
teststrings=set() 
foriinrange(base**digits): 
entry= [] 
forjinrange(digits): 
i, m=divmod(i, base) 
entry.append(charset[m]) 
teststrings.add(''.join(entry)) 
teststrings= [self.fixtype(ts) fortsinteststrings] 
foriinteststrings: 
n=len(i) 
forjinteststrings: 
r1=i.count(j) 
ifj: 
r2, rem=divmod(n-len(i.replace(j, self.fixtype(''))), 
len(j)) 
else: 
r2, rem=len(i)+1, 0
ifremorr1!=r2: 
self.assertEqual(rem, 0, '%s != 0 for %s'% (rem, i)) 
self.assertEqual(r1, r2, '%s != %s for %s'% (r1, r2, i)) 

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count? They look pretty similar:

cpython/Objects/unicodeobject.c

Lines 8968 to 9040 in cbdeda8

Py_ssize_t
PyUnicode_Count(PyObject*str, 
PyObject*substr, 
Py_ssize_tstart, 
Py_ssize_tend) 
{
Py_ssize_tresult; 
intkind1, kind2; 
constvoid*buf1=NULL, *buf2=NULL; 
Py_ssize_tlen1, len2; 
if (ensure_unicode(str) <0||ensure_unicode(substr) <0) 
return-1; 
kind1=PyUnicode_KIND(str); 
kind2=PyUnicode_KIND(substr); 
if (kind1<kind2) 
return0; 
len1=PyUnicode_GET_LENGTH(str); 
len2=PyUnicode_GET_LENGTH(substr); 
ADJUST_INDICES(start, end, len1); 
if (end-start<len2) 
return0; 
buf1=PyUnicode_DATA(str); 
buf2=PyUnicode_DATA(substr); 
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1); 
if (!buf2) 
 goto onError; 
 } 
switch (kind1){
casePyUnicode_1BYTE_KIND: 
if (PyUnicode_IS_ASCII(str) &&PyUnicode_IS_ASCII(substr)) 
result=asciilib_count( 
 ((constPy_UCS1*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
else
result=ucs1lib_count( 
 ((constPy_UCS1*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
casePyUnicode_2BYTE_KIND: 
result=ucs2lib_count( 
 ((constPy_UCS2*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
casePyUnicode_4BYTE_KIND: 
result=ucs4lib_count( 
 ((constPy_UCS4*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
default: 
Py_UNREACHABLE(); 
 } 
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr))); 
if (kind2!=kind1) 
PyMem_Free((void*)buf2); 
returnresult; 
onError: 
assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr))); 
if (kind2!=kind1) 
PyMem_Free((void*)buf2); 
return-1; 
 } 

And

cpython/Objects/unicodeobject.c

Lines 10854 to 10916 in cbdeda8

staticPyObject*
unicode_count(PyObject*self, PyObject*args) 
{
PyObject*substring=NULL; /* initialize to fix a compiler warning */
Py_ssize_tstart=0; 
Py_ssize_tend=PY_SSIZE_T_MAX; 
PyObject*result; 
intkind1, kind2; 
constvoid*buf1, *buf2; 
Py_ssize_tlen1, len2, iresult; 
if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) 
returnNULL; 
kind1=PyUnicode_KIND(self); 
kind2=PyUnicode_KIND(substring); 
if (kind1<kind2) 
returnPyLong_FromLong(0); 
len1=PyUnicode_GET_LENGTH(self); 
len2=PyUnicode_GET_LENGTH(substring); 
ADJUST_INDICES(start, end, len1); 
if (end-start<len2) 
returnPyLong_FromLong(0); 
buf1=PyUnicode_DATA(self); 
buf2=PyUnicode_DATA(substring); 
if (kind2!=kind1){
buf2=unicode_askind(kind2, buf2, len2, kind1); 
if (!buf2) 
returnNULL; 
 } 
switch (kind1){
casePyUnicode_1BYTE_KIND: 
iresult=ucs1lib_count( 
 ((constPy_UCS1*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
casePyUnicode_2BYTE_KIND: 
iresult=ucs2lib_count( 
 ((constPy_UCS2*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
casePyUnicode_4BYTE_KIND: 
iresult=ucs4lib_count( 
 ((constPy_UCS4*)buf1) +start, end-start, 
buf2, len2, PY_SSIZE_T_MAX
 ); 
break; 
default: 
Py_UNREACHABLE(); 
 } 
result=PyLong_FromSsize_t(iresult); 
assert((kind2==kind1) == (buf2==PyUnicode_DATA(substring))); 
if (kind2!=kind1) 
PyMem_Free((void*)buf2); 
returnresult; 
 } 

Issue: Metabug: Improving C-level coverage #94808

mdboom · 2022-09-22T19:45:02Z

Question: what is the historical context on why PyUnicode_Count is not reused in unicode_count?

It looks like these both date to the same commit d57fd91 from 2000-03-10. They were pretty different then, but are almost the same now. I see some benefit in making unicode_count call PyUnicode_Count to make sure they remain consistent, but I could also see someone seeing this as "churn for churn's sake".

Note there is also anylib_count which is a subset of unicode_count and PyUnicode_Count.

There are a few other instances of this kind of thing I've come across looking at coverage -- it would be good to get a core developer's take on whether merging internal and external functions where they are clearly wrappable like this would be welcome.

encukou · 2022-10-06T15:24:07Z

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

sobolevn · 2022-10-06T17:04:14Z

Thanks! Yes, I would like to do that! I will open a new issue for it.

* main: pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528)

* main: (53 commits) pythongh-94808: Coverage: Test that maximum indentation level is handled (python#95926) pythonGH-88050: fix race in closing subprocess pipe in asyncio (python#97951) pythongh-93738: Disallow pre-v3 syntax in the C domain (python#97962) pythongh-95986: Fix the example using match keyword (python#95989) pythongh-97897: Prevent os.mkfifo and os.mknod segfaults with macOS 13 SDK (pythonGH-97944) pythongh-94808: Cover `PyUnicode_Count` in CAPI (python#96929) pythongh-94808: Cover `PyObject_PyBytes` case with custom `__bytes__` method (python#96610) pythongh-95691: Doc BufferedWriter and BufferedReader (python#95703) pythonGH-88968: Add notes about socket ownership transfers (python#97936) pythongh-96865: [Enum] fix Flag to use CONFORM boundary (pythonGH-97528) pythongh-65961: Raise `DeprecationWarning` when `__package__` differs from `__spec__.parent` (python#97879) docs(typing): add "see PEP 675" to LiteralString (python#97926) pythongh-97850: Remove all known instances of module_repr() (python#97876) I changed my surname early this year (python#96671) pythongh-93738: Documentation C syntax (:c:type:<C type> -> :c:expr:<C type>) (python#97768) pythongh-91539: improve performance of get_proxies_environment (python#91566) build(deps): bump actions/stale from 5 to 6 (python#97701) pythonGH-95172 Make the same version `versionadded` oneline (python#95172) pythongh-88050: Fix asyncio subprocess to kill process cleanly when process is blocked (python#32073) pythongh-93738: Documentation C syntax (Function glob patterns -> literal markup) (python#97774) ...

pythongh-94808: CoverPyUnicode_Countin CAPI
22aee9a

bedevere-bot added the awaiting review label Sep 19, 2022

sobolevn added tests Tests in the Lib/test dir skip news labels Sep 19, 2022

sobolevn requested a review from encukou September 19, 2022 09:49

encukou approved these changes Oct 6, 2022
View reviewed changes

bedevere-bot added awaiting merge and removed awaiting review labels Oct 6, 2022

encukou merged commit e63d7da into python:mainOct 6, 2022

bedevere-bot removed the awaiting merge label Oct 6, 2022

sobolevn mentioned this pull request Oct 6, 2022
Unify PyUncode_Count and unicode_count#97982
Closed

mpage pushed a commit to mpage/cpython that referenced this pull request Oct 11, 2022
pythongh-94808: CoverPyUnicode_Countin CAPI (python#96929)
d458682

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

gh-94808: Cover `PyUnicode_Count` in CAPI#96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

Uh oh!

sobolevn commented Sep 19, 2022•
edited by bedevere-bot
Loading

Uh oh!

mdboom commented Sep 22, 2022

Uh oh!

encukou commented Oct 6, 2022

Uh oh!

sobolevn commented Oct 6, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

	deftest_count(self):
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(3, 'aaa', 'count', 'a')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(0, 'aaa', 'count', 'b')
	self.checkequal(2, 'aaa', 'count', 'a', 1)
	self.checkequal(0, 'aaa', 'count', 'a', 10)
	self.checkequal(1, 'aaa', 'count', 'a', -1)
	self.checkequal(3, 'aaa', 'count', 'a', -10)
	self.checkequal(1, 'aaa', 'count', 'a', 0, 1)
	self.checkequal(3, 'aaa', 'count', 'a', 0, 10)
	self.checkequal(2, 'aaa', 'count', 'a', 0, -1)
	self.checkequal(0, 'aaa', 'count', 'a', 0, -10)
	self.checkequal(3, 'aaa', 'count', '', 1)
	self.checkequal(1, 'aaa', 'count', '', 3)
	self.checkequal(0, 'aaa', 'count', '', 10)
	self.checkequal(2, 'aaa', 'count', '', -1)
	self.checkequal(4, 'aaa', 'count', '', -10)

	self.checkequal(1, '', 'count', '')
	self.checkequal(0, '', 'count', '', 1, 1)
	self.checkequal(0, '', 'count', '', sys.maxsize, 0)

	self.checkequal(0, '', 'count', 'xx')
	self.checkequal(0, '', 'count', 'xx', 1, 1)
	self.checkequal(0, '', 'count', 'xx', sys.maxsize, 0)

	self.checkraises(TypeError, 'hello', 'count')

	ifself.contains_bytes:
	self.checkequal(0, 'hello', 'count', 42)
	else:
	self.checkraises(TypeError, 'hello', 'count', 42)

	# For a variety of combinations,
	# verify that str.count() matches an equivalent function
	# replacing all occurrences and then differencing the string lengths
	charset= ['', 'a', 'b']
	digits=7
	base=len(charset)
	teststrings=set()
	foriinrange(base**digits):
	entry= []
	forjinrange(digits):
	i, m=divmod(i, base)
	entry.append(charset[m])
	teststrings.add(''.join(entry))
	teststrings= [self.fixtype(ts) fortsinteststrings]
	foriinteststrings:
	n=len(i)
	forjinteststrings:
	r1=i.count(j)
	ifj:
	r2, rem=divmod(n-len(i.replace(j, self.fixtype(''))),
	len(j))
	else:
	r2, rem=len(i)+1, 0
	ifremorr1!=r2:
	self.assertEqual(rem, 0, '%s != 0 for %s'% (rem, i))
	self.assertEqual(r1, r2, '%s != %s for %s'% (r1, r2, i))

	Py_ssize_t
	PyUnicode_Count(PyObject*str,
	PyObject*substr,
	Py_ssize_tstart,
	Py_ssize_tend)
	{
	Py_ssize_tresult;
	intkind1, kind2;
	constvoidbuf1=NULL, buf2=NULL;
	Py_ssize_tlen1, len2;

	if (ensure_unicode(str) <0\|\|ensure_unicode(substr) <0)
	return-1;

	kind1=PyUnicode_KIND(str);
	kind2=PyUnicode_KIND(substr);
	if (kind1<kind2)
	return0;

	len1=PyUnicode_GET_LENGTH(str);
	len2=PyUnicode_GET_LENGTH(substr);
	ADJUST_INDICES(start, end, len1);
	if (end-start<len2)
	return0;

	buf1=PyUnicode_DATA(str);
	buf2=PyUnicode_DATA(substr);
	if (kind2!=kind1){
	buf2=unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	goto onError;
	}

	switch (kind1){
	casePyUnicode_1BYTE_KIND:
	if (PyUnicode_IS_ASCII(str) &&PyUnicode_IS_ASCII(substr))
	result=asciilib_count(
	((constPy_UCS1*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	else
	result=ucs1lib_count(
	((constPy_UCS1*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	casePyUnicode_2BYTE_KIND:
	result=ucs2lib_count(
	((constPy_UCS2*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	casePyUnicode_4BYTE_KIND:
	result=ucs4lib_count(
	((constPy_UCS4*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
	if (kind2!=kind1)
	PyMem_Free((void*)buf2);

	returnresult;
	onError:
	assert((kind2!=kind1) == (buf2!=PyUnicode_DATA(substr)));
	if (kind2!=kind1)
	PyMem_Free((void*)buf2);
	return-1;
	}

	staticPyObject*
	unicode_count(PyObjectself, PyObjectargs)
	{
	PyObjectsubstring=NULL; / initialize to fix a compiler warning */
	Py_ssize_tstart=0;
	Py_ssize_tend=PY_SSIZE_T_MAX;
	PyObject*result;
	intkind1, kind2;
	constvoidbuf1, buf2;
	Py_ssize_tlen1, len2, iresult;

	if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
	returnNULL;

	kind1=PyUnicode_KIND(self);
	kind2=PyUnicode_KIND(substring);
	if (kind1<kind2)
	returnPyLong_FromLong(0);

	len1=PyUnicode_GET_LENGTH(self);
	len2=PyUnicode_GET_LENGTH(substring);
	ADJUST_INDICES(start, end, len1);
	if (end-start<len2)
	returnPyLong_FromLong(0);

	buf1=PyUnicode_DATA(self);
	buf2=PyUnicode_DATA(substring);
	if (kind2!=kind1){
	buf2=unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	returnNULL;
	}
	switch (kind1){
	casePyUnicode_1BYTE_KIND:
	iresult=ucs1lib_count(
	((constPy_UCS1*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	casePyUnicode_2BYTE_KIND:
	iresult=ucs2lib_count(
	((constPy_UCS2*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	casePyUnicode_4BYTE_KIND:
	iresult=ucs4lib_count(
	((constPy_UCS4*)buf1) +start, end-start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	result=PyLong_FromSsize_t(iresult);

	assert((kind2==kind1) == (buf2==PyUnicode_DATA(substring)));
	if (kind2!=kind1)
	PyMem_Free((void*)buf2);

	returnresult;
	}

Uh oh!

gh-94808: Cover PyUnicode_Count in CAPI#96929

gh-94808: Cover PyUnicode_Count in CAPI #96929

Uh oh!

Conversation

sobolevn commented Sep 19, 2022• edited by bedevere-bot Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

mdboom commented Sep 22, 2022

Uh oh!

encukou commented Oct 6, 2022

Uh oh!

sobolevn commented Oct 6, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

gh-94808: Cover `PyUnicode_Count` in CAPI#96929

gh-94808: Cover `PyUnicode_Count` in CAPI #96929

sobolevn commented Sep 19, 2022•
edited by bedevere-bot
Loading