- Notifications
You must be signed in to change notification settings - Fork 45
[perf] Do not make compiler life harder#164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Uh oh!
There was an error while loading. Please reload this page.
Conversation
ChALkeR commented Oct 29, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
ChALkeR commented Oct 30, 2025
will retest on linux |
lpinca commented Oct 31, 2025
I'm surprised to see such a big difference. I've run similar benchmarks on Intel Mac, and while I didn't see a 10x improvement, the difference is still huge (about 5x faster). |
lpinca commented Oct 31, 2025
Can you please run clang-format with style Google? |
lpinca commented Oct 31, 2025
On a Linux VM the difference is not so big but sill 2x faster in some cases. |
ChALkeR commented Oct 31, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
This might be affected by both clang++/g++ and arm64/x86-64. The most common case on servers is likely g++ and x86-64 on Linux. Will do that (if no one would beat me to it) |
ChALkeR commented Oct 31, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
side note: on Mac, an optimized (not the current) JS impl for |
lpinca commented Oct 31, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
This is the VM env mentioned above, but it is virtualized.
The JS implementation is used for very small buffers where the cost of calling the native bindings isn't worth the effort, so it does not really matter. |
lpinca commented Oct 31, 2025
While reading the changes I also noticed that it is time to use |
ChALkeR commented Oct 31, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
@lpinca I was considering replacing native with js as new js was 1.5x faster, this is why it mattered 🙃
That is broken in Node.js unfortunately for now. But yes, it makes sense to support that here. And as for perf, I'll get to my x86-64 / Linux machine soon to test this locally. |
ChALkeR commented Nov 1, 2025
Here are two versions which could be explored in Godbolt: Before #include<cstddef> #include<cstdint>structArgs0{uint8_t *source; uint8_t *mask; uint8_t *destination; uint32_t offset; uint32_t length}; structArgs1{uint8_t *source; size_t length; uint8_t *mask}; void* Mask(Args0 args0){uint8_t *source = args0.source; uint8_t *mask = args0.mask; uint8_t *destination = args0.destination; uint32_t offset = args0.offset; uint32_t length = args0.length; destination += offset; uint32_t index = 0; //// Alignment preamble.//while (index < length && ((size_t)source % 8)){*destination++ = *source++ ^ mask[index % 4]; index++} length -= index; if (!length) returnNULL; //// Realign mask and convert to 64 bit.//uint8_t maskAlignedArray[8]; for (uint8_t i = 0; i < 8; i++, index++){maskAlignedArray[i] = mask[index % 4]} //// Apply 64 bit mask in 8 byte chunks.//uint32_t loop = length / 8; uint64_t *pMask8 = (uint64_t *)maskAlignedArray; while (loop--){uint64_t *pFrom8 = (uint64_t *)source; uint64_t *pTo8 = (uint64_t *)destination; *pTo8 = *pFrom8 ^ *pMask8; source += 8; destination += 8} //// Apply mask to remaining data.//uint8_t *pmaskAlignedArray = maskAlignedArray; length %= 8; while (length--){*destination++ = *source++ ^ *pmaskAlignedArray++} returnNULL} void* Unmask(Args1 args1){uint8_t *source = args1.source; size_t length = args1.length; uint8_t *mask = args1.mask; uint32_t index = 0; //// Alignment preamble.//while (index < length && ((size_t)source % 8)){*source++ ^= mask[index % 4]; index++} length -= index; if (!length) returnNULL; //// Realign mask and convert to 64 bit.//uint8_t maskAlignedArray[8]; for (uint8_t i = 0; i < 8; i++, index++){maskAlignedArray[i] = mask[index % 4]} //// Apply 64 bit mask in 8 byte chunks.//uint32_t loop = length / 8; uint64_t *pMask8 = (uint64_t *)maskAlignedArray; while (loop--){uint64_t *pSource8 = (uint64_t *)source; *pSource8 ^= *pMask8; source += 8} //// Apply mask to remaining data.//uint8_t *pmaskAlignedArray = maskAlignedArray; length %= 8; while (length--){*source++ ^= *pmaskAlignedArray++} returnNULL}After #include<cstddef> #include<cstdint>structArgs0{uint8_t *source; uint8_t *mask; uint8_t *destination; uint32_t offset; uint32_t length}; structArgs1{uint8_t *source; size_t length; uint8_t *mask}; void* Mask(Args0 args0){uint8_t *source = args0.source; uint8_t *mask = args0.mask; uint8_t *destination = args0.destination; uint32_t offset = args0.offset; uint32_t length = args0.length; destination += offset; uint32_t index = 0; //// Alignment preamble.//while (index < length && ((size_t)source % 8)){*destination++ = *source++ ^ mask[index % 4]; index++} length -= index; if (!length) returnNULL; //// Realign mask and convert to 64 bit.//uint8_t maskAlignedArray[8]; for (uint8_t i = 0; i < 8; i++, index++){maskAlignedArray[i] = mask[index % 4]} //// Apply 64 bit mask in 8 byte chunks.//uint32_t loop = length / 8; uint64_t mask8 = ((uint64_t *)maskAlignedArray)[0]; uint64_t *pFrom8 = (uint64_t *)source; uint64_t *pTo8 = (uint64_t *)destination; for (uint32_t i = 0; i < loop; i++) pTo8[i] = pFrom8[i] ^ mask8; source += 8 * loop; destination += 8 * loop; //// Apply mask to remaining data.// length %= 8; for (uint32_t i = 0; i < length; i++){destination[i] = source[i] ^ maskAlignedArray[i]} returnNULL} void* Unmask(Args1 args1){uint8_t *source = args1.source; uint8_t *mask = args1.mask; size_t length = args1.length; uint32_t index = 0; //// Alignment preamble.//while (index < length && ((size_t)source % 8)){*source++ ^= mask[index % 4]; index++} length -= index; if (!length) returnNULL; //// Realign mask and convert to 64 bit.//uint8_t maskAlignedArray[8]; for (uint8_t i = 0; i < 8; i++, index++){maskAlignedArray[i] = mask[index % 4]} //// Apply 64 bit mask in 8 byte chunks.//uint32_t loop = length / 8; uint64_t mask8 = ((uint64_t *)maskAlignedArray)[0]; uint64_t *pSource8 = (uint64_t *)source; for (uint32_t i = 0; i < loop; i++) pSource8[i] ^= mask8; source += 8 * loop; //// Apply mask to remaining data.// length %= 8; for (uint32_t i = 0; i < length; i++){source[i] ^= maskAlignedArray[i]} returnNULL} |
ChALkeR commented Nov 1, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
g++ on x86_64 with mov QWORD PTR [rsp-16],rcxtestr8d,r8dje .L71cmpr8d,1je .L77movedi,r8dmovqxmm1,rcxmovrdx,raxshredipunpcklqdqxmm1,xmm1salrdi,4addrdi,rax.L73:movdquxmm0, XMMWORD PTR [rdx]addrdx,16pxorxmm0,xmm1movups XMMWORD PTR [rdx-16],xmm0cmprdx,rdijne .L73testr8b,1je .L74 movabs rdx,34359738352andrdx,rsiaddrdx,rax.L72:xor QWORD PTR [rdx],rcx.L74:g++ on x86_64 with movr8, QWORD PTR [rsp-24]cmpesi,1je .L68movecx,esimovqxmm1,r8movrdx,raxshrecxpunpcklqdqxmm1,xmm1salrcx,4addrcx,rax.L64:movdquxmm0, XMMWORD PTR [rdx]addrdx,16pxorxmm0,xmm1movups XMMWORD PTR [rdx-16],xmm0cmprdx,rcxjne .L64test sil,1je .L62movedx,esiandedx,-2.L63:xor QWORD PTR [rax+rdx*8],r8.L62:The main loop is identical. |
ChALkeR commented Nov 1, 2025
clang++ on arm8-a with -O3 Before: b.hs .LBB1_26 tbz w9, #0, .LBB1_26mov x9, x8 b .LBB1_29.LBB1_26: ldr d0,[sp, #8]and x13, x11, #0xfffffffcadd x9, x8, x13,lsl #3sub w11, w11, w13add x8, x8, #16 dup v0.2d, v0.d[0]mov x14, x13.LBB1_27: ldp q1, q2,[x8, #-16] subs x14, x14, #4 eor v1.16b, v1.16b, v0.16b eor v2.16b, v2.16b, v0.16b stp q1, q2,[x8, #-16]add x8, x8, #32 b.ne .LBB1_27cmp x12, x13 b.eq .LBB1_30.LBB1_29: ldr x8,[sp, #8] ldr x12,[x9] subs w11, w11, #1 eor x8, x12, x8str x8,[x9], #8 b.ne .LBB1_29.LBB1_30:After: b.hs .LBB1_22mov x16, xzr b .LBB1_25.LBB1_22: lsr x16, x13, #3 dup v0.2d, x14add x17, x8, #16and x16, x16, #0xfffffffcmov x18, x16.LBB1_23: ldp q1, q2,[x17, #-16] subs x18, x18, #4 eor v1.16b, v1.16b, v0.16b eor v2.16b, v2.16b, v0.16b stp q1, q2,[x17, #-16]add x17, x17, #32 b.ne .LBB1_23cmp x15, x16 b.eq .LBB1_27.LBB1_25:add x17, x8, x16,lsl #3sub x15, x16, x15.LBB1_26: ldr x16,[x17] adds x15, x15, #1 eor x16, x16, x14str x16,[x17], #8 b.lo .LBB1_26.LBB1_27: |
This comment was marked as outdated.
This comment was marked as outdated.
lpinca commented Nov 14, 2025
Benchmark results on native Windows: |
lpinca commented Nov 26, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
@ChALkeR apart from #164 (comment) and #164 (comment), is this ready? |
lpinca commented Dec 10, 2025 • edited
Loading Uh oh!
There was an error while loading. Please reload this page.
edited
Uh oh!
There was an error while loading. Please reload this page.
@ChALkeR ping. |
95371b9 to 557a7afCompare321fbe4 into websockets:masterUh oh!
There was an error while loading. Please reload this page.
ChALkeR commented Dec 18, 2025
@lpinca sorry for missing this, thanks for fixing and merging! |
Loops with bodies not depending on previous iterations are easier
Tested on M3 (please recheck on other platforms)
Before:
After
Has to be retested on smth else