ExactSizeIterator seems to generate worse assembly if mutated before collected into `Vec` #110734

ZhennanWu · 2023-04-23T17:32:59Z

Godbolt link https://godbolt.org/z/cMdx6v1G9

pub fn exact_size(){
    let it = std::hint::black_box(vec![1,2,3].into_iter());
    let v = it.collect::<Vec<_>>();
    std::hint::black_box(v);
}


pub fn var_size(){
    let it = std::hint::black_box(vec![1,2,3].into_iter()).filter(|x| true);
    let v = it.collect::<Vec<_>>();
    std::hint::black_box(v);
}

I expected to see this happen: exact_size to generate better assembly than var_size

Instead, this happened:

var_size generates way shorter assembly (~90) than exact size (>200) with zero call to allocate Vec
It seems to me var_size is trying to allocate the Vec on stack. This optimization did not happen on exact_size (Excuse me if I misintepreted the assembly).

The key to trigger this deoptimization seems to be mutating the iterator before collecting. This example is a reduction of a real-world code where the first few elements are processed differently and the rest of the elements is collected into Vec then consumed locally.

Godbolt link for a more realistic example: https://godbolt.org/z/sccdTcvh6

#[inline(never)]
pub fn example<I: Iterator>(mut it: I){
    // .... read and process a few elements
    let Some(first_elem) = it.next() else {return;};
    // let mut it = it.filter(|x| true); // <------ This causes different optimization if ExactSizeIterator
    let v = it.collect::<Vec<_>>();
    for elem in v {
        // Different processing on rest of the elements.
    }
}

pub fn call_example() {
    example(vec![1,2,3].into_iter())
}

Edit: Include a more realistic example

The text was updated successfully, but these errors were encountered:

lukas-code · 2023-04-23T21:03:44Z

The offending non-optimization is here:

rust/library/alloc/src/vec/spec_from_iter.rs

Lines 37 to 64 in 7f94b31

    
           impl<T> SpecFromIter<T, IntoIter<T>> for Vec<T> { 
        
               fn from_iter(iterator: IntoIter<T>) -> Self { 
        
                   // A common case is passing a vector into a function which immediately 
        
                   // re-collects into a vector. We can short circuit this if the IntoIter 
        
                   // has not been advanced at all. 
        
                   // When it has been advanced We can also reuse the memory and move the data to the front. 
        
                   // But we only do so when the resulting Vec wouldn't have more unused capacity 
        
                   // than creating it through the generic FromIterator implementation would. That limitation 
        
                   // is not strictly necessary as Vec's allocation behavior is intentionally unspecified. 
        
                   // But it is a conservative choice. 
        
                   let has_advanced = iterator.buf.as_ptr() as *const _ != iterator.ptr; 
        
                   if !has_advanced || iterator.len() >= iterator.cap / 2 { 
        
                       unsafe { 
        
                           let it = ManuallyDrop::new(iterator); 
        
                           if has_advanced { 
        
                               ptr::copy(it.ptr, it.buf.as_ptr(), it.len()); 
        
                           } 
        
                           return Vec::from_raw_parts(it.buf.as_ptr(), it.len(), it.cap); 
        
                       } 
        
                   } 
        
                   let mut vec = Vec::new(); 
        
                   // must delegate to spec_extend() since extend() itself delegates 
        
                   // to spec_from for empty Vecs 
        
                   vec.spec_extend(iterator); 
        
                   vec 
        
               } 
        
           }

If I just remove the if-condition and unconditionally reuse the IntoIter's buffer, the generated assembly for exact_size and var_size look similar:

big asm

rust_test::exact_size:

    push r15
    push r14
    push rbx
    sub rsp, 64

    mov edi, 12
    mov esi, 4
    call qword ptr [rip + __rust_alloc@GOTPCREL]

    test rax, rax
    je .LBB0_6

    movabs rcx, 8589934593
    mov qword ptr [rax], rcx
    mov dword ptr [rax + 8], 3

    mov rcx, rax
    add rcx, 12

    mov qword ptr [rsp + 8], 3
    mov qword ptr [rsp + 16], rax
    mov qword ptr [rsp + 24], rcx
    mov qword ptr [rsp + 32], rax
    lea rax, [rsp + 8]

    #APP
    #NO_APP

    mov r15, qword ptr [rsp + 8]

    mov rsi, qword ptr [rsp + 16]

    mov rbx, qword ptr [rsp + 24]

    mov r14, qword ptr [rsp + 32]

    sub rbx, rsi

    cmp rsi, r14

    je .LBB0_3

    mov rdi, r14
    mov rdx, rbx
    call qword ptr [rip + memmove@GOTPCREL]

.LBB0_3:
    shr rbx, 2

    mov qword ptr [rsp + 40], r15
    mov qword ptr [rsp + 48], r14
    mov qword ptr [rsp + 56], rbx
    lea rax, [rsp + 40]
    #APP
    #NO_APP

    mov rsi, qword ptr [rsp + 40]

    test rsi, rsi
    je .LBB0_5

    mov rdi, qword ptr [rsp + 48]

    shl rsi, 2

    mov edx, 4
    call qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB0_5:
    add rsp, 64
    pop rbx
    pop r14
    pop r15
    ret
.LBB0_6:

    mov edi, 12
    mov esi, 4
    call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
    ud2

rust_test::var_size:

    push rbx
    sub rsp, 32

    mov edi, 12
    mov esi, 4
    call qword ptr [rip + __rust_alloc@GOTPCREL]

    test rax, rax
    je .LBB1_12

    movabs rcx, 8589934593
    mov qword ptr [rax], rcx
    mov dword ptr [rax + 8], 3

    mov rcx, rax
    add rcx, 12

    mov qword ptr [rsp], 3
    mov qword ptr [rsp + 8], rax
    mov qword ptr [rsp + 16], rcx
    mov qword ptr [rsp + 24], rax
    mov rax, rsp

    #APP
    #NO_APP

    mov rcx, qword ptr [rsp]

    mov r8, qword ptr [rsp + 8]

    mov rsi, qword ptr [rsp + 16]

    mov rdx, qword ptr [rsp + 24]

    mov rdi, rdx

    cmp r8, rsi
    je .LBB1_9

    mov r10, rsi
    sub r10, r8
    add r10, -4
    cmp r10, 28
    jb .LBB1_3

    mov rdi, rdx

    sub rdi, r8
    cmp rdi, 32
    jb .LBB1_3

    shr r10, 2
    inc r10
    mov r11, r10
    and r11, -8
    lea rdi, [rdx + 4*r11]
    lea r9, [r8 + 4*r11]
    xor ebx, ebx

.LBB1_6:
    movups xmm0, xmmword ptr [r8 + 4*rbx]
    movups xmm1, xmmword ptr [r8 + 4*rbx + 16]

    movups xmmword ptr [rdx + 4*rbx], xmm0
    movups xmmword ptr [rdx + 4*rbx + 16], xmm1
    add rbx, 8
    cmp r11, rbx
    jne .LBB1_6

    cmp r10, r11
    jne .LBB1_8
    jmp .LBB1_9

.LBB1_3:
    mov rdi, rdx
    mov r9, r8

.LBB1_8:
    mov r8d, dword ptr [r9]

    add r9, 4

    mov dword ptr [rdi], r8d

    add rdi, 4

    cmp r9, rsi
    jne .LBB1_8

.LBB1_9:
    sub rdi, rdx

    shr rdi, 2

    mov qword ptr [rsp], rcx
    mov qword ptr [rsp + 8], rdx
    mov qword ptr [rsp + 16], rdi
    #APP
    #NO_APP

    mov rsi, qword ptr [rsp]

    test rsi, rsi
    je .LBB1_11

    mov rdi, qword ptr [rsp + 8]

    shl rsi, 2

    mov edx, 4

    call qword ptr [rip + __rust_dealloc@GOTPCREL]

.LBB1_11:
    add rsp, 32
    pop rbx
    ret
.LBB1_12:

    mov edi, 12
    mov esi, 4
    call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
    ud2

the8472 · 2023-04-23T21:04:02Z

We have many different code-paths for impl FromIterator for Vec (these are implementation details):

rust/library/alloc/src/vec/spec_from_iter.rs

Lines 6 to 23 in 7f94b31

    
           /// Specialization trait used for Vec::from_iter 
        
           /// 
        
           /// ## The delegation graph: 
        
           /// 
        
           /// ```text 
        
           /// +-------------+ 
        
           /// |FromIterator | 
        
           /// +-+-----------+ 
        
           ///   | 
        
           ///   v 
        
           /// +-+-------------------------------+  +---------------------+ 
        
           /// |SpecFromIter                  +---->+SpecFromIterNested   | 
        
           /// |where I:                      |  |  |where I:             | 
        
           /// |  Iterator (default)----------+  |  |  Iterator (default) | 
        
           /// |  vec::IntoIter               |  |  |  TrustedLen         | 
        
           /// |  SourceIterMarker---fallback-+  |  +---------------------+ 
        
           /// +---------------------------------+ 
        
           /// ```

So differences in assembly output are to be expected.

In particular when collecting from a vec::IntoIter into a Vec then it may or may not reuse the allocation of the source, depending on how much excess capacity that would cause (edit: previous commenter already linked to the relevant part)

Have you encountered a measurable performance problem due to those differences?

ZhennanWu · 2023-04-23T23:21:18Z

Thanks for pointing the way. It helps tremendously.

Have you encountered a measurable performance problem due to those differences?

Not yet. I discovered this while prototyping some iterable abstractions. I think now I can simulate an unsafe workaround in my library code if turned out really necessary. You can backlog or close this issue.

ZhennanWu · 2023-04-25T05:16:14Z

I've gone through the specialization logic and the generated assembly. It turns out to be a minor problem. Here is a summary:

Cause

The fundamental cause of this problem is the different optimization aggressiveness on source vector reuse between the in-place iterable specialization (unconditional reuse when element type is compatible) and the IntoIter specialization (reuse only when not has_advanced and will occupy more than half of capacity)
The extra calls and instructions of exact_size comes from the fallback (non-reusing) path of the IntoIter specialization

Impact

This turns out to be a very artificial issue and has minimal performance impact.

The iterator being collected must be the IntoIter directly produced by the source Vec, without any transformations. Making it very hard to encounter in real life and almost purely an artificial thing.
Even if we have advanced the iterator, it is still most likely we can pass the capacity check. So most of the time we won't take the fallback path and the bloated parts of the assembly won't be reached.

rust/library/alloc/src/vec/spec_from_iter.rs

Line 48 in 7f94b31

if !has_advanced || iterator.len() >= iterator.cap / 2 {

Optional Fix

We can equalize the source vector reuse optimization aggressiveness between the two specializations. They have the same space-efficiency concern of producing sparsely populated Vecs. And the in-place iterable specialization chose more aggressive optimization with (TrustedRandomAccessNoCoerce specialization) or without knowing the final length.

The suggested fix is to unconditionally reuse source Vec everywhere. (Non-TrustedRandomAccessNoCoerce specialization can only make an unconditional decision, and unconditional alloc seems bad)

Conclusion

We can either close this issue due to its minimal impact or take the optional fix. @lukas-code @the8472

Jules-Bertholet · 2023-04-26T02:24:36Z

@rustbot label +A-codegen +I-heavy

ZhennanWu added the C-bug Category: This is a bug. label Apr 23, 2023

ZhennanWu changed the title ~~ExactSizeIterator seems to generate worse assembly when collected into Vec~~ ExactSizeIterator seems to generate worse assembly if mutated before collected into Vec Apr 23, 2023

ZhennanWu mentioned this issue Apr 23, 2023

Tracking Issue for Iterator::next_chunk #98326

Open

3 tasks

rustbot added A-codegen Area: Code generation I-heavy Issue: Problems and improvements with respect to binary size of generated code. labels Apr 26, 2023

workingjubilee added the C-optimization Category: An issue highlighting optimization opportunities or PRs implementing such label Oct 8, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ExactSizeIterator seems to generate worse assembly if mutated before collected into `Vec` #110734

ExactSizeIterator seems to generate worse assembly if mutated before collected into `Vec` #110734

ZhennanWu commented Apr 23, 2023 •

edited

Loading

lukas-code commented Apr 23, 2023

the8472 commented Apr 23, 2023 •

edited

Loading

ZhennanWu commented Apr 23, 2023

ZhennanWu commented Apr 25, 2023 •

edited

Loading

Jules-Bertholet commented Apr 26, 2023

ExactSizeIterator seems to generate worse assembly if mutated before collected into Vec #110734

ExactSizeIterator seems to generate worse assembly if mutated before collected into Vec #110734

Comments

ZhennanWu commented Apr 23, 2023 • edited Loading

lukas-code commented Apr 23, 2023

the8472 commented Apr 23, 2023 • edited Loading

ZhennanWu commented Apr 23, 2023

ZhennanWu commented Apr 25, 2023 • edited Loading

Cause

Impact

Optional Fix

Conclusion

Jules-Bertholet commented Apr 26, 2023

ExactSizeIterator seems to generate worse assembly if mutated before collected into `Vec` #110734

ExactSizeIterator seems to generate worse assembly if mutated before collected into `Vec` #110734

ZhennanWu commented Apr 23, 2023 •

edited

Loading

the8472 commented Apr 23, 2023 •

edited

Loading

ZhennanWu commented Apr 25, 2023 •

edited

Loading