a couple trait method call optimizations #9332

eugals · 2013-09-19T17:55:20Z

It is intended to optimize/beautify the code generated in a few trivial trait operations.
Let's take the following code as an example:

trait Stuff {
    fn bar(&self);
}

fn callBar(s: &Stuff) {
    s.bar();
}

struct Foo;

impl Stuff for Foo {
    fn bar(&self) {
    }
}

pub fn main() {
    let o = Foo;
    callBar(&o as &Stuff);
}

At present it is translated into something like:

define void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*, { %tydesc*, i8* }*) #4 {
"function top level":
  %__trait_callee = alloca { %tydesc*, i8* }
  %__auto_borrow_obj = alloca { %tydesc*, i8* }
  %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0
  %3 = load %tydesc** %2
  %4 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 0
  store %tydesc* %3, %tydesc** %4
  %5 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1
  %6 = load i8** %5
  %7 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 1
  store i8* %6, i8** %7
  %8 = bitcast { %tydesc*, i8* }* %__auto_borrow_obj to i8*
  %9 = bitcast { %tydesc*, i8* }* %__trait_callee to i8*
  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %9, i8* %8, i32 8, i32 4, i1 false)
  %10 = getelementptr inbounds { %tydesc*, i8* }* %__trait_callee, i32 0, i32 1
  %11 = load i8** %10
  %12 = bitcast i8* %11 to { i32, %tydesc*, i8*, i8*, i8 }*
  %13 = getelementptr inbounds { %tydesc*, i8* }* %__trait_callee, i32 0, i32 0
  %14 = bitcast %tydesc** %13 to [1 x i8*]**
  %15 = load [1 x i8*]** %14
  %16 = getelementptr inbounds [1 x i8*]* %15, i32 0, i32 1
  %17 = load i8** %16
  %18 = bitcast i8* %17 to void ({ i32, %tydesc*, i8*, i8*, i8 }*)*
  call void %18({ i32, %tydesc*, i8*, i8*, i8 }* %12)
  ret void
}

...

define void @_ZN4main_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*) #4 {
"function top level":
  %o = alloca %struct.Foo
  %1 = alloca { %tydesc*, i8* }
  %__auto_borrow_obj = alloca { %tydesc*, i8* }
  %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1
  %3 = bitcast i8** %2 to %struct.Foo**
  store %struct.Foo* %o, %struct.Foo** %3
  %4 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0
  %5 = bitcast %tydesc** %4 to { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }**
  store { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }* @vtable1081, { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** %5
  %6 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0
  %7 = load %tydesc** %6
  %8 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 0
  store %tydesc* %7, %tydesc** %8
  %9 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1
  %10 = load i8** %9
  %11 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 1
  store i8* %10, i8** %11
  call void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }* undef, { %tydesc*, i8* }* %__auto_borrow_obj)
  ret void
}

If you apply my patch, it would become way shorter and cleaner:

define void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*, { %tydesc*, i8* }*) #4 {
"function top level":
  %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1
  %3 = load i8** %2
  %4 = bitcast i8* %3 to { i32, %tydesc*, i8*, i8*, i8 }*
  %5 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0
  %6 = bitcast %tydesc** %5 to [1 x i8*]**
  %7 = load [1 x i8*]** %6
  %8 = getelementptr inbounds [1 x i8*]* %7, i32 0, i32 1
  %9 = load i8** %8
  %10 = bitcast i8* %9 to void ({ i32, %tydesc*, i8*, i8*, i8 }*)*
  call void %10({ i32, %tydesc*, i8*, i8*, i8 }* %4)
  ret void
}

...

define void @_ZN4main_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*) #4 {
"function top level":
  %o = alloca %struct.Foo
  %1 = alloca { %tydesc*, i8* }
  %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1
  %3 = bitcast i8** %2 to %struct.Foo**
  store %struct.Foo* %o, %struct.Foo** %3
  %4 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0
  %5 = bitcast %tydesc** %4 to { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }**
  store { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }* @vtable1081, { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** %5
  call void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }* undef, { %tydesc*, i8* }* %1)
  ret void
}

Although this change doesn't increase the compilation speed much (I mentioned only about 1-2% boost on "rustc -O -Z time-passes syntax.rs"), but I still think it's a good thing to do as it greatly simplifies/clarifies LL generated in some cases which would definitely help in the future code generation investigations.

I don't provide any new test cases in this patch as it is merely an optimization.

Sorry guys, I somehow messed my previous PR and I don't see any better way to fix as to recreate it here.

…e trivial cases where simple copying can be applied

…rrowed ref

eugals · 2013-09-19T18:05:04Z

Sorry for the mess with my last PR. It seems I didn't understand that GitHub fork magic well enough when I did my last "git merge upstream/master" :(

This one contains the same set of commits plus I've added a line of code (#[fixed_stack_segment];) to run-pass/core-run-destroy.rs.

alexcrichton · 2013-09-19T18:05:07Z

Looks like this is still carrying over some stray commits, could you squash/rebase to have just your changes?

eugals · 2013-09-19T18:33:21Z

@alexcrichton done

It is intended to optimize/beautify the code generated in a few trivial trait operations. Let's take the following code as an example: ``` trait Stuff { fn bar(&self); } fn callBar(s: &Stuff) { s.bar(); } struct Foo; impl Stuff for Foo { fn bar(&self) { } } pub fn main() { let o = Foo; callBar(&o as &Stuff); } ``` At present it is translated into something like: ``` define void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*, { %tydesc*, i8* }*) #4 { "function top level": %__trait_callee = alloca { %tydesc*, i8* } %__auto_borrow_obj = alloca { %tydesc*, i8* } %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0 %3 = load %tydesc** %2 %4 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 0 store %tydesc* %3, %tydesc** %4 %5 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1 %6 = load i8** %5 %7 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 1 store i8* %6, i8** %7 %8 = bitcast { %tydesc*, i8* }* %__auto_borrow_obj to i8* %9 = bitcast { %tydesc*, i8* }* %__trait_callee to i8* call void @llvm.memcpy.p0i8.p0i8.i32(i8* %9, i8* %8, i32 8, i32 4, i1 false) %10 = getelementptr inbounds { %tydesc*, i8* }* %__trait_callee, i32 0, i32 1 %11 = load i8** %10 %12 = bitcast i8* %11 to { i32, %tydesc*, i8*, i8*, i8 }* %13 = getelementptr inbounds { %tydesc*, i8* }* %__trait_callee, i32 0, i32 0 %14 = bitcast %tydesc** %13 to [1 x i8*]** %15 = load [1 x i8*]** %14 %16 = getelementptr inbounds [1 x i8*]* %15, i32 0, i32 1 %17 = load i8** %16 %18 = bitcast i8* %17 to void ({ i32, %tydesc*, i8*, i8*, i8 }*)* call void %18({ i32, %tydesc*, i8*, i8*, i8 }* %12) ret void } ... define void @_ZN4main_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*) #4 { "function top level": %o = alloca %struct.Foo %1 = alloca { %tydesc*, i8* } %__auto_borrow_obj = alloca { %tydesc*, i8* } %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1 %3 = bitcast i8** %2 to %struct.Foo** store %struct.Foo* %o, %struct.Foo** %3 %4 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0 %5 = bitcast %tydesc** %4 to { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** store { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }* @vtable1081, { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** %5 %6 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0 %7 = load %tydesc** %6 %8 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 0 store %tydesc* %7, %tydesc** %8 %9 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1 %10 = load i8** %9 %11 = getelementptr inbounds { %tydesc*, i8* }* %__auto_borrow_obj, i32 0, i32 1 store i8* %10, i8** %11 call void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }* undef, { %tydesc*, i8* }* %__auto_borrow_obj) ret void } ``` If you apply my patch, it would become way shorter and cleaner: ``` define void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*, { %tydesc*, i8* }*) #4 { "function top level": %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1 %3 = load i8** %2 %4 = bitcast i8* %3 to { i32, %tydesc*, i8*, i8*, i8 }* %5 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0 %6 = bitcast %tydesc** %5 to [1 x i8*]** %7 = load [1 x i8*]** %6 %8 = getelementptr inbounds [1 x i8*]* %7, i32 0, i32 1 %9 = load i8** %8 %10 = bitcast i8* %9 to void ({ i32, %tydesc*, i8*, i8*, i8 }*)* call void %10({ i32, %tydesc*, i8*, i8*, i8 }* %4) ret void } ... define void @_ZN4main_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }*) #4 { "function top level": %o = alloca %struct.Foo %1 = alloca { %tydesc*, i8* } %2 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 1 %3 = bitcast i8** %2 to %struct.Foo** store %struct.Foo* %o, %struct.Foo** %3 %4 = getelementptr inbounds { %tydesc*, i8* }* %1, i32 0, i32 0 %5 = bitcast %tydesc** %4 to { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** store { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }* @vtable1081, { %tydesc*, void ({ i32, %tydesc*, i8*, i8*, i8 }*)* }** %5 call void @_ZN7callBar_UUID.0E({ i32, %tydesc*, i8*, i8*, i8 }* undef, { %tydesc*, i8* }* %1) ret void } ``` Although this change doesn't increase the compilation speed much (I mentioned only about 1-2% boost on "rustc -O -Z time-passes syntax.rs"), but I still think it's a good thing to do as it greatly simplifies/clarifies LL generated in some cases which would definitely help in the future code generation investigations. I don't provide any new test cases in this patch as it is merely an optimization. Sorry guys, I somehow messed my previous PR and I don't see any better way to fix as to recreate it here.

U-NOV2010\eugals added 4 commits September 19, 2013 18:34

optimized trans_to_datum::auto_borrow_obj code generation in case som…

2927ab1

…e trivial cases where simple copying can be applied

minor Type::opaque_trait code cleanup

dfa3f5f

will not copy trait_callee on stack if it's source expr is a plain bo…

0c3b6ad

…rrowed ref

pacified test/run-pass/core-run-destroy on Win7x64

fadc6cc

bors closed this Sep 20, 2013

bors merged commit fadc6cc into rust-lang:master Sep 20, 2013

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

a couple trait method call optimizations #9332

a couple trait method call optimizations #9332

eugals commented Sep 19, 2013

eugals commented Sep 19, 2013

alexcrichton commented Sep 19, 2013

eugals commented Sep 19, 2013

a couple trait method call optimizations #9332

a couple trait method call optimizations #9332

Conversation

eugals commented Sep 19, 2013

eugals commented Sep 19, 2013

alexcrichton commented Sep 19, 2013

eugals commented Sep 19, 2013