for-loop - 逆反復、for ループと while ループのどちらが速いですか?

Question

私はmemmoveRust で標準関数を実装しようとしていますが、下向きの反復ではどの方法がより高速であるか疑問に思っていました (ここでsrc< dest):

for i in (0..n).rev() {
    //Do copying
}

また

let mut i = n;
while i != 0 {
    i -= 1;
    // Do copying
}

rev()ループ内のforバージョンでは大幅に遅くなりますか?

score 7 · Accepted Answer

TL;DR:forループを使用します。

どちらも同じように高速である必要があります。forループに含まれる抽象化の層を非常に簡単に取り除くコンパイラの機能を確認できます。

#[inline(never)]
fn blackhole() {}

#[inline(never)]
fn with_for(n: usize) {
    for i in (0..n).rev() { blackhole(); }
}

#[inline(never)]
fn with_while(n: usize) {
    let mut i = n;
    while i > 0 {
        blackhole();
        i -= 1;
    }
}

これにより、次の LLVM IR が生成されます。

; Function Attrs: noinline nounwind readnone uwtable
define internal void @_ZN8with_for20h645c385965fcce1fhaaE(i64) unnamed_addr #0 {
entry-block:
  ret void
}

; Function Attrs: noinline nounwind readnone uwtable
define internal void @_ZN10with_while20hc09c3331764a9434yaaE(i64) unnamed_addr #0 {
entry-block:
  ret void
}

LLVM に精通していなくても、両方の関数が同じ IR にコンパイルされたことは明らかです (したがって、明らかに同じアセンブリに)。

それらのパフォーマンスは同じであるため、より明示的なループを優先し、反復が不規則な場合にループforを予約する必要があります。while

編集：スターブルーの不適格の懸念に対処する.

#[link(name = "snappy")]
extern {
    fn blackhole(i: libc::c_int) -> libc::c_int;
}

#[inline(never)]
fn with_for(n: i32) {
    for i in (0..n).rev() { unsafe { blackhole(i as libc::c_int); } }
}

#[inline(never)]
fn with_while(n: i32) {
    let mut i = n;
    while i > 0 {
        unsafe { blackhole(i as libc::c_int); }
        i -= 1;
    }
}

コンパイルすると次のようになります。

; Function Attrs: noinline nounwind uwtable
define internal void @_ZN8with_for20h7cf06f33e247fa35maaE(i32) unnamed_addr #1 {
entry-block:
  %1 = icmp sgt i32 %0, 0
  br i1 %1, label %match_case.preheader, label %clean_ast_95_

match_case.preheader:                             ; preds = %entry-block
  br label %match_case

match_case:                                       ; preds = %match_case.preheader, %match_case
  %.in = phi i32 [ %2, %match_case ], [ %0, %match_case.preheader ]
  %2 = add i32 %.in, -1
  %3 = tail call i32 @blackhole(i32 %2)
  %4 = icmp sgt i32 %2, 0
  br i1 %4, label %match_case, label %clean_ast_95_.loopexit

clean_ast_95_.loopexit:                           ; preds = %match_case
  br label %clean_ast_95_

clean_ast_95_:                                    ; preds = %clean_ast_95_.loopexit, %entry-block
  ret void
}

; Function Attrs: noinline nounwind uwtable
define internal void @_ZN10with_while20hee8edd624cfe9293IaaE(i32) unnamed_addr #1 {
entry-block:
  %1 = icmp sgt i32 %0, 0
  br i1 %1, label %while_body.preheader, label %while_exit

while_body.preheader:                             ; preds = %entry-block
  br label %while_body

while_exit.loopexit:                              ; preds = %while_body
  br label %while_exit

while_exit:                                       ; preds = %while_exit.loopexit, %entry-block
  ret void

while_body:                                       ; preds = %while_body.preheader, %while_body
  %i.05 = phi i32 [ %3, %while_body ], [ %0, %while_body.preheader ]
  %2 = tail call i32 @blackhole(i32 %i.05)
  %3 = add nsw i32 %i.05, -1
  %4 = icmp sgt i32 %i.05, 1
  br i1 %4, label %while_body, label %while_exit.loopexit
}

コアループは次のとおりです。

; -- for loop
match_case:                                       ; preds = %match_case.preheader, %match_case
  %.in = phi i32 [ %2, %match_case ], [ %0, %match_case.preheader ]
  %2 = add i32 %.in, -1
  %3 = tail call i32 @blackhole(i32 %2)
  %4 = icmp sgt i32 %2, 0
  br i1 %4, label %match_case, label %clean_ast_95_.loopexit

; -- while loop
while_body:                                       ; preds = %while_body.preheader, %while_body
  %i.05 = phi i32 [ %3, %while_body ], [ %0, %while_body.preheader ]
  %2 = tail call i32 @blackhole(i32 %i.05)
  %3 = add nsw i32 %i.05, -1
  %4 = icmp sgt i32 %i.05, 1
  br i1 %4, label %while_body, label %while_exit.loopexit

唯一の違いは次のとおりです。

for デクリメントは呼び出す前にblackhole、while は呼び出し後にデクリメントします
for は 0 と比較し、while は 1 と比較します

それ以外の場合は、同じコアループです。

score 1 · Accepted Answer

smallNの場合、実際には問題になりません。

Rust はイテレータを怠ります。実際に要素を要求する0..nまで、評価は行われません。最初に最後の要素を要求します。私の知る限り、Rust カウンターの反復子は賢く、最初の要素を生成してth を取得する必要はありません。この特定のケースでは、メソッドはおそらくさらに高速です。rev()N-1Nrev

一般的なケースでは、イテレータのアクセスパラダイムとアクセス時間の種類によって異なります。最後にアクセスするのに一定の時間がかかることを確認してください。違いはありません。

すべてのベンチマークの質問と同様に、状況によって異なります。N自分の価値を自分でテストしてください！

時期尚早の最適化も悪いので、あなたNが小さく、あなたのループがあまり頻繁に行われなくても...心配しないでください。

for-loop - 逆反復、for ループと while ループのどちらが速いですか?

3 に答える 3

Related

Reference