algorithm - オブザベーションが複数の可能なパートナーオブザベーションを持つことができるという条件の下で、オブザベーションのペアを削除します

Question

私の現在のプロジェクトでは、さまざまな条件に基づいてマッチングを実装しなければならないことが何度かありました。最初に、問題のより詳細な説明。

テーブルテストを取得しました:
key Value
1 10
1 -10
1 10
1 20
1 -10
1 10
2 10
2 -10

ここで、ルールを適用して、グループ (キーの値で定義) 内で合計が 0 のペアを除外する必要があります。

期待される結果は次のようになります:
キー値
1 10
1 20

ソート順は関係ありません。

次のコードは、ソリューションの例です。my_id が 2 と 7 のオブザベーションを削除し、さらに 3 つのオブザベーションのうち 2 つを量 10 で削除します。

data test;
input my_id alias $ amount;
datalines4;
1 aaa 10
2 aaa -10
3 aaa 8000
4 aaa -16000
5 aaa 700
6 aaa 10
7 aaa -10
8 aaa 10
;;;;
run;

/* get all possible matches represented by pairs of my_id */
proc sql noprint;
  create table zwischen_erg as
  select a.my_id as a_id,
         b.my_id as b_id
  from test as a inner join
       test as b on (a.alias=b.alias) 
  where a.amount=-b.amount;
quit;

/* select ids of matches to eliminate */
proc sort data=zwischen_erg ;
  by a_id b_id;
run;

data zwischen_erg1;
  set zwischen_erg;
  by a_id;

  if first.a_id then tmp_id1 = 0;
  tmp_id1 +1;
run;


proc sort data=zwischen_erg;
  by b_id a_id;
run;

data zwischen_erg2;
  set zwischen_erg;
  by b_id;

  if first.b_id then tmp_id2 = 0;
  tmp_id2 +1;
run;

proc sql;
  create table delete_ids as 
  select zwischen_erg1.a_id as my_id
  from zwischen_erg1 as erg1 left join 
       zwischen_erg2 as erg2 on 
                   (erg1.a_id = erg2.a_id and 
                    erg1.b_id = erg2.b_id)
  where tmp_id1 = tmp_id2
;
quit;

/* use delete_ids as filter */
proc sql noprint;
  create table erg as
  select a.*
  from test as a left join
       delete_ids as b on (a.my_id = b.my_id) 
  where b.my_id=.;
quit;

アルゴリズムは機能しているようです。少なくとも、エラーの原因となった入力データは見つかりませんでした。しかし、なぜそれが機能するのか、誰も私に説明できませんでした。

それで、いくつか質問があります。

このアルゴリズムは、入力データのすべての可能な組み合わせに対して正しい方法でペアを削除しますか?
正しく機能する場合、アルゴリズムはどのように詳細に機能しますか? 特に
tmp_id1 = tmp_id2 の部分。
対応するペアを削除するためのより良いアルゴリズムはありますか?

前もって感謝し、ハッピーコーディング
マイケル

score 1 · Accepted Answer

3番目の質問への回答として。次のアプローチは、私には簡単に思えます。そしておそらくより高性能です。（私は結合がないので）

/*For every (absolute) value, find how many more positive/negative occurrences we have per key*/
proc sql;
    create view V_INTERMEDIATE_VIEW as
    select key, abs(Value) as Value_abs, sum(sign(value)) as balance
    from INPUT_DATA
    group by key, Value_abs
    ;
quit;

*The balance variable here means how many times more often did we see the positive than the negative of this value. I.e., how many of either the positive or the negative were we not able to eliminate;

/*Now output*/
data OUTPUT_DATA (keep=key Value);
    set V_INTERMEDIATE_VIEW;
    Value = sign(balance)*Value_abs; *Put the correct value back;

    do i=1 to abs(balance) by 1;
        output;
    end;
run;

純粋な SAS のみが必要な場合 (したがって proc sql は必要ありません)、次のように実行できます。その背後にある考え方は同じままであることに注意してください。

data V_INTERMEDIATE_VIEW /view=V_INTERMEDIATE_VIEW;
    set INPUT_DATA;
    value_abs = abs(value);
run;
proc sort data=V_INTERMEDIATE_VIEW out=INTERMEDIATE_DATA;
    by key value_abs; *we will encounter the negatives of each value and then the positives;
run;

data OUTPUT_DATA (keep=key value);
    set INTERMEDIATE_DATA;
    by key value_abs;

    retain balance 0;
    balance = sum(balance,sign(value));

    if last.value_abs then do;
        value = sign(balance)*value_abs; *set sign depending on what we have in excess;            
        do i=1 to abs(balance) by 1;
            output;
        end;

        balance=0; *reset balance for next value_abs;
    end;
run;

注: 有用なパフォーマンスの提案をしてくれた Joe に感謝します。

score 0 · Accepted Answer

I don't see any bugs after a quick read. But "zwischen_erg" could have a lot of unnecessary many-to-many matches which would be inefficient.

This seems to work (but not guaranteed), and might be more efficient. Also shorter, so perhaps easier to see whats going on.

data test;
input my_id alias $ amount;
datalines4;
1 aaa 10
2 aaa -10
3 aaa 8000
4 aaa -16000
5 aaa 700
6 aaa 10
7 aaa -10
8 aaa 10
;;;;
run;

proc sort data=test;
    by alias amount;
run;

data zwischen_erg;
    set test;
    by alias amount;
    if first.amount then occurrence = 0;
    occurrence+1;
run;

proc sql;
    create table zwischen as
    select
        a.my_id,
        a.alias,
        a.amount
    from zwischen_erg as a
    left join zwischen_erg as b
    on a.amount = (-1)*b.amount and a.occurrence = b.occurrence
    where b.my_id is missing;
quit;

algorithm - オブザベーションが複数の可能なパートナーオブザベーションを持つことができるという条件の下で、オブザベーションのペアを削除します

2 に答える 2

Related

Reference