r - Rcpp と openMP を使用した切り捨て正規分布からの高速サンプリング

Question

アップデート：

Dirkの提案を実装しようとしました。コメント? 私は今、JSM で忙しいのですが、ギャラリー用の Rmd を編む前にフィードバックをもらいたいと思っています。何の価値もなかったので、Armadillo から通常の Rcpp に戻しました。R:: を使用したスカラーバージョンは非常に優れています。mean/sd が目的の出力長のベクトルとしてではなく、スカラーとして入力されている場合は、描画数のパラメーター n を入力する必要があります。

切り捨てられた正規分布からサンプルを抽出する必要がある MCMC アプリケーションはたくさんあります。TN の既存の実装を基に構築し、それに並列計算を追加しました。

問題：

さらなる速度向上の可能性を感じている人はいますか? ~~ベンチマークの最後のケースでは、rtruncnorm の方が速い場合があります。~~Rcpp の実装は常に既存のパッケージよりも高速ですが、さらに改善することはできますか?
共有できない複雑なモデル内で実行したところ、R セッションがクラッシュしました。ただし、体系的に再現することはできないため、コードの別の部分であった可能性があります。誰かが TN を使用している場合は、テストして私に知らせてください。更新: 更新されたコードで問題は発生していませんが、お知らせください。

まとめ方: 私の知る限り、最速の実装は CRAN ではありませんが、ソースコードはOSU statからダウンロードできます。私のベンチマークでは、競合するmsmとtruncormの実装は低速でした。秘訣は、提案分布を効率的に調整することです。指数関数は、切り捨てられた法線の末尾に対してうまく機能します。そこで、Chris のコードを「Rcpp」して、openMP のスパイスを加えました。ここでは動的スケジュールが最適です。境界に応じてサンプリングにかかる時間が増減する可能性があるからです。私が厄介だと思ったことの 1 つは、double を操作したいときに、多くの統計分布が NumericVector 型に基づいていることです。私はそれを回避する方法をコーディングしました。

Rcpp コードは次のとおりです。

#include <Rcpp.h>
#include <omp.h>


// norm_rs(a, b)
// generates a sample from a N(0,1) RV restricted to be in the interval
// (a,b) via rejection sampling.
// ======================================================================

// [[Rcpp::export]]

double norm_rs(double a, double b)
{
   double  x;
   x = Rf_rnorm(0.0, 1.0);
   while( (x < a) || (x > b) ) x = norm_rand();
   return x;
}

// half_norm_rs(a, b)
// generates a sample from a N(0,1) RV restricted to the interval
// (a,b) (with a > 0) using half normal rejection sampling.
// ======================================================================

// [[Rcpp::export]]

double half_norm_rs(double a, double b)
{
   double   x;
   x = fabs(norm_rand());
   while( (x<a) || (x>b) ) x = fabs(norm_rand());
   return x;
}

// unif_rs(a, b)
// generates a sample from a N(0,1) RV restricted to the interval
// (a,b) using uniform rejection sampling. 
// ======================================================================

// [[Rcpp::export]]

double unif_rs(double a, double b)
{
   double xstar, logphixstar, x, logu;

   // Find the argmax (b is always >= 0)
   // This works because we want to sample from N(0,1)
   if(a <= 0.0) xstar = 0.0;
   else xstar = a;
   logphixstar = R::dnorm(xstar, 0.0, 1.0, 1.0);

   x = R::runif(a, b);
   logu = log(R::runif(0.0, 1.0));
   while( logu > (R::dnorm(x, 0.0, 1.0,1.0) - logphixstar))
   {
      x = R::runif(a, b);
      logu = log(R::runif(0.0, 1.0));
   }
   return x;
}

// exp_rs(a, b)
// generates a sample from a N(0,1) RV restricted to the interval
// (a,b) using exponential rejection sampling.
// ======================================================================

// [[Rcpp::export]]

double exp_rs(double a, double b)
{
  double  z, u, rate;

//  Rprintf("in exp_rs");
  rate = 1/a;
//1/a

   // Generate a proposal on (0, b-a)
   z = R::rexp(rate);
   while(z > (b-a)) z = R::rexp(rate);
   u = R::runif(0.0, 1.0);

   while( log(u) > (-0.5*z*z))
   {
      z = R::rexp(rate);
      while(z > (b-a)) z = R::rexp(rate);
      u = R::runif(0.0,1.0);
   }
   return(z+a);
}




// rnorm_trunc( mu, sigma, lower, upper)
//
// generates one random normal RVs with mean 'mu' and standard
// deviation 'sigma', truncated to the interval (lower,upper), where
// lower can be -Inf and upper can be Inf.
//======================================================================

// [[Rcpp::export]]
double rnorm_trunc (double mu, double sigma, double lower, double upper)
{
int change;
 double a, b;
 double logt1 = log(0.150), logt2 = log(2.18), t3 = 0.725;
 double z, tmp, lograt;

 change = 0;
 a = (lower - mu)/sigma;
 b = (upper - mu)/sigma;

 // First scenario
 if( (a == R_NegInf) || (b == R_PosInf))
   {
     if(a == R_NegInf)
       {
     change = 1;
     a = -b;
     b = R_PosInf;
       }

     // The two possibilities for this scenario
     if(a <= 0.45) z = norm_rs(a, b);
     else z = exp_rs(a, b);
     if(change) z = -z;
   }
 // Second scenario
 else if((a * b) <= 0.0)
   {
     // The two possibilities for this scenario
     if((R::dnorm(a, 0.0, 1.0,1.0) <= logt1) || (R::dnorm(b, 0.0, 1.0, 1.0) <= logt1))
       {
     z = norm_rs(a, b);
       }
     else z = unif_rs(a,b);
   }
 // Third scenario
 else
   {
     if(b < 0)
       {
     tmp = b; b = -a; a = -tmp; change = 1;
       }

     lograt = R::dnorm(a, 0.0, 1.0, 1.0) - R::dnorm(b, 0.0, 1.0, 1.0);
     if(lograt <= logt2) z = unif_rs(a,b);
     else if((lograt > logt1) && (a < t3)) z = half_norm_rs(a,b);
     else z = exp_rs(a,b);
     if(change) z = -z;
   }
   double output;
   output = sigma*z + mu;
 return (output);
}


// rtnm( mu, sigma, lower, upper, cores)
//
// generates one random normal RVs with mean 'mu' and standard
// deviation 'sigma', truncated to the interval (lower,upper), where
// lower can be -Inf and upper can be Inf.
// mu, sigma, lower, upper are vectors, and vectorized calls of this function
// speed up computation
// cores is an intege, representing the number of cores to be used in parallel
//======================================================================


// [[Rcpp::export]]

Rcpp::NumericVector rtnm(Rcpp::NumericVector mus, Rcpp::NumericVector sigmas, Rcpp::NumericVector lower, Rcpp::NumericVector upper, int cores){
  omp_set_num_threads(cores);
  int nobs = mus.size();
  Rcpp::NumericVector out(nobs);
  double logt1 = log(0.150), logt2 = log(2.18), t3 = 0.725;
    double a,b, z, tmp, lograt;

     int  change;

  #pragma omp parallel for schedule(dynamic)   
  for(int i=0;i<nobs;i++) {  

     a = (lower(i) - mus(i))/sigmas(i);
     b = (upper(i) - mus(i))/sigmas(i);
     change=0;
     // First scenario
     if( (a == R_NegInf) || (b == R_PosInf))
       {
         if(a == R_NegInf)
           {
              change = 1;
              a = -b;
              b = R_PosInf;
           }

         // The two possibilities for this scenario
         if(a <= 0.45) z = norm_rs(a, b);
         else z = exp_rs(a, b);
         if(change) z = -z;
       }
     // Second scenario
     else if((a * b) <= 0.0)
       {
         // The two possibilities for this scenario
         if((R::dnorm(a, 0.0, 1.0,1.0) <= logt1) || (R::dnorm(b, 0.0, 1.0, 1.0) <= logt1))
           {
                z = norm_rs(a, b);
           }
         else z = unif_rs(a,b);
       }

     // Third scenario
     else
       {
         if(b < 0)
           {
                tmp = b; b = -a; a = -tmp; change = 1;
           }

         lograt = R::dnorm(a, 0.0, 1.0, 1.0) - R::dnorm(b, 0.0, 1.0, 1.0);
         if(lograt <= logt2) z = unif_rs(a,b);
         else if((lograt > logt1) && (a < t3)) z = half_norm_rs(a,b);
         else z = exp_rs(a,b);
         if(change) z = -z;
       }
    out(i)=sigmas(i)*z + mus(i);          
  }

return(out);
}

そして、ここにベンチマークがあります：

libs=c("truncnorm","msm","inline","Rcpp","RcppArmadillo","rbenchmark")
if( sum(!(libs %in% .packages(all.available = TRUE)))>0){ install.packages(libs[!(libs %in% .packages(all.available = TRUE))])}
for(i in 1:length(libs)) {library(libs[i],character.only = TRUE,quietly=TRUE)}


#needed for openMP parallel
Sys.setenv("PKG_CXXFLAGS"="-fopenmp")
Sys.setenv("PKG_LIBS"="-fopenmp")

#no of cores for openMP version
cores = 4

#surce code from same dir
Rcpp::sourceCpp('truncnorm.cpp')


#sample size
nn=1000000


bb= 100
aa=-100
benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]

aa=0 
benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]

aa=2
benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]

aa=50
benchmark( rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),cores), rtnm(rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn),1),rtnorm(nn,rep(0,nn),rep(1,nn),rep(aa,nn),rep(100,nn)),rtruncnorm(nn, a=aa, b=100, mean = 0, sd = 1) , order="relative", replications=3    )[,1:4]

速度は上限/下限の境界に依存するため、いくつかのベンチマークの実行が必要です。さまざまなケースで、アルゴリズムのさまざまな部分が作動します。

score 3 · Accepted Answer

本当に素早いコメント:

含める場合は含めるRcppArmadillo.h必要はありませんRcpp.h。実際、含めるべきではありません。
rep(oneDraw, n)n 回の呼び出しを行います。n回の描画を返す、一度呼び出される関数を作成します-n-1回の関数呼び出しのオーバーヘッドを節約できるため、高速になります
多くの統計分布に関するあなたのコメントは型に基づいています.doubles をNumericVector使用したい場合、いくつかの誤解が明らかになる可能性がNumericVectorあります.内部 R 型の便利なプロキシクラスはコピーではありません. 使い方は自由、std::vector<double>お好きな形でどうぞ。
切り捨てられた法線についてはほとんど知らないので、アルゴリズムの詳細についてコメントすることはできません。
解決したら、Rcpp Galleryへの投稿を検討してください。

r - Rcpp と openMP を使用した切り捨て正規分布からの高速サンプリング

1 に答える 1

Related

Reference