fortran - この巨大なネストされたループのパフォーマンスを改善するにはどうすればよいですか? (Fortran 90)

Question

ここにコードセグメント全体を掲載しますが、唯一の問題は、最後のネストされたループです。すべての読み込み行列のサイズは 180x180 で、ループは耐えられないほど遅いです。行列 "AnaInt" を取得するためのインデックスごとの乗算は、インデックスが 3 回出現するため単純な行列積ではないため、計算を単純化する簡単な方法はわかりません。何かご意見は？ありがとう！

program AC
 implicit none
  integer, parameter :: dp = selected_real_kind(15, 307)
  integer :: n, ndim, k, j, i, o, l, m, steps
  real(dp) :: emax, omega, pi, EFermi, auev
  complex(dp) :: Grs,Gas, ACCond, tinyc, cunit, czero, cone

  complex(dp), allocatable :: GammaL(:,:)     
  complex(dp), allocatable :: GammaL_EB(:,:)  
  complex(dp), allocatable :: GammaR(:,:)     
  complex(dp), allocatable :: R(:,:)  
  complex(dp), allocatable :: Yc(:,:)         
  complex(dp), allocatable :: Yd(:,:)         
  complex(dp), allocatable :: AnaInt(:,:)     
  complex(dp), allocatable :: H(:,:)         
  complex(dp), allocatable :: HamEff(:,:)     
  complex(dp), allocatable :: EigVec(:,:)    
  complex(dp), allocatable :: InvEigVec(:,:)  
  complex(dp), allocatable :: EigVal(:)       
  complex(dp), allocatable :: ctemp(:,:)      
  complex(dp), allocatable :: ctemp2(:,:)      
  complex(dp), allocatable :: S(:,:)          
  complex(dp), allocatable :: SelfL(:,:)     
  complex(dp), allocatable :: SelfR(:,:)     
  complex(dp), allocatable :: SHalf(:,:)      
  complex(dp), allocatable :: InvSHalf(:,:)   
  complex(dp), allocatable :: HEB(:,:)
  complex(dp), allocatable :: Integrand(:,:)


!Lapack arrays and variables
  integer :: info, lwork
  complex(dp), allocatable :: work(:)       
  real(dp), allocatable :: rwork(:)    
  integer,allocatable :: ipiv(:)

!########################################################################

!Constants
    auev = 27.211385
    pi = 3.14159265359
    cunit = (0,1)
    czero = (0,0)
    cone = (1,0)
    tinyc = (0.0, 0.000000000001)


!System and calculation parameters
    open(unit=123, file="ForAC.dat", action='read', form='formatted')
    read(123,*) ndim, EFermi
    lwork = ndim*ndim

    emax = 5.0/auev
    steps = 1000 


    allocate(HEB(ndim,ndim))
    allocate(H(ndim,ndim))
    allocate(Yc(ndim,ndim))
    allocate(Yd(ndim,ndim))
    allocate(S(ndim,ndim))
    allocate(SelfL(ndim,ndim))
    allocate(SelfR(ndim,ndim))
    allocate(HamEff(ndim,ndim))
    allocate(GammaR(ndim,ndim))
    allocate(GammaL(ndim,ndim))
    allocate(AnaInt(ndim,ndim))
    allocate(EigVec(ndim,ndim))
    allocate(EigVal(ndim))
    allocate(InvEigVec(ndim,ndim))
    allocate(R(ndim,ndim))
    allocate(GammaL_EB(ndim,ndim))
    allocate(Integrand(ndim,ndim))

!################################################



    read(123,*) H, S, SelfL, SelfR
    close(unit=123)

    HamEff(:,:)=(H(:,:) + SelfL(:,:) + SelfR(:,:))   



    allocate(SHalf(ndim, ndim))
    allocate(InvSHalf(ndim,ndim))
    SHalf(:,:) = (cmplx(real(S(:,:),dp),0.0_dp,dp))

    call zpotrf('l', ndim, SHalf, ndim, info)         
    InvSHalf(:,:) = SHalf(:,:)
    call ztrtri('l', 'n', ndim, InvSHalf, ndim, info) 

    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, HamEff, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaL, ndim) 
    call ztrmm('l', 'l', 'n', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)
    call ztrmm('r', 'l', 't', 'n', ndim, ndim, cone, InvSHalf, ndim, GammaR, ndim)

    deallocate(SHalf)
    deallocate(InvSHalf)




!In the PDF: B = EigVec, B^(-1) = InvEigVec, Hk = EigVal

    allocate(ctemp(ndim,ndim))
    ctemp(:,:) = HamEff(:,:)
    allocate(work(lwork),rwork(2*ndim))
    call zgeev('N', 'V', ndim, ctemp, ndim, EigVal, InvEigVec, ndim, EigVec, ndim, work, lwork, rwork, info)
    if(info/=0)write(*,*) "Warning: zgeev info=", info
    deallocate(work,rwork)
    deallocate(ctemp) 

    InvEigVec(:,:)=EigVec(:,:)
    lwork = 3*ndim
    allocate(ipiv(ndim))
    allocate(work(lwork))
    call zgetrf(ndim,ndim,InvEigVec,ndim,ipiv,info)
    if(info/=0)write(*,*) "Warning: zgetrf info=", info   ! LU decomposition
    call zgetri(ndim,InvEigVec,ndim,ipiv,work,lwork,info)
    if(info/=0)write(*,*) "Warning: zgetri info=", info ! Inversion by LU decomposition (Building of InvEigVec)
    deallocate(work)
    deallocate(ipiv)


 R(:,:) = 0.0_dp
 do j=1,ndim
 do m=1,ndim
 do k=1,ndim
 do l=1,ndim
 R(j,m) = R(j,m) + InvEigVec(j,k) * GammaR(k,l) * conjg(InvEigVec(m,l))
 end do
 end do
 end do
 end do





!!!THIS IS THE LOOP IN QUESTION. MATRIX DIMENSION 180x180, STEPS=1000

 open(unit=125,file="ACCond.dat")

     !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp) 
         AnaInt(:,:) = 0.0_dp
         do i=1,ndim
             do n=1,ndim
                 do j=1,ndim
                      do m=1,ndim
                           Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                           Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                           Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

                           AnaInt(i,n)= AnaInt(i,n) + EigVec(i,j) * R(j,m) * Integrand(j,m) * conjg(EigVec(n,m))
                      end do
                 end do
             end do
        end do 

         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

          ACCond = czero
          do k=1,ndim
              ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
          end do
          write(125,*) omega, real(ACCond,dp), aimag(ACCond)
      end do



!#############################################

    deallocate(Integrand)
    deallocate(HEB)
    deallocate(Yc)
    deallocate(Yd)
    deallocate(HamEff)
    deallocate(GammaR)
    deallocate(GammaL)
    deallocate(AnaInt)
    deallocate(EigVec)
    deallocate(EigVal)
    deallocate(InvEigVec)
    deallocate(H)
    deallocate(S)
    deallocate(SelfL)
    deallocate(SelfR)
    deallocate(R)
    deallocate(GammaL_EB)
end program AC

したがって、提案による最初の適応は次のとおりです。

HermEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermEigVec(i,j) = conjg(EigVec(j,i))
end do
end do

HermInvEigVec(:,:) = 0.0_dp
do i=1, ndim
do j=1, ndim
HermInvEigVec(i,j) = conjg(InvEigVec(j,i))
end do
end do


R(:,:) = 0.0_dp

R = matmul(InvEigVec,matmul(GammaR,HermInvEigVec))


open(unit=125,file="ACCond.dat")

    !Looping over omega
     do o=1,steps
         omega=real(o,dp)*emax/real(steps,dp)

         AnaInt(:,:) = 0.0_dp
             do j=1,ndim
             do m=1,ndim
                 Grs = log((EFermi-(EigVal(j)+tinyc)+omega)/(EFermi-(EigVal(j)+tinyc)))
                 Gas = log((EFermi-conjg(EigVal(m)+tinyc))/(EFermi-omega-conjg(EigVal(m)+tinyc)))
                 Integrand(j,m) = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))
                 T(j,m) = R(j,m) * Integrand(j,m)
             end do
             end do
         AnaInt = matmul(EigVec,matmul(T,HermEigVec))


         Yc = 1/(2.0*pi*omega) * matmul(AnaInt,GammaL)                      
         Yd(:,:) = - 1/(2.0*pi) * cunit * AnaInt(:,:)

         ACCond = czero
         do k=1,ndim
             ACCond=ACCond+Yc(k,k) + 1/(2.0) * Yd(k,k)
         end do
       write(125,*) omega, real(ACCond,dp), aimag(ACCond)
     end do

score 2 · Accepted Answer

コードにいくつかの問題があります。強調したループの前のループから始めましょう (理解するのは簡単ですが、次の大きなループには多かれ少なかれ同じ問題があります)。

したがって、i、j、k、l のループがあります。

キャッシュへのアクセスを改善するために、ループの順序を変更することを検討できます。最も内側のループは l にあり、列インデックスとしてのみ表示されます。Fortranの列優先の配列では、パフォーマンスの低下が予想されます。j の内部ループの方がおそらく良いでしょう。

さらに悪いことに、ループ全体が 3 つの行列 (InvEigVec * GammaR * InvEigVec^H) の積による行列更新ですが、O(ndim^4) で実行します。各行列積は O(n^3) です ( Strassen アルゴリズムを使用して、最適化された ZGEMMを呼び出す場合はそれ以下になる可能性があります)。したがって、行列積を格納することにより、2 つの積は O(n^4) ではなく O(n^3) になります。つまり、マトリックス製品を実行してから、マトリックス製品の更新を実行できます。

さて、あなたの大きなループ: i、n、j、m を数回ステップします。

私がよく読んだら、あなたは書く

Integrand = (Grs-Gas)/(EigVal(j)-tinyc-omega-conjg(EigVal(m)-tinyc))

右側の変数はすべてスカラーですが、被積分関数は ndim*ndim 行列です。1 つの値を複数の場所にコピーするには多くの作業が必要です。しかし、その後、単にスカラーを使用できる被積分関数をループします。それともバグで、左側に Integrand(j, m) などを配置する必要がありますか?

次に、4 つの内部ループは前のコメントのように、配列積 EigVec * (R .* Integrand) * EigVec^H を使用した AnaInt の更新であり、.* 配列の (項ごとの) スカラー積 (または単に EigVec * R * Integrand が単なるスカラーの場合は EigVec^H)。

繰り返しになりますが、これを ZGEMM で記述して、複雑さを大幅に軽減することをお勧めします。

score 1 · Accepted Answer

OPENMP を使用したループの並列化を検討しましたか? 実装はとても簡単です。もし興味があれば、私はあなたにいくつかのヒントを与えることができるかもしれません.

ここを見てみてください: openMP DO チュートリアル

fortran - この巨大なネストされたループのパフォーマンスを改善するにはどうすればよいですか? (Fortran 90)

2 に答える 2

Related

Reference