2つの4x4行列を互いに乗算する関数の最適化されたCまたはアセンブラーの実装を見つけようとしています。プラットフォームは、ARM6またはARM7ベースのiPhoneまたはiPodです。
現在、私はかなり標準的なアプローチを使用しています-ほんの少しのループ展開です。
#define O(y、x)(y +(x << 2)) static inline void Matrix4x4MultiplyBy4x4(float * src1、float * src2、float * dest) {{ *(dest + O(0,0))=(*(src1 + O(0,0))* *(src2 + O(0,0)))+(*(src1 + O(0,1)) * *(src2 + O(1,0)))+(*(src1 + O(0,2))* *(src2 + O(2,0)))+(*(src1 + O(0,3) ))* *(src2 + O(3,0))); *(dest + O(0,1))=(*(src1 + O(0,0))* *(src2 + O(0,1)))+(*(src1 + O(0,1)) * *(src2 + O(1,1)))+(*(src1 + O(0,2))* *(src2 + O(2,1)))+(*(src1 + O(0,3) ))* *(src2 + O(3,1))); *(dest + O(0,2))=(*(src1 + O(0,0))* *(src2 + O(0,2)))+(*(src1 + O(0,1)) * *(src2 + O(1,2)))+(*(src1 + O(0,2))* *(src2 + O(2,2)))+(*(src1 + O(0,3) ))* *(src2 + O(3,2))); *(dest + O(0,3))=(*(src1 + O(0,0))* *(src2 + O(0,3)))+(*(src1 + O(0,1)) * *(src2 + O(1,3)))+(*(src1 + O(0,2))* *(src2 + O(2,3)))+(*(src1 + O(0,3) ))* *(src2 + O(3,3))); *(dest + O(1,0))=(*(src1 + O(1,0))* *(src2 + O(0,0)))+(*(src1 + O(1,1)) * *(src2 + O(1,0)))+(*(src1 + O(1,2))* *(src2 + O(2,0)))+(*(src1 + O(1,3) ))* *(src2 + O(3,0))); *(dest + O(1,1))=(*(src1 + O(1,0))* *(src2 + O(0,1)))+(*(src1 + O(1,1)) * *(src2 + O(1,1)))+(*(src1 + O(1,2))* *(src2 + O(2,1)))+(*(src1 + O(1,3 ))* *(src2 + O(3,1))); *(dest + O(1,2))=(*(src1 + O(1,0))* *(src2 + O(0,2)))+(*(src1 + O(1,1)) * *(src2 + O(1,2)))+(*(src1 + O(1,2))* *(src2 + O(2,2)))+(*(src1 + O(1,3 ))* *(src2 + O(3,2))); *(dest + O(1,3))=(*(src1 + O(1,0))* *(src2 + O(0,3)))+(*(src1 + O(1,1)) * *(src2 + O(1,3)))+(*(src1 + O(1,2))* *(src2 + O(2,3)))+(*(src1 + O(1,3) ))* *(src2 + O(3,3))); *(dest + O(2,0))=(*(src1 + O(2,0))* *(src2 + O(0,0)))+(*(src1 + O(2,1)) * *(src2 + O(1,0)))+(*(src1 + O(2,2))* *(src2 + O(2,0)))+(*(src1 + O(2,3) ))* *(src2 + O(3,0))); *(dest + O(2,1))=(*(src1 + O(2,0))* *(src2 + O(0,1)))+(*(src1 + O(2,1)) * *(src2 + O(1,1)))+(*(src1 + O(2,2))* *(src2 + O(2,1)))+(*(src1 + O(2,3 ))* *(src2 + O(3,1))); *(dest + O(2,2))=(*(src1 + O(2,0))* *(src2 + O(0,2)))+(*(src1 + O(2,1)) * *(src2 + O(1,2)))+(*(src1 + O(2,2))* *(src2 + O(2,2)))+(*(src1 + O(2,3 ))* *(src2 + O(3,2))); *(dest + O(2,3))=(*(src1 + O(2,0))* *(src2 + O(0,3)))+(*(src1 + O(2,1)) * *(src2 + O(1,3)))+(*(src1 + O(2,2))* *(src2 + O(2,3)))+(*(src1 + O(2,3) ))* *(src2 + O(3,3))); *(dest + O(3,0))=(*(src1 + O(3,0))* *(src2 + O(0,0)))+(*(src1 + O(3,1)) * *(src2 + O(1,0)))+(*(src1 + O(3,2))* *(src2 + O(2,0)))+(*(src1 + O(3,3) ))* *(src2 + O(3,0))); *(dest + O(3,1))=(*(src1 + O(3,0))* *(src2 + O(0,1)))+(*(src1 + O(3,1)) * *(src2 + O(1,1)))+(*(src1 + O(3,2))* *(src2 + O(2,1)))+(*(src1 + O(3,3 ))* *(src2 + O(3,1))); *(dest + O(3,2))=(*(src1 + O(3,0))* *(src2 + O(0,2)))+(*(src1 + O(3,1)) * *(src2 + O(1,2)))+(*(src1 + O(3,2))* *(src2 + O(2,2)))+(*(src1 + O(3,3 ))* *(src2 + O(3,2))); *(dest + O(3,3))=(*(src1 + O(3,0))* *(src2 + O(0,3)))+(*(src1 + O(3,1)) * *(src2 + O(1,3)))+(*(src1 + O(3,2))* *(src2 + O(2,3)))+(*(src1 + O(3,3) ))* *(src2 + O(3,3))); };
Strassen-またはCoppersmith-Winogradアルゴリズムを使用することでメリットがありますか?