123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265 |
- NS_CC_MATH_BEGIN
- class MathUtilNeon64
- {
- public:
- inline static void addMatrix(const float* m, float scalar, float* dst);
-
- inline static void addMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-
- inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void negateMatrix(const float* m, float* dst);
-
- inline static void transposeMatrix(const float* m, float* dst);
-
- inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-
- inline static void transformVec4(const float* m, const float* v, float* dst);
-
- inline static void crossVec3(const float* v1, const float* v2, float* dst);
- };
- inline void MathUtilNeon64::addMatrix(const float* m, float scalar, float* dst)
- {
- asm volatile(
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t"
- "ld1r {v4.4s}, [%2] \n\t"
- "fadd v8.4s, v0.4s, v4.4s \n\t"
- "fadd v9.4s, v1.4s, v4.4s \n\t"
- "fadd v10.4s, v2.4s, v4.4s \n\t"
- "fadd v11.4s, v3.4s, v4.4s \n\t"
-
- "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m), "r"(&scalar)
- : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "memory"
- );
- }
- inline void MathUtilNeon64::addMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t"
- "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t"
- "fadd v12.4s, v0.4s, v8.4s \n\t"
- "fadd v13.4s, v1.4s, v9.4s \n\t"
- "fadd v14.4s, v2.4s, v10.4s \n\t"
- "fadd v15.4s, v3.4s, v11.4s \n\t"
- "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
- );
- }
- inline void MathUtilNeon64::subtractMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t"
- "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2] \n\t"
- "fsub v12.4s, v0.4s, v8.4s \n\t"
- "fsub v13.4s, v1.4s, v9.4s \n\t"
- "fsub v14.4s, v2.4s, v10.4s \n\t"
- "fsub v15.4s, v3.4s, v11.4s \n\t"
- "st4 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "memory"
- );
- }
- inline void MathUtilNeon64::multiplyMatrix(const float* m, float scalar, float* dst)
- {
- asm volatile(
- "ld1 {v0.s}[0], [%2] \n\t"
- "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n\t"
- "fmul v8.4s, v4.4s, v0.s[0] \n\t"
- "fmul v9.4s, v5.4s, v0.s[0] \n\t"
- "fmul v10.4s, v6.4s, v0.s[0] \n\t"
- "fmul v11.4s, v7.4s, v0.s[0] \n\t"
- "st4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m), "r"(&scalar)
- : "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory"
- );
- }
- inline void MathUtilNeon64::multiplyMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1] \n\t"
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n\t"
- "fmul v12.4s, v8.4s, v0.s[0] \n\t"
- "fmul v13.4s, v8.4s, v0.s[1] \n\t"
- "fmul v14.4s, v8.4s, v0.s[2] \n\t"
- "fmul v15.4s, v8.4s, v0.s[3] \n\t"
- "fmla v12.4s, v9.4s, v1.s[0] \n\t"
- "fmla v13.4s, v9.4s, v1.s[1] \n\t"
- "fmla v14.4s, v9.4s, v1.s[2] \n\t"
- "fmla v15.4s, v9.4s, v1.s[3] \n\t"
- "fmla v12.4s, v10.4s, v2.s[0] \n\t"
- "fmla v13.4s, v10.4s, v2.s[1] \n\t"
- "fmla v14.4s, v10.4s, v2.s[2] \n\t"
- "fmla v15.4s, v10.4s, v2.s[3] \n\t"
- "fmla v12.4s, v11.4s, v3.s[0] \n\t"
- "fmla v13.4s, v11.4s, v3.s[1] \n\t"
- "fmla v14.4s, v11.4s, v3.s[2] \n\t"
- "fmla v15.4s, v11.4s, v3.s[3] \n\t"
- "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
- );
- }
- inline void MathUtilNeon64::negateMatrix(const float* m, float* dst)
- {
- asm volatile(
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t"
- "fneg v4.4s, v0.4s \n\t"
- "fneg v5.4s, v1.4s \n\t"
- "fneg v6.4s, v2.4s \n\t"
- "fneg v7.4s, v3.4s \n\t"
- "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory"
- );
- }
- inline void MathUtilNeon64::transposeMatrix(const float* m, float* dst)
- {
- asm volatile(
- "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n\t"
-
- "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(m)
- : "v0", "v1", "v2", "v3", "memory"
- );
- }
- inline void MathUtilNeon64::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
- {
- asm volatile(
- "ld1 {v0.s}[0], [%1] \n\t"
- "ld1 {v0.s}[1], [%2] \n\t"
- "ld1 {v0.s}[2], [%3] \n\t"
- "ld1 {v0.s}[3], [%4] \n\t"
- "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%5] \n\t"
-
- "fmul v13.4s, v9.4s, v0.s[0] \n\t"
- "fmla v13.4s, v10.4s, v0.s[1] \n\t"
- "fmla v13.4s, v11.4s, v0.s[2] \n\t"
- "fmla v13.4s, v12.4s, v0.s[3] \n\t"
-
- "st1 {v13.2s}, [%0], 8 \n\t"
- "st1 {v13.s}[2], [%0] \n\t"
- :
- : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
- : "v0", "v9", "v10","v11", "v12", "v13", "memory"
- );
- }
- inline void MathUtilNeon64::transformVec4(const float* m, const float* v, float* dst)
- {
- asm volatile
- (
- "ld1 {v0.4s}, [%1] \n\t"
- "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%2] \n\t"
- "fmul v13.4s, v9.4s, v0.s[0] \n\t"
- "fmla v13.4s, v10.4s, v0.s[1] \n\t"
- "fmla v13.4s, v11.4s, v0.s[2] \n\t"
- "fmla v13.4s, v12.4s, v0.s[3] \n\t"
- "st1 {v13.4s}, [%0] \n\t"
- :
- : "r"(dst), "r"(v), "r"(m)
- : "v0", "v9", "v10","v11", "v12", "v13", "memory"
- );
- }
- inline void MathUtilNeon64::crossVec3(const float* v1, const float* v2, float* dst)
- {
- asm volatile(
- "ld1 {v0.2s}, [%2] \n\t"
- "ld1 {v0.s}[2], [%1] \n\t"
- "mov v0.s[3], v0.s[0] \n\t"
- "ld1 {v1.4s}, [%3] \n\t"
- "mov v1.s[3], v1.s[0] \n\t"
- "fmul v2.4s, v0.4s, v1.4s \n\t"
- "mov v0.s[0], v0.s[1] \n\t"
- "mov v0.s[1], v0.s[2] \n\t"
- "mov v0.s[2], v0.s[3] \n\t"
- "mov v1.s[3], v1.s[2] \n\t"
- "fmul v0.4s, v0.4s, v1.4s \n\t"
- "mov v0.s[3], v0.s[1] \n\t"
- "mov v0.s[1], v0.s[2] \n\t"
- "mov v0.s[2], v0.s[0] \n\t"
- "fsub v2.4s, v0.4s, v2.4s \n\t"
- "mov v2.s[0], v2.s[1] \n\t"
- "mov v2.s[1], v2.s[2] \n\t"
- "mov v2.s[2], v2.s[3] \n\t"
- "st1 {v2.2s}, [%0], 8 \n\t"
- "st1 {v2.s}[2], [%0] \n\t"
- :
- : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
- : "v0", "v1", "v2", "memory"
- );
- }
- NS_CC_MATH_END
|