123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- NS_CC_MATH_BEGIN
- class MathUtilNeon
- {
- public:
- inline static void addMatrix(const float* m, float scalar, float* dst);
-
- inline static void addMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void multiplyMatrix(const float* m, float scalar, float* dst);
-
- inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
-
- inline static void negateMatrix(const float* m, float* dst);
-
- inline static void transposeMatrix(const float* m, float* dst);
-
- inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
-
- inline static void transformVec4(const float* m, const float* v, float* dst);
-
- inline static void crossVec3(const float* v1, const float* v2, float* dst);
- };
- inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
- {
- asm volatile(
- "vld1.32 {q0, q1}, [%1]! \n\t"
- "vld1.32 {q2, q3}, [%1] \n\t"
- "vld1.32 {d8[0]}, [%2] \n\t"
- "vmov.f32 s17, s16 \n\t"
- "vmov.f32 s18, s16 \n\t"
- "vmov.f32 s19, s16 \n\t"
-
- "vadd.f32 q8, q0, q4 \n\t"
- "vadd.f32 q9, q1, q4 \n\t"
- "vadd.f32 q10, q2, q4 \n\t"
- "vadd.f32 q11, q3, q4 \n\t"
-
- "vst1.32 {q8, q9}, [%0]! \n\t"
- "vst1.32 {q10, q11}, [%0] \n\t"
- :
- : "r"(dst), "r"(m), "r"(&scalar)
- : "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
- );
- }
- inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "vld1.32 {q0, q1}, [%1]! \n\t"
- "vld1.32 {q2, q3}, [%1] \n\t"
- "vld1.32 {q8, q9}, [%2]! \n\t"
- "vld1.32 {q10, q11}, [%2] \n\t"
-
- "vadd.f32 q12, q0, q8 \n\t"
- "vadd.f32 q13, q1, q9 \n\t"
- "vadd.f32 q14, q2, q10 \n\t"
- "vadd.f32 q15, q3, q11 \n\t"
-
- "vst1.32 {q12, q13}, [%0]! \n\t"
- "vst1.32 {q14, q15}, [%0] \n\t"
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
- );
- }
- inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "vld1.32 {q0, q1}, [%1]! \n\t"
- "vld1.32 {q2, q3}, [%1] \n\t"
- "vld1.32 {q8, q9}, [%2]! \n\t"
- "vld1.32 {q10, q11}, [%2] \n\t"
-
- "vsub.f32 q12, q0, q8 \n\t"
- "vsub.f32 q13, q1, q9 \n\t"
- "vsub.f32 q14, q2, q10 \n\t"
- "vsub.f32 q15, q3, q11 \n\t"
-
- "vst1.32 {q12, q13}, [%0]! \n\t"
- "vst1.32 {q14, q15}, [%0] \n\t"
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
- );
- }
- inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst)
- {
- asm volatile(
- "vld1.32 {d0[0]}, [%2] \n\t"
- "vld1.32 {q4-q5}, [%1]! \n\t"
- "vld1.32 {q6-q7}, [%1] \n\t"
-
- "vmul.f32 q8, q4, d0[0] \n\t"
- "vmul.f32 q9, q5, d0[0] \n\t"
- "vmul.f32 q10, q6, d0[0] \n\t"
- "vmul.f32 q11, q7, d0[0] \n\t"
-
- "vst1.32 {q8-q9}, [%0]! \n\t"
- "vst1.32 {q10-q11}, [%0] \n\t"
- :
- : "r"(dst), "r"(m), "r"(&scalar)
- : "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
- );
- }
- inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst)
- {
- asm volatile(
- "vld1.32 {d16 - d19}, [%1]! \n\t"
- "vld1.32 {d20 - d23}, [%1] \n\t"
- "vld1.32 {d0 - d3}, [%2]! \n\t"
- "vld1.32 {d4 - d7}, [%2] \n\t"
-
- "vmul.f32 q12, q8, d0[0] \n\t"
- "vmul.f32 q13, q8, d2[0] \n\t"
- "vmul.f32 q14, q8, d4[0] \n\t"
- "vmul.f32 q15, q8, d6[0] \n\t"
-
- "vmla.f32 q12, q9, d0[1] \n\t"
- "vmla.f32 q13, q9, d2[1] \n\t"
- "vmla.f32 q14, q9, d4[1] \n\t"
- "vmla.f32 q15, q9, d6[1] \n\t"
-
- "vmla.f32 q12, q10, d1[0] \n\t"
- "vmla.f32 q13, q10, d3[0] \n\t"
- "vmla.f32 q14, q10, d5[0] \n\t"
- "vmla.f32 q15, q10, d7[0] \n\t"
-
- "vmla.f32 q12, q11, d1[1] \n\t"
- "vmla.f32 q13, q11, d3[1] \n\t"
- "vmla.f32 q14, q11, d5[1] \n\t"
- "vmla.f32 q15, q11, d7[1] \n\t"
-
- "vst1.32 {d24 - d27}, [%0]! \n\t"
- "vst1.32 {d28 - d31}, [%0] \n\t"
-
- :
- : "r"(dst), "r"(m1), "r"(m2)
- : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
- }
- inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
- {
- asm volatile(
- "vld1.32 {q0-q1}, [%1]! \n\t"
- "vld1.32 {q2-q3}, [%1] \n\t"
-
- "vneg.f32 q4, q0 \n\t"
- "vneg.f32 q5, q1 \n\t"
- "vneg.f32 q6, q2 \n\t"
- "vneg.f32 q7, q3 \n\t"
-
- "vst1.32 {q4-q5}, [%0]! \n\t"
- "vst1.32 {q6-q7}, [%0] \n\t"
- :
- : "r"(dst), "r"(m)
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
- );
- }
- inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
- {
- asm volatile(
- "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t"
- "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t"
- "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t"
- "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t"
-
- "vst1.32 {q0-q1}, [%0]! \n\t"
- "vst1.32 {q2-q3}, [%0] \n\t"
- :
- : "r"(dst), "r"(m)
- : "q0", "q1", "q2", "q3", "memory"
- );
- }
- inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
- {
- asm volatile(
- "vld1.32 {d0[0]}, [%1] \n\t"
- "vld1.32 {d0[1]}, [%2] \n\t"
- "vld1.32 {d1[0]}, [%3] \n\t"
- "vld1.32 {d1[1]}, [%4] \n\t"
- "vld1.32 {d18 - d21}, [%5]! \n\t"
- "vld1.32 {d22 - d25}, [%5] \n\t"
-
- "vmul.f32 q13, q9, d0[0] \n\t"
- "vmla.f32 q13, q10, d0[1] \n\t"
- "vmla.f32 q13, q11, d1[0] \n\t"
- "vmla.f32 q13, q12, d1[1] \n\t"
-
- "vst1.32 {d26}, [%0]! \n\t"
- "vst1.32 {d27[0]}, [%0] \n\t"
- :
- : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
- : "q0", "q9", "q10","q11", "q12", "q13", "memory"
- );
- }
- inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst)
- {
- asm volatile
- (
- "vld1.32 {d0, d1}, [%1] \n\t"
- "vld1.32 {d18 - d21}, [%2]! \n\t"
- "vld1.32 {d22 - d25}, [%2] \n\t"
-
- "vmul.f32 q13, q9, d0[0] \n\t"
- "vmla.f32 q13, q10, d0[1] \n\t"
- "vmla.f32 q13, q11, d1[0] \n\t"
- "vmla.f32 q13, q12, d1[1] \n\t"
-
- "vst1.32 {d26, d27}, [%0] \n\t"
- :
- : "r"(dst), "r"(v), "r"(m)
- : "q0", "q9", "q10","q11", "q12", "q13", "memory"
- );
- }
- inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst)
- {
- asm volatile(
- "vld1.32 {d1[1]}, [%1] \n\t"
- "vld1.32 {d0}, [%2] \n\t"
- "vmov.f32 s2, s1 \n\t"
-
- "vld1.32 {d2[1]}, [%3] \n\t"
- "vld1.32 {d3}, [%4] \n\t"
- "vmov.f32 s4, s7 \n\t"
-
- "vmul.f32 d4, d0, d2 \n\t"
- "vmls.f32 d4, d1, d3 \n\t"
-
- "vmul.f32 d5, d3, d1[1] \n\t"
- "vmls.f32 d5, d0, d2[1] \n\t"
-
- "vst1.32 {d4}, [%0]! \n\t"
- "vst1.32 {d5[0]}, [%0] \n\t"
- :
- : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
- : "q0", "q1", "q2", "memory"
- );
- }
- NS_CC_MATH_END
|