MathUtilNeon.inl 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /**
  2. Copyright 2013 BlackBerry Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. Original file from GamePlay3D: http://gameplay3d.org
  13. This file was modified to fit the cocos2d-x project
  14. */
  15. NS_CC_MATH_BEGIN
  16. class MathUtilNeon
  17. {
  18. public:
  19. inline static void addMatrix(const float* m, float scalar, float* dst);
  20. inline static void addMatrix(const float* m1, const float* m2, float* dst);
  21. inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
  22. inline static void multiplyMatrix(const float* m, float scalar, float* dst);
  23. inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
  24. inline static void negateMatrix(const float* m, float* dst);
  25. inline static void transposeMatrix(const float* m, float* dst);
  26. inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
  27. inline static void transformVec4(const float* m, const float* v, float* dst);
  28. inline static void crossVec3(const float* v1, const float* v2, float* dst);
  29. };
  30. inline void MathUtilNeon::addMatrix(const float* m, float scalar, float* dst)
  31. {
  32. asm volatile(
  33. "vld1.32 {q0, q1}, [%1]! \n\t" // M[m0-m7]
  34. "vld1.32 {q2, q3}, [%1] \n\t" // M[m8-m15]
  35. "vld1.32 {d8[0]}, [%2] \n\t" // s
  36. "vmov.f32 s17, s16 \n\t" // s
  37. "vmov.f32 s18, s16 \n\t" // s
  38. "vmov.f32 s19, s16 \n\t" // s
  39. "vadd.f32 q8, q0, q4 \n\t" // DST->M[m0-m3] = M[m0-m3] + s
  40. "vadd.f32 q9, q1, q4 \n\t" // DST->M[m4-m7] = M[m4-m7] + s
  41. "vadd.f32 q10, q2, q4 \n\t" // DST->M[m8-m11] = M[m8-m11] + s
  42. "vadd.f32 q11, q3, q4 \n\t" // DST->M[m12-m15] = M[m12-m15] + s
  43. "vst1.32 {q8, q9}, [%0]! \n\t" // DST->M[m0-m7]
  44. "vst1.32 {q10, q11}, [%0] \n\t" // DST->M[m8-m15]
  45. :
  46. : "r"(dst), "r"(m), "r"(&scalar)
  47. : "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "memory"
  48. );
  49. }
  50. inline void MathUtilNeon::addMatrix(const float* m1, const float* m2, float* dst)
  51. {
  52. asm volatile(
  53. "vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
  54. "vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
  55. "vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
  56. "vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
  57. "vadd.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] + M2[m0-m3]
  58. "vadd.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] + M2[m4-m7]
  59. "vadd.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] + M2[m8-m11]
  60. "vadd.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] + M2[m12-m15]
  61. "vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
  62. "vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
  63. :
  64. : "r"(dst), "r"(m1), "r"(m2)
  65. : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
  66. );
  67. }
  68. inline void MathUtilNeon::subtractMatrix(const float* m1, const float* m2, float* dst)
  69. {
  70. asm volatile(
  71. "vld1.32 {q0, q1}, [%1]! \n\t" // M1[m0-m7]
  72. "vld1.32 {q2, q3}, [%1] \n\t" // M1[m8-m15]
  73. "vld1.32 {q8, q9}, [%2]! \n\t" // M2[m0-m7]
  74. "vld1.32 {q10, q11}, [%2] \n\t" // M2[m8-m15]
  75. "vsub.f32 q12, q0, q8 \n\t" // DST->M[m0-m3] = M1[m0-m3] - M2[m0-m3]
  76. "vsub.f32 q13, q1, q9 \n\t" // DST->M[m4-m7] = M1[m4-m7] - M2[m4-m7]
  77. "vsub.f32 q14, q2, q10 \n\t" // DST->M[m8-m11] = M1[m8-m11] - M2[m8-m11]
  78. "vsub.f32 q15, q3, q11 \n\t" // DST->M[m12-m15] = M1[m12-m15] - M2[m12-m15]
  79. "vst1.32 {q12, q13}, [%0]! \n\t" // DST->M[m0-m7]
  80. "vst1.32 {q14, q15}, [%0] \n\t" // DST->M[m8-m15]
  81. :
  82. : "r"(dst), "r"(m1), "r"(m2)
  83. : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "memory"
  84. );
  85. }
  86. inline void MathUtilNeon::multiplyMatrix(const float* m, float scalar, float* dst)
  87. {
  88. asm volatile(
  89. "vld1.32 {d0[0]}, [%2] \n\t" // M[m0-m7]
  90. "vld1.32 {q4-q5}, [%1]! \n\t" // M[m8-m15]
  91. "vld1.32 {q6-q7}, [%1] \n\t" // s
  92. "vmul.f32 q8, q4, d0[0] \n\t" // DST->M[m0-m3] = M[m0-m3] * s
  93. "vmul.f32 q9, q5, d0[0] \n\t" // DST->M[m4-m7] = M[m4-m7] * s
  94. "vmul.f32 q10, q6, d0[0] \n\t" // DST->M[m8-m11] = M[m8-m11] * s
  95. "vmul.f32 q11, q7, d0[0] \n\t" // DST->M[m12-m15] = M[m12-m15] * s
  96. "vst1.32 {q8-q9}, [%0]! \n\t" // DST->M[m0-m7]
  97. "vst1.32 {q10-q11}, [%0] \n\t" // DST->M[m8-m15]
  98. :
  99. : "r"(dst), "r"(m), "r"(&scalar)
  100. : "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "memory"
  101. );
  102. }
  103. inline void MathUtilNeon::multiplyMatrix(const float* m1, const float* m2, float* dst)
  104. {
  105. asm volatile(
  106. "vld1.32 {d16 - d19}, [%1]! \n\t" // M1[m0-m7]
  107. "vld1.32 {d20 - d23}, [%1] \n\t" // M1[m8-m15]
  108. "vld1.32 {d0 - d3}, [%2]! \n\t" // M2[m0-m7]
  109. "vld1.32 {d4 - d7}, [%2] \n\t" // M2[m8-m15]
  110. "vmul.f32 q12, q8, d0[0] \n\t" // DST->M[m0-m3] = M1[m0-m3] * M2[m0]
  111. "vmul.f32 q13, q8, d2[0] \n\t" // DST->M[m4-m7] = M1[m4-m7] * M2[m4]
  112. "vmul.f32 q14, q8, d4[0] \n\t" // DST->M[m8-m11] = M1[m8-m11] * M2[m8]
  113. "vmul.f32 q15, q8, d6[0] \n\t" // DST->M[m12-m15] = M1[m12-m15] * M2[m12]
  114. "vmla.f32 q12, q9, d0[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m1]
  115. "vmla.f32 q13, q9, d2[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m5]
  116. "vmla.f32 q14, q9, d4[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m9]
  117. "vmla.f32 q15, q9, d6[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m13]
  118. "vmla.f32 q12, q10, d1[0] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m2]
  119. "vmla.f32 q13, q10, d3[0] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m6]
  120. "vmla.f32 q14, q10, d5[0] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m10]
  121. "vmla.f32 q15, q10, d7[0] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m14]
  122. "vmla.f32 q12, q11, d1[1] \n\t" // DST->M[m0-m3] += M1[m0-m3] * M2[m3]
  123. "vmla.f32 q13, q11, d3[1] \n\t" // DST->M[m4-m7] += M1[m4-m7] * M2[m7]
  124. "vmla.f32 q14, q11, d5[1] \n\t" // DST->M[m8-m11] += M1[m8-m11] * M2[m11]
  125. "vmla.f32 q15, q11, d7[1] \n\t" // DST->M[m12-m15] += M1[m12-m15] * M2[m15]
  126. "vst1.32 {d24 - d27}, [%0]! \n\t" // DST->M[m0-m7]
  127. "vst1.32 {d28 - d31}, [%0] \n\t" // DST->M[m8-m15]
  128. : // output
  129. : "r"(dst), "r"(m1), "r"(m2) // input - note *value* of pointer doesn't change.
  130. : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  131. );
  132. }
  133. inline void MathUtilNeon::negateMatrix(const float* m, float* dst)
  134. {
  135. asm volatile(
  136. "vld1.32 {q0-q1}, [%1]! \n\t" // load m0-m7
  137. "vld1.32 {q2-q3}, [%1] \n\t" // load m8-m15
  138. "vneg.f32 q4, q0 \n\t" // negate m0-m3
  139. "vneg.f32 q5, q1 \n\t" // negate m4-m7
  140. "vneg.f32 q6, q2 \n\t" // negate m8-m15
  141. "vneg.f32 q7, q3 \n\t" // negate m8-m15
  142. "vst1.32 {q4-q5}, [%0]! \n\t" // store m0-m7
  143. "vst1.32 {q6-q7}, [%0] \n\t" // store m8-m15
  144. :
  145. : "r"(dst), "r"(m)
  146. : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "memory"
  147. );
  148. }
  149. inline void MathUtilNeon::transposeMatrix(const float* m, float* dst)
  150. {
  151. asm volatile(
  152. "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%1]! \n\t" // DST->M[m0, m4, m8, m12] = M[m0-m3]
  153. "vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1]! \n\t" // DST->M[m1, m5, m9, m12] = M[m4-m7]
  154. "vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%1]! \n\t" // DST->M[m2, m6, m10, m12] = M[m8-m11]
  155. "vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%1] \n\t" // DST->M[m3, m7, m11, m12] = M[m12-m15]
  156. "vst1.32 {q0-q1}, [%0]! \n\t" // DST->M[m0-m7]
  157. "vst1.32 {q2-q3}, [%0] \n\t" // DST->M[m8-m15]
  158. :
  159. : "r"(dst), "r"(m)
  160. : "q0", "q1", "q2", "q3", "memory"
  161. );
  162. }
  163. inline void MathUtilNeon::transformVec4(const float* m, float x, float y, float z, float w, float* dst)
  164. {
  165. asm volatile(
  166. "vld1.32 {d0[0]}, [%1] \n\t" // V[x]
  167. "vld1.32 {d0[1]}, [%2] \n\t" // V[y]
  168. "vld1.32 {d1[0]}, [%3] \n\t" // V[z]
  169. "vld1.32 {d1[1]}, [%4] \n\t" // V[w]
  170. "vld1.32 {d18 - d21}, [%5]! \n\t" // M[m0-m7]
  171. "vld1.32 {d22 - d25}, [%5] \n\t" // M[m8-m15]
  172. "vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
  173. "vmla.f32 q13, q10, d0[1] \n\t" // DST->V += M[m4-m7] * V[y]
  174. "vmla.f32 q13, q11, d1[0] \n\t" // DST->V += M[m8-m11] * V[z]
  175. "vmla.f32 q13, q12, d1[1] \n\t" // DST->V += M[m12-m15] * V[w]
  176. "vst1.32 {d26}, [%0]! \n\t" // DST->V[x, y]
  177. "vst1.32 {d27[0]}, [%0] \n\t" // DST->V[z]
  178. :
  179. : "r"(dst), "r"(&x), "r"(&y), "r"(&z), "r"(&w), "r"(m)
  180. : "q0", "q9", "q10","q11", "q12", "q13", "memory"
  181. );
  182. }
  183. inline void MathUtilNeon::transformVec4(const float* m, const float* v, float* dst)
  184. {
  185. asm volatile
  186. (
  187. "vld1.32 {d0, d1}, [%1] \n\t" // V[x, y, z, w]
  188. "vld1.32 {d18 - d21}, [%2]! \n\t" // M[m0-m7]
  189. "vld1.32 {d22 - d25}, [%2] \n\t" // M[m8-m15]
  190. "vmul.f32 q13, q9, d0[0] \n\t" // DST->V = M[m0-m3] * V[x]
  191. "vmla.f32 q13, q10, d0[1] \n\t" // DST->V = M[m4-m7] * V[y]
  192. "vmla.f32 q13, q11, d1[0] \n\t" // DST->V = M[m8-m11] * V[z]
  193. "vmla.f32 q13, q12, d1[1] \n\t" // DST->V = M[m12-m15] * V[w]
  194. "vst1.32 {d26, d27}, [%0] \n\t" // DST->V
  195. :
  196. : "r"(dst), "r"(v), "r"(m)
  197. : "q0", "q9", "q10","q11", "q12", "q13", "memory"
  198. );
  199. }
  200. inline void MathUtilNeon::crossVec3(const float* v1, const float* v2, float* dst)
  201. {
  202. asm volatile(
  203. "vld1.32 {d1[1]}, [%1] \n\t" //
  204. "vld1.32 {d0}, [%2] \n\t" //
  205. "vmov.f32 s2, s1 \n\t" // q0 = (v1y, v1z, v1z, v1x)
  206. "vld1.32 {d2[1]}, [%3] \n\t" //
  207. "vld1.32 {d3}, [%4] \n\t" //
  208. "vmov.f32 s4, s7 \n\t" // q1 = (v2z, v2x, v2y, v2z)
  209. "vmul.f32 d4, d0, d2 \n\t" // x = v1y * v2z, y = v1z * v2x
  210. "vmls.f32 d4, d1, d3 \n\t" // x -= v1z * v2y, y-= v1x - v2z
  211. "vmul.f32 d5, d3, d1[1] \n\t" // z = v1x * v2y
  212. "vmls.f32 d5, d0, d2[1] \n\t" // z-= v1y * vx
  213. "vst1.32 {d4}, [%0]! \n\t" // V[x, y]
  214. "vst1.32 {d5[0]}, [%0] \n\t" // V[z]
  215. :
  216. : "r"(dst), "r"(v1), "r"((v1+1)), "r"(v2), "r"((v2+1))
  217. : "q0", "q1", "q2", "memory"
  218. );
  219. }
  220. NS_CC_MATH_END