mdctARM.s 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234
  1. @ Tremolo library
  2. @-----------------------------------------------------------------------
  3. @ Copyright (C) 2002-2009, Xiph.org Foundation
  4. @ Copyright (C) 2010, Robin Watts for Pinknoise Productions Ltd
  5. @ All rights reserved.
  6. @ Redistribution and use in source and binary forms, with or without
  7. @ modification, are permitted provided that the following conditions
  8. @ are met:
  9. @ * Redistributions of source code must retain the above copyright
  10. @ notice, this list of conditions and the following disclaimer.
  11. @ * Redistributions in binary form must reproduce the above
  12. @ copyright notice, this list of conditions and the following disclaimer
  13. @ in the documentation and/or other materials provided with the
  14. @ distribution.
  15. @ * Neither the names of the Xiph.org Foundation nor Pinknoise
  16. @ Productions Ltd nor the names of its contributors may be used to
  17. @ endorse or promote products derived from this software without
  18. @ specific prior written permission.
  19. @
  20. @ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. @ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. @ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. @ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. @ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. @ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. @ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. @ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. @ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. @ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. @ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. @ ----------------------------------------------------------------------
  32. .text
  33. @ full accuracy version
  34. .global mdct_backwardARM
  35. .global mdct_shift_right
  36. .global mdct_unroll_prelap
  37. .global mdct_unroll_part2
  38. .global mdct_unroll_part3
  39. .global mdct_unroll_postlap
  40. .extern sincos_lookup0
  41. .extern sincos_lookup1
  42. .hidden sincos_lookup0
  43. .hidden sincos_lookup1
  44. mdct_unroll_prelap:
  45. @ r0 = out
  46. @ r1 = post
  47. @ r2 = r
  48. @ r3 = step
  49. STMFD r13!,{r4-r7,r14}
  50. MVN r4, #0x8000
  51. MOV r3, r3, LSL #1
  52. SUB r1, r2, r1 @ r1 = r - post
  53. SUBS r1, r1, #16 @ r1 = r - post - 16
  54. BLT unroll_over
  55. unroll_loop:
  56. LDMDB r2!,{r5,r6,r7,r12}
  57. MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
  58. MOV r6, r6, ASR #9 @ r6 = (*--r)>>9
  59. MOV r7, r7, ASR #9 @ r7 = (*--r)>>9
  60. MOV r12,r12,ASR #9 @ r12= (*--r)>>9
  61. MOV r14,r12,ASR #15
  62. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  63. EORNE r12,r4, r14,ASR #31
  64. STRH r12,[r0], r3
  65. MOV r14,r7, ASR #15
  66. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  67. EORNE r7, r4, r14,ASR #31
  68. STRH r7, [r0], r3
  69. MOV r14,r6, ASR #15
  70. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  71. EORNE r6, r4, r14,ASR #31
  72. STRH r6, [r0], r3
  73. MOV r14,r5, ASR #15
  74. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  75. EORNE r5, r4, r14,ASR #31
  76. STRH r5, [r0], r3
  77. SUBS r1, r1, #16
  78. BGE unroll_loop
  79. unroll_over:
  80. ADDS r1, r1, #16
  81. BLE unroll_end
  82. unroll_loop2:
  83. LDR r5,[r2,#-4]!
  84. @ stall
  85. @ stall (Xscale)
  86. MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
  87. MOV r14,r5, ASR #15
  88. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  89. EORNE r5, r4, r14,ASR #31
  90. STRH r5, [r0], r3
  91. SUBS r1, r1, #4
  92. BGT unroll_loop2
  93. unroll_end:
  94. LDMFD r13!,{r4-r7,PC}
  95. mdct_unroll_postlap:
  96. @ r0 = out
  97. @ r1 = post
  98. @ r2 = l
  99. @ r3 = step
  100. STMFD r13!,{r4-r7,r14}
  101. MVN r4, #0x8000
  102. MOV r3, r3, LSL #1
  103. SUB r1, r1, r2 @ r1 = post - l
  104. MOV r1, r1, ASR #1 @ r1 = (post - l)>>1
  105. SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4
  106. BLT unroll_over3
  107. unroll_loop3:
  108. LDR r12,[r2],#8
  109. LDR r7, [r2],#8
  110. LDR r6, [r2],#8
  111. LDR r5, [r2],#8
  112. RSB r12,r12,#0
  113. RSB r5, r5, #0
  114. RSB r6, r6, #0
  115. RSB r7, r7, #0
  116. MOV r12, r12,ASR #9 @ r12= (-*l)>>9
  117. MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
  118. MOV r6, r6, ASR #9 @ r6 = (-*l)>>9
  119. MOV r7, r7, ASR #9 @ r7 = (-*l)>>9
  120. MOV r14,r12,ASR #15
  121. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  122. EORNE r12,r4, r14,ASR #31
  123. STRH r12,[r0], r3
  124. MOV r14,r7, ASR #15
  125. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  126. EORNE r7, r4, r14,ASR #31
  127. STRH r7, [r0], r3
  128. MOV r14,r6, ASR #15
  129. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  130. EORNE r6, r4, r14,ASR #31
  131. STRH r6, [r0], r3
  132. MOV r14,r5, ASR #15
  133. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  134. EORNE r5, r4, r14,ASR #31
  135. STRH r5, [r0], r3
  136. SUBS r1, r1, #16
  137. BGE unroll_loop3
  138. unroll_over3:
  139. ADDS r1, r1, #16
  140. BLE unroll_over4
  141. unroll_loop4:
  142. LDR r5,[r2], #8
  143. @ stall
  144. @ stall (Xscale)
  145. RSB r5, r5, #0
  146. MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
  147. MOV r14,r5, ASR #15
  148. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  149. EORNE r5, r4, r14,ASR #31
  150. STRH r5, [r0], r3
  151. SUBS r1, r1, #4
  152. BGT unroll_loop4
  153. unroll_over4:
  154. LDMFD r13!,{r4-r7,PC}
  155. mdct_unroll_part2:
  156. @ r0 = out
  157. @ r1 = post
  158. @ r2 = l
  159. @ r3 = r
  160. @ <> = step
  161. @ <> = wL
  162. @ <> = wR
  163. MOV r12,r13
  164. STMFD r13!,{r4,r6-r11,r14}
  165. LDMFD r12,{r8,r9,r10} @ r8 = step
  166. @ r9 = wL
  167. @ r10= wR
  168. MVN r4, #0x8000
  169. MOV r8, r8, LSL #1
  170. SUBS r1, r3, r1 @ r1 = (r - post)
  171. BLE unroll_over5
  172. unroll_loop5:
  173. LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first)
  174. LDR r11,[r9],#4 @ r11= *wL++
  175. LDR r7, [r3, #-4]! @ r7 = *--r
  176. LDR r6, [r10,#-4]! @ r6 = *--wR
  177. @ Can save a cycle here, at the cost of 1bit errors in rounding
  178. SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
  179. SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
  180. ADD r6, r6, r11
  181. MOV r6, r6, ASR #8
  182. MOV r14,r6, ASR #15
  183. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  184. EORNE r6, r4, r14,ASR #31
  185. STRH r6, [r0], r8
  186. SUBS r1, r1, #4
  187. BGT unroll_loop5
  188. unroll_over5:
  189. LDMFD r13!,{r4,r6-r11,PC}
  190. mdct_unroll_part3:
  191. @ r0 = out
  192. @ r1 = post
  193. @ r2 = l
  194. @ r3 = r
  195. @ <> = step
  196. @ <> = wL
  197. @ <> = wR
  198. MOV r12,r13
  199. STMFD r13!,{r4,r6-r11,r14}
  200. LDMFD r12,{r8,r9,r10} @ r8 = step
  201. @ r9 = wL
  202. @ r10= wR
  203. MVN r4, #0x8000
  204. MOV r8, r8, LSL #1
  205. SUBS r1, r1, r3 @ r1 = (post - r)
  206. BLE unroll_over6
  207. unroll_loop6:
  208. LDR r12,[r2],#8 @ r12= *l (but l += 2 first)
  209. LDR r11,[r9],#4 @ r11= *wL++
  210. LDR r7, [r3],#4 @ r7 = *r++
  211. LDR r6, [r10,#-4]! @ r6 = *--wR
  212. @ Can save a cycle here, at the cost of 1bit errors in rounding
  213. SMULL r14,r11,r12,r11 @ (r14,r11) = *l * *wL++
  214. SMULL r14,r6, r7, r6 @ (r14,r6) = *--r * *--wR
  215. SUB r6, r6, r11
  216. MOV r6, r6, ASR #8
  217. MOV r14,r6, ASR #15
  218. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  219. EORNE r6, r4, r14,ASR #31
  220. STRH r6, [r0], r8
  221. SUBS r1, r1, #4
  222. BGT unroll_loop6
  223. unroll_over6:
  224. LDMFD r13!,{r4,r6-r11,PC}
  225. mdct_shift_right:
  226. @ r0 = n
  227. @ r1 = in
  228. @ r2 = right
  229. STMFD r13!,{r4-r11,r14}
  230. MOV r0, r0, LSR #2 @ n >>= 2
  231. ADD r1, r1, #4
  232. SUBS r0, r0, #8
  233. BLT sr_less_than_8
  234. sr_loop:
  235. LDR r3, [r1], #8
  236. LDR r4, [r1], #8
  237. LDR r5, [r1], #8
  238. LDR r6, [r1], #8
  239. LDR r7, [r1], #8
  240. LDR r8, [r1], #8
  241. LDR r12,[r1], #8
  242. LDR r14,[r1], #8
  243. SUBS r0, r0, #8
  244. STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
  245. BGE sr_loop
  246. sr_less_than_8:
  247. ADDS r0, r0, #8
  248. BEQ sr_end
  249. sr_loop2:
  250. LDR r3, [r1], #8
  251. SUBS r0, r0, #1
  252. STR r3, [r2], #4
  253. BGT sr_loop2
  254. sr_end:
  255. LDMFD r13!,{r4-r11,PC}
  256. mdct_backwardARM:
  257. @ r0 = n
  258. @ r1 = in
  259. STMFD r13!,{r4-r11,r14}
  260. MOV r2,#1<<4 @ r2 = 1<<shift
  261. MOV r3,#13-4 @ r3 = 13-shift
  262. find_shift_loop:
  263. TST r0,r2 @ if (n & (1<<shift)) == 0
  264. MOV r2,r2,LSL #1
  265. SUBEQ r3,r3,#1 @ shift--
  266. BEQ find_shift_loop
  267. MOV r2,#2
  268. MOV r2,r2,LSL r3 @ r2 = step = 2<<shift
  269. @ presymmetry
  270. @ r0 = n (a multiple of 4)
  271. @ r1 = in
  272. @ r2 = step
  273. @ r3 = shift
  274. ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
  275. ADD r14,r1, r0 @ r14= in+(n>>2)
  276. SUB r4, r4, #3*4 @ r4 = aX = in+n2-3
  277. ADRL r7, .Lsincos_lookup
  278. LDR r5, [r7] @ r5 = T=sincos_lookup0
  279. ADD r5, r7
  280. presymmetry_loop1:
  281. LDR r7, [r4,#8] @ r6 = s2 = aX[2]
  282. LDR r11,[r5,#4] @ r11= T[1]
  283. LDR r6, [r4] @ r6 = s0 = aX[0]
  284. LDR r10,[r5],r2,LSL #2 @ r10= T[0] T += step
  285. @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
  286. SMULL r8, r9, r7, r11 @ (r8, r9) = s2*T[1]
  287. @ stall
  288. @ stall ?
  289. SMLAL r8, r9, r6, r10 @ (r8, r9) += s0*T[0]
  290. RSB r6, r6, #0
  291. @ stall ?
  292. SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[0]
  293. MOV r9, r9, LSL #1
  294. @ stall ?
  295. SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[1]
  296. STR r9, [r4],#-16 @ aX[0] = r9
  297. CMP r4,r14
  298. MOV r12,r12,LSL #1
  299. STR r12,[r4,#8+16] @ aX[2] = r12
  300. BGE presymmetry_loop1 @ while (aX >= in+n4)
  301. presymmetry_loop2:
  302. LDR r6,[r4] @ r6 = s0 = aX[0]
  303. LDR r10,[r5,#4] @ r10= T[1]
  304. LDR r7,[r4,#8] @ r6 = s2 = aX[2]
  305. LDR r11,[r5],-r2,LSL #2 @ r11= T[0] T -= step
  306. @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
  307. SMULL r8, r9, r6, r10 @ (r8, r9) = s0*T[1]
  308. @ stall
  309. @ stall ?
  310. SMLAL r8, r9, r7, r11 @ (r8, r9) += s2*T[0]
  311. RSB r6, r6, #0
  312. @ stall ?
  313. SMULL r8, r12,r7, r10 @ (r8, r12) = s2*T[1]
  314. MOV r9, r9, LSL #1
  315. @ stall ?
  316. SMLAL r8, r12,r6, r11 @ (r8, r12) -= s0*T[0]
  317. STR r9, [r4],#-16 @ aX[0] = r9
  318. CMP r4,r1
  319. MOV r12,r12,LSL #1
  320. STR r12,[r4,#8+16] @ aX[2] = r12
  321. BGE presymmetry_loop2 @ while (aX >= in)
  322. @ r0 = n
  323. @ r1 = in
  324. @ r2 = step
  325. @ r3 = shift
  326. STMFD r13!,{r3}
  327. ADRL r4, .Lsincos_lookup
  328. LDR r5, [r4] @ r5 = T=sincos_lookup0
  329. ADD r5, r4
  330. ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
  331. SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4
  332. LDR r11,[r5,#4] @ r11= T[1]
  333. LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
  334. presymmetry_loop3:
  335. LDR r8,[r1],#16 @ r8 = ro0 = bX[0]
  336. LDR r9,[r1,#8-16] @ r9 = ro2 = bX[2]
  337. LDR r6,[r4] @ r6 = ri0 = aX[0]
  338. @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
  339. @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
  340. SMULL r14,r12,r8, r11 @ (r14,r12) = ro0*T[1]
  341. RSB r8,r8,#0 @ r8 = -ro0
  342. @ Stall ?
  343. SMLAL r14,r12,r9, r10 @ (r14,r12) += ro2*T[0]
  344. LDR r7,[r4,#8] @ r7 = ri2 = aX[2]
  345. @ Stall ?
  346. SMULL r14,r3, r9, r11 @ (r14,r3) = ro2*T[1]
  347. MOV r12,r12,LSL #1
  348. LDR r11,[r5,#4] @ r11= T[1]
  349. SMLAL r14,r3, r8, r10 @ (r14,r3) -= ro0*T[0]
  350. LDR r10,[r5],r2, LSL #2 @ r10= T[0] T += step
  351. STR r12,[r4,#8]
  352. MOV r3, r3, LSL #1
  353. STR r3, [r4],#-16
  354. @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
  355. @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
  356. SMULL r14,r12,r6, r10 @ (r14,r12) = ri0*T[0]
  357. RSB r6,r6,#0 @ r6 = -ri0
  358. @ stall ?
  359. SMLAL r14,r12,r7, r11 @ (r14,r12) += ri2*T[1]
  360. @ stall ?
  361. @ stall ?
  362. SMULL r14,r3, r7, r10 @ (r14,r3) = ri2*T[0]
  363. MOV r12,r12,LSL #1
  364. @ stall ?
  365. SMLAL r14,r3, r6, r11 @ (r14,r3) -= ri0*T[1]
  366. CMP r4,r1
  367. STR r12,[r1,#8-16]
  368. MOV r3, r3, LSL #1
  369. STR r3, [r1,#-16]
  370. BGE presymmetry_loop3
  371. SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in)
  372. LDR r3,[r13]
  373. STR r2,[r13,#-4]!
  374. @ mdct_butterflies
  375. @ r0 = n = (points * 2)
  376. @ r1 = in = x
  377. @ r2 = i
  378. @ r3 = shift
  379. STMFD r13!,{r0-r1}
  380. ADRL r4, .Lsincos_lookup
  381. LDR r5, [r4]
  382. ADD r5, r4
  383. RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages
  384. BLE no_generics
  385. MOV r14,#4 @ r14= 4 (i=0)
  386. MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift
  387. mdct_butterflies_loop1:
  388. MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS
  389. MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0)
  390. STMFD r13!,{r4,r14}
  391. mdct_butterflies_loop2:
  392. @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
  393. @ mdct_butterfly_generic(r1, r0, r6)
  394. @ r0 = points
  395. @ r1 = x
  396. @ preserve r2 (external loop counter)
  397. @ preserve r3
  398. @ preserve r4 (external loop counter)
  399. @ r5 = T = sincos_lookup0
  400. @ r6 = step
  401. @ preserve r14
  402. STR r2,[r13,#-4]! @ stack r2
  403. ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1)
  404. ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS
  405. ADD r12,r5,#1024*4 @ r12= sincos_lookup0+1024
  406. mdct_bufferfly_generic_loop1:
  407. LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0]
  408. @ r3 = x1[1]
  409. @ r8 = x1[2]
  410. @ r11= x1[3] x1 -= 4
  411. LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0]
  412. @ r9 = x2[1]
  413. @ r10= x2[2]
  414. @ r14= x2[3] x2 -= 4
  415. SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
  416. ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
  417. SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2]
  418. ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2])
  419. SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0]
  420. ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1])
  421. SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2]
  422. ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3])
  423. STMIA r7,{r3,r4,r8,r10}
  424. @ r0 = points
  425. @ r1 = x2
  426. @ r2 = s0
  427. @ r3 free
  428. @ r4 free
  429. @ r5 = T
  430. @ r6 = step
  431. @ r7 = x1
  432. @ r8 free
  433. @ r9 = s2
  434. @ r10 free
  435. @ r11= s1
  436. @ r12= limit
  437. @ r14= s3
  438. LDR r8, [r5,#4] @ r8 = T[1]
  439. LDR r10,[r5],r6,LSL #2 @ r10= T[0] T += step
  440. @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
  441. @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31
  442. @ stall Xscale
  443. SMULL r4, r3, r2, r8 @ (r4, r3) = s0*T[1]
  444. SMLAL r4, r3, r11,r10 @ (r4, r3) += s1*T[0]
  445. RSB r11,r11,#0
  446. SMULL r11,r4, r8, r11 @ (r11,r4) = -s1*T[1]
  447. SMLAL r11,r4, r2, r10 @ (r11,r4) += s0*T[0]
  448. MOV r2, r3, LSL #1 @ r2 = r3<<1 = Value for x2[0]
  449. @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
  450. @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31
  451. SMULL r11,r3, r9, r10 @ (r11,r3) = s2*T[0]
  452. MOV r4, r4, LSL #1 @ r4 = r4<<1 = Value for x2[2]
  453. SMLAL r11,r3, r14,r8 @ (r11,r3) += s3*T[1]
  454. RSB r9, r9, #0
  455. SMULL r10,r11,r14,r10 @ (r10,r11) = s3*T[0]
  456. MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
  457. SMLAL r10,r11,r9,r8 @ (r10,r11) -= s2*T[1]
  458. CMP r5, r12
  459. MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
  460. STMIA r1,{r2,r3,r4,r11}
  461. BLT mdct_bufferfly_generic_loop1
  462. SUB r12,r12,#1024*4
  463. mdct_bufferfly_generic_loop2:
  464. LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0]
  465. @ r3 = x1[1]
  466. @ r9 = x1[2]
  467. @ r10= x1[3] x1 -= 4
  468. LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0]
  469. @ r8 = x2[1]
  470. @ r11= x2[2]
  471. @ r14= x2[3] x2 -= 4
  472. SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
  473. ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
  474. SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3]
  475. ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2])
  476. SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1]
  477. ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1])
  478. SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2]
  479. ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3])
  480. STMIA r7,{r3,r8,r10,r11}
  481. @ r0 = points
  482. @ r1 = x2
  483. @ r2 = s0
  484. @ r3 free
  485. @ r4 = s2
  486. @ r5 = T
  487. @ r6 = step
  488. @ r7 = x1
  489. @ r8 free
  490. @ r9 = s1
  491. @ r10 free
  492. @ r11 free
  493. @ r12= limit
  494. @ r14= s3
  495. LDR r8, [r5,#4] @ r8 = T[1]
  496. LDR r10,[r5],-r6,LSL #2 @ r10= T[0] T -= step
  497. @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
  498. @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31
  499. @ stall Xscale
  500. SMULL r3, r11,r2, r8 @ (r3, r11) = s0*T[1]
  501. SMLAL r3, r11,r9, r10 @ (r3, r11) += s1*T[0]
  502. RSB r9, r9, #0
  503. SMULL r3, r2, r10,r2 @ (r3, r2) = s0*T[0]
  504. SMLAL r3, r2, r9, r8 @ (r3, r2) += -s1*T[1]
  505. MOV r9, r11,LSL #1 @ r9 = r11<<1 = Value for x2[2]
  506. @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
  507. @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31
  508. SMULL r3, r11,r4, r10 @ (r3,r11) = s2*T[0]
  509. MOV r2, r2, LSL #1 @ r2 = r2<<1 = Value for x2[0]
  510. SMLAL r3, r11,r14,r8 @ (r3,r11) += s3*T[1]
  511. RSB r4, r4, #0
  512. SMULL r10,r3,r14,r10 @ (r10,r3) = s3*T[0]
  513. MOV r11,r11,LSL #1 @ r11= r11<<1 = Value for x2[3]
  514. SMLAL r10,r3, r4, r8 @ (r10,r3) -= s2*T[1]
  515. CMP r5, r12
  516. MOV r3, r3, LSL #1 @ r3 = r3<<1 = Value for x2[1]
  517. STMIA r1,{r2,r3,r9,r11}
  518. BGT mdct_bufferfly_generic_loop2
  519. LDR r2,[r13],#4 @ unstack r2
  520. ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j
  521. @ stall Xscale
  522. SUBS r2, r2, #1 @ r2-- (j++)
  523. BGT mdct_butterflies_loop2
  524. LDMFD r13!,{r4,r14}
  525. LDR r1,[r13,#4]
  526. SUBS r4, r4, #1 @ stages--
  527. MOV r14,r14,LSL #1 @ r14= 4<<i (i++)
  528. MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++)
  529. BGE mdct_butterflies_loop1
  530. LDMFD r13,{r0-r1}
  531. no_generics:
  532. @ mdct_butterflies part2 (loop around mdct_bufferfly_32)
  533. @ r0 = points
  534. @ r1 = in
  535. @ r2 = step
  536. @ r3 = shift
  537. mdct_bufferflies_loop3:
  538. @ mdct_bufferfly_32
  539. @ block1
  540. ADD r4, r1, #16*4 @ r4 = &in[16]
  541. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16]
  542. @ r6 = x[17]
  543. @ r9 = x[18]
  544. @ r10= x[19]
  545. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  546. @ r8 = x[1]
  547. @ r11= x[2]
  548. @ r12= x[3]
  549. SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17]
  550. ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16]
  551. SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19]
  552. ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18]
  553. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  554. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17]
  555. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  556. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19]
  557. STMIA r4!,{r6,r7,r10,r11}
  558. LDR r6,cPI1_8
  559. LDR r7,cPI3_8
  560. @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
  561. @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8
  562. @ stall Xscale
  563. SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI1_8
  564. SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI3_8
  565. RSB r9, r9, #0
  566. SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI3_8
  567. SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI1_8
  568. MOV r11,r11,LSL #1
  569. MOV r5, r5, LSL #1
  570. @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
  571. @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8
  572. SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI1_8
  573. SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI3_8
  574. RSB r8,r8,#0
  575. SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI1_8
  576. SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI3_8
  577. MOV r9, r9, LSL #1
  578. MOV r12,r12,LSL #1
  579. STMIA r1!,{r5,r9,r11,r12}
  580. @ block2
  581. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20]
  582. @ r6 = x[21]
  583. @ r9 = x[22]
  584. @ r10= x[23]
  585. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4]
  586. @ r8 = x[5]
  587. @ r11= x[6]
  588. @ r12= x[7]
  589. SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21]
  590. ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20]
  591. SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23]
  592. ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22]
  593. SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4]
  594. ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21]
  595. SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6]
  596. ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23]
  597. LDR r14,cPI2_8
  598. STMIA r4!,{r6,r7,r10,r11}
  599. SUB r5, r5, r9 @ r5 = s0 - s1
  600. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  601. SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
  602. SUB r12,r12,r8 @ r12= s3 - s2
  603. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  604. SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
  605. MOV r5, r5, LSL #1
  606. SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
  607. MOV r8, r8, LSL #1
  608. SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
  609. MOV r9, r9, LSL #1
  610. MOV r12,r12,LSL #1
  611. STMIA r1!,{r5,r8,r9,r12}
  612. @ block3
  613. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24]
  614. @ r6 = x[25]
  615. @ r9 = x[25]
  616. @ r10= x[26]
  617. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8]
  618. @ r8 = x[9]
  619. @ r11= x[10]
  620. @ r12= x[11]
  621. SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25]
  622. ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25]
  623. SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27]
  624. ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26]
  625. SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8]
  626. ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25]
  627. SUB r12,r12,r11 @ r12= s3 = x[11] - x[10]
  628. ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27]
  629. STMIA r4!,{r6,r7,r10,r11}
  630. LDR r6,cPI3_8
  631. LDR r7,cPI1_8
  632. @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
  633. @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8
  634. @ stall Xscale
  635. SMULL r14,r11,r5, r6 @ (r14,r11) = s0*cPI3_8
  636. SMLAL r14,r11,r9, r7 @ (r14,r11) += s1*cPI1_8
  637. RSB r9, r9, #0
  638. SMULL r14,r5, r7, r5 @ (r14,r5) = s0*cPI1_8
  639. SMLAL r14,r5, r9, r6 @ (r14,r5) -= s1*cPI3_8
  640. MOV r11,r11,LSL #1
  641. MOV r5, r5, LSL #1
  642. @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
  643. @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8
  644. SMULL r14,r9, r8, r6 @ (r14,r9) = s2*cPI3_8
  645. SMLAL r14,r9, r12,r7 @ (r14,r9) += s3*cPI1_8
  646. RSB r8,r8,#0
  647. SMULL r14,r12,r6, r12 @ (r14,r12) = s3*cPI3_8
  648. SMLAL r14,r12,r8, r7 @ (r14,r12) -= s2*cPI1_8
  649. MOV r9, r9, LSL #1
  650. MOV r12,r12,LSL #1
  651. STMIA r1!,{r5,r9,r11,r12}
  652. @ block4
  653. LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28]
  654. @ r6 = x[29]
  655. @ r10= x[30]
  656. @ r11= x[31]
  657. LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12]
  658. @ r9 = x[13]
  659. @ r12= x[14]
  660. @ r14= x[15]
  661. SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29]
  662. ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28]
  663. SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14]
  664. ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31]
  665. SUB r10,r10,r11 @ r10= s1 = x[30] - x[31]
  666. ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30]
  667. SUB r14, r8, r9 @ r14= s2 = x[12] - x[13]
  668. ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29]
  669. STMIA r4!,{r6,r9,r11,r12}
  670. STMIA r1!,{r5,r7,r10,r14}
  671. @ mdct_butterfly16 (1st version)
  672. @ block 1
  673. SUB r1,r1,#16*4
  674. ADD r4,r1,#8*4
  675. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
  676. @ r6 = x[ 9]
  677. @ r9 = x[10]
  678. @ r10= x[11]
  679. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  680. @ r8 = x[1]
  681. @ r11= x[2]
  682. @ r12= x[3]
  683. SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
  684. ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
  685. SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
  686. ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
  687. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  688. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
  689. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  690. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
  691. LDR r14,cPI2_8
  692. STMIA r4!,{r6,r7,r10,r11}
  693. SUB r5, r5, r9 @ r5 = s0 - s1
  694. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  695. SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
  696. SUB r12,r12,r8 @ r12= s3 - s2
  697. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  698. SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
  699. MOV r5, r5, LSL #1
  700. SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
  701. MOV r8, r8, LSL #1
  702. SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
  703. MOV r9, r9, LSL #1
  704. MOV r12,r12,LSL #1
  705. STMIA r1!,{r5,r8,r9,r12}
  706. @ block4
  707. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
  708. @ r6 = x[13]
  709. @ r9 = x[14]
  710. @ r10= x[15]
  711. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
  712. @ r8 = x[ 5]
  713. @ r11= x[ 6]
  714. @ r12= x[ 7]
  715. SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
  716. ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
  717. SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
  718. ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
  719. SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
  720. ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
  721. SUB r12,r9, r10 @ r12= s3 = x[14] - x[15]
  722. ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
  723. STMIA r4!,{r6,r8,r10,r11}
  724. STMIA r1!,{r5,r7,r12,r14}
  725. @ mdct_butterfly_8
  726. LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  727. @ r6 = x[0]
  728. @ r7 = x[1]
  729. @ r8 = x[2]
  730. @ r9 = x[3]
  731. @ r10= x[4]
  732. @ r11= x[5]
  733. @ r12= x[6]
  734. @ r14= x[7]
  735. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  736. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  737. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  738. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  739. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  740. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  741. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  742. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  743. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  744. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  745. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  746. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  747. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  748. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  749. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  750. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  751. STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  752. @ mdct_butterfly_8
  753. LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  754. @ r6 = x[0]
  755. @ r7 = x[1]
  756. @ r8 = x[2]
  757. @ r9 = x[3]
  758. @ r10= x[4]
  759. @ r11= x[5]
  760. @ r12= x[6]
  761. @ r14= x[7]
  762. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  763. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  764. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  765. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  766. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  767. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  768. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  769. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  770. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  771. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  772. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  773. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  774. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  775. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  776. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  777. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  778. STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  779. @ block 2
  780. ADD r1,r1,#16*4-8*4
  781. ADD r4,r1,#8*4
  782. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
  783. @ r6 = x[ 9]
  784. @ r9 = x[10]
  785. @ r10= x[11]
  786. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  787. @ r8 = x[1]
  788. @ r11= x[2]
  789. @ r12= x[3]
  790. SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
  791. ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
  792. SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
  793. ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
  794. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  795. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
  796. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  797. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
  798. LDR r14,cPI2_8
  799. STMIA r4!,{r6,r7,r10,r11}
  800. SUB r5, r5, r9 @ r5 = s0 - s1
  801. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  802. SMULL r6, r5, r14,r5 @ (r6,r5) = (s0-s1)*cPI2_8
  803. SUB r12,r12,r8 @ r12= s3 - s2
  804. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  805. SMULL r6, r8, r14,r8 @ (r6,r8) = (s3+s2)*cPI2_8
  806. MOV r5, r5, LSL #1
  807. SMULL r6, r9, r14,r9 @ (r6,r9) = (s0+s1)*cPI2_8
  808. MOV r8, r8, LSL #1
  809. SMULL r6, r12,r14,r12 @ (r6,r12) = (s3-s2)*cPI2_8
  810. MOV r9, r9, LSL #1
  811. MOV r12,r12,LSL #1
  812. STMIA r1!,{r5,r8,r9,r12}
  813. @ block4
  814. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
  815. @ r6 = x[13]
  816. @ r9 = x[14]
  817. @ r10= x[15]
  818. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
  819. @ r8 = x[ 5]
  820. @ r11= x[ 6]
  821. @ r12= x[ 7]
  822. SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
  823. ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
  824. SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15]
  825. ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
  826. SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
  827. ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
  828. SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
  829. ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
  830. STMIA r4!,{r6,r8,r10,r11}
  831. STMIA r1!,{r5,r7,r9,r14}
  832. @ mdct_butterfly_8
  833. LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  834. @ r6 = x[0]
  835. @ r7 = x[1]
  836. @ r8 = x[2]
  837. @ r9 = x[3]
  838. @ r10= x[4]
  839. @ r11= x[5]
  840. @ r12= x[6]
  841. @ r14= x[7]
  842. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  843. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  844. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  845. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  846. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  847. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  848. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  849. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  850. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  851. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  852. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  853. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  854. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  855. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  856. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  857. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  858. STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  859. @ mdct_butterfly_8
  860. LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  861. @ r6 = x[0]
  862. @ r7 = x[1]
  863. @ r8 = x[2]
  864. @ r9 = x[3]
  865. @ r10= x[4]
  866. @ r11= x[5]
  867. @ r12= x[6]
  868. @ r14= x[7]
  869. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  870. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  871. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  872. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  873. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  874. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  875. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  876. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  877. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  878. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  879. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  880. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  881. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  882. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  883. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  884. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  885. STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  886. ADD r1,r1,#8*4
  887. SUBS r0,r0,#64
  888. BGT mdct_bufferflies_loop3
  889. LDMFD r13,{r0-r3}
  890. mdct_bitreverseARM:
  891. @ r0 = points = n
  892. @ r1 = in
  893. @ r2 = step
  894. @ r3 = shift
  895. MOV r4, #0 @ r4 = bit = 0
  896. ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1)
  897. ADR r6, bitrev
  898. SUB r5, r5, #8
  899. brev_lp:
  900. LDRB r7, [r6, r4, LSR #6]
  901. AND r8, r4, #0x3f
  902. LDRB r8, [r6, r8]
  903. ADD r4, r4, #1 @ bit++
  904. @ stall XScale
  905. ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit]
  906. MOV r7, r7, LSR r3
  907. ADD r9, r1, r7, LSL #2 @ r9 = xx = x + (b>>shift)
  908. CMP r5, r9 @ if (w > xx)
  909. LDR r10,[r5],#-8 @ r10 = w[0] w -= 2
  910. LDRGT r11,[r5,#12] @ r11 = w[1]
  911. LDRGT r12,[r9] @ r12 = xx[0]
  912. LDRGT r14,[r9,#4] @ r14 = xx[1]
  913. STRGT r10,[r9] @ xx[0]= w[0]
  914. STRGT r11,[r9,#4] @ xx[1]= w[1]
  915. STRGT r12,[r5,#8] @ w[0] = xx[0]
  916. STRGT r14,[r5,#12] @ w[1] = xx[1]
  917. CMP r5,r1
  918. BGT brev_lp
  919. @ mdct_step7
  920. @ r0 = points
  921. @ r1 = in
  922. @ r2 = step
  923. @ r3 = shift
  924. CMP r2, #4 @ r5 = T = (step>=4) ?
  925. ADR r7, .Lsincos_lookup @ sincos_lookup0 +
  926. ADDLT r7, #4 @ sincos_lookup1
  927. LDR r5, [r7]
  928. ADD r5, r7
  929. ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1)
  930. ADDGE r5, r5, r2, LSL #1 @ (step>>1)
  931. ADD r8, r5, #1024*4 @ r8 = Ttop
  932. step7_loop1:
  933. LDR r6, [r1] @ r6 = w0[0]
  934. LDR r9, [r1,#4] @ r9 = w0[1]
  935. LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
  936. LDR r11,[r7,#4] @ r11= w1[1]
  937. LDR r14,[r5,#4] @ r14= T[1]
  938. LDR r12,[r5],r2,LSL #2 @ r12= T[0] T += step
  939. ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
  940. SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
  941. SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
  942. ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
  943. @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
  944. @ 1 off.
  945. SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[1]
  946. SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[0] = s2
  947. ADD r3, r3, r4
  948. SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[1]
  949. SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[0] = s3
  950. SUB r14,r14,r12
  951. @ r9 = s0b<<1
  952. @ r10= s1b<<1
  953. ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
  954. SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
  955. SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
  956. ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
  957. STR r9, [r1],#4
  958. STR r10,[r1],#4 @ w0 += 2
  959. STR r3, [r7]
  960. STR r12,[r7,#4]
  961. CMP r5,r8
  962. BLT step7_loop1
  963. step7_loop2:
  964. LDR r6, [r1] @ r6 = w0[0]
  965. LDR r9, [r1,#4] @ r9 = w0[1]
  966. LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
  967. LDR r11,[r7,#4] @ r11= w1[1]
  968. LDR r14,[r5,-r2,LSL #2]! @ r12= T[1] T -= step
  969. LDR r12,[r5,#4] @ r14= T[0]
  970. ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
  971. SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
  972. SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
  973. ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
  974. @ Can save 1 cycle by using SMULL SMLAL - at the cost of being
  975. @ 1 off.
  976. SMULL r0, r3, r6, r14 @ (r0,r3) = s0*T[0]
  977. SMULL r0, r4, r11,r12 @ (r0,r4) += s1*T[1] = s2
  978. ADD r3, r3, r4
  979. SMULL r0, r14,r11,r14 @ (r0,r14) = s1*T[0]
  980. SMULL r0, r12,r6, r12 @ (r0,r12) += s0*T[1] = s3
  981. SUB r14,r14,r12
  982. @ r9 = s0b<<1
  983. @ r10= s1b<<1
  984. ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
  985. SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
  986. SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
  987. ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
  988. STR r9, [r1],#4
  989. STR r10,[r1],#4 @ w0 += 2
  990. STR r3, [r7]
  991. STR r12,[r7,#4]
  992. CMP r1,r7
  993. BLT step7_loop2
  994. LDMFD r13!,{r0-r3}
  995. @ r0 = points
  996. @ r1 = in
  997. @ r2 = step
  998. @ r3 = shift
  999. MOV r2, r2, ASR #2 @ r2 = step >>= 2
  1000. CMP r2, #0
  1001. CMPNE r2, #1
  1002. BEQ mdct_end
  1003. @ step > 1 (default case)
  1004. CMP r2, #4 @ r5 = T = (step>=4) ?
  1005. ADR r7, .Lsincos_lookup @ sincos_lookup0 +
  1006. ADDLT r7, #4 @ sincos_lookup1
  1007. LDR r5, [r7]
  1008. ADD r5, r7
  1009. ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1)
  1010. ADDGE r5, r5, r2, LSL #1 @ (step>>1)
  1011. mdct_step8_default:
  1012. LDR r6, [r1],#4 @ r6 = s0 = x[0]
  1013. LDR r8, [r1],#4 @ r8 = -s1 = x[1]
  1014. LDR r12,[r5,#4] @ r12= T[1]
  1015. LDR r14,[r5],r2,LSL #2 @ r14= T[0] T += step
  1016. RSB r8, r8, #0 @ r8 = s1
  1017. @ XPROD31(s0, s1, T[0], T[1], x, x+1)
  1018. @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1]
  1019. SMULL r9, r10, r8, r12 @ (r9,r10) = s1 * T[1]
  1020. CMP r1, r7
  1021. SMLAL r9, r10, r6, r14 @ (r9,r10) += s0 * T[0]
  1022. RSB r6, r6, #0 @ r6 = -s0
  1023. SMULL r9, r11, r8, r14 @ (r9,r11) = s1 * T[0]
  1024. MOV r10,r10,LSL #1
  1025. SMLAL r9, r11, r6, r12 @ (r9,r11) -= s0 * T[1]
  1026. STR r10,[r1,#-8]
  1027. MOV r11,r11,LSL #1
  1028. STR r11,[r1,#-4]
  1029. BLT mdct_step8_default
  1030. mdct_end:
  1031. MOV r0, r2
  1032. LDMFD r13!,{r4-r11,PC}
  1033. cPI1_8:
  1034. .word 0x7641af3d
  1035. cPI2_8:
  1036. .word 0x5a82799a
  1037. cPI3_8:
  1038. .word 0x30fbc54d
  1039. bitrev:
  1040. .byte 0
  1041. .byte 32
  1042. .byte 16
  1043. .byte 48
  1044. .byte 8
  1045. .byte 40
  1046. .byte 24
  1047. .byte 56
  1048. .byte 4
  1049. .byte 36
  1050. .byte 20
  1051. .byte 52
  1052. .byte 12
  1053. .byte 44
  1054. .byte 28
  1055. .byte 60
  1056. .byte 2
  1057. .byte 34
  1058. .byte 18
  1059. .byte 50
  1060. .byte 10
  1061. .byte 42
  1062. .byte 26
  1063. .byte 58
  1064. .byte 6
  1065. .byte 38
  1066. .byte 22
  1067. .byte 54
  1068. .byte 14
  1069. .byte 46
  1070. .byte 30
  1071. .byte 62
  1072. .byte 1
  1073. .byte 33
  1074. .byte 17
  1075. .byte 49
  1076. .byte 9
  1077. .byte 41
  1078. .byte 25
  1079. .byte 57
  1080. .byte 5
  1081. .byte 37
  1082. .byte 21
  1083. .byte 53
  1084. .byte 13
  1085. .byte 45
  1086. .byte 29
  1087. .byte 61
  1088. .byte 3
  1089. .byte 35
  1090. .byte 19
  1091. .byte 51
  1092. .byte 11
  1093. .byte 43
  1094. .byte 27
  1095. .byte 59
  1096. .byte 7
  1097. .byte 39
  1098. .byte 23
  1099. .byte 55
  1100. .byte 15
  1101. .byte 47
  1102. .byte 31
  1103. .byte 63
  1104. .Lsincos_lookup:
  1105. .word sincos_lookup0-.Lsincos_lookup
  1106. .word sincos_lookup1-(.Lsincos_lookup+4)
  1107. @ END