mdctLARM.s 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206
  1. @ Tremolo library
  2. @-----------------------------------------------------------------------
  3. @ Copyright (C) 2002-2009, Xiph.org Foundation
  4. @ Copyright (C) 2010, Robin Watts for Pinknoise Productions Ltd
  5. @ All rights reserved.
  6. @ Redistribution and use in source and binary forms, with or without
  7. @ modification, are permitted provided that the following conditions
  8. @ are met:
  9. @ * Redistributions of source code must retain the above copyright
  10. @ notice, this list of conditions and the following disclaimer.
  11. @ * Redistributions in binary form must reproduce the above
  12. @ copyright notice, this list of conditions and the following disclaimer
  13. @ in the documentation and/or other materials provided with the
  14. @ distribution.
  15. @ * Neither the names of the Xiph.org Foundation nor Pinknoise
  16. @ Productions Ltd nor the names of its contributors may be used to
  17. @ endorse or promote products derived from this software without
  18. @ specific prior written permission.
  19. @
  20. @ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. @ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. @ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. @ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. @ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. @ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. @ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. @ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. @ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. @ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. @ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. @ ----------------------------------------------------------------------
  32. .text
  33. @ low accuracy version
  34. .global mdct_backwardARM
  35. .global mdct_shift_right
  36. .global mdct_unroll_prelap
  37. .global mdct_unroll_part2
  38. .global mdct_unroll_part3
  39. .global mdct_unroll_postlap
  40. .extern sincos_lookup0
  41. .extern sincos_lookup1
  42. mdct_unroll_prelap:
  43. @ r0 = out
  44. @ r1 = post
  45. @ r2 = r
  46. @ r3 = step
  47. STMFD r13!,{r4-r7,r14}
  48. MVN r4, #0x8000
  49. MOV r3, r3, LSL #1
  50. SUB r1, r2, r1 @ r1 = r - post
  51. SUBS r1, r1, #16 @ r1 = r - post - 16
  52. BLT unroll_over
  53. unroll_loop:
  54. LDMDB r2!,{r5,r6,r7,r12}
  55. MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
  56. MOV r6, r6, ASR #9 @ r6 = (*--r)>>9
  57. MOV r7, r7, ASR #9 @ r7 = (*--r)>>9
  58. MOV r12,r12,ASR #9 @ r12= (*--r)>>9
  59. MOV r14,r12,ASR #15
  60. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  61. EORNE r12,r4, r14,ASR #31
  62. STRH r12,[r0], r3
  63. MOV r14,r7, ASR #15
  64. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  65. EORNE r7, r4, r14,ASR #31
  66. STRH r7, [r0], r3
  67. MOV r14,r6, ASR #15
  68. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  69. EORNE r6, r4, r14,ASR #31
  70. STRH r6, [r0], r3
  71. MOV r14,r5, ASR #15
  72. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  73. EORNE r5, r4, r14,ASR #31
  74. STRH r5, [r0], r3
  75. SUBS r1, r1, #16
  76. BGE unroll_loop
  77. unroll_over:
  78. ADDS r1, r1, #16
  79. BLE unroll_end
  80. unroll_loop2:
  81. LDR r5,[r2,#-4]!
  82. @ stall
  83. @ stall (Xscale)
  84. MOV r5, r5, ASR #9 @ r5 = (*--r)>>9
  85. MOV r14,r5, ASR #15
  86. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  87. EORNE r5, r4, r14,ASR #31
  88. STRH r5, [r0], r3
  89. SUBS r1, r1, #4
  90. BGT unroll_loop2
  91. unroll_end:
  92. LDMFD r13!,{r4-r7,PC}
  93. mdct_unroll_postlap:
  94. @ r0 = out
  95. @ r1 = post
  96. @ r2 = l
  97. @ r3 = step
  98. STMFD r13!,{r4-r7,r14}
  99. MVN r4, #0x8000
  100. MOV r3, r3, LSL #1
  101. SUB r1, r1, r2 @ r1 = post - l
  102. MOV r1, r1, ASR #1 @ r1 = (post - l)>>1
  103. SUBS r1, r1, #16 @ r1 = ((post - l)>>1) - 4
  104. BLT unroll_over3
  105. unroll_loop3:
  106. LDR r12,[r2],#8
  107. LDR r7, [r2],#8
  108. LDR r6, [r2],#8
  109. LDR r5, [r2],#8
  110. RSB r12,r12,#0
  111. RSB r5, r5, #0
  112. RSB r6, r6, #0
  113. RSB r7, r7, #0
  114. MOV r12, r12,ASR #9 @ r12= (-*l)>>9
  115. MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
  116. MOV r6, r6, ASR #9 @ r6 = (-*l)>>9
  117. MOV r7, r7, ASR #9 @ r7 = (-*l)>>9
  118. MOV r14,r12,ASR #15
  119. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  120. EORNE r12,r4, r14,ASR #31
  121. STRH r12,[r0], r3
  122. MOV r14,r7, ASR #15
  123. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  124. EORNE r7, r4, r14,ASR #31
  125. STRH r7, [r0], r3
  126. MOV r14,r6, ASR #15
  127. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  128. EORNE r6, r4, r14,ASR #31
  129. STRH r6, [r0], r3
  130. MOV r14,r5, ASR #15
  131. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  132. EORNE r5, r4, r14,ASR #31
  133. STRH r5, [r0], r3
  134. SUBS r1, r1, #16
  135. BGE unroll_loop3
  136. unroll_over3:
  137. ADDS r1, r1, #16
  138. BLE unroll_over4
  139. unroll_loop4:
  140. LDR r5,[r2], #8
  141. @ stall
  142. @ stall (Xscale)
  143. RSB r5, r5, #0
  144. MOV r5, r5, ASR #9 @ r5 = (-*l)>>9
  145. MOV r14,r5, ASR #15
  146. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  147. EORNE r5, r4, r14,ASR #31
  148. STRH r5, [r0], r3
  149. SUBS r1, r1, #4
  150. BGT unroll_loop4
  151. unroll_over4:
  152. LDMFD r13!,{r4-r7,PC}
  153. mdct_unroll_part2:
  154. @ r0 = out
  155. @ r1 = post
  156. @ r2 = l
  157. @ r3 = r
  158. @ <> = step
  159. @ <> = wL
  160. @ <> = wR
  161. MOV r12,r13
  162. STMFD r13!,{r4,r6-r11,r14}
  163. LDMFD r12,{r8,r9,r10} @ r8 = step
  164. @ r9 = wL
  165. @ r10= wR
  166. MVN r4, #0x8000
  167. MOV r8, r8, LSL #1
  168. SUBS r1, r3, r1 @ r1 = (r - post)
  169. BLE unroll_over5
  170. unroll_loop5:
  171. LDR r12,[r2, #-8]! @ r12= *l (but l -= 2 first)
  172. LDR r7, [r3, #-4]! @ r7 = *--r
  173. LDRB r6, [r10,#-1]! @ r6 = *--wR
  174. LDRB r11,[r9],#1 @ r11= *wL++
  175. MOV r12, r12, ASR #8
  176. @ Can save a cycle here, at the cost of 1bit errors in rounding
  177. MUL r11,r12,r11 @ r11 = *l * *wL++
  178. MOV r7, r7, ASR #8
  179. MLA r6, r7, r6, r11 @ r6 = *--r * *--wR
  180. MOV r6, r6, ASR #9
  181. MOV r14,r6, ASR #15
  182. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  183. EORNE r6, r4, r14,ASR #31
  184. STRH r6, [r0], r8
  185. SUBS r1, r1, #4
  186. BGT unroll_loop5
  187. unroll_over5:
  188. LDMFD r13!,{r4,r6-r11,PC}
  189. mdct_unroll_part3:
  190. @ r0 = out
  191. @ r1 = post
  192. @ r2 = l
  193. @ r3 = r
  194. @ <> = step
  195. @ <> = wL
  196. @ <> = wR
  197. MOV r12,r13
  198. STMFD r13!,{r4,r6-r11,r14}
  199. LDMFD r12,{r8,r9,r10} @ r8 = step
  200. @ r9 = wL
  201. @ r10= wR
  202. MVN r4, #0x8000
  203. MOV r8, r8, LSL #1
  204. SUBS r1, r1, r3 @ r1 = (post - r)
  205. BLE unroll_over6
  206. unroll_loop6:
  207. LDR r12,[r2],#8 @ r12= *l (but l += 2 first)
  208. LDR r7, [r3],#4 @ r7 = *r++
  209. LDRB r11,[r9],#1 @ r11= *wL++
  210. LDRB r6, [r10,#-1]! @ r6 = *--wR
  211. @ Can save a cycle here, at the cost of 1bit errors in rounding
  212. MOV r12,r12,ASR #8
  213. MUL r11,r12,r11 @ (r14,r11) = *l * *wL++
  214. MOV r7, r7, ASR #8
  215. MUL r6, r7, r6 @ (r14,r6) = *--r * *--wR
  216. SUB r6, r6, r11
  217. MOV r6, r6, ASR #9
  218. MOV r14,r6, ASR #15
  219. TEQ r14,r14,ASR #31 @ if r14==0 || r14==-1 then in range
  220. EORNE r6, r4, r14,ASR #31
  221. STRH r6, [r0], r8
  222. SUBS r1, r1, #4
  223. BGT unroll_loop6
  224. unroll_over6:
  225. LDMFD r13!,{r4,r6-r11,PC}
  226. mdct_shift_right:
  227. @ r0 = n
  228. @ r1 = in
  229. @ r2 = right
  230. STMFD r13!,{r4-r11,r14}
  231. MOV r0, r0, LSR #2 @ n >>= 2
  232. ADD r1, r1, #4
  233. SUBS r0, r0, #8
  234. BLT sr_less_than_8
  235. sr_loop:
  236. LDR r3, [r1], #8
  237. LDR r4, [r1], #8
  238. LDR r5, [r1], #8
  239. LDR r6, [r1], #8
  240. LDR r7, [r1], #8
  241. LDR r8, [r1], #8
  242. LDR r12,[r1], #8
  243. LDR r14,[r1], #8
  244. SUBS r0, r0, #8
  245. STMIA r2!,{r3,r4,r5,r6,r7,r8,r12,r14}
  246. BGE sr_loop
  247. sr_less_than_8:
  248. ADDS r0, r0, #8
  249. BEQ sr_end
  250. sr_loop2:
  251. LDR r3, [r1], #8
  252. SUBS r0, r0, #1
  253. STR r3, [r2], #4
  254. BGT sr_loop2
  255. sr_end:
  256. LDMFD r13!,{r4-r11,PC}
  257. mdct_backwardARM:
  258. @ r0 = n
  259. @ r1 = in
  260. STMFD r13!,{r4-r11,r14}
  261. MOV r2, #1<<4 @ r2 = 1<<shift
  262. MOV r3, #13-4 @ r3 = 13-shift
  263. find_shift_loop:
  264. TST r0, r2 @ if (n & (1<<shift)) == 0
  265. MOV r2, r2, LSL #1
  266. SUBEQ r3, r3, #1 @ shift--
  267. BEQ find_shift_loop
  268. MOV r2, #2
  269. MOV r2, r2, LSL r3 @ r2 = step = 2<<shift
  270. @ presymmetry
  271. @ r0 = n (a multiple of 4)
  272. @ r1 = in
  273. @ r2 = step
  274. @ r3 = shift
  275. ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
  276. ADD r14,r1, r0 @ r14= in+(n>>2)
  277. SUB r4, r4, #3*4 @ r4 = aX = in+n2-3
  278. LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0
  279. presymmetry_loop1:
  280. LDR r7, [r4,#8] @ r6 = s2 = aX[2]
  281. LDRB r11,[r5,#1] @ r11= T[1]
  282. LDR r6, [r4],#-16 @ r6 = s0 = aX[0]
  283. LDRB r10,[r5],r2 @ r10= T[0] T += step
  284. MOV r6, r6, ASR #8
  285. MOV r7, r7, ASR #8
  286. @ XPROD31(s0, s2, T[0], T[1], 0xaX[0], &ax[2])
  287. MUL r9, r6, r10 @ r9 = s0*T[0]
  288. RSB r6, r6, #0
  289. MLA r9, r7, r11,r9 @ r9 += s2*T[1]
  290. CMP r4, r14
  291. MUL r12,r7, r10 @ r12 = s2*T[0]
  292. STR r9, [r4,#16] @ aX[0] = r9
  293. MLA r12,r6, r11,r12 @ r12 -= s0*T[1]
  294. STR r12,[r4,#8+16] @ aX[2] = r12
  295. BGE presymmetry_loop1 @ while (aX >= in+n4)
  296. presymmetry_loop2:
  297. LDR r6, [r4],#-16 @ r6 = s0 = aX[0]
  298. LDRB r10,[r5,#1] @ r10= T[1]
  299. LDR r7, [r4,#16+8] @ r6 = s2 = aX[2]
  300. LDRB r11,[r5],-r2 @ r11= T[0] T -= step
  301. MOV r6, r6, ASR #8
  302. MOV r7, r7, ASR #8
  303. @ XPROD31(s0, s2, T[1], T[0], 0xaX[0], &ax[2])
  304. MUL r9, r6, r10 @ r9 = s0*T[1]
  305. RSB r6, r6, #0
  306. MLA r9, r7, r11,r9 @ r9 += s2*T[0]
  307. CMP r4, r1
  308. MUL r12,r7, r10 @ r12 = s2*T[1]
  309. STR r9, [r4,#16] @ aX[0] = r9
  310. MLA r12,r6, r11,r12 @ r12 -= s0*T[0]
  311. STR r12,[r4,#8+16] @ aX[2] = r12
  312. BGE presymmetry_loop2 @ while (aX >= in)
  313. @ r0 = n
  314. @ r1 = in
  315. @ r2 = step
  316. @ r3 = shift
  317. STMFD r13!,{r3}
  318. LDR r5, =sincos_lookup0 @ r5 = T=sincos_lookup0
  319. ADD r4, r1, r0, LSL #1 @ r4 = aX = in+(n>>1)
  320. SUB r4, r4, #4*4 @ r4 = aX = in+(n>>1)-4
  321. LDRB r11,[r5,#1] @ r11= T[1]
  322. LDRB r10,[r5],r2 @ r10= T[0] T += step
  323. presymmetry_loop3:
  324. LDR r8, [r1],#16 @ r8 = ro0 = bX[0]
  325. LDR r9, [r1,#8-16] @ r9 = ro2 = bX[2]
  326. LDR r6, [r4],#-16 @ r6 = ri0 = aX[0]
  327. LDR r7, [r4,#8+16] @ r7 = ri2 = aX[2]
  328. MOV r8, r8, ASR #8
  329. MOV r9, r9, ASR #8
  330. MOV r6, r6, ASR #8
  331. @ XNPROD31( ro2, ro0, T[1], T[0], 0xaX[0], &aX[2] )
  332. @ aX[0] = (ro2*T[1] - ro0*T[0])>>31 aX[2] = (ro0*T[1] + ro2*T[0])>>31
  333. MUL r12,r8, r11 @ r12 = ro0*T[1]
  334. MOV r7, r7, ASR #8
  335. MLA r12,r9, r10,r12 @ r12 += ro2*T[0]
  336. RSB r8, r8, #0 @ r8 = -ro0
  337. MUL r3, r9, r11 @ r3 = ro2*T[1]
  338. LDRB r11,[r5,#1] @ r11= T[1]
  339. MLA r3, r8, r10,r3 @ r3 -= ro0*T[0]
  340. LDRB r10,[r5],r2 @ r10= T[0] T += step
  341. STR r12,[r4,#16+8]
  342. STR r3, [r4,#16]
  343. @ XNPROD31( ri2, ri0, T[0], T[1], 0xbX[0], &bX[2] )
  344. @ bX[0] = (ri2*T[0] - ri0*T[1])>>31 bX[2] = (ri0*T[0] + ri2*T[1])>>31
  345. MUL r12,r6, r10 @ r12 = ri0*T[0]
  346. RSB r6, r6, #0 @ r6 = -ri0
  347. MLA r12,r7, r11,r12 @ r12 += ri2*T[1]
  348. CMP r4, r1
  349. MUL r3, r7, r10 @ r3 = ri2*T[0]
  350. STR r12,[r1,#8-16]
  351. MLA r3, r6, r11,r3 @ r3 -= ri0*T[1]
  352. STR r3, [r1,#-16]
  353. BGE presymmetry_loop3
  354. SUB r1,r1,r0 @ r1 = in -= n>>2 (i.e. restore in)
  355. LDR r3,[r13]
  356. STR r2,[r13,#-4]!
  357. @ mdct_butterflies
  358. @ r0 = n = (points * 2)
  359. @ r1 = in = x
  360. @ r2 = i
  361. @ r3 = shift
  362. STMFD r13!,{r0-r1}
  363. RSBS r4,r3,#6 @ r4 = stages = 7-shift then --stages
  364. LDR r5,=sincos_lookup0
  365. BLE no_generics
  366. MOV r14,#4 @ r14= 4 (i=0)
  367. MOV r6, r14,LSL r3 @ r6 = (4<<i)<<shift
  368. mdct_butterflies_loop1:
  369. MOV r0, r0, LSR #1 @ r0 = points>>i = POINTS
  370. MOV r2, r14,LSR #2 @ r2 = (1<<i)-j (j=0)
  371. STMFD r13!,{r4,r14}
  372. mdct_butterflies_loop2:
  373. @ mdct_butterfly_generic(x+POINTS*j, POINTS, 4<<(i+shift))
  374. @ mdct_butterfly_generic(r1, r0, r6)
  375. @ r0 = points
  376. @ r1 = x
  377. @ preserve r2 (external loop counter)
  378. @ preserve r3
  379. @ preserve r4 (external loop counter)
  380. @ r5 = T = sincos_lookup0
  381. @ r6 = step
  382. @ preserve r14
  383. STR r2,[r13,#-4]! @ stack r2
  384. ADD r1,r1,r0,LSL #1 @ r1 = x2+4 = x + (POINTS>>1)
  385. ADD r7,r1,r0,LSL #1 @ r7 = x1+4 = x + POINTS
  386. ADD r12,r5,#1024 @ r12= sincos_lookup0+1024
  387. mdct_bufferfly_generic_loop1:
  388. LDMDB r7!,{r2,r3,r8,r11} @ r2 = x1[0]
  389. @ r3 = x1[1]
  390. @ r8 = x1[2]
  391. @ r11= x1[3] x1 -= 4
  392. LDMDB r1!,{r4,r9,r10,r14} @ r4 = x2[0]
  393. @ r9 = x2[1]
  394. @ r10= x2[2]
  395. @ r14= x2[3] x2 -= 4
  396. SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
  397. ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
  398. SUB r11,r11,r8 @ r11= s1 = x1[3] - x1[2]
  399. ADD r8, r11,r8, LSL #1 @ r8 = x1[3] + x1[2] (-> x1[2])
  400. SUB r9, r9, r4 @ r9 = s2 = x2[1] - x2[0]
  401. ADD r4, r9, r4, LSL #1 @ r4 = x2[1] + x2[0] (-> x1[1])
  402. SUB r14,r14,r10 @ r14= s3 = x2[3] - x2[2]
  403. ADD r10,r14,r10,LSL #1 @ r10= x2[3] + x2[2] (-> x1[3])
  404. STMIA r7,{r3,r4,r8,r10}
  405. @ r0 = points
  406. @ r1 = x2
  407. @ r2 = s0
  408. @ r3 free
  409. @ r4 free
  410. @ r5 = T
  411. @ r6 = step
  412. @ r7 = x1
  413. @ r8 free
  414. @ r9 = s2
  415. @ r10 free
  416. @ r11= s1
  417. @ r12= limit
  418. @ r14= s3
  419. LDRB r8, [r5,#1] @ r8 = T[1]
  420. LDRB r10,[r5],r6 @ r10= T[0] T += step
  421. MOV r2, r2, ASR #8
  422. MOV r11,r11,ASR #8
  423. MOV r9, r9, ASR #8
  424. MOV r14,r14,ASR #8
  425. @ XPROD31(s1, s0, T[0], T[1], &x2[0], &x2[2])
  426. @ x2[0] = (s1*T[0] + s0*T[1])>>31 x2[2] = (s0*T[0] - s1*T[1])>>31
  427. @ stall Xscale
  428. MUL r3, r2, r8 @ r3 = s0*T[1]
  429. MLA r3, r11,r10,r3 @ r3 += s1*T[0]
  430. RSB r11,r11,#0
  431. MUL r4, r8, r11 @ r4 = -s1*T[1]
  432. MLA r4, r2, r10,r4 @ r4 += s0*T[0] = Value for x2[2]
  433. MOV r2, r3 @ r2 = r3 = Value for x2[0]
  434. @ XPROD31(s2, s3, T[0], T[1], &x2[1], &x2[3])
  435. @ x2[1] = (s2*T[0] + s3*T[1])>>31 x2[3] = (s3*T[0] - s2*T[1])>>31
  436. MUL r3, r9, r10 @ r3 = s2*T[0]
  437. MLA r3, r14,r8, r3 @ r3 += s3*T[1] = Value for x2[1]
  438. RSB r9, r9, #0
  439. MUL r11,r14,r10 @ r11 = s3*T[0]
  440. MLA r11,r9, r8, r11 @ r11 -= s2*T[1] = Value for x2[3]
  441. CMP r5, r12
  442. STMIA r1,{r2,r3,r4,r11}
  443. BLT mdct_bufferfly_generic_loop1
  444. SUB r12,r12,#1024
  445. mdct_bufferfly_generic_loop2:
  446. LDMDB r7!,{r2,r3,r9,r10} @ r2 = x1[0]
  447. @ r3 = x1[1]
  448. @ r9 = x1[2]
  449. @ r10= x1[3] x1 -= 4
  450. LDMDB r1!,{r4,r8,r11,r14} @ r4 = x2[0]
  451. @ r8 = x2[1]
  452. @ r11= x2[2]
  453. @ r14= x2[3] x2 -= 4
  454. SUB r2, r2, r3 @ r2 = s0 = x1[0] - x1[1]
  455. ADD r3, r2, r3, LSL #1 @ r3 = x1[0] + x1[1] (-> x1[0])
  456. SUB r9, r9,r10 @ r9 = s1 = x1[2] - x1[3]
  457. ADD r10,r9,r10, LSL #1 @ r10= x1[2] + x1[3] (-> x1[2])
  458. SUB r4, r4, r8 @ r4 = s2 = x2[0] - x2[1]
  459. ADD r8, r4, r8, LSL #1 @ r8 = x2[0] + x2[1] (-> x1[1])
  460. SUB r14,r14,r11 @ r14= s3 = x2[3] - x2[2]
  461. ADD r11,r14,r11,LSL #1 @ r11= x2[3] + x2[2] (-> x1[3])
  462. STMIA r7,{r3,r8,r10,r11}
  463. @ r0 = points
  464. @ r1 = x2
  465. @ r2 = s0
  466. @ r3 free
  467. @ r4 = s2
  468. @ r5 = T
  469. @ r6 = step
  470. @ r7 = x1
  471. @ r8 free
  472. @ r9 = s1
  473. @ r10 free
  474. @ r11 free
  475. @ r12= limit
  476. @ r14= s3
  477. LDRB r8, [r5,#1] @ r8 = T[1]
  478. LDRB r10,[r5],-r6 @ r10= T[0] T -= step
  479. MOV r2, r2, ASR #8
  480. MOV r9, r9, ASR #8
  481. MOV r4, r4, ASR #8
  482. MOV r14,r14,ASR #8
  483. @ XNPROD31(s0, s1, T[0], T[1], &x2[0], &x2[2])
  484. @ x2[0] = (s0*T[0] - s1*T[1])>>31 x2[2] = (s1*T[0] + s0*T[1])>>31
  485. @ stall Xscale
  486. MUL r11,r2, r8 @ r11 = s0*T[1]
  487. MLA r11,r9, r10,r11 @ r11 += s1*T[0]
  488. RSB r9, r9, #0
  489. MUL r2, r10,r2 @ r2 = s0*T[0]
  490. MLA r2, r9, r8, r2 @ r2 += -s1*T[1] = Value for x2[0]
  491. MOV r9, r11 @ r9 = r11 = Value for x2[2]
  492. @ XNPROD31(s3, s2, T[0], T[1], &x2[1], &x2[3])
  493. @ x2[1] = (s3*T[0] - s2*T[1])>>31 x2[3] = (s2*T[0] + s3*T[1])>>31
  494. MUL r11,r4, r10 @ r11 = s2*T[0]
  495. MLA r11,r14,r8, r11 @ r11 += s3*T[1] = Value for x2[3]
  496. RSB r4, r4, #0
  497. MUL r3, r14,r10 @ r3 = s3*T[0]
  498. MLA r3, r4, r8, r3 @ r3 -= s2*T[1] = Value for x2[1]
  499. CMP r5, r12
  500. STMIA r1,{r2,r3,r9,r11}
  501. BGT mdct_bufferfly_generic_loop2
  502. LDR r2,[r13],#4 @ unstack r2
  503. ADD r1, r1, r0, LSL #2 @ r1 = x+POINTS*j
  504. @ stall Xscale
  505. SUBS r2, r2, #1 @ r2-- (j++)
  506. BGT mdct_butterflies_loop2
  507. LDMFD r13!,{r4,r14}
  508. LDR r1,[r13,#4]
  509. SUBS r4, r4, #1 @ stages--
  510. MOV r14,r14,LSL #1 @ r14= 4<<i (i++)
  511. MOV r6, r6, LSL #1 @ r6 = step <<= 1 (i++)
  512. BGE mdct_butterflies_loop1
  513. LDMFD r13,{r0-r1}
  514. no_generics:
  515. @ mdct_butterflies part2 (loop around mdct_bufferfly_32)
  516. @ r0 = points
  517. @ r1 = in
  518. @ r2 = step
  519. @ r3 = shift
  520. mdct_bufferflies_loop3:
  521. @ mdct_bufferfly_32
  522. @ block1
  523. ADD r4, r1, #16*4 @ r4 = &in[16]
  524. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[16]
  525. @ r6 = x[17]
  526. @ r9 = x[18]
  527. @ r10= x[19]
  528. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  529. @ r8 = x[1]
  530. @ r11= x[2]
  531. @ r12= x[3]
  532. SUB r5, r5, r6 @ r5 = s0 = x[16] - x[17]
  533. ADD r6, r5, r6, LSL #1 @ r6 = x[16] + x[17] -> x[16]
  534. SUB r9, r9, r10 @ r9 = s1 = x[18] - x[19]
  535. ADD r10,r9, r10,LSL #1 @ r10= x[18] + x[19] -> x[18]
  536. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  537. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[17]
  538. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  539. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[19]
  540. STMIA r4!,{r6,r7,r10,r11}
  541. MOV r6,#0xed @ r6 =cPI1_8
  542. MOV r7,#0x62 @ r7 =cPI3_8
  543. MOV r5, r5, ASR #8
  544. MOV r9, r9, ASR #8
  545. MOV r8, r8, ASR #8
  546. MOV r12,r12,ASR #8
  547. @ XNPROD31( s0, s1, cPI3_8, cPI1_8, &x[ 0], &x[ 2] )
  548. @ x[0] = s0*cPI3_8 - s1*cPI1_8 x[2] = s1*cPI3_8 + s0*cPI1_8
  549. @ stall Xscale
  550. MUL r11,r5, r6 @ r11 = s0*cPI1_8
  551. MLA r11,r9, r7, r11 @ r11 += s1*cPI3_8
  552. RSB r9, r9, #0
  553. MUL r5, r7, r5 @ r5 = s0*cPI3_8
  554. MLA r5, r9, r6, r5 @ r5 -= s1*cPI1_8
  555. @ XPROD31 ( s2, s3, cPI1_8, cPI3_8, &x[ 1], &x[ 3] )
  556. @ x[1] = s2*cPI1_8 + s3*cPI3_8 x[3] = s3*cPI1_8 - s2*cPI3_8
  557. MUL r9, r8, r6 @ r9 = s2*cPI1_8
  558. MLA r9, r12,r7, r9 @ r9 += s3*cPI3_8
  559. RSB r8,r8,#0
  560. MUL r12,r6, r12 @ r12 = s3*cPI1_8
  561. MLA r12,r8, r7, r12 @ r12 -= s2*cPI3_8
  562. STMIA r1!,{r5,r9,r11,r12}
  563. @ block2
  564. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[20]
  565. @ r6 = x[21]
  566. @ r9 = x[22]
  567. @ r10= x[23]
  568. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[4]
  569. @ r8 = x[5]
  570. @ r11= x[6]
  571. @ r12= x[7]
  572. SUB r5, r5, r6 @ r5 = s0 = x[20] - x[21]
  573. ADD r6, r5, r6, LSL #1 @ r6 = x[20] + x[21] -> x[20]
  574. SUB r9, r9, r10 @ r9 = s1 = x[22] - x[23]
  575. ADD r10,r9, r10,LSL #1 @ r10= x[22] + x[23] -> x[22]
  576. SUB r8, r8, r7 @ r8 = s2 = x[ 5] - x[ 4]
  577. ADD r7, r8, r7, LSL #1 @ r7 = x[ 5] + x[ 4] -> x[21]
  578. SUB r12,r12,r11 @ r12= s3 = x[ 7] - x[ 6]
  579. ADD r11,r12,r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[23]
  580. MOV r14,#0xb5 @ cPI2_8
  581. STMIA r4!,{r6,r7,r10,r11}
  582. SUB r5, r5, r9 @ r5 = s0 - s1
  583. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  584. MOV r5, r5, ASR #8
  585. MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8
  586. SUB r12,r12,r8 @ r12= s3 - s2
  587. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  588. MOV r8, r8, ASR #8
  589. MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8
  590. MOV r9, r9, ASR #8
  591. MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8
  592. MOV r12,r12,ASR #8
  593. MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8
  594. STMIA r1!,{r5,r8,r9,r12}
  595. @ block3
  596. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[24]
  597. @ r6 = x[25]
  598. @ r9 = x[25]
  599. @ r10= x[26]
  600. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[8]
  601. @ r8 = x[9]
  602. @ r11= x[10]
  603. @ r12= x[11]
  604. SUB r5, r5, r6 @ r5 = s0 = x[24] - x[25]
  605. ADD r6, r5, r6, LSL #1 @ r6 = x[24] + x[25] -> x[25]
  606. SUB r9, r9, r10 @ r9 = s1 = x[26] - x[27]
  607. ADD r10,r9, r10,LSL #1 @ r10= x[26] + x[27] -> x[26]
  608. SUB r8, r8, r7 @ r8 = s2 = x[ 9] - x[ 8]
  609. ADD r7, r8, r7, LSL #1 @ r7 = x[ 9] + x[ 8] -> x[25]
  610. SUB r12,r12,r11 @ r12= s3 = x[11] - x[10]
  611. ADD r11,r12,r11, LSL #1 @ r11= x[11] + x[10] -> x[27]
  612. STMIA r4!,{r6,r7,r10,r11}
  613. MOV r6,#0x62 @ r6 = cPI3_8
  614. MOV r7,#0xED @ r7 = cPI1_8
  615. @ XNPROD31( s0, s1, cPI1_8, cPI3_8, &x[ 8], &x[10] )
  616. @ x[8] = s0*cPI1_8 - s1*cPI3_8 x[10] = s1*cPI1_8 + s0*cPI3_8
  617. @ stall Xscale
  618. MOV r5, r5, ASR #8
  619. MUL r11,r5, r6 @ r11 = s0*cPI3_8
  620. MOV r9, r9, ASR #8
  621. MLA r11,r9, r7, r11 @ r11 += s1*cPI1_8
  622. RSB r9, r9, #0
  623. MUL r5, r7, r5 @ r5 = s0*cPI1_8
  624. MLA r5, r9, r6, r5 @ r5 -= s1*cPI3_8
  625. @ XPROD31 ( s2, s3, cPI3_8, cPI1_8, &x[ 9], &x[11] )
  626. @ x[9] = s2*cPI3_8 + s3*cPI1_8 x[11] = s3*cPI3_8 - s2*cPI1_8
  627. MOV r8, r8, ASR #8
  628. MUL r9, r8, r6 @ r9 = s2*cPI3_8
  629. MOV r12,r12,ASR #8
  630. MLA r9, r12,r7, r9 @ r9 += s3*cPI1_8
  631. RSB r8,r8,#0
  632. MUL r12,r6, r12 @ r12 = s3*cPI3_8
  633. MLA r12,r8, r7, r12 @ r12 -= s2*cPI1_8
  634. STMIA r1!,{r5,r9,r11,r12}
  635. @ block4
  636. LDMIA r4,{r5,r6,r10,r11} @ r5 = x[28]
  637. @ r6 = x[29]
  638. @ r10= x[30]
  639. @ r11= x[31]
  640. LDMIA r1,{r8,r9,r12,r14} @ r8 = x[12]
  641. @ r9 = x[13]
  642. @ r12= x[14]
  643. @ r14= x[15]
  644. SUB r5, r5, r6 @ r5 = s0 = x[28] - x[29]
  645. ADD r6, r5, r6, LSL #1 @ r6 = x[28] + x[29] -> x[28]
  646. SUB r7, r14,r12 @ r7 = s3 = x[15] - x[14]
  647. ADD r12,r7, r12, LSL #1 @ r12= x[15] + x[14] -> x[31]
  648. SUB r10,r10,r11 @ r10= s1 = x[30] - x[31]
  649. ADD r11,r10,r11,LSL #1 @ r11= x[30] + x[31] -> x[30]
  650. SUB r14, r8, r9 @ r14= s2 = x[12] - x[13]
  651. ADD r9, r14, r9, LSL #1 @ r9 = x[12] + x[13] -> x[29]
  652. STMIA r4!,{r6,r9,r11,r12}
  653. STMIA r1!,{r5,r7,r10,r14}
  654. @ mdct_butterfly16 (1st version)
  655. @ block 1
  656. SUB r1,r1,#16*4
  657. ADD r4,r1,#8*4
  658. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
  659. @ r6 = x[ 9]
  660. @ r9 = x[10]
  661. @ r10= x[11]
  662. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  663. @ r8 = x[1]
  664. @ r11= x[2]
  665. @ r12= x[3]
  666. SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
  667. ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
  668. SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
  669. ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
  670. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  671. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
  672. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  673. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
  674. MOV r14,#0xB5 @ r14= cPI2_8
  675. STMIA r4!,{r6,r7,r10,r11}
  676. SUB r5, r5, r9 @ r5 = s0 - s1
  677. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  678. MOV r5, r5, ASR #8
  679. MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8
  680. SUB r12,r12,r8 @ r12= s3 - s2
  681. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  682. MOV r8, r8, ASR #8
  683. MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8
  684. MOV r9, r9, ASR #8
  685. MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8
  686. MOV r12,r12,ASR #8
  687. MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8
  688. STMIA r1!,{r5,r8,r9,r12}
  689. @ block2
  690. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
  691. @ r6 = x[13]
  692. @ r9 = x[14]
  693. @ r10= x[15]
  694. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
  695. @ r8 = x[ 5]
  696. @ r11= x[ 6]
  697. @ r12= x[ 7]
  698. SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
  699. ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
  700. SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
  701. ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
  702. SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
  703. ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
  704. SUB r12,r9, r10 @ r12= s3 = x[14] - x[15]
  705. ADD r10,r12,r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
  706. STMIA r4!,{r6,r8,r10,r11}
  707. STMIA r1!,{r5,r7,r12,r14}
  708. @ mdct_butterfly_8
  709. LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  710. @ r6 = x[0]
  711. @ r7 = x[1]
  712. @ r8 = x[2]
  713. @ r9 = x[3]
  714. @ r10= x[4]
  715. @ r11= x[5]
  716. @ r12= x[6]
  717. @ r14= x[7]
  718. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  719. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  720. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  721. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  722. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  723. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  724. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  725. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  726. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  727. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  728. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  729. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  730. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  731. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  732. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  733. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  734. STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  735. @ mdct_butterfly_8
  736. LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  737. @ r6 = x[0]
  738. @ r7 = x[1]
  739. @ r8 = x[2]
  740. @ r9 = x[3]
  741. @ r10= x[4]
  742. @ r11= x[5]
  743. @ r12= x[6]
  744. @ r14= x[7]
  745. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  746. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  747. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  748. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  749. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  750. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  751. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  752. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  753. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  754. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  755. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  756. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  757. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  758. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  759. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  760. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  761. STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  762. @ mdct_butterfly16 (2nd version)
  763. @ block 1
  764. ADD r1,r1,#16*4-8*4
  765. ADD r4,r1,#8*4
  766. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[ 8]
  767. @ r6 = x[ 9]
  768. @ r9 = x[10]
  769. @ r10= x[11]
  770. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[0]
  771. @ r8 = x[1]
  772. @ r11= x[2]
  773. @ r12= x[3]
  774. SUB r5, r5, r6 @ r5 = s0 = x[ 8] - x[ 9]
  775. ADD r6, r5, r6, LSL #1 @ r6 = x[ 8] + x[ 9] -> x[ 8]
  776. SUB r9, r9, r10 @ r9 = s1 = x[10] - x[11]
  777. ADD r10,r9, r10,LSL #1 @ r10= x[10] + x[11] -> x[10]
  778. SUB r8, r8, r7 @ r8 = s2 = x[ 1] - x[ 0]
  779. ADD r7, r8, r7, LSL #1 @ r7 = x[ 1] + x[ 0] -> x[ 9]
  780. SUB r12,r12,r11 @ r12= s3 = x[ 3] - x[ 2]
  781. ADD r11,r12,r11, LSL #1 @ r11= x[ 3] + x[ 2] -> x[11]
  782. MOV r14,#0xb5 @ r14= cPI2_8
  783. STMIA r4!,{r6,r7,r10,r11}
  784. SUB r5, r5, r9 @ r5 = s0 - s1
  785. ADD r9, r5, r9, LSL #1 @ r9 = s0 + s1
  786. MOV r5, r5, ASR #8
  787. MUL r5, r14,r5 @ r5 = (s0-s1)*cPI2_8
  788. SUB r12,r12,r8 @ r12= s3 - s2
  789. ADD r8, r12,r8, LSL #1 @ r8 = s3 + s2
  790. MOV r8, r8, ASR #8
  791. MUL r8, r14,r8 @ r8 = (s3+s2)*cPI2_8
  792. MOV r9, r9, ASR #8
  793. MUL r9, r14,r9 @ r9 = (s0+s1)*cPI2_8
  794. MOV r12,r12,ASR #8
  795. MUL r12,r14,r12 @ r12 = (s3-s2)*cPI2_8
  796. STMIA r1!,{r5,r8,r9,r12}
  797. @ block2
  798. LDMIA r4,{r5,r6,r9,r10} @ r5 = x[12]
  799. @ r6 = x[13]
  800. @ r9 = x[14]
  801. @ r10= x[15]
  802. LDMIA r1,{r7,r8,r11,r12} @ r7 = x[ 4]
  803. @ r8 = x[ 5]
  804. @ r11= x[ 6]
  805. @ r12= x[ 7]
  806. SUB r5, r5, r6 @ r5 = s2 = x[12] - x[13]
  807. ADD r6, r5, r6, LSL #1 @ r6 = x[12] + x[13] -> x[12]
  808. SUB r9, r9, r10 @ r9 = s3 = x[14] - x[15]
  809. ADD r10,r9, r10,LSL #1 @ r10= x[14] + x[15] -> x[14]
  810. SUB r14,r7, r8 @ r14= s0 = x[ 4] - x[ 5]
  811. ADD r8, r14,r8, LSL #1 @ r8 = x[ 4] + x[ 5] -> x[13]
  812. SUB r7, r12,r11 @ r7 = s1 = x[ 7] - x[ 6]
  813. ADD r11,r7, r11, LSL #1 @ r11= x[ 7] + x[ 6] -> x[15]
  814. STMIA r4!,{r6,r8,r10,r11}
  815. STMIA r1!,{r5,r7,r9,r14}
  816. @ mdct_butterfly_8
  817. LDMDB r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  818. @ r6 = x[0]
  819. @ r7 = x[1]
  820. @ r8 = x[2]
  821. @ r9 = x[3]
  822. @ r10= x[4]
  823. @ r11= x[5]
  824. @ r12= x[6]
  825. @ r14= x[7]
  826. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  827. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  828. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  829. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  830. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  831. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  832. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  833. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  834. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  835. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  836. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  837. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  838. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  839. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  840. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  841. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  842. STMDB r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  843. @ mdct_butterfly_8
  844. LDMIA r1,{r6,r7,r8,r9,r10,r11,r12,r14}
  845. @ r6 = x[0]
  846. @ r7 = x[1]
  847. @ r8 = x[2]
  848. @ r9 = x[3]
  849. @ r10= x[4]
  850. @ r11= x[5]
  851. @ r12= x[6]
  852. @ r14= x[7]
  853. ADD r6, r6, r7 @ r6 = s0 = x[0] + x[1]
  854. SUB r7, r6, r7, LSL #1 @ r7 = s1 = x[0] - x[1]
  855. ADD r8, r8, r9 @ r8 = s2 = x[2] + x[3]
  856. SUB r9, r8, r9, LSL #1 @ r9 = s3 = x[2] - x[3]
  857. ADD r10,r10,r11 @ r10= s4 = x[4] + x[5]
  858. SUB r11,r10,r11,LSL #1 @ r11= s5 = x[4] - x[5]
  859. ADD r12,r12,r14 @ r12= s6 = x[6] + x[7]
  860. SUB r14,r12,r14,LSL #1 @ r14= s7 = x[6] - x[7]
  861. ADD r2, r11,r9 @ r2 = x[0] = s5 + s3
  862. SUB r4, r2, r9, LSL #1 @ r4 = x[2] = s5 - s3
  863. SUB r3, r14,r7 @ r3 = x[1] = s7 - s1
  864. ADD r5, r3, r7, LSL #1 @ r5 = x[3] = s7 + s1
  865. SUB r10,r10,r6 @ r10= x[4] = s4 - s0
  866. SUB r11,r12,r8 @ r11= x[5] = s6 - s2
  867. ADD r12,r10,r6, LSL #1 @ r12= x[6] = s4 + s0
  868. ADD r14,r11,r8, LSL #1 @ r14= x[7] = s6 + s2
  869. STMIA r1,{r2,r3,r4,r5,r10,r11,r12,r14}
  870. ADD r1,r1,#8*4
  871. SUBS r0,r0,#64
  872. BGT mdct_bufferflies_loop3
  873. LDMFD r13,{r0-r3}
  874. mdct_bitreverseARM:
  875. @ r0 = points
  876. @ r1 = in
  877. @ r2 = step
  878. @ r3 = shift
  879. MOV r4, #0 @ r4 = bit = 0
  880. ADD r5, r1, r0, LSL #1 @ r5 = w = x + (n>>1)
  881. ADR r6, bitrev
  882. SUB r3, r3, #2 @ r3 = shift -= 2
  883. SUB r5, r5, #8
  884. brev_lp:
  885. LDRB r7, [r6, r4, LSR #6]
  886. AND r8, r4, #0x3f
  887. LDRB r8, [r6, r8]
  888. ADD r4, r4, #1 @ bit++
  889. @ stall XScale
  890. ORR r7, r7, r8, LSL #6 @ r7 = bitrev[bit]
  891. ADD r9, r1, r7, LSR r3 @ r9 = xx = x + (b>>shift)
  892. CMP r5, r9 @ if (w > xx)
  893. LDR r10,[r5],#-8 @ r10 = w[0] w -= 2
  894. LDRGT r11,[r5,#12] @ r11 = w[1]
  895. LDRGT r12,[r9] @ r12 = xx[0]
  896. LDRGT r14,[r9,#4] @ r14 = xx[1]
  897. STRGT r10,[r9] @ xx[0]= w[0]
  898. STRGT r11,[r9,#4] @ xx[1]= w[1]
  899. STRGT r12,[r5,#8] @ w[0] = xx[0]
  900. STRGT r14,[r5,#12] @ w[1] = xx[1]
  901. CMP r5,r1
  902. BGT brev_lp
  903. @ mdct_step7
  904. @ r0 = points
  905. @ r1 = in
  906. @ r2 = step
  907. @ r3 = shift-2
  908. CMP r2, #4 @ r5 = T = (step>=4) ?
  909. LDRGE r5, =sincos_lookup0 @ sincos_lookup0 +
  910. LDRLT r5, =sincos_lookup1 @ sincos_lookup0 +
  911. ADD r7, r1, r0, LSL #1 @ r7 = w1 = x + (n>>1)
  912. ADDGE r5, r5, r2, LSR #1 @ (step>>1)
  913. ADD r8, r5, #1024 @ r8 = Ttop
  914. step7_loop1:
  915. LDR r6, [r1] @ r6 = w0[0]
  916. LDR r9, [r1,#4] @ r9 = w0[1]
  917. LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
  918. LDR r11,[r7,#4] @ r11= w1[1]
  919. LDRB r14,[r5,#1] @ r14= T[1]
  920. LDRB r12,[r5],r2 @ r12= T[0] T += step
  921. ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
  922. SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
  923. SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
  924. ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
  925. MOV r6, r6, ASR #9
  926. MUL r3, r6, r14 @ r3 = s0*T[1]
  927. MOV r11,r11,ASR #9
  928. MUL r4, r11,r12 @ r4 += s1*T[0] = s2
  929. ADD r3, r3, r4
  930. MUL r14,r11,r14 @ r14 = s1*T[1]
  931. MUL r12,r6, r12 @ r12 += s0*T[0] = s3
  932. SUB r14,r14,r12
  933. @ r9 = s0b<<1
  934. @ r10= s1b<<1
  935. ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
  936. SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
  937. SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
  938. ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
  939. STR r9, [r1],#4
  940. STR r10,[r1],#4 @ w0 += 2
  941. STR r3, [r7]
  942. STR r12,[r7,#4]
  943. CMP r5,r8
  944. BLT step7_loop1
  945. step7_loop2:
  946. LDR r6, [r1] @ r6 = w0[0]
  947. LDR r9, [r1,#4] @ r9 = w0[1]
  948. LDR r10,[r7,#-8]! @ r10= w1[0] w1 -= 2
  949. LDR r11,[r7,#4] @ r11= w1[1]
  950. LDRB r14,[r5,-r2]! @ r12= T[1] T -= step
  951. LDRB r12,[r5,#1] @ r14= T[0]
  952. ADD r6, r6, r10 @ r6 = s0 = w0[0] + w1[0]
  953. SUB r10,r6, r10,LSL #1 @ r10= s1b= w0[0] - w1[0]
  954. SUB r11,r11,r9 @ r11= s1 = w1[1] - w0[1]
  955. ADD r9, r11,r9, LSL #1 @ r9 = s0b= w1[1] + w0[1]
  956. MOV r6, r6, ASR #9
  957. MUL r3, r6, r14 @ r3 = s0*T[0]
  958. MOV r11,r11,ASR #9
  959. MUL r4, r11,r12 @ r4 += s1*T[1] = s2
  960. ADD r3, r3, r4
  961. MUL r14,r11,r14 @ r14 = s1*T[0]
  962. MUL r12,r6, r12 @ r12 += s0*T[1] = s3
  963. SUB r14,r14,r12
  964. @ r9 = s0b<<1
  965. @ r10= s1b<<1
  966. ADD r9, r3, r9, ASR #1 @ r9 = s0b + s2
  967. SUB r3, r9, r3, LSL #1 @ r3 = s0b - s2
  968. SUB r12,r14,r10,ASR #1 @ r12= s3 - s1b
  969. ADD r10,r14,r10,ASR #1 @ r10= s3 + s1b
  970. STR r9, [r1],#4
  971. STR r10,[r1],#4 @ w0 += 2
  972. STR r3, [r7]
  973. STR r12,[r7,#4]
  974. CMP r1,r7
  975. BLT step7_loop2
  976. LDMFD r13!,{r0-r3}
  977. @ r0 = points
  978. @ r1 = in
  979. @ r2 = step
  980. @ r3 = shift
  981. MOV r2, r2, ASR #2 @ r2 = step >>= 2
  982. CMP r2, #0
  983. CMPNE r2, #1
  984. BEQ mdct_end
  985. @ step > 1 (default case)
  986. CMP r2, #4 @ r5 = T = (step>=4) ?
  987. LDRGE r5, =sincos_lookup0 @ sincos_lookup0 +
  988. LDRLT r5, =sincos_lookup1 @ sincos_lookup1
  989. ADD r7, r1, r0, LSL #1 @ r7 = iX = x + (n>>1)
  990. ADDGE r5, r5, r2, LSR #1 @ (step>>1)
  991. mdct_step8_default:
  992. LDR r6, [r1],#4 @ r6 = s0 = x[0]
  993. LDR r8, [r1],#4 @ r8 = -s1 = x[1]
  994. LDRB r12,[r5,#1] @ r12= T[1]
  995. LDRB r14,[r5],r2 @ r14= T[0] T += step
  996. RSB r8, r8, #0 @ r8 = s1
  997. @ XPROD31(s0, s1, T[0], T[1], x, x+1)
  998. @ x[0] = s0 * T[0] + s1 * T[1] x[1] = s1 * T[0] - s0 * T[1]
  999. MOV r6, r6, ASR #8
  1000. MOV r8, r8, ASR #8
  1001. MUL r10,r8, r12 @ r10 = s1 * T[1]
  1002. CMP r1, r7
  1003. MLA r10,r6, r14,r10 @ r10 += s0 * T[0]
  1004. RSB r6, r6, #0 @ r6 = -s0
  1005. MUL r11,r8, r14 @ r11 = s1 * T[0]
  1006. MLA r11,r6, r12,r11 @ r11 -= s0 * T[1]
  1007. STR r10,[r1,#-8]
  1008. STR r11,[r1,#-4]
  1009. BLT mdct_step8_default
  1010. mdct_end:
  1011. MOV r0, r2
  1012. LDMFD r13!,{r4-r11,PC}
  1013. bitrev:
  1014. .byte 0
  1015. .byte 32
  1016. .byte 16
  1017. .byte 48
  1018. .byte 8
  1019. .byte 40
  1020. .byte 24
  1021. .byte 56
  1022. .byte 4
  1023. .byte 36
  1024. .byte 20
  1025. .byte 52
  1026. .byte 12
  1027. .byte 44
  1028. .byte 28
  1029. .byte 60
  1030. .byte 2
  1031. .byte 34
  1032. .byte 18
  1033. .byte 50
  1034. .byte 10
  1035. .byte 42
  1036. .byte 26
  1037. .byte 58
  1038. .byte 6
  1039. .byte 38
  1040. .byte 22
  1041. .byte 54
  1042. .byte 14
  1043. .byte 46
  1044. .byte 30
  1045. .byte 62
  1046. .byte 1
  1047. .byte 33
  1048. .byte 17
  1049. .byte 49
  1050. .byte 9
  1051. .byte 41
  1052. .byte 25
  1053. .byte 57
  1054. .byte 5
  1055. .byte 37
  1056. .byte 21
  1057. .byte 53
  1058. .byte 13
  1059. .byte 45
  1060. .byte 29
  1061. .byte 61
  1062. .byte 3
  1063. .byte 35
  1064. .byte 19
  1065. .byte 51
  1066. .byte 11
  1067. .byte 43
  1068. .byte 27
  1069. .byte 59
  1070. .byte 7
  1071. .byte 39
  1072. .byte 23
  1073. .byte 55
  1074. .byte 15
  1075. .byte 47
  1076. .byte 31
  1077. .byte 63
  1078. @ END