fe_amd64.s 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
  2. //go:build amd64 && gc && !purego
  3. // +build amd64,gc,!purego
  4. #include "textflag.h"
  5. // func feMul(out *Element, a *Element, b *Element)
  6. TEXT ·feMul(SB), NOSPLIT, $0-24
  7. MOVQ a+8(FP), CX
  8. MOVQ b+16(FP), BX
  9. // r0 = a0×b0
  10. MOVQ (CX), AX
  11. MULQ (BX)
  12. MOVQ AX, DI
  13. MOVQ DX, SI
  14. // r0 += 19×a1×b4
  15. MOVQ 8(CX), AX
  16. IMUL3Q $0x13, AX, AX
  17. MULQ 32(BX)
  18. ADDQ AX, DI
  19. ADCQ DX, SI
  20. // r0 += 19×a2×b3
  21. MOVQ 16(CX), AX
  22. IMUL3Q $0x13, AX, AX
  23. MULQ 24(BX)
  24. ADDQ AX, DI
  25. ADCQ DX, SI
  26. // r0 += 19×a3×b2
  27. MOVQ 24(CX), AX
  28. IMUL3Q $0x13, AX, AX
  29. MULQ 16(BX)
  30. ADDQ AX, DI
  31. ADCQ DX, SI
  32. // r0 += 19×a4×b1
  33. MOVQ 32(CX), AX
  34. IMUL3Q $0x13, AX, AX
  35. MULQ 8(BX)
  36. ADDQ AX, DI
  37. ADCQ DX, SI
  38. // r1 = a0×b1
  39. MOVQ (CX), AX
  40. MULQ 8(BX)
  41. MOVQ AX, R9
  42. MOVQ DX, R8
  43. // r1 += a1×b0
  44. MOVQ 8(CX), AX
  45. MULQ (BX)
  46. ADDQ AX, R9
  47. ADCQ DX, R8
  48. // r1 += 19×a2×b4
  49. MOVQ 16(CX), AX
  50. IMUL3Q $0x13, AX, AX
  51. MULQ 32(BX)
  52. ADDQ AX, R9
  53. ADCQ DX, R8
  54. // r1 += 19×a3×b3
  55. MOVQ 24(CX), AX
  56. IMUL3Q $0x13, AX, AX
  57. MULQ 24(BX)
  58. ADDQ AX, R9
  59. ADCQ DX, R8
  60. // r1 += 19×a4×b2
  61. MOVQ 32(CX), AX
  62. IMUL3Q $0x13, AX, AX
  63. MULQ 16(BX)
  64. ADDQ AX, R9
  65. ADCQ DX, R8
  66. // r2 = a0×b2
  67. MOVQ (CX), AX
  68. MULQ 16(BX)
  69. MOVQ AX, R11
  70. MOVQ DX, R10
  71. // r2 += a1×b1
  72. MOVQ 8(CX), AX
  73. MULQ 8(BX)
  74. ADDQ AX, R11
  75. ADCQ DX, R10
  76. // r2 += a2×b0
  77. MOVQ 16(CX), AX
  78. MULQ (BX)
  79. ADDQ AX, R11
  80. ADCQ DX, R10
  81. // r2 += 19×a3×b4
  82. MOVQ 24(CX), AX
  83. IMUL3Q $0x13, AX, AX
  84. MULQ 32(BX)
  85. ADDQ AX, R11
  86. ADCQ DX, R10
  87. // r2 += 19×a4×b3
  88. MOVQ 32(CX), AX
  89. IMUL3Q $0x13, AX, AX
  90. MULQ 24(BX)
  91. ADDQ AX, R11
  92. ADCQ DX, R10
  93. // r3 = a0×b3
  94. MOVQ (CX), AX
  95. MULQ 24(BX)
  96. MOVQ AX, R13
  97. MOVQ DX, R12
  98. // r3 += a1×b2
  99. MOVQ 8(CX), AX
  100. MULQ 16(BX)
  101. ADDQ AX, R13
  102. ADCQ DX, R12
  103. // r3 += a2×b1
  104. MOVQ 16(CX), AX
  105. MULQ 8(BX)
  106. ADDQ AX, R13
  107. ADCQ DX, R12
  108. // r3 += a3×b0
  109. MOVQ 24(CX), AX
  110. MULQ (BX)
  111. ADDQ AX, R13
  112. ADCQ DX, R12
  113. // r3 += 19×a4×b4
  114. MOVQ 32(CX), AX
  115. IMUL3Q $0x13, AX, AX
  116. MULQ 32(BX)
  117. ADDQ AX, R13
  118. ADCQ DX, R12
  119. // r4 = a0×b4
  120. MOVQ (CX), AX
  121. MULQ 32(BX)
  122. MOVQ AX, R15
  123. MOVQ DX, R14
  124. // r4 += a1×b3
  125. MOVQ 8(CX), AX
  126. MULQ 24(BX)
  127. ADDQ AX, R15
  128. ADCQ DX, R14
  129. // r4 += a2×b2
  130. MOVQ 16(CX), AX
  131. MULQ 16(BX)
  132. ADDQ AX, R15
  133. ADCQ DX, R14
  134. // r4 += a3×b1
  135. MOVQ 24(CX), AX
  136. MULQ 8(BX)
  137. ADDQ AX, R15
  138. ADCQ DX, R14
  139. // r4 += a4×b0
  140. MOVQ 32(CX), AX
  141. MULQ (BX)
  142. ADDQ AX, R15
  143. ADCQ DX, R14
  144. // First reduction chain
  145. MOVQ $0x0007ffffffffffff, AX
  146. SHLQ $0x0d, DI, SI
  147. SHLQ $0x0d, R9, R8
  148. SHLQ $0x0d, R11, R10
  149. SHLQ $0x0d, R13, R12
  150. SHLQ $0x0d, R15, R14
  151. ANDQ AX, DI
  152. IMUL3Q $0x13, R14, R14
  153. ADDQ R14, DI
  154. ANDQ AX, R9
  155. ADDQ SI, R9
  156. ANDQ AX, R11
  157. ADDQ R8, R11
  158. ANDQ AX, R13
  159. ADDQ R10, R13
  160. ANDQ AX, R15
  161. ADDQ R12, R15
  162. // Second reduction chain (carryPropagate)
  163. MOVQ DI, SI
  164. SHRQ $0x33, SI
  165. MOVQ R9, R8
  166. SHRQ $0x33, R8
  167. MOVQ R11, R10
  168. SHRQ $0x33, R10
  169. MOVQ R13, R12
  170. SHRQ $0x33, R12
  171. MOVQ R15, R14
  172. SHRQ $0x33, R14
  173. ANDQ AX, DI
  174. IMUL3Q $0x13, R14, R14
  175. ADDQ R14, DI
  176. ANDQ AX, R9
  177. ADDQ SI, R9
  178. ANDQ AX, R11
  179. ADDQ R8, R11
  180. ANDQ AX, R13
  181. ADDQ R10, R13
  182. ANDQ AX, R15
  183. ADDQ R12, R15
  184. // Store output
  185. MOVQ out+0(FP), AX
  186. MOVQ DI, (AX)
  187. MOVQ R9, 8(AX)
  188. MOVQ R11, 16(AX)
  189. MOVQ R13, 24(AX)
  190. MOVQ R15, 32(AX)
  191. RET
  192. // func feSquare(out *Element, a *Element)
  193. TEXT ·feSquare(SB), NOSPLIT, $0-16
  194. MOVQ a+8(FP), CX
  195. // r0 = l0×l0
  196. MOVQ (CX), AX
  197. MULQ (CX)
  198. MOVQ AX, SI
  199. MOVQ DX, BX
  200. // r0 += 38×l1×l4
  201. MOVQ 8(CX), AX
  202. IMUL3Q $0x26, AX, AX
  203. MULQ 32(CX)
  204. ADDQ AX, SI
  205. ADCQ DX, BX
  206. // r0 += 38×l2×l3
  207. MOVQ 16(CX), AX
  208. IMUL3Q $0x26, AX, AX
  209. MULQ 24(CX)
  210. ADDQ AX, SI
  211. ADCQ DX, BX
  212. // r1 = 2×l0×l1
  213. MOVQ (CX), AX
  214. SHLQ $0x01, AX
  215. MULQ 8(CX)
  216. MOVQ AX, R8
  217. MOVQ DX, DI
  218. // r1 += 38×l2×l4
  219. MOVQ 16(CX), AX
  220. IMUL3Q $0x26, AX, AX
  221. MULQ 32(CX)
  222. ADDQ AX, R8
  223. ADCQ DX, DI
  224. // r1 += 19×l3×l3
  225. MOVQ 24(CX), AX
  226. IMUL3Q $0x13, AX, AX
  227. MULQ 24(CX)
  228. ADDQ AX, R8
  229. ADCQ DX, DI
  230. // r2 = 2×l0×l2
  231. MOVQ (CX), AX
  232. SHLQ $0x01, AX
  233. MULQ 16(CX)
  234. MOVQ AX, R10
  235. MOVQ DX, R9
  236. // r2 += l1×l1
  237. MOVQ 8(CX), AX
  238. MULQ 8(CX)
  239. ADDQ AX, R10
  240. ADCQ DX, R9
  241. // r2 += 38×l3×l4
  242. MOVQ 24(CX), AX
  243. IMUL3Q $0x26, AX, AX
  244. MULQ 32(CX)
  245. ADDQ AX, R10
  246. ADCQ DX, R9
  247. // r3 = 2×l0×l3
  248. MOVQ (CX), AX
  249. SHLQ $0x01, AX
  250. MULQ 24(CX)
  251. MOVQ AX, R12
  252. MOVQ DX, R11
  253. // r3 += 2×l1×l2
  254. MOVQ 8(CX), AX
  255. IMUL3Q $0x02, AX, AX
  256. MULQ 16(CX)
  257. ADDQ AX, R12
  258. ADCQ DX, R11
  259. // r3 += 19×l4×l4
  260. MOVQ 32(CX), AX
  261. IMUL3Q $0x13, AX, AX
  262. MULQ 32(CX)
  263. ADDQ AX, R12
  264. ADCQ DX, R11
  265. // r4 = 2×l0×l4
  266. MOVQ (CX), AX
  267. SHLQ $0x01, AX
  268. MULQ 32(CX)
  269. MOVQ AX, R14
  270. MOVQ DX, R13
  271. // r4 += 2×l1×l3
  272. MOVQ 8(CX), AX
  273. IMUL3Q $0x02, AX, AX
  274. MULQ 24(CX)
  275. ADDQ AX, R14
  276. ADCQ DX, R13
  277. // r4 += l2×l2
  278. MOVQ 16(CX), AX
  279. MULQ 16(CX)
  280. ADDQ AX, R14
  281. ADCQ DX, R13
  282. // First reduction chain
  283. MOVQ $0x0007ffffffffffff, AX
  284. SHLQ $0x0d, SI, BX
  285. SHLQ $0x0d, R8, DI
  286. SHLQ $0x0d, R10, R9
  287. SHLQ $0x0d, R12, R11
  288. SHLQ $0x0d, R14, R13
  289. ANDQ AX, SI
  290. IMUL3Q $0x13, R13, R13
  291. ADDQ R13, SI
  292. ANDQ AX, R8
  293. ADDQ BX, R8
  294. ANDQ AX, R10
  295. ADDQ DI, R10
  296. ANDQ AX, R12
  297. ADDQ R9, R12
  298. ANDQ AX, R14
  299. ADDQ R11, R14
  300. // Second reduction chain (carryPropagate)
  301. MOVQ SI, BX
  302. SHRQ $0x33, BX
  303. MOVQ R8, DI
  304. SHRQ $0x33, DI
  305. MOVQ R10, R9
  306. SHRQ $0x33, R9
  307. MOVQ R12, R11
  308. SHRQ $0x33, R11
  309. MOVQ R14, R13
  310. SHRQ $0x33, R13
  311. ANDQ AX, SI
  312. IMUL3Q $0x13, R13, R13
  313. ADDQ R13, SI
  314. ANDQ AX, R8
  315. ADDQ BX, R8
  316. ANDQ AX, R10
  317. ADDQ DI, R10
  318. ANDQ AX, R12
  319. ADDQ R9, R12
  320. ANDQ AX, R14
  321. ADDQ R11, R14
  322. // Store output
  323. MOVQ out+0(FP), AX
  324. MOVQ SI, (AX)
  325. MOVQ R8, 8(AX)
  326. MOVQ R10, 16(AX)
  327. MOVQ R12, 24(AX)
  328. MOVQ R14, 32(AX)
  329. RET