seqdec_amd64.s 82 KB


  1. // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
  2. //go:build !appengine && !noasm && gc && !noasm
  3. // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  4. // Requires: CMOV
  5. TEXT ·sequenceDecs_decode_amd64(SB), $8-32
  6. MOVQ br+8(FP), CX
  7. MOVQ 24(CX), DX
  8. MOVBQZX 32(CX), BX
  9. MOVQ (CX), AX
  10. MOVQ 8(CX), SI
  11. ADDQ SI, AX
  12. MOVQ AX, (SP)
  13. MOVQ ctx+16(FP), AX
  14. MOVQ 72(AX), DI
  15. MOVQ 80(AX), R8
  16. MOVQ 88(AX), R9
  17. MOVQ 104(AX), R10
  18. MOVQ s+0(FP), AX
  19. MOVQ 144(AX), R11
  20. MOVQ 152(AX), R12
  21. MOVQ 160(AX), R13
  22. sequenceDecs_decode_amd64_main_loop:
  23. MOVQ (SP), R14
  24. // Fill bitreader to have enough for the offset and match length.
  25. CMPQ SI, $0x08
  26. JL sequenceDecs_decode_amd64_fill_byte_by_byte
  27. MOVQ BX, AX
  28. SHRQ $0x03, AX
  29. SUBQ AX, R14
  30. MOVQ (R14), DX
  31. SUBQ AX, SI
  32. ANDQ $0x07, BX
  33. JMP sequenceDecs_decode_amd64_fill_end
  34. sequenceDecs_decode_amd64_fill_byte_by_byte:
  35. CMPQ SI, $0x00
  36. JLE sequenceDecs_decode_amd64_fill_check_overread
  37. CMPQ BX, $0x07
  38. JLE sequenceDecs_decode_amd64_fill_end
  39. SHLQ $0x08, DX
  40. SUBQ $0x01, R14
  41. SUBQ $0x01, SI
  42. SUBQ $0x08, BX
  43. MOVBQZX (R14), AX
  44. ORQ AX, DX
  45. JMP sequenceDecs_decode_amd64_fill_byte_by_byte
  46. sequenceDecs_decode_amd64_fill_check_overread:
  47. CMPQ BX, $0x40
  48. JA error_overread
  49. sequenceDecs_decode_amd64_fill_end:
  50. // Update offset
  51. MOVQ R9, AX
  52. MOVQ BX, CX
  53. MOVQ DX, R15
  54. SHLQ CL, R15
  55. MOVB AH, CL
  56. SHRQ $0x20, AX
  57. TESTQ CX, CX
  58. JZ sequenceDecs_decode_amd64_of_update_zero
  59. ADDQ CX, BX
  60. CMPQ BX, $0x40
  61. JA sequenceDecs_decode_amd64_of_update_zero
  62. CMPQ CX, $0x40
  63. JAE sequenceDecs_decode_amd64_of_update_zero
  64. NEGQ CX
  65. SHRQ CL, R15
  66. ADDQ R15, AX
  67. sequenceDecs_decode_amd64_of_update_zero:
  68. MOVQ AX, 16(R10)
  69. // Update match length
  70. MOVQ R8, AX
  71. MOVQ BX, CX
  72. MOVQ DX, R15
  73. SHLQ CL, R15
  74. MOVB AH, CL
  75. SHRQ $0x20, AX
  76. TESTQ CX, CX
  77. JZ sequenceDecs_decode_amd64_ml_update_zero
  78. ADDQ CX, BX
  79. CMPQ BX, $0x40
  80. JA sequenceDecs_decode_amd64_ml_update_zero
  81. CMPQ CX, $0x40
  82. JAE sequenceDecs_decode_amd64_ml_update_zero
  83. NEGQ CX
  84. SHRQ CL, R15
  85. ADDQ R15, AX
  86. sequenceDecs_decode_amd64_ml_update_zero:
  87. MOVQ AX, 8(R10)
  88. // Fill bitreader to have enough for the remaining
  89. CMPQ SI, $0x08
  90. JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
  91. MOVQ BX, AX
  92. SHRQ $0x03, AX
  93. SUBQ AX, R14
  94. MOVQ (R14), DX
  95. SUBQ AX, SI
  96. ANDQ $0x07, BX
  97. JMP sequenceDecs_decode_amd64_fill_2_end
  98. sequenceDecs_decode_amd64_fill_2_byte_by_byte:
  99. CMPQ SI, $0x00
  100. JLE sequenceDecs_decode_amd64_fill_2_check_overread
  101. CMPQ BX, $0x07
  102. JLE sequenceDecs_decode_amd64_fill_2_end
  103. SHLQ $0x08, DX
  104. SUBQ $0x01, R14
  105. SUBQ $0x01, SI
  106. SUBQ $0x08, BX
  107. MOVBQZX (R14), AX
  108. ORQ AX, DX
  109. JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
  110. sequenceDecs_decode_amd64_fill_2_check_overread:
  111. CMPQ BX, $0x40
  112. JA error_overread
  113. sequenceDecs_decode_amd64_fill_2_end:
  114. // Update literal length
  115. MOVQ DI, AX
  116. MOVQ BX, CX
  117. MOVQ DX, R15
  118. SHLQ CL, R15
  119. MOVB AH, CL
  120. SHRQ $0x20, AX
  121. TESTQ CX, CX
  122. JZ sequenceDecs_decode_amd64_ll_update_zero
  123. ADDQ CX, BX
  124. CMPQ BX, $0x40
  125. JA sequenceDecs_decode_amd64_ll_update_zero
  126. CMPQ CX, $0x40
  127. JAE sequenceDecs_decode_amd64_ll_update_zero
  128. NEGQ CX
  129. SHRQ CL, R15
  130. ADDQ R15, AX
  131. sequenceDecs_decode_amd64_ll_update_zero:
  132. MOVQ AX, (R10)
  133. // Fill bitreader for state updates
  134. MOVQ R14, (SP)
  135. MOVQ R9, AX
  136. SHRQ $0x08, AX
  137. MOVBQZX AL, AX
  138. MOVQ ctx+16(FP), CX
  139. CMPQ 96(CX), $0x00
  140. JZ sequenceDecs_decode_amd64_skip_update
  141. // Update Literal Length State
  142. MOVBQZX DI, R14
  143. SHRL $0x10, DI
  144. LEAQ (BX)(R14*1), CX
  145. MOVQ DX, R15
  146. MOVQ CX, BX
  147. ROLQ CL, R15
  148. MOVL $0x00000001, BP
  149. MOVB R14, CL
  150. SHLL CL, BP
  151. DECL BP
  152. ANDQ BP, R15
  153. ADDQ R15, DI
  154. // Load ctx.llTable
  155. MOVQ ctx+16(FP), CX
  156. MOVQ (CX), CX
  157. MOVQ (CX)(DI*8), DI
  158. // Update Match Length State
  159. MOVBQZX R8, R14
  160. SHRL $0x10, R8
  161. LEAQ (BX)(R14*1), CX
  162. MOVQ DX, R15
  163. MOVQ CX, BX
  164. ROLQ CL, R15
  165. MOVL $0x00000001, BP
  166. MOVB R14, CL
  167. SHLL CL, BP
  168. DECL BP
  169. ANDQ BP, R15
  170. ADDQ R15, R8
  171. // Load ctx.mlTable
  172. MOVQ ctx+16(FP), CX
  173. MOVQ 24(CX), CX
  174. MOVQ (CX)(R8*8), R8
  175. // Update Offset State
  176. MOVBQZX R9, R14
  177. SHRL $0x10, R9
  178. LEAQ (BX)(R14*1), CX
  179. MOVQ DX, R15
  180. MOVQ CX, BX
  181. ROLQ CL, R15
  182. MOVL $0x00000001, BP
  183. MOVB R14, CL
  184. SHLL CL, BP
  185. DECL BP
  186. ANDQ BP, R15
  187. ADDQ R15, R9
  188. // Load ctx.ofTable
  189. MOVQ ctx+16(FP), CX
  190. MOVQ 48(CX), CX
  191. MOVQ (CX)(R9*8), R9
  192. sequenceDecs_decode_amd64_skip_update:
  193. // Adjust offset
  194. MOVQ 16(R10), CX
  195. CMPQ AX, $0x01
  196. JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
  197. MOVQ R12, R13
  198. MOVQ R11, R12
  199. MOVQ CX, R11
  200. JMP sequenceDecs_decode_amd64_after_adjust
  201. sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
  202. CMPQ (R10), $0x00000000
  203. JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
  204. INCQ CX
  205. JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
  206. sequenceDecs_decode_amd64_adjust_offset_maybezero:
  207. TESTQ CX, CX
  208. JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
  209. MOVQ R11, CX
  210. JMP sequenceDecs_decode_amd64_after_adjust
  211. sequenceDecs_decode_amd64_adjust_offset_nonzero:
  212. CMPQ CX, $0x01
  213. JB sequenceDecs_decode_amd64_adjust_zero
  214. JEQ sequenceDecs_decode_amd64_adjust_one
  215. CMPQ CX, $0x02
  216. JA sequenceDecs_decode_amd64_adjust_three
  217. JMP sequenceDecs_decode_amd64_adjust_two
  218. sequenceDecs_decode_amd64_adjust_zero:
  219. MOVQ R11, AX
  220. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  221. sequenceDecs_decode_amd64_adjust_one:
  222. MOVQ R12, AX
  223. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  224. sequenceDecs_decode_amd64_adjust_two:
  225. MOVQ R13, AX
  226. JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
  227. sequenceDecs_decode_amd64_adjust_three:
  228. LEAQ -1(R11), AX
  229. sequenceDecs_decode_amd64_adjust_test_temp_valid:
  230. TESTQ AX, AX
  231. JNZ sequenceDecs_decode_amd64_adjust_temp_valid
  232. MOVQ $0x00000001, AX
  233. sequenceDecs_decode_amd64_adjust_temp_valid:
  234. CMPQ CX, $0x01
  235. CMOVQNE R12, R13
  236. MOVQ R11, R12
  237. MOVQ AX, R11
  238. MOVQ AX, CX
  239. sequenceDecs_decode_amd64_after_adjust:
  240. MOVQ CX, 16(R10)
  241. // Check values
  242. MOVQ 8(R10), AX
  243. MOVQ (R10), R14
  244. LEAQ (AX)(R14*1), R15
  245. MOVQ s+0(FP), BP
  246. ADDQ R15, 256(BP)
  247. MOVQ ctx+16(FP), R15
  248. SUBQ R14, 128(R15)
  249. JS error_not_enough_literals
  250. CMPQ AX, $0x00020002
  251. JA sequenceDecs_decode_amd64_error_match_len_too_big
  252. TESTQ CX, CX
  253. JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
  254. TESTQ AX, AX
  255. JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
  256. sequenceDecs_decode_amd64_match_len_ofs_ok:
  257. ADDQ $0x18, R10
  258. MOVQ ctx+16(FP), AX
  259. DECQ 96(AX)
  260. JNS sequenceDecs_decode_amd64_main_loop
  261. MOVQ s+0(FP), AX
  262. MOVQ R11, 144(AX)
  263. MOVQ R12, 152(AX)
  264. MOVQ R13, 160(AX)
  265. MOVQ br+8(FP), AX
  266. MOVQ DX, 24(AX)
  267. MOVB BL, 32(AX)
  268. MOVQ SI, 8(AX)
  269. // Return success
  270. MOVQ $0x00000000, ret+24(FP)
  271. RET
  272. // Return with match length error
  273. sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
  274. MOVQ $0x00000001, ret+24(FP)
  275. RET
  276. // Return with match too long error
  277. sequenceDecs_decode_amd64_error_match_len_too_big:
  278. MOVQ $0x00000002, ret+24(FP)
  279. RET
  280. // Return with match offset too long error
  281. MOVQ $0x00000003, ret+24(FP)
  282. RET
  283. // Return with not enough literals error
  284. error_not_enough_literals:
  285. MOVQ $0x00000004, ret+24(FP)
  286. RET
  287. // Return with overread error
  288. error_overread:
  289. MOVQ $0x00000006, ret+24(FP)
  290. RET
  291. // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  292. // Requires: CMOV
  293. TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
  294. MOVQ br+8(FP), CX
  295. MOVQ 24(CX), DX
  296. MOVBQZX 32(CX), BX
  297. MOVQ (CX), AX
  298. MOVQ 8(CX), SI
  299. ADDQ SI, AX
  300. MOVQ AX, (SP)
  301. MOVQ ctx+16(FP), AX
  302. MOVQ 72(AX), DI
  303. MOVQ 80(AX), R8
  304. MOVQ 88(AX), R9
  305. MOVQ 104(AX), R10
  306. MOVQ s+0(FP), AX
  307. MOVQ 144(AX), R11
  308. MOVQ 152(AX), R12
  309. MOVQ 160(AX), R13
  310. sequenceDecs_decode_56_amd64_main_loop:
  311. MOVQ (SP), R14
  312. // Fill bitreader to have enough for the offset and match length.
  313. CMPQ SI, $0x08
  314. JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
  315. MOVQ BX, AX
  316. SHRQ $0x03, AX
  317. SUBQ AX, R14
  318. MOVQ (R14), DX
  319. SUBQ AX, SI
  320. ANDQ $0x07, BX
  321. JMP sequenceDecs_decode_56_amd64_fill_end
  322. sequenceDecs_decode_56_amd64_fill_byte_by_byte:
  323. CMPQ SI, $0x00
  324. JLE sequenceDecs_decode_56_amd64_fill_check_overread
  325. CMPQ BX, $0x07
  326. JLE sequenceDecs_decode_56_amd64_fill_end
  327. SHLQ $0x08, DX
  328. SUBQ $0x01, R14
  329. SUBQ $0x01, SI
  330. SUBQ $0x08, BX
  331. MOVBQZX (R14), AX
  332. ORQ AX, DX
  333. JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
  334. sequenceDecs_decode_56_amd64_fill_check_overread:
  335. CMPQ BX, $0x40
  336. JA error_overread
  337. sequenceDecs_decode_56_amd64_fill_end:
  338. // Update offset
  339. MOVQ R9, AX
  340. MOVQ BX, CX
  341. MOVQ DX, R15
  342. SHLQ CL, R15
  343. MOVB AH, CL
  344. SHRQ $0x20, AX
  345. TESTQ CX, CX
  346. JZ sequenceDecs_decode_56_amd64_of_update_zero
  347. ADDQ CX, BX
  348. CMPQ BX, $0x40
  349. JA sequenceDecs_decode_56_amd64_of_update_zero
  350. CMPQ CX, $0x40
  351. JAE sequenceDecs_decode_56_amd64_of_update_zero
  352. NEGQ CX
  353. SHRQ CL, R15
  354. ADDQ R15, AX
  355. sequenceDecs_decode_56_amd64_of_update_zero:
  356. MOVQ AX, 16(R10)
  357. // Update match length
  358. MOVQ R8, AX
  359. MOVQ BX, CX
  360. MOVQ DX, R15
  361. SHLQ CL, R15
  362. MOVB AH, CL
  363. SHRQ $0x20, AX
  364. TESTQ CX, CX
  365. JZ sequenceDecs_decode_56_amd64_ml_update_zero
  366. ADDQ CX, BX
  367. CMPQ BX, $0x40
  368. JA sequenceDecs_decode_56_amd64_ml_update_zero
  369. CMPQ CX, $0x40
  370. JAE sequenceDecs_decode_56_amd64_ml_update_zero
  371. NEGQ CX
  372. SHRQ CL, R15
  373. ADDQ R15, AX
  374. sequenceDecs_decode_56_amd64_ml_update_zero:
  375. MOVQ AX, 8(R10)
  376. // Update literal length
  377. MOVQ DI, AX
  378. MOVQ BX, CX
  379. MOVQ DX, R15
  380. SHLQ CL, R15
  381. MOVB AH, CL
  382. SHRQ $0x20, AX
  383. TESTQ CX, CX
  384. JZ sequenceDecs_decode_56_amd64_ll_update_zero
  385. ADDQ CX, BX
  386. CMPQ BX, $0x40
  387. JA sequenceDecs_decode_56_amd64_ll_update_zero
  388. CMPQ CX, $0x40
  389. JAE sequenceDecs_decode_56_amd64_ll_update_zero
  390. NEGQ CX
  391. SHRQ CL, R15
  392. ADDQ R15, AX
  393. sequenceDecs_decode_56_amd64_ll_update_zero:
  394. MOVQ AX, (R10)
  395. // Fill bitreader for state updates
  396. MOVQ R14, (SP)
  397. MOVQ R9, AX
  398. SHRQ $0x08, AX
  399. MOVBQZX AL, AX
  400. MOVQ ctx+16(FP), CX
  401. CMPQ 96(CX), $0x00
  402. JZ sequenceDecs_decode_56_amd64_skip_update
  403. // Update Literal Length State
  404. MOVBQZX DI, R14
  405. SHRL $0x10, DI
  406. LEAQ (BX)(R14*1), CX
  407. MOVQ DX, R15
  408. MOVQ CX, BX
  409. ROLQ CL, R15
  410. MOVL $0x00000001, BP
  411. MOVB R14, CL
  412. SHLL CL, BP
  413. DECL BP
  414. ANDQ BP, R15
  415. ADDQ R15, DI
  416. // Load ctx.llTable
  417. MOVQ ctx+16(FP), CX
  418. MOVQ (CX), CX
  419. MOVQ (CX)(DI*8), DI
  420. // Update Match Length State
  421. MOVBQZX R8, R14
  422. SHRL $0x10, R8
  423. LEAQ (BX)(R14*1), CX
  424. MOVQ DX, R15
  425. MOVQ CX, BX
  426. ROLQ CL, R15
  427. MOVL $0x00000001, BP
  428. MOVB R14, CL
  429. SHLL CL, BP
  430. DECL BP
  431. ANDQ BP, R15
  432. ADDQ R15, R8
  433. // Load ctx.mlTable
  434. MOVQ ctx+16(FP), CX
  435. MOVQ 24(CX), CX
  436. MOVQ (CX)(R8*8), R8
  437. // Update Offset State
  438. MOVBQZX R9, R14
  439. SHRL $0x10, R9
  440. LEAQ (BX)(R14*1), CX
  441. MOVQ DX, R15
  442. MOVQ CX, BX
  443. ROLQ CL, R15
  444. MOVL $0x00000001, BP
  445. MOVB R14, CL
  446. SHLL CL, BP
  447. DECL BP
  448. ANDQ BP, R15
  449. ADDQ R15, R9
  450. // Load ctx.ofTable
  451. MOVQ ctx+16(FP), CX
  452. MOVQ 48(CX), CX
  453. MOVQ (CX)(R9*8), R9
  454. sequenceDecs_decode_56_amd64_skip_update:
  455. // Adjust offset
  456. MOVQ 16(R10), CX
  457. CMPQ AX, $0x01
  458. JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
  459. MOVQ R12, R13
  460. MOVQ R11, R12
  461. MOVQ CX, R11
  462. JMP sequenceDecs_decode_56_amd64_after_adjust
  463. sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
  464. CMPQ (R10), $0x00000000
  465. JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
  466. INCQ CX
  467. JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  468. sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
  469. TESTQ CX, CX
  470. JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
  471. MOVQ R11, CX
  472. JMP sequenceDecs_decode_56_amd64_after_adjust
  473. sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
  474. CMPQ CX, $0x01
  475. JB sequenceDecs_decode_56_amd64_adjust_zero
  476. JEQ sequenceDecs_decode_56_amd64_adjust_one
  477. CMPQ CX, $0x02
  478. JA sequenceDecs_decode_56_amd64_adjust_three
  479. JMP sequenceDecs_decode_56_amd64_adjust_two
  480. sequenceDecs_decode_56_amd64_adjust_zero:
  481. MOVQ R11, AX
  482. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  483. sequenceDecs_decode_56_amd64_adjust_one:
  484. MOVQ R12, AX
  485. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  486. sequenceDecs_decode_56_amd64_adjust_two:
  487. MOVQ R13, AX
  488. JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
  489. sequenceDecs_decode_56_amd64_adjust_three:
  490. LEAQ -1(R11), AX
  491. sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
  492. TESTQ AX, AX
  493. JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
  494. MOVQ $0x00000001, AX
  495. sequenceDecs_decode_56_amd64_adjust_temp_valid:
  496. CMPQ CX, $0x01
  497. CMOVQNE R12, R13
  498. MOVQ R11, R12
  499. MOVQ AX, R11
  500. MOVQ AX, CX
  501. sequenceDecs_decode_56_amd64_after_adjust:
  502. MOVQ CX, 16(R10)
  503. // Check values
  504. MOVQ 8(R10), AX
  505. MOVQ (R10), R14
  506. LEAQ (AX)(R14*1), R15
  507. MOVQ s+0(FP), BP
  508. ADDQ R15, 256(BP)
  509. MOVQ ctx+16(FP), R15
  510. SUBQ R14, 128(R15)
  511. JS error_not_enough_literals
  512. CMPQ AX, $0x00020002
  513. JA sequenceDecs_decode_56_amd64_error_match_len_too_big
  514. TESTQ CX, CX
  515. JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
  516. TESTQ AX, AX
  517. JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
  518. sequenceDecs_decode_56_amd64_match_len_ofs_ok:
  519. ADDQ $0x18, R10
  520. MOVQ ctx+16(FP), AX
  521. DECQ 96(AX)
  522. JNS sequenceDecs_decode_56_amd64_main_loop
  523. MOVQ s+0(FP), AX
  524. MOVQ R11, 144(AX)
  525. MOVQ R12, 152(AX)
  526. MOVQ R13, 160(AX)
  527. MOVQ br+8(FP), AX
  528. MOVQ DX, 24(AX)
  529. MOVB BL, 32(AX)
  530. MOVQ SI, 8(AX)
  531. // Return success
  532. MOVQ $0x00000000, ret+24(FP)
  533. RET
  534. // Return with match length error
  535. sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
  536. MOVQ $0x00000001, ret+24(FP)
  537. RET
  538. // Return with match too long error
  539. sequenceDecs_decode_56_amd64_error_match_len_too_big:
  540. MOVQ $0x00000002, ret+24(FP)
  541. RET
  542. // Return with match offset too long error
  543. MOVQ $0x00000003, ret+24(FP)
  544. RET
  545. // Return with not enough literals error
  546. error_not_enough_literals:
  547. MOVQ $0x00000004, ret+24(FP)
  548. RET
  549. // Return with overread error
  550. error_overread:
  551. MOVQ $0x00000006, ret+24(FP)
  552. RET
  553. // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  554. // Requires: BMI, BMI2, CMOV
  555. TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
  556. MOVQ br+8(FP), BX
  557. MOVQ 24(BX), AX
  558. MOVBQZX 32(BX), DX
  559. MOVQ (BX), CX
  560. MOVQ 8(BX), BX
  561. ADDQ BX, CX
  562. MOVQ CX, (SP)
  563. MOVQ ctx+16(FP), CX
  564. MOVQ 72(CX), SI
  565. MOVQ 80(CX), DI
  566. MOVQ 88(CX), R8
  567. MOVQ 104(CX), R9
  568. MOVQ s+0(FP), CX
  569. MOVQ 144(CX), R10
  570. MOVQ 152(CX), R11
  571. MOVQ 160(CX), R12
  572. sequenceDecs_decode_bmi2_main_loop:
  573. MOVQ (SP), R13
  574. // Fill bitreader to have enough for the offset and match length.
  575. CMPQ BX, $0x08
  576. JL sequenceDecs_decode_bmi2_fill_byte_by_byte
  577. MOVQ DX, CX
  578. SHRQ $0x03, CX
  579. SUBQ CX, R13
  580. MOVQ (R13), AX
  581. SUBQ CX, BX
  582. ANDQ $0x07, DX
  583. JMP sequenceDecs_decode_bmi2_fill_end
  584. sequenceDecs_decode_bmi2_fill_byte_by_byte:
  585. CMPQ BX, $0x00
  586. JLE sequenceDecs_decode_bmi2_fill_check_overread
  587. CMPQ DX, $0x07
  588. JLE sequenceDecs_decode_bmi2_fill_end
  589. SHLQ $0x08, AX
  590. SUBQ $0x01, R13
  591. SUBQ $0x01, BX
  592. SUBQ $0x08, DX
  593. MOVBQZX (R13), CX
  594. ORQ CX, AX
  595. JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
  596. sequenceDecs_decode_bmi2_fill_check_overread:
  597. CMPQ DX, $0x40
  598. JA error_overread
  599. sequenceDecs_decode_bmi2_fill_end:
  600. // Update offset
  601. MOVQ $0x00000808, CX
  602. BEXTRQ CX, R8, R14
  603. MOVQ AX, R15
  604. LEAQ (DX)(R14*1), CX
  605. ROLQ CL, R15
  606. BZHIQ R14, R15, R15
  607. MOVQ CX, DX
  608. MOVQ R8, CX
  609. SHRQ $0x20, CX
  610. ADDQ R15, CX
  611. MOVQ CX, 16(R9)
  612. // Update match length
  613. MOVQ $0x00000808, CX
  614. BEXTRQ CX, DI, R14
  615. MOVQ AX, R15
  616. LEAQ (DX)(R14*1), CX
  617. ROLQ CL, R15
  618. BZHIQ R14, R15, R15
  619. MOVQ CX, DX
  620. MOVQ DI, CX
  621. SHRQ $0x20, CX
  622. ADDQ R15, CX
  623. MOVQ CX, 8(R9)
  624. // Fill bitreader to have enough for the remaining
  625. CMPQ BX, $0x08
  626. JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  627. MOVQ DX, CX
  628. SHRQ $0x03, CX
  629. SUBQ CX, R13
  630. MOVQ (R13), AX
  631. SUBQ CX, BX
  632. ANDQ $0x07, DX
  633. JMP sequenceDecs_decode_bmi2_fill_2_end
  634. sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
  635. CMPQ BX, $0x00
  636. JLE sequenceDecs_decode_bmi2_fill_2_check_overread
  637. CMPQ DX, $0x07
  638. JLE sequenceDecs_decode_bmi2_fill_2_end
  639. SHLQ $0x08, AX
  640. SUBQ $0x01, R13
  641. SUBQ $0x01, BX
  642. SUBQ $0x08, DX
  643. MOVBQZX (R13), CX
  644. ORQ CX, AX
  645. JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
  646. sequenceDecs_decode_bmi2_fill_2_check_overread:
  647. CMPQ DX, $0x40
  648. JA error_overread
  649. sequenceDecs_decode_bmi2_fill_2_end:
  650. // Update literal length
  651. MOVQ $0x00000808, CX
  652. BEXTRQ CX, SI, R14
  653. MOVQ AX, R15
  654. LEAQ (DX)(R14*1), CX
  655. ROLQ CL, R15
  656. BZHIQ R14, R15, R15
  657. MOVQ CX, DX
  658. MOVQ SI, CX
  659. SHRQ $0x20, CX
  660. ADDQ R15, CX
  661. MOVQ CX, (R9)
  662. // Fill bitreader for state updates
  663. MOVQ R13, (SP)
  664. MOVQ $0x00000808, CX
  665. BEXTRQ CX, R8, R13
  666. MOVQ ctx+16(FP), CX
  667. CMPQ 96(CX), $0x00
  668. JZ sequenceDecs_decode_bmi2_skip_update
  669. LEAQ (SI)(DI*1), R14
  670. ADDQ R8, R14
  671. MOVBQZX R14, R14
  672. LEAQ (DX)(R14*1), CX
  673. MOVQ AX, R15
  674. MOVQ CX, DX
  675. ROLQ CL, R15
  676. BZHIQ R14, R15, R15
  677. // Update Offset State
  678. BZHIQ R8, R15, CX
  679. SHRXQ R8, R15, R15
  680. SHRL $0x10, R8
  681. ADDQ CX, R8
  682. // Load ctx.ofTable
  683. MOVQ ctx+16(FP), CX
  684. MOVQ 48(CX), CX
  685. MOVQ (CX)(R8*8), R8
  686. // Update Match Length State
  687. BZHIQ DI, R15, CX
  688. SHRXQ DI, R15, R15
  689. SHRL $0x10, DI
  690. ADDQ CX, DI
  691. // Load ctx.mlTable
  692. MOVQ ctx+16(FP), CX
  693. MOVQ 24(CX), CX
  694. MOVQ (CX)(DI*8), DI
  695. // Update Literal Length State
  696. BZHIQ SI, R15, CX
  697. SHRL $0x10, SI
  698. ADDQ CX, SI
  699. // Load ctx.llTable
  700. MOVQ ctx+16(FP), CX
  701. MOVQ (CX), CX
  702. MOVQ (CX)(SI*8), SI
  703. sequenceDecs_decode_bmi2_skip_update:
  704. // Adjust offset
  705. MOVQ 16(R9), CX
  706. CMPQ R13, $0x01
  707. JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
  708. MOVQ R11, R12
  709. MOVQ R10, R11
  710. MOVQ CX, R10
  711. JMP sequenceDecs_decode_bmi2_after_adjust
  712. sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
  713. CMPQ (R9), $0x00000000
  714. JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
  715. INCQ CX
  716. JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
  717. sequenceDecs_decode_bmi2_adjust_offset_maybezero:
  718. TESTQ CX, CX
  719. JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
  720. MOVQ R10, CX
  721. JMP sequenceDecs_decode_bmi2_after_adjust
  722. sequenceDecs_decode_bmi2_adjust_offset_nonzero:
  723. CMPQ CX, $0x01
  724. JB sequenceDecs_decode_bmi2_adjust_zero
  725. JEQ sequenceDecs_decode_bmi2_adjust_one
  726. CMPQ CX, $0x02
  727. JA sequenceDecs_decode_bmi2_adjust_three
  728. JMP sequenceDecs_decode_bmi2_adjust_two
  729. sequenceDecs_decode_bmi2_adjust_zero:
  730. MOVQ R10, R13
  731. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  732. sequenceDecs_decode_bmi2_adjust_one:
  733. MOVQ R11, R13
  734. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  735. sequenceDecs_decode_bmi2_adjust_two:
  736. MOVQ R12, R13
  737. JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
  738. sequenceDecs_decode_bmi2_adjust_three:
  739. LEAQ -1(R10), R13
  740. sequenceDecs_decode_bmi2_adjust_test_temp_valid:
  741. TESTQ R13, R13
  742. JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
  743. MOVQ $0x00000001, R13
  744. sequenceDecs_decode_bmi2_adjust_temp_valid:
  745. CMPQ CX, $0x01
  746. CMOVQNE R11, R12
  747. MOVQ R10, R11
  748. MOVQ R13, R10
  749. MOVQ R13, CX
  750. sequenceDecs_decode_bmi2_after_adjust:
  751. MOVQ CX, 16(R9)
  752. // Check values
  753. MOVQ 8(R9), R13
  754. MOVQ (R9), R14
  755. LEAQ (R13)(R14*1), R15
  756. MOVQ s+0(FP), BP
  757. ADDQ R15, 256(BP)
  758. MOVQ ctx+16(FP), R15
  759. SUBQ R14, 128(R15)
  760. JS error_not_enough_literals
  761. CMPQ R13, $0x00020002
  762. JA sequenceDecs_decode_bmi2_error_match_len_too_big
  763. TESTQ CX, CX
  764. JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
  765. TESTQ R13, R13
  766. JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
  767. sequenceDecs_decode_bmi2_match_len_ofs_ok:
  768. ADDQ $0x18, R9
  769. MOVQ ctx+16(FP), CX
  770. DECQ 96(CX)
  771. JNS sequenceDecs_decode_bmi2_main_loop
  772. MOVQ s+0(FP), CX
  773. MOVQ R10, 144(CX)
  774. MOVQ R11, 152(CX)
  775. MOVQ R12, 160(CX)
  776. MOVQ br+8(FP), CX
  777. MOVQ AX, 24(CX)
  778. MOVB DL, 32(CX)
  779. MOVQ BX, 8(CX)
  780. // Return success
  781. MOVQ $0x00000000, ret+24(FP)
  782. RET
  783. // Return with match length error
  784. sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
  785. MOVQ $0x00000001, ret+24(FP)
  786. RET
  787. // Return with match too long error
  788. sequenceDecs_decode_bmi2_error_match_len_too_big:
  789. MOVQ $0x00000002, ret+24(FP)
  790. RET
  791. // Return with match offset too long error
  792. MOVQ $0x00000003, ret+24(FP)
  793. RET
  794. // Return with not enough literals error
  795. error_not_enough_literals:
  796. MOVQ $0x00000004, ret+24(FP)
  797. RET
  798. // Return with overread error
  799. error_overread:
  800. MOVQ $0x00000006, ret+24(FP)
  801. RET
  802. // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
  803. // Requires: BMI, BMI2, CMOV
  804. TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
  805. MOVQ br+8(FP), BX
  806. MOVQ 24(BX), AX
  807. MOVBQZX 32(BX), DX
  808. MOVQ (BX), CX
  809. MOVQ 8(BX), BX
  810. ADDQ BX, CX
  811. MOVQ CX, (SP)
  812. MOVQ ctx+16(FP), CX
  813. MOVQ 72(CX), SI
  814. MOVQ 80(CX), DI
  815. MOVQ 88(CX), R8
  816. MOVQ 104(CX), R9
  817. MOVQ s+0(FP), CX
  818. MOVQ 144(CX), R10
  819. MOVQ 152(CX), R11
  820. MOVQ 160(CX), R12
  821. sequenceDecs_decode_56_bmi2_main_loop:
  822. MOVQ (SP), R13
  823. // Fill bitreader to have enough for the offset and match length.
  824. CMPQ BX, $0x08
  825. JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  826. MOVQ DX, CX
  827. SHRQ $0x03, CX
  828. SUBQ CX, R13
  829. MOVQ (R13), AX
  830. SUBQ CX, BX
  831. ANDQ $0x07, DX
  832. JMP sequenceDecs_decode_56_bmi2_fill_end
  833. sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
  834. CMPQ BX, $0x00
  835. JLE sequenceDecs_decode_56_bmi2_fill_check_overread
  836. CMPQ DX, $0x07
  837. JLE sequenceDecs_decode_56_bmi2_fill_end
  838. SHLQ $0x08, AX
  839. SUBQ $0x01, R13
  840. SUBQ $0x01, BX
  841. SUBQ $0x08, DX
  842. MOVBQZX (R13), CX
  843. ORQ CX, AX
  844. JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
  845. sequenceDecs_decode_56_bmi2_fill_check_overread:
  846. CMPQ DX, $0x40
  847. JA error_overread
  848. sequenceDecs_decode_56_bmi2_fill_end:
  849. // Update offset
  850. MOVQ $0x00000808, CX
  851. BEXTRQ CX, R8, R14
  852. MOVQ AX, R15
  853. LEAQ (DX)(R14*1), CX
  854. ROLQ CL, R15
  855. BZHIQ R14, R15, R15
  856. MOVQ CX, DX
  857. MOVQ R8, CX
  858. SHRQ $0x20, CX
  859. ADDQ R15, CX
  860. MOVQ CX, 16(R9)
  861. // Update match length
  862. MOVQ $0x00000808, CX
  863. BEXTRQ CX, DI, R14
  864. MOVQ AX, R15
  865. LEAQ (DX)(R14*1), CX
  866. ROLQ CL, R15
  867. BZHIQ R14, R15, R15
  868. MOVQ CX, DX
  869. MOVQ DI, CX
  870. SHRQ $0x20, CX
  871. ADDQ R15, CX
  872. MOVQ CX, 8(R9)
  873. // Update literal length
  874. MOVQ $0x00000808, CX
  875. BEXTRQ CX, SI, R14
  876. MOVQ AX, R15
  877. LEAQ (DX)(R14*1), CX
  878. ROLQ CL, R15
  879. BZHIQ R14, R15, R15
  880. MOVQ CX, DX
  881. MOVQ SI, CX
  882. SHRQ $0x20, CX
  883. ADDQ R15, CX
  884. MOVQ CX, (R9)
  885. // Fill bitreader for state updates
  886. MOVQ R13, (SP)
  887. MOVQ $0x00000808, CX
  888. BEXTRQ CX, R8, R13
  889. MOVQ ctx+16(FP), CX
  890. CMPQ 96(CX), $0x00
  891. JZ sequenceDecs_decode_56_bmi2_skip_update
  892. LEAQ (SI)(DI*1), R14
  893. ADDQ R8, R14
  894. MOVBQZX R14, R14
  895. LEAQ (DX)(R14*1), CX
  896. MOVQ AX, R15
  897. MOVQ CX, DX
  898. ROLQ CL, R15
  899. BZHIQ R14, R15, R15
  900. // Update Offset State
  901. BZHIQ R8, R15, CX
  902. SHRXQ R8, R15, R15
  903. SHRL $0x10, R8
  904. ADDQ CX, R8
  905. // Load ctx.ofTable
  906. MOVQ ctx+16(FP), CX
  907. MOVQ 48(CX), CX
  908. MOVQ (CX)(R8*8), R8
  909. // Update Match Length State
  910. BZHIQ DI, R15, CX
  911. SHRXQ DI, R15, R15
  912. SHRL $0x10, DI
  913. ADDQ CX, DI
  914. // Load ctx.mlTable
  915. MOVQ ctx+16(FP), CX
  916. MOVQ 24(CX), CX
  917. MOVQ (CX)(DI*8), DI
  918. // Update Literal Length State
  919. BZHIQ SI, R15, CX
  920. SHRL $0x10, SI
  921. ADDQ CX, SI
  922. // Load ctx.llTable
  923. MOVQ ctx+16(FP), CX
  924. MOVQ (CX), CX
  925. MOVQ (CX)(SI*8), SI
  926. sequenceDecs_decode_56_bmi2_skip_update:
  927. // Adjust offset
  928. MOVQ 16(R9), CX
  929. CMPQ R13, $0x01
  930. JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
  931. MOVQ R11, R12
  932. MOVQ R10, R11
  933. MOVQ CX, R10
  934. JMP sequenceDecs_decode_56_bmi2_after_adjust
  935. sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
  936. CMPQ (R9), $0x00000000
  937. JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
  938. INCQ CX
  939. JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  940. sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
  941. TESTQ CX, CX
  942. JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
  943. MOVQ R10, CX
  944. JMP sequenceDecs_decode_56_bmi2_after_adjust
  945. sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
  946. CMPQ CX, $0x01
  947. JB sequenceDecs_decode_56_bmi2_adjust_zero
  948. JEQ sequenceDecs_decode_56_bmi2_adjust_one
  949. CMPQ CX, $0x02
  950. JA sequenceDecs_decode_56_bmi2_adjust_three
  951. JMP sequenceDecs_decode_56_bmi2_adjust_two
  952. sequenceDecs_decode_56_bmi2_adjust_zero:
  953. MOVQ R10, R13
  954. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  955. sequenceDecs_decode_56_bmi2_adjust_one:
  956. MOVQ R11, R13
  957. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  958. sequenceDecs_decode_56_bmi2_adjust_two:
  959. MOVQ R12, R13
  960. JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
  961. sequenceDecs_decode_56_bmi2_adjust_three:
  962. LEAQ -1(R10), R13
  963. sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
  964. TESTQ R13, R13
  965. JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
  966. MOVQ $0x00000001, R13
  967. sequenceDecs_decode_56_bmi2_adjust_temp_valid:
  968. CMPQ CX, $0x01
  969. CMOVQNE R11, R12
  970. MOVQ R10, R11
  971. MOVQ R13, R10
  972. MOVQ R13, CX
  973. sequenceDecs_decode_56_bmi2_after_adjust:
  974. MOVQ CX, 16(R9)
  975. // Check values
  976. MOVQ 8(R9), R13
  977. MOVQ (R9), R14
  978. LEAQ (R13)(R14*1), R15
  979. MOVQ s+0(FP), BP
  980. ADDQ R15, 256(BP)
  981. MOVQ ctx+16(FP), R15
  982. SUBQ R14, 128(R15)
  983. JS error_not_enough_literals
  984. CMPQ R13, $0x00020002
  985. JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
  986. TESTQ CX, CX
  987. JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
  988. TESTQ R13, R13
  989. JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
  990. sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
  991. ADDQ $0x18, R9
  992. MOVQ ctx+16(FP), CX
  993. DECQ 96(CX)
  994. JNS sequenceDecs_decode_56_bmi2_main_loop
  995. MOVQ s+0(FP), CX
  996. MOVQ R10, 144(CX)
  997. MOVQ R11, 152(CX)
  998. MOVQ R12, 160(CX)
  999. MOVQ br+8(FP), CX
  1000. MOVQ AX, 24(CX)
  1001. MOVB DL, 32(CX)
  1002. MOVQ BX, 8(CX)
  1003. // Return success
  1004. MOVQ $0x00000000, ret+24(FP)
  1005. RET
  1006. // Return with match length error
  1007. sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
  1008. MOVQ $0x00000001, ret+24(FP)
  1009. RET
  1010. // Return with match too long error
  1011. sequenceDecs_decode_56_bmi2_error_match_len_too_big:
  1012. MOVQ $0x00000002, ret+24(FP)
  1013. RET
  1014. // Return with match offset too long error
  1015. MOVQ $0x00000003, ret+24(FP)
  1016. RET
  1017. // Return with not enough literals error
  1018. error_not_enough_literals:
  1019. MOVQ $0x00000004, ret+24(FP)
  1020. RET
  1021. // Return with overread error
  1022. error_overread:
  1023. MOVQ $0x00000006, ret+24(FP)
  1024. RET
  1025. // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
  1026. // Requires: SSE
  1027. TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
  1028. MOVQ ctx+0(FP), R10
  1029. MOVQ 8(R10), CX
  1030. TESTQ CX, CX
  1031. JZ empty_seqs
  1032. MOVQ (R10), AX
  1033. MOVQ 24(R10), DX
  1034. MOVQ 32(R10), BX
  1035. MOVQ 80(R10), SI
  1036. MOVQ 104(R10), DI
  1037. MOVQ 120(R10), R8
  1038. MOVQ 56(R10), R9
  1039. MOVQ 64(R10), R10
  1040. ADDQ R10, R9
  1041. // seqsBase += 24 * seqIndex
  1042. LEAQ (DX)(DX*2), R11
  1043. SHLQ $0x03, R11
  1044. ADDQ R11, AX
  1045. // outBase += outPosition
  1046. ADDQ DI, BX
  1047. main_loop:
  1048. MOVQ (AX), R11
  1049. MOVQ 16(AX), R12
  1050. MOVQ 8(AX), R13
  1051. // Copy literals
  1052. TESTQ R11, R11
  1053. JZ check_offset
  1054. XORQ R14, R14
  1055. copy_1:
  1056. MOVUPS (SI)(R14*1), X0
  1057. MOVUPS X0, (BX)(R14*1)
  1058. ADDQ $0x10, R14
  1059. CMPQ R14, R11
  1060. JB copy_1
  1061. ADDQ R11, SI
  1062. ADDQ R11, BX
  1063. ADDQ R11, DI
  1064. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1065. check_offset:
  1066. LEAQ (DI)(R10*1), R11
  1067. CMPQ R12, R11
  1068. JG error_match_off_too_big
  1069. CMPQ R12, R8
  1070. JG error_match_off_too_big
  1071. // Copy match from history
  1072. MOVQ R12, R11
  1073. SUBQ DI, R11
  1074. JLS copy_match
  1075. MOVQ R9, R14
  1076. SUBQ R11, R14
  1077. CMPQ R13, R11
  1078. JG copy_all_from_history
  1079. MOVQ R13, R11
  1080. SUBQ $0x10, R11
  1081. JB copy_4_small
  1082. copy_4_loop:
  1083. MOVUPS (R14), X0
  1084. MOVUPS X0, (BX)
  1085. ADDQ $0x10, R14
  1086. ADDQ $0x10, BX
  1087. SUBQ $0x10, R11
  1088. JAE copy_4_loop
  1089. LEAQ 16(R14)(R11*1), R14
  1090. LEAQ 16(BX)(R11*1), BX
  1091. MOVUPS -16(R14), X0
  1092. MOVUPS X0, -16(BX)
  1093. JMP copy_4_end
  1094. copy_4_small:
  1095. CMPQ R13, $0x03
  1096. JE copy_4_move_3
  1097. CMPQ R13, $0x08
  1098. JB copy_4_move_4through7
  1099. JMP copy_4_move_8through16
  1100. copy_4_move_3:
  1101. MOVW (R14), R11
  1102. MOVB 2(R14), R12
  1103. MOVW R11, (BX)
  1104. MOVB R12, 2(BX)
  1105. ADDQ R13, R14
  1106. ADDQ R13, BX
  1107. JMP copy_4_end
  1108. copy_4_move_4through7:
  1109. MOVL (R14), R11
  1110. MOVL -4(R14)(R13*1), R12
  1111. MOVL R11, (BX)
  1112. MOVL R12, -4(BX)(R13*1)
  1113. ADDQ R13, R14
  1114. ADDQ R13, BX
  1115. JMP copy_4_end
  1116. copy_4_move_8through16:
  1117. MOVQ (R14), R11
  1118. MOVQ -8(R14)(R13*1), R12
  1119. MOVQ R11, (BX)
  1120. MOVQ R12, -8(BX)(R13*1)
  1121. ADDQ R13, R14
  1122. ADDQ R13, BX
  1123. copy_4_end:
  1124. ADDQ R13, DI
  1125. ADDQ $0x18, AX
  1126. INCQ DX
  1127. CMPQ DX, CX
  1128. JB main_loop
  1129. JMP loop_finished
  1130. copy_all_from_history:
  1131. MOVQ R11, R15
  1132. SUBQ $0x10, R15
  1133. JB copy_5_small
  1134. copy_5_loop:
  1135. MOVUPS (R14), X0
  1136. MOVUPS X0, (BX)
  1137. ADDQ $0x10, R14
  1138. ADDQ $0x10, BX
  1139. SUBQ $0x10, R15
  1140. JAE copy_5_loop
  1141. LEAQ 16(R14)(R15*1), R14
  1142. LEAQ 16(BX)(R15*1), BX
  1143. MOVUPS -16(R14), X0
  1144. MOVUPS X0, -16(BX)
  1145. JMP copy_5_end
  1146. copy_5_small:
  1147. CMPQ R11, $0x03
  1148. JE copy_5_move_3
  1149. JB copy_5_move_1or2
  1150. CMPQ R11, $0x08
  1151. JB copy_5_move_4through7
  1152. JMP copy_5_move_8through16
  1153. copy_5_move_1or2:
  1154. MOVB (R14), R15
  1155. MOVB -1(R14)(R11*1), BP
  1156. MOVB R15, (BX)
  1157. MOVB BP, -1(BX)(R11*1)
  1158. ADDQ R11, R14
  1159. ADDQ R11, BX
  1160. JMP copy_5_end
  1161. copy_5_move_3:
  1162. MOVW (R14), R15
  1163. MOVB 2(R14), BP
  1164. MOVW R15, (BX)
  1165. MOVB BP, 2(BX)
  1166. ADDQ R11, R14
  1167. ADDQ R11, BX
  1168. JMP copy_5_end
  1169. copy_5_move_4through7:
  1170. MOVL (R14), R15
  1171. MOVL -4(R14)(R11*1), BP
  1172. MOVL R15, (BX)
  1173. MOVL BP, -4(BX)(R11*1)
  1174. ADDQ R11, R14
  1175. ADDQ R11, BX
  1176. JMP copy_5_end
  1177. copy_5_move_8through16:
  1178. MOVQ (R14), R15
  1179. MOVQ -8(R14)(R11*1), BP
  1180. MOVQ R15, (BX)
  1181. MOVQ BP, -8(BX)(R11*1)
  1182. ADDQ R11, R14
  1183. ADDQ R11, BX
  1184. copy_5_end:
  1185. ADDQ R11, DI
  1186. SUBQ R11, R13
  1187. // Copy match from the current buffer
  1188. copy_match:
  1189. MOVQ BX, R11
  1190. SUBQ R12, R11
  1191. // ml <= mo
  1192. CMPQ R13, R12
  1193. JA copy_overlapping_match
  1194. // Copy non-overlapping match
  1195. ADDQ R13, DI
  1196. MOVQ BX, R12
  1197. ADDQ R13, BX
  1198. copy_2:
  1199. MOVUPS (R11), X0
  1200. MOVUPS X0, (R12)
  1201. ADDQ $0x10, R11
  1202. ADDQ $0x10, R12
  1203. SUBQ $0x10, R13
  1204. JHI copy_2
  1205. JMP handle_loop
  1206. // Copy overlapping match
  1207. copy_overlapping_match:
  1208. ADDQ R13, DI
  1209. copy_slow_3:
  1210. MOVB (R11), R12
  1211. MOVB R12, (BX)
  1212. INCQ R11
  1213. INCQ BX
  1214. DECQ R13
  1215. JNZ copy_slow_3
  1216. handle_loop:
  1217. ADDQ $0x18, AX
  1218. INCQ DX
  1219. CMPQ DX, CX
  1220. JB main_loop
  1221. loop_finished:
  1222. // Return value
  1223. MOVB $0x01, ret+8(FP)
  1224. // Update the context
  1225. MOVQ ctx+0(FP), AX
  1226. MOVQ DX, 24(AX)
  1227. MOVQ DI, 104(AX)
  1228. SUBQ 80(AX), SI
  1229. MOVQ SI, 112(AX)
  1230. RET
  1231. error_match_off_too_big:
  1232. // Return value
  1233. MOVB $0x00, ret+8(FP)
  1234. // Update the context
  1235. MOVQ ctx+0(FP), AX
  1236. MOVQ DX, 24(AX)
  1237. MOVQ DI, 104(AX)
  1238. SUBQ 80(AX), SI
  1239. MOVQ SI, 112(AX)
  1240. RET
  1241. empty_seqs:
  1242. // Return value
  1243. MOVB $0x01, ret+8(FP)
  1244. RET
  1245. // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
  1246. // Requires: SSE
  1247. TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
  1248. MOVQ ctx+0(FP), R10
  1249. MOVQ 8(R10), CX
  1250. TESTQ CX, CX
  1251. JZ empty_seqs
  1252. MOVQ (R10), AX
  1253. MOVQ 24(R10), DX
  1254. MOVQ 32(R10), BX
  1255. MOVQ 80(R10), SI
  1256. MOVQ 104(R10), DI
  1257. MOVQ 120(R10), R8
  1258. MOVQ 56(R10), R9
  1259. MOVQ 64(R10), R10
  1260. ADDQ R10, R9
  1261. // seqsBase += 24 * seqIndex
  1262. LEAQ (DX)(DX*2), R11
  1263. SHLQ $0x03, R11
  1264. ADDQ R11, AX
  1265. // outBase += outPosition
  1266. ADDQ DI, BX
  1267. main_loop:
  1268. MOVQ (AX), R11
  1269. MOVQ 16(AX), R12
  1270. MOVQ 8(AX), R13
  1271. // Copy literals
  1272. TESTQ R11, R11
  1273. JZ check_offset
  1274. MOVQ R11, R14
  1275. SUBQ $0x10, R14
  1276. JB copy_1_small
  1277. copy_1_loop:
  1278. MOVUPS (SI), X0
  1279. MOVUPS X0, (BX)
  1280. ADDQ $0x10, SI
  1281. ADDQ $0x10, BX
  1282. SUBQ $0x10, R14
  1283. JAE copy_1_loop
  1284. LEAQ 16(SI)(R14*1), SI
  1285. LEAQ 16(BX)(R14*1), BX
  1286. MOVUPS -16(SI), X0
  1287. MOVUPS X0, -16(BX)
  1288. JMP copy_1_end
  1289. copy_1_small:
  1290. CMPQ R11, $0x03
  1291. JE copy_1_move_3
  1292. JB copy_1_move_1or2
  1293. CMPQ R11, $0x08
  1294. JB copy_1_move_4through7
  1295. JMP copy_1_move_8through16
  1296. copy_1_move_1or2:
  1297. MOVB (SI), R14
  1298. MOVB -1(SI)(R11*1), R15
  1299. MOVB R14, (BX)
  1300. MOVB R15, -1(BX)(R11*1)
  1301. ADDQ R11, SI
  1302. ADDQ R11, BX
  1303. JMP copy_1_end
  1304. copy_1_move_3:
  1305. MOVW (SI), R14
  1306. MOVB 2(SI), R15
  1307. MOVW R14, (BX)
  1308. MOVB R15, 2(BX)
  1309. ADDQ R11, SI
  1310. ADDQ R11, BX
  1311. JMP copy_1_end
  1312. copy_1_move_4through7:
  1313. MOVL (SI), R14
  1314. MOVL -4(SI)(R11*1), R15
  1315. MOVL R14, (BX)
  1316. MOVL R15, -4(BX)(R11*1)
  1317. ADDQ R11, SI
  1318. ADDQ R11, BX
  1319. JMP copy_1_end
  1320. copy_1_move_8through16:
  1321. MOVQ (SI), R14
  1322. MOVQ -8(SI)(R11*1), R15
  1323. MOVQ R14, (BX)
  1324. MOVQ R15, -8(BX)(R11*1)
  1325. ADDQ R11, SI
  1326. ADDQ R11, BX
  1327. copy_1_end:
  1328. ADDQ R11, DI
  1329. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1330. check_offset:
  1331. LEAQ (DI)(R10*1), R11
  1332. CMPQ R12, R11
  1333. JG error_match_off_too_big
  1334. CMPQ R12, R8
  1335. JG error_match_off_too_big
  1336. // Copy match from history
  1337. MOVQ R12, R11
  1338. SUBQ DI, R11
  1339. JLS copy_match
  1340. MOVQ R9, R14
  1341. SUBQ R11, R14
  1342. CMPQ R13, R11
  1343. JG copy_all_from_history
  1344. MOVQ R13, R11
  1345. SUBQ $0x10, R11
  1346. JB copy_4_small
  1347. copy_4_loop:
  1348. MOVUPS (R14), X0
  1349. MOVUPS X0, (BX)
  1350. ADDQ $0x10, R14
  1351. ADDQ $0x10, BX
  1352. SUBQ $0x10, R11
  1353. JAE copy_4_loop
  1354. LEAQ 16(R14)(R11*1), R14
  1355. LEAQ 16(BX)(R11*1), BX
  1356. MOVUPS -16(R14), X0
  1357. MOVUPS X0, -16(BX)
  1358. JMP copy_4_end
  1359. copy_4_small:
  1360. CMPQ R13, $0x03
  1361. JE copy_4_move_3
  1362. CMPQ R13, $0x08
  1363. JB copy_4_move_4through7
  1364. JMP copy_4_move_8through16
  1365. copy_4_move_3:
  1366. MOVW (R14), R11
  1367. MOVB 2(R14), R12
  1368. MOVW R11, (BX)
  1369. MOVB R12, 2(BX)
  1370. ADDQ R13, R14
  1371. ADDQ R13, BX
  1372. JMP copy_4_end
  1373. copy_4_move_4through7:
  1374. MOVL (R14), R11
  1375. MOVL -4(R14)(R13*1), R12
  1376. MOVL R11, (BX)
  1377. MOVL R12, -4(BX)(R13*1)
  1378. ADDQ R13, R14
  1379. ADDQ R13, BX
  1380. JMP copy_4_end
  1381. copy_4_move_8through16:
  1382. MOVQ (R14), R11
  1383. MOVQ -8(R14)(R13*1), R12
  1384. MOVQ R11, (BX)
  1385. MOVQ R12, -8(BX)(R13*1)
  1386. ADDQ R13, R14
  1387. ADDQ R13, BX
  1388. copy_4_end:
  1389. ADDQ R13, DI
  1390. ADDQ $0x18, AX
  1391. INCQ DX
  1392. CMPQ DX, CX
  1393. JB main_loop
  1394. JMP loop_finished
  1395. copy_all_from_history:
  1396. MOVQ R11, R15
  1397. SUBQ $0x10, R15
  1398. JB copy_5_small
  1399. copy_5_loop:
  1400. MOVUPS (R14), X0
  1401. MOVUPS X0, (BX)
  1402. ADDQ $0x10, R14
  1403. ADDQ $0x10, BX
  1404. SUBQ $0x10, R15
  1405. JAE copy_5_loop
  1406. LEAQ 16(R14)(R15*1), R14
  1407. LEAQ 16(BX)(R15*1), BX
  1408. MOVUPS -16(R14), X0
  1409. MOVUPS X0, -16(BX)
  1410. JMP copy_5_end
  1411. copy_5_small:
  1412. CMPQ R11, $0x03
  1413. JE copy_5_move_3
  1414. JB copy_5_move_1or2
  1415. CMPQ R11, $0x08
  1416. JB copy_5_move_4through7
  1417. JMP copy_5_move_8through16
  1418. copy_5_move_1or2:
  1419. MOVB (R14), R15
  1420. MOVB -1(R14)(R11*1), BP
  1421. MOVB R15, (BX)
  1422. MOVB BP, -1(BX)(R11*1)
  1423. ADDQ R11, R14
  1424. ADDQ R11, BX
  1425. JMP copy_5_end
  1426. copy_5_move_3:
  1427. MOVW (R14), R15
  1428. MOVB 2(R14), BP
  1429. MOVW R15, (BX)
  1430. MOVB BP, 2(BX)
  1431. ADDQ R11, R14
  1432. ADDQ R11, BX
  1433. JMP copy_5_end
  1434. copy_5_move_4through7:
  1435. MOVL (R14), R15
  1436. MOVL -4(R14)(R11*1), BP
  1437. MOVL R15, (BX)
  1438. MOVL BP, -4(BX)(R11*1)
  1439. ADDQ R11, R14
  1440. ADDQ R11, BX
  1441. JMP copy_5_end
  1442. copy_5_move_8through16:
  1443. MOVQ (R14), R15
  1444. MOVQ -8(R14)(R11*1), BP
  1445. MOVQ R15, (BX)
  1446. MOVQ BP, -8(BX)(R11*1)
  1447. ADDQ R11, R14
  1448. ADDQ R11, BX
  1449. copy_5_end:
  1450. ADDQ R11, DI
  1451. SUBQ R11, R13
  1452. // Copy match from the current buffer
  1453. copy_match:
  1454. MOVQ BX, R11
  1455. SUBQ R12, R11
  1456. // ml <= mo
  1457. CMPQ R13, R12
  1458. JA copy_overlapping_match
  1459. // Copy non-overlapping match
  1460. ADDQ R13, DI
  1461. MOVQ R13, R12
  1462. SUBQ $0x10, R12
  1463. JB copy_2_small
  1464. copy_2_loop:
  1465. MOVUPS (R11), X0
  1466. MOVUPS X0, (BX)
  1467. ADDQ $0x10, R11
  1468. ADDQ $0x10, BX
  1469. SUBQ $0x10, R12
  1470. JAE copy_2_loop
  1471. LEAQ 16(R11)(R12*1), R11
  1472. LEAQ 16(BX)(R12*1), BX
  1473. MOVUPS -16(R11), X0
  1474. MOVUPS X0, -16(BX)
  1475. JMP copy_2_end
  1476. copy_2_small:
  1477. CMPQ R13, $0x03
  1478. JE copy_2_move_3
  1479. JB copy_2_move_1or2
  1480. CMPQ R13, $0x08
  1481. JB copy_2_move_4through7
  1482. JMP copy_2_move_8through16
  1483. copy_2_move_1or2:
  1484. MOVB (R11), R12
  1485. MOVB -1(R11)(R13*1), R14
  1486. MOVB R12, (BX)
  1487. MOVB R14, -1(BX)(R13*1)
  1488. ADDQ R13, R11
  1489. ADDQ R13, BX
  1490. JMP copy_2_end
  1491. copy_2_move_3:
  1492. MOVW (R11), R12
  1493. MOVB 2(R11), R14
  1494. MOVW R12, (BX)
  1495. MOVB R14, 2(BX)
  1496. ADDQ R13, R11
  1497. ADDQ R13, BX
  1498. JMP copy_2_end
  1499. copy_2_move_4through7:
  1500. MOVL (R11), R12
  1501. MOVL -4(R11)(R13*1), R14
  1502. MOVL R12, (BX)
  1503. MOVL R14, -4(BX)(R13*1)
  1504. ADDQ R13, R11
  1505. ADDQ R13, BX
  1506. JMP copy_2_end
  1507. copy_2_move_8through16:
  1508. MOVQ (R11), R12
  1509. MOVQ -8(R11)(R13*1), R14
  1510. MOVQ R12, (BX)
  1511. MOVQ R14, -8(BX)(R13*1)
  1512. ADDQ R13, R11
  1513. ADDQ R13, BX
  1514. copy_2_end:
  1515. JMP handle_loop
  1516. // Copy overlapping match
  1517. copy_overlapping_match:
  1518. ADDQ R13, DI
  1519. copy_slow_3:
  1520. MOVB (R11), R12
  1521. MOVB R12, (BX)
  1522. INCQ R11
  1523. INCQ BX
  1524. DECQ R13
  1525. JNZ copy_slow_3
  1526. handle_loop:
  1527. ADDQ $0x18, AX
  1528. INCQ DX
  1529. CMPQ DX, CX
  1530. JB main_loop
  1531. loop_finished:
  1532. // Return value
  1533. MOVB $0x01, ret+8(FP)
  1534. // Update the context
  1535. MOVQ ctx+0(FP), AX
  1536. MOVQ DX, 24(AX)
  1537. MOVQ DI, 104(AX)
  1538. SUBQ 80(AX), SI
  1539. MOVQ SI, 112(AX)
  1540. RET
  1541. error_match_off_too_big:
  1542. // Return value
  1543. MOVB $0x00, ret+8(FP)
  1544. // Update the context
  1545. MOVQ ctx+0(FP), AX
  1546. MOVQ DX, 24(AX)
  1547. MOVQ DI, 104(AX)
  1548. SUBQ 80(AX), SI
  1549. MOVQ SI, 112(AX)
  1550. RET
  1551. empty_seqs:
  1552. // Return value
  1553. MOVB $0x01, ret+8(FP)
  1554. RET
  1555. // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  1556. // Requires: CMOV, SSE
  1557. TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
  1558. MOVQ br+8(FP), CX
  1559. MOVQ 24(CX), DX
  1560. MOVBQZX 32(CX), BX
  1561. MOVQ (CX), AX
  1562. MOVQ 8(CX), SI
  1563. ADDQ SI, AX
  1564. MOVQ AX, (SP)
  1565. MOVQ ctx+16(FP), AX
  1566. MOVQ 72(AX), DI
  1567. MOVQ 80(AX), R8
  1568. MOVQ 88(AX), R9
  1569. XORQ CX, CX
  1570. MOVQ CX, 8(SP)
  1571. MOVQ CX, 16(SP)
  1572. MOVQ CX, 24(SP)
  1573. MOVQ 112(AX), R10
  1574. MOVQ 128(AX), CX
  1575. MOVQ CX, 32(SP)
  1576. MOVQ 144(AX), R11
  1577. MOVQ 136(AX), R12
  1578. MOVQ 200(AX), CX
  1579. MOVQ CX, 56(SP)
  1580. MOVQ 176(AX), CX
  1581. MOVQ CX, 48(SP)
  1582. MOVQ 184(AX), AX
  1583. MOVQ AX, 40(SP)
  1584. MOVQ 40(SP), AX
  1585. ADDQ AX, 48(SP)
  1586. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  1587. ADDQ R10, 32(SP)
  1588. // outBase += outPosition
  1589. ADDQ R12, R10
  1590. sequenceDecs_decodeSync_amd64_main_loop:
  1591. MOVQ (SP), R13
  1592. // Fill bitreader to have enough for the offset and match length.
  1593. CMPQ SI, $0x08
  1594. JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1595. MOVQ BX, AX
  1596. SHRQ $0x03, AX
  1597. SUBQ AX, R13
  1598. MOVQ (R13), DX
  1599. SUBQ AX, SI
  1600. ANDQ $0x07, BX
  1601. JMP sequenceDecs_decodeSync_amd64_fill_end
  1602. sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
  1603. CMPQ SI, $0x00
  1604. JLE sequenceDecs_decodeSync_amd64_fill_check_overread
  1605. CMPQ BX, $0x07
  1606. JLE sequenceDecs_decodeSync_amd64_fill_end
  1607. SHLQ $0x08, DX
  1608. SUBQ $0x01, R13
  1609. SUBQ $0x01, SI
  1610. SUBQ $0x08, BX
  1611. MOVBQZX (R13), AX
  1612. ORQ AX, DX
  1613. JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
  1614. sequenceDecs_decodeSync_amd64_fill_check_overread:
  1615. CMPQ BX, $0x40
  1616. JA error_overread
  1617. sequenceDecs_decodeSync_amd64_fill_end:
  1618. // Update offset
  1619. MOVQ R9, AX
  1620. MOVQ BX, CX
  1621. MOVQ DX, R14
  1622. SHLQ CL, R14
  1623. MOVB AH, CL
  1624. SHRQ $0x20, AX
  1625. TESTQ CX, CX
  1626. JZ sequenceDecs_decodeSync_amd64_of_update_zero
  1627. ADDQ CX, BX
  1628. CMPQ BX, $0x40
  1629. JA sequenceDecs_decodeSync_amd64_of_update_zero
  1630. CMPQ CX, $0x40
  1631. JAE sequenceDecs_decodeSync_amd64_of_update_zero
  1632. NEGQ CX
  1633. SHRQ CL, R14
  1634. ADDQ R14, AX
  1635. sequenceDecs_decodeSync_amd64_of_update_zero:
  1636. MOVQ AX, 8(SP)
  1637. // Update match length
  1638. MOVQ R8, AX
  1639. MOVQ BX, CX
  1640. MOVQ DX, R14
  1641. SHLQ CL, R14
  1642. MOVB AH, CL
  1643. SHRQ $0x20, AX
  1644. TESTQ CX, CX
  1645. JZ sequenceDecs_decodeSync_amd64_ml_update_zero
  1646. ADDQ CX, BX
  1647. CMPQ BX, $0x40
  1648. JA sequenceDecs_decodeSync_amd64_ml_update_zero
  1649. CMPQ CX, $0x40
  1650. JAE sequenceDecs_decodeSync_amd64_ml_update_zero
  1651. NEGQ CX
  1652. SHRQ CL, R14
  1653. ADDQ R14, AX
  1654. sequenceDecs_decodeSync_amd64_ml_update_zero:
  1655. MOVQ AX, 16(SP)
  1656. // Fill bitreader to have enough for the remaining
  1657. CMPQ SI, $0x08
  1658. JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1659. MOVQ BX, AX
  1660. SHRQ $0x03, AX
  1661. SUBQ AX, R13
  1662. MOVQ (R13), DX
  1663. SUBQ AX, SI
  1664. ANDQ $0x07, BX
  1665. JMP sequenceDecs_decodeSync_amd64_fill_2_end
  1666. sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
  1667. CMPQ SI, $0x00
  1668. JLE sequenceDecs_decodeSync_amd64_fill_2_check_overread
  1669. CMPQ BX, $0x07
  1670. JLE sequenceDecs_decodeSync_amd64_fill_2_end
  1671. SHLQ $0x08, DX
  1672. SUBQ $0x01, R13
  1673. SUBQ $0x01, SI
  1674. SUBQ $0x08, BX
  1675. MOVBQZX (R13), AX
  1676. ORQ AX, DX
  1677. JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
  1678. sequenceDecs_decodeSync_amd64_fill_2_check_overread:
  1679. CMPQ BX, $0x40
  1680. JA error_overread
  1681. sequenceDecs_decodeSync_amd64_fill_2_end:
  1682. // Update literal length
  1683. MOVQ DI, AX
  1684. MOVQ BX, CX
  1685. MOVQ DX, R14
  1686. SHLQ CL, R14
  1687. MOVB AH, CL
  1688. SHRQ $0x20, AX
  1689. TESTQ CX, CX
  1690. JZ sequenceDecs_decodeSync_amd64_ll_update_zero
  1691. ADDQ CX, BX
  1692. CMPQ BX, $0x40
  1693. JA sequenceDecs_decodeSync_amd64_ll_update_zero
  1694. CMPQ CX, $0x40
  1695. JAE sequenceDecs_decodeSync_amd64_ll_update_zero
  1696. NEGQ CX
  1697. SHRQ CL, R14
  1698. ADDQ R14, AX
  1699. sequenceDecs_decodeSync_amd64_ll_update_zero:
  1700. MOVQ AX, 24(SP)
  1701. // Fill bitreader for state updates
  1702. MOVQ R13, (SP)
  1703. MOVQ R9, AX
  1704. SHRQ $0x08, AX
  1705. MOVBQZX AL, AX
  1706. MOVQ ctx+16(FP), CX
  1707. CMPQ 96(CX), $0x00
  1708. JZ sequenceDecs_decodeSync_amd64_skip_update
  1709. // Update Literal Length State
  1710. MOVBQZX DI, R13
  1711. SHRL $0x10, DI
  1712. LEAQ (BX)(R13*1), CX
  1713. MOVQ DX, R14
  1714. MOVQ CX, BX
  1715. ROLQ CL, R14
  1716. MOVL $0x00000001, R15
  1717. MOVB R13, CL
  1718. SHLL CL, R15
  1719. DECL R15
  1720. ANDQ R15, R14
  1721. ADDQ R14, DI
  1722. // Load ctx.llTable
  1723. MOVQ ctx+16(FP), CX
  1724. MOVQ (CX), CX
  1725. MOVQ (CX)(DI*8), DI
  1726. // Update Match Length State
  1727. MOVBQZX R8, R13
  1728. SHRL $0x10, R8
  1729. LEAQ (BX)(R13*1), CX
  1730. MOVQ DX, R14
  1731. MOVQ CX, BX
  1732. ROLQ CL, R14
  1733. MOVL $0x00000001, R15
  1734. MOVB R13, CL
  1735. SHLL CL, R15
  1736. DECL R15
  1737. ANDQ R15, R14
  1738. ADDQ R14, R8
  1739. // Load ctx.mlTable
  1740. MOVQ ctx+16(FP), CX
  1741. MOVQ 24(CX), CX
  1742. MOVQ (CX)(R8*8), R8
  1743. // Update Offset State
  1744. MOVBQZX R9, R13
  1745. SHRL $0x10, R9
  1746. LEAQ (BX)(R13*1), CX
  1747. MOVQ DX, R14
  1748. MOVQ CX, BX
  1749. ROLQ CL, R14
  1750. MOVL $0x00000001, R15
  1751. MOVB R13, CL
  1752. SHLL CL, R15
  1753. DECL R15
  1754. ANDQ R15, R14
  1755. ADDQ R14, R9
  1756. // Load ctx.ofTable
  1757. MOVQ ctx+16(FP), CX
  1758. MOVQ 48(CX), CX
  1759. MOVQ (CX)(R9*8), R9
  1760. sequenceDecs_decodeSync_amd64_skip_update:
  1761. // Adjust offset
  1762. MOVQ s+0(FP), CX
  1763. MOVQ 8(SP), R13
  1764. CMPQ AX, $0x01
  1765. JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
  1766. MOVUPS 144(CX), X0
  1767. MOVQ R13, 144(CX)
  1768. MOVUPS X0, 152(CX)
  1769. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1770. sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
  1771. CMPQ 24(SP), $0x00000000
  1772. JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
  1773. INCQ R13
  1774. JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1775. sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
  1776. TESTQ R13, R13
  1777. JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
  1778. MOVQ 144(CX), R13
  1779. JMP sequenceDecs_decodeSync_amd64_after_adjust
  1780. sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
  1781. MOVQ R13, AX
  1782. XORQ R14, R14
  1783. MOVQ $-1, R15
  1784. CMPQ R13, $0x03
  1785. CMOVQEQ R14, AX
  1786. CMOVQEQ R15, R14
  1787. ADDQ 144(CX)(AX*8), R14
  1788. JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
  1789. MOVQ $0x00000001, R14
  1790. sequenceDecs_decodeSync_amd64_adjust_temp_valid:
  1791. CMPQ R13, $0x01
  1792. JZ sequenceDecs_decodeSync_amd64_adjust_skip
  1793. MOVQ 152(CX), AX
  1794. MOVQ AX, 160(CX)
  1795. sequenceDecs_decodeSync_amd64_adjust_skip:
  1796. MOVQ 144(CX), AX
  1797. MOVQ AX, 152(CX)
  1798. MOVQ R14, 144(CX)
  1799. MOVQ R14, R13
  1800. sequenceDecs_decodeSync_amd64_after_adjust:
  1801. MOVQ R13, 8(SP)
  1802. // Check values
  1803. MOVQ 16(SP), AX
  1804. MOVQ 24(SP), CX
  1805. LEAQ (AX)(CX*1), R14
  1806. MOVQ s+0(FP), R15
  1807. ADDQ R14, 256(R15)
  1808. MOVQ ctx+16(FP), R14
  1809. SUBQ CX, 104(R14)
  1810. JS error_not_enough_literals
  1811. CMPQ AX, $0x00020002
  1812. JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
  1813. TESTQ R13, R13
  1814. JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
  1815. TESTQ AX, AX
  1816. JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
  1817. sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
  1818. MOVQ 24(SP), AX
  1819. MOVQ 8(SP), CX
  1820. MOVQ 16(SP), R13
  1821. // Check if we have enough space in s.out
  1822. LEAQ (AX)(R13*1), R14
  1823. ADDQ R10, R14
  1824. CMPQ R14, 32(SP)
  1825. JA error_not_enough_space
  1826. // Copy literals
  1827. TESTQ AX, AX
  1828. JZ check_offset
  1829. XORQ R14, R14
  1830. copy_1:
  1831. MOVUPS (R11)(R14*1), X0
  1832. MOVUPS X0, (R10)(R14*1)
  1833. ADDQ $0x10, R14
  1834. CMPQ R14, AX
  1835. JB copy_1
  1836. ADDQ AX, R11
  1837. ADDQ AX, R10
  1838. ADDQ AX, R12
  1839. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  1840. check_offset:
  1841. MOVQ R12, AX
  1842. ADDQ 40(SP), AX
  1843. CMPQ CX, AX
  1844. JG error_match_off_too_big
  1845. CMPQ CX, 56(SP)
  1846. JG error_match_off_too_big
  1847. // Copy match from history
  1848. MOVQ CX, AX
  1849. SUBQ R12, AX
  1850. JLS copy_match
  1851. MOVQ 48(SP), R14
  1852. SUBQ AX, R14
  1853. CMPQ R13, AX
  1854. JG copy_all_from_history
  1855. MOVQ R13, AX
  1856. SUBQ $0x10, AX
  1857. JB copy_4_small
  1858. copy_4_loop:
  1859. MOVUPS (R14), X0
  1860. MOVUPS X0, (R10)
  1861. ADDQ $0x10, R14
  1862. ADDQ $0x10, R10
  1863. SUBQ $0x10, AX
  1864. JAE copy_4_loop
  1865. LEAQ 16(R14)(AX*1), R14
  1866. LEAQ 16(R10)(AX*1), R10
  1867. MOVUPS -16(R14), X0
  1868. MOVUPS X0, -16(R10)
  1869. JMP copy_4_end
  1870. copy_4_small:
  1871. CMPQ R13, $0x03
  1872. JE copy_4_move_3
  1873. CMPQ R13, $0x08
  1874. JB copy_4_move_4through7
  1875. JMP copy_4_move_8through16
  1876. copy_4_move_3:
  1877. MOVW (R14), AX
  1878. MOVB 2(R14), CL
  1879. MOVW AX, (R10)
  1880. MOVB CL, 2(R10)
  1881. ADDQ R13, R14
  1882. ADDQ R13, R10
  1883. JMP copy_4_end
  1884. copy_4_move_4through7:
  1885. MOVL (R14), AX
  1886. MOVL -4(R14)(R13*1), CX
  1887. MOVL AX, (R10)
  1888. MOVL CX, -4(R10)(R13*1)
  1889. ADDQ R13, R14
  1890. ADDQ R13, R10
  1891. JMP copy_4_end
  1892. copy_4_move_8through16:
  1893. MOVQ (R14), AX
  1894. MOVQ -8(R14)(R13*1), CX
  1895. MOVQ AX, (R10)
  1896. MOVQ CX, -8(R10)(R13*1)
  1897. ADDQ R13, R14
  1898. ADDQ R13, R10
  1899. copy_4_end:
  1900. ADDQ R13, R12
  1901. JMP handle_loop
  1902. JMP loop_finished
  1903. copy_all_from_history:
  1904. MOVQ AX, R15
  1905. SUBQ $0x10, R15
  1906. JB copy_5_small
  1907. copy_5_loop:
  1908. MOVUPS (R14), X0
  1909. MOVUPS X0, (R10)
  1910. ADDQ $0x10, R14
  1911. ADDQ $0x10, R10
  1912. SUBQ $0x10, R15
  1913. JAE copy_5_loop
  1914. LEAQ 16(R14)(R15*1), R14
  1915. LEAQ 16(R10)(R15*1), R10
  1916. MOVUPS -16(R14), X0
  1917. MOVUPS X0, -16(R10)
  1918. JMP copy_5_end
  1919. copy_5_small:
  1920. CMPQ AX, $0x03
  1921. JE copy_5_move_3
  1922. JB copy_5_move_1or2
  1923. CMPQ AX, $0x08
  1924. JB copy_5_move_4through7
  1925. JMP copy_5_move_8through16
  1926. copy_5_move_1or2:
  1927. MOVB (R14), R15
  1928. MOVB -1(R14)(AX*1), BP
  1929. MOVB R15, (R10)
  1930. MOVB BP, -1(R10)(AX*1)
  1931. ADDQ AX, R14
  1932. ADDQ AX, R10
  1933. JMP copy_5_end
  1934. copy_5_move_3:
  1935. MOVW (R14), R15
  1936. MOVB 2(R14), BP
  1937. MOVW R15, (R10)
  1938. MOVB BP, 2(R10)
  1939. ADDQ AX, R14
  1940. ADDQ AX, R10
  1941. JMP copy_5_end
  1942. copy_5_move_4through7:
  1943. MOVL (R14), R15
  1944. MOVL -4(R14)(AX*1), BP
  1945. MOVL R15, (R10)
  1946. MOVL BP, -4(R10)(AX*1)
  1947. ADDQ AX, R14
  1948. ADDQ AX, R10
  1949. JMP copy_5_end
  1950. copy_5_move_8through16:
  1951. MOVQ (R14), R15
  1952. MOVQ -8(R14)(AX*1), BP
  1953. MOVQ R15, (R10)
  1954. MOVQ BP, -8(R10)(AX*1)
  1955. ADDQ AX, R14
  1956. ADDQ AX, R10
  1957. copy_5_end:
  1958. ADDQ AX, R12
  1959. SUBQ AX, R13
  1960. // Copy match from the current buffer
  1961. copy_match:
  1962. MOVQ R10, AX
  1963. SUBQ CX, AX
  1964. // ml <= mo
  1965. CMPQ R13, CX
  1966. JA copy_overlapping_match
  1967. // Copy non-overlapping match
  1968. ADDQ R13, R12
  1969. MOVQ R10, CX
  1970. ADDQ R13, R10
  1971. copy_2:
  1972. MOVUPS (AX), X0
  1973. MOVUPS X0, (CX)
  1974. ADDQ $0x10, AX
  1975. ADDQ $0x10, CX
  1976. SUBQ $0x10, R13
  1977. JHI copy_2
  1978. JMP handle_loop
  1979. // Copy overlapping match
  1980. copy_overlapping_match:
  1981. ADDQ R13, R12
  1982. copy_slow_3:
  1983. MOVB (AX), CL
  1984. MOVB CL, (R10)
  1985. INCQ AX
  1986. INCQ R10
  1987. DECQ R13
  1988. JNZ copy_slow_3
  1989. handle_loop:
  1990. MOVQ ctx+16(FP), AX
  1991. DECQ 96(AX)
  1992. JNS sequenceDecs_decodeSync_amd64_main_loop
  1993. loop_finished:
  1994. MOVQ br+8(FP), AX
  1995. MOVQ DX, 24(AX)
  1996. MOVB BL, 32(AX)
  1997. MOVQ SI, 8(AX)
  1998. // Update the context
  1999. MOVQ ctx+16(FP), AX
  2000. MOVQ R12, 136(AX)
  2001. MOVQ 144(AX), CX
  2002. SUBQ CX, R11
  2003. MOVQ R11, 168(AX)
  2004. // Return success
  2005. MOVQ $0x00000000, ret+24(FP)
  2006. RET
  2007. // Return with match length error
  2008. sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
  2009. MOVQ 16(SP), AX
  2010. MOVQ ctx+16(FP), CX
  2011. MOVQ AX, 216(CX)
  2012. MOVQ $0x00000001, ret+24(FP)
  2013. RET
  2014. // Return with match too long error
  2015. sequenceDecs_decodeSync_amd64_error_match_len_too_big:
  2016. MOVQ ctx+16(FP), AX
  2017. MOVQ 16(SP), CX
  2018. MOVQ CX, 216(AX)
  2019. MOVQ $0x00000002, ret+24(FP)
  2020. RET
  2021. // Return with match offset too long error
  2022. error_match_off_too_big:
  2023. MOVQ ctx+16(FP), AX
  2024. MOVQ 8(SP), CX
  2025. MOVQ CX, 224(AX)
  2026. MOVQ R12, 136(AX)
  2027. MOVQ $0x00000003, ret+24(FP)
  2028. RET
  2029. // Return with not enough literals error
  2030. error_not_enough_literals:
  2031. MOVQ ctx+16(FP), AX
  2032. MOVQ 24(SP), CX
  2033. MOVQ CX, 208(AX)
  2034. MOVQ $0x00000004, ret+24(FP)
  2035. RET
  2036. // Return with overread error
  2037. error_overread:
  2038. MOVQ $0x00000006, ret+24(FP)
  2039. RET
  2040. // Return with not enough output space error
  2041. error_not_enough_space:
  2042. MOVQ ctx+16(FP), AX
  2043. MOVQ 24(SP), CX
  2044. MOVQ CX, 208(AX)
  2045. MOVQ 16(SP), CX
  2046. MOVQ CX, 216(AX)
  2047. MOVQ R12, 136(AX)
  2048. MOVQ $0x00000005, ret+24(FP)
  2049. RET
  2050. // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2051. // Requires: BMI, BMI2, CMOV, SSE
  2052. TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
  2053. MOVQ br+8(FP), BX
  2054. MOVQ 24(BX), AX
  2055. MOVBQZX 32(BX), DX
  2056. MOVQ (BX), CX
  2057. MOVQ 8(BX), BX
  2058. ADDQ BX, CX
  2059. MOVQ CX, (SP)
  2060. MOVQ ctx+16(FP), CX
  2061. MOVQ 72(CX), SI
  2062. MOVQ 80(CX), DI
  2063. MOVQ 88(CX), R8
  2064. XORQ R9, R9
  2065. MOVQ R9, 8(SP)
  2066. MOVQ R9, 16(SP)
  2067. MOVQ R9, 24(SP)
  2068. MOVQ 112(CX), R9
  2069. MOVQ 128(CX), R10
  2070. MOVQ R10, 32(SP)
  2071. MOVQ 144(CX), R10
  2072. MOVQ 136(CX), R11
  2073. MOVQ 200(CX), R12
  2074. MOVQ R12, 56(SP)
  2075. MOVQ 176(CX), R12
  2076. MOVQ R12, 48(SP)
  2077. MOVQ 184(CX), CX
  2078. MOVQ CX, 40(SP)
  2079. MOVQ 40(SP), CX
  2080. ADDQ CX, 48(SP)
  2081. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2082. ADDQ R9, 32(SP)
  2083. // outBase += outPosition
  2084. ADDQ R11, R9
  2085. sequenceDecs_decodeSync_bmi2_main_loop:
  2086. MOVQ (SP), R12
  2087. // Fill bitreader to have enough for the offset and match length.
  2088. CMPQ BX, $0x08
  2089. JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2090. MOVQ DX, CX
  2091. SHRQ $0x03, CX
  2092. SUBQ CX, R12
  2093. MOVQ (R12), AX
  2094. SUBQ CX, BX
  2095. ANDQ $0x07, DX
  2096. JMP sequenceDecs_decodeSync_bmi2_fill_end
  2097. sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
  2098. CMPQ BX, $0x00
  2099. JLE sequenceDecs_decodeSync_bmi2_fill_check_overread
  2100. CMPQ DX, $0x07
  2101. JLE sequenceDecs_decodeSync_bmi2_fill_end
  2102. SHLQ $0x08, AX
  2103. SUBQ $0x01, R12
  2104. SUBQ $0x01, BX
  2105. SUBQ $0x08, DX
  2106. MOVBQZX (R12), CX
  2107. ORQ CX, AX
  2108. JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
  2109. sequenceDecs_decodeSync_bmi2_fill_check_overread:
  2110. CMPQ DX, $0x40
  2111. JA error_overread
  2112. sequenceDecs_decodeSync_bmi2_fill_end:
  2113. // Update offset
  2114. MOVQ $0x00000808, CX
  2115. BEXTRQ CX, R8, R13
  2116. MOVQ AX, R14
  2117. LEAQ (DX)(R13*1), CX
  2118. ROLQ CL, R14
  2119. BZHIQ R13, R14, R14
  2120. MOVQ CX, DX
  2121. MOVQ R8, CX
  2122. SHRQ $0x20, CX
  2123. ADDQ R14, CX
  2124. MOVQ CX, 8(SP)
  2125. // Update match length
  2126. MOVQ $0x00000808, CX
  2127. BEXTRQ CX, DI, R13
  2128. MOVQ AX, R14
  2129. LEAQ (DX)(R13*1), CX
  2130. ROLQ CL, R14
  2131. BZHIQ R13, R14, R14
  2132. MOVQ CX, DX
  2133. MOVQ DI, CX
  2134. SHRQ $0x20, CX
  2135. ADDQ R14, CX
  2136. MOVQ CX, 16(SP)
  2137. // Fill bitreader to have enough for the remaining
  2138. CMPQ BX, $0x08
  2139. JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2140. MOVQ DX, CX
  2141. SHRQ $0x03, CX
  2142. SUBQ CX, R12
  2143. MOVQ (R12), AX
  2144. SUBQ CX, BX
  2145. ANDQ $0x07, DX
  2146. JMP sequenceDecs_decodeSync_bmi2_fill_2_end
  2147. sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
  2148. CMPQ BX, $0x00
  2149. JLE sequenceDecs_decodeSync_bmi2_fill_2_check_overread
  2150. CMPQ DX, $0x07
  2151. JLE sequenceDecs_decodeSync_bmi2_fill_2_end
  2152. SHLQ $0x08, AX
  2153. SUBQ $0x01, R12
  2154. SUBQ $0x01, BX
  2155. SUBQ $0x08, DX
  2156. MOVBQZX (R12), CX
  2157. ORQ CX, AX
  2158. JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
  2159. sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
  2160. CMPQ DX, $0x40
  2161. JA error_overread
  2162. sequenceDecs_decodeSync_bmi2_fill_2_end:
  2163. // Update literal length
  2164. MOVQ $0x00000808, CX
  2165. BEXTRQ CX, SI, R13
  2166. MOVQ AX, R14
  2167. LEAQ (DX)(R13*1), CX
  2168. ROLQ CL, R14
  2169. BZHIQ R13, R14, R14
  2170. MOVQ CX, DX
  2171. MOVQ SI, CX
  2172. SHRQ $0x20, CX
  2173. ADDQ R14, CX
  2174. MOVQ CX, 24(SP)
  2175. // Fill bitreader for state updates
  2176. MOVQ R12, (SP)
  2177. MOVQ $0x00000808, CX
  2178. BEXTRQ CX, R8, R12
  2179. MOVQ ctx+16(FP), CX
  2180. CMPQ 96(CX), $0x00
  2181. JZ sequenceDecs_decodeSync_bmi2_skip_update
  2182. LEAQ (SI)(DI*1), R13
  2183. ADDQ R8, R13
  2184. MOVBQZX R13, R13
  2185. LEAQ (DX)(R13*1), CX
  2186. MOVQ AX, R14
  2187. MOVQ CX, DX
  2188. ROLQ CL, R14
  2189. BZHIQ R13, R14, R14
  2190. // Update Offset State
  2191. BZHIQ R8, R14, CX
  2192. SHRXQ R8, R14, R14
  2193. SHRL $0x10, R8
  2194. ADDQ CX, R8
  2195. // Load ctx.ofTable
  2196. MOVQ ctx+16(FP), CX
  2197. MOVQ 48(CX), CX
  2198. MOVQ (CX)(R8*8), R8
  2199. // Update Match Length State
  2200. BZHIQ DI, R14, CX
  2201. SHRXQ DI, R14, R14
  2202. SHRL $0x10, DI
  2203. ADDQ CX, DI
  2204. // Load ctx.mlTable
  2205. MOVQ ctx+16(FP), CX
  2206. MOVQ 24(CX), CX
  2207. MOVQ (CX)(DI*8), DI
  2208. // Update Literal Length State
  2209. BZHIQ SI, R14, CX
  2210. SHRL $0x10, SI
  2211. ADDQ CX, SI
  2212. // Load ctx.llTable
  2213. MOVQ ctx+16(FP), CX
  2214. MOVQ (CX), CX
  2215. MOVQ (CX)(SI*8), SI
  2216. sequenceDecs_decodeSync_bmi2_skip_update:
  2217. // Adjust offset
  2218. MOVQ s+0(FP), CX
  2219. MOVQ 8(SP), R13
  2220. CMPQ R12, $0x01
  2221. JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
  2222. MOVUPS 144(CX), X0
  2223. MOVQ R13, 144(CX)
  2224. MOVUPS X0, 152(CX)
  2225. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2226. sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
  2227. CMPQ 24(SP), $0x00000000
  2228. JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
  2229. INCQ R13
  2230. JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2231. sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
  2232. TESTQ R13, R13
  2233. JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
  2234. MOVQ 144(CX), R13
  2235. JMP sequenceDecs_decodeSync_bmi2_after_adjust
  2236. sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
  2237. MOVQ R13, R12
  2238. XORQ R14, R14
  2239. MOVQ $-1, R15
  2240. CMPQ R13, $0x03
  2241. CMOVQEQ R14, R12
  2242. CMOVQEQ R15, R14
  2243. ADDQ 144(CX)(R12*8), R14
  2244. JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
  2245. MOVQ $0x00000001, R14
  2246. sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
  2247. CMPQ R13, $0x01
  2248. JZ sequenceDecs_decodeSync_bmi2_adjust_skip
  2249. MOVQ 152(CX), R12
  2250. MOVQ R12, 160(CX)
  2251. sequenceDecs_decodeSync_bmi2_adjust_skip:
  2252. MOVQ 144(CX), R12
  2253. MOVQ R12, 152(CX)
  2254. MOVQ R14, 144(CX)
  2255. MOVQ R14, R13
  2256. sequenceDecs_decodeSync_bmi2_after_adjust:
  2257. MOVQ R13, 8(SP)
  2258. // Check values
  2259. MOVQ 16(SP), CX
  2260. MOVQ 24(SP), R12
  2261. LEAQ (CX)(R12*1), R14
  2262. MOVQ s+0(FP), R15
  2263. ADDQ R14, 256(R15)
  2264. MOVQ ctx+16(FP), R14
  2265. SUBQ R12, 104(R14)
  2266. JS error_not_enough_literals
  2267. CMPQ CX, $0x00020002
  2268. JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
  2269. TESTQ R13, R13
  2270. JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
  2271. TESTQ CX, CX
  2272. JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
  2273. sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
  2274. MOVQ 24(SP), CX
  2275. MOVQ 8(SP), R12
  2276. MOVQ 16(SP), R13
  2277. // Check if we have enough space in s.out
  2278. LEAQ (CX)(R13*1), R14
  2279. ADDQ R9, R14
  2280. CMPQ R14, 32(SP)
  2281. JA error_not_enough_space
  2282. // Copy literals
  2283. TESTQ CX, CX
  2284. JZ check_offset
  2285. XORQ R14, R14
  2286. copy_1:
  2287. MOVUPS (R10)(R14*1), X0
  2288. MOVUPS X0, (R9)(R14*1)
  2289. ADDQ $0x10, R14
  2290. CMPQ R14, CX
  2291. JB copy_1
  2292. ADDQ CX, R10
  2293. ADDQ CX, R9
  2294. ADDQ CX, R11
  2295. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2296. check_offset:
  2297. MOVQ R11, CX
  2298. ADDQ 40(SP), CX
  2299. CMPQ R12, CX
  2300. JG error_match_off_too_big
  2301. CMPQ R12, 56(SP)
  2302. JG error_match_off_too_big
  2303. // Copy match from history
  2304. MOVQ R12, CX
  2305. SUBQ R11, CX
  2306. JLS copy_match
  2307. MOVQ 48(SP), R14
  2308. SUBQ CX, R14
  2309. CMPQ R13, CX
  2310. JG copy_all_from_history
  2311. MOVQ R13, CX
  2312. SUBQ $0x10, CX
  2313. JB copy_4_small
  2314. copy_4_loop:
  2315. MOVUPS (R14), X0
  2316. MOVUPS X0, (R9)
  2317. ADDQ $0x10, R14
  2318. ADDQ $0x10, R9
  2319. SUBQ $0x10, CX
  2320. JAE copy_4_loop
  2321. LEAQ 16(R14)(CX*1), R14
  2322. LEAQ 16(R9)(CX*1), R9
  2323. MOVUPS -16(R14), X0
  2324. MOVUPS X0, -16(R9)
  2325. JMP copy_4_end
  2326. copy_4_small:
  2327. CMPQ R13, $0x03
  2328. JE copy_4_move_3
  2329. CMPQ R13, $0x08
  2330. JB copy_4_move_4through7
  2331. JMP copy_4_move_8through16
  2332. copy_4_move_3:
  2333. MOVW (R14), CX
  2334. MOVB 2(R14), R12
  2335. MOVW CX, (R9)
  2336. MOVB R12, 2(R9)
  2337. ADDQ R13, R14
  2338. ADDQ R13, R9
  2339. JMP copy_4_end
  2340. copy_4_move_4through7:
  2341. MOVL (R14), CX
  2342. MOVL -4(R14)(R13*1), R12
  2343. MOVL CX, (R9)
  2344. MOVL R12, -4(R9)(R13*1)
  2345. ADDQ R13, R14
  2346. ADDQ R13, R9
  2347. JMP copy_4_end
  2348. copy_4_move_8through16:
  2349. MOVQ (R14), CX
  2350. MOVQ -8(R14)(R13*1), R12
  2351. MOVQ CX, (R9)
  2352. MOVQ R12, -8(R9)(R13*1)
  2353. ADDQ R13, R14
  2354. ADDQ R13, R9
  2355. copy_4_end:
  2356. ADDQ R13, R11
  2357. JMP handle_loop
  2358. JMP loop_finished
  2359. copy_all_from_history:
  2360. MOVQ CX, R15
  2361. SUBQ $0x10, R15
  2362. JB copy_5_small
  2363. copy_5_loop:
  2364. MOVUPS (R14), X0
  2365. MOVUPS X0, (R9)
  2366. ADDQ $0x10, R14
  2367. ADDQ $0x10, R9
  2368. SUBQ $0x10, R15
  2369. JAE copy_5_loop
  2370. LEAQ 16(R14)(R15*1), R14
  2371. LEAQ 16(R9)(R15*1), R9
  2372. MOVUPS -16(R14), X0
  2373. MOVUPS X0, -16(R9)
  2374. JMP copy_5_end
  2375. copy_5_small:
  2376. CMPQ CX, $0x03
  2377. JE copy_5_move_3
  2378. JB copy_5_move_1or2
  2379. CMPQ CX, $0x08
  2380. JB copy_5_move_4through7
  2381. JMP copy_5_move_8through16
  2382. copy_5_move_1or2:
  2383. MOVB (R14), R15
  2384. MOVB -1(R14)(CX*1), BP
  2385. MOVB R15, (R9)
  2386. MOVB BP, -1(R9)(CX*1)
  2387. ADDQ CX, R14
  2388. ADDQ CX, R9
  2389. JMP copy_5_end
  2390. copy_5_move_3:
  2391. MOVW (R14), R15
  2392. MOVB 2(R14), BP
  2393. MOVW R15, (R9)
  2394. MOVB BP, 2(R9)
  2395. ADDQ CX, R14
  2396. ADDQ CX, R9
  2397. JMP copy_5_end
  2398. copy_5_move_4through7:
  2399. MOVL (R14), R15
  2400. MOVL -4(R14)(CX*1), BP
  2401. MOVL R15, (R9)
  2402. MOVL BP, -4(R9)(CX*1)
  2403. ADDQ CX, R14
  2404. ADDQ CX, R9
  2405. JMP copy_5_end
  2406. copy_5_move_8through16:
  2407. MOVQ (R14), R15
  2408. MOVQ -8(R14)(CX*1), BP
  2409. MOVQ R15, (R9)
  2410. MOVQ BP, -8(R9)(CX*1)
  2411. ADDQ CX, R14
  2412. ADDQ CX, R9
  2413. copy_5_end:
  2414. ADDQ CX, R11
  2415. SUBQ CX, R13
  2416. // Copy match from the current buffer
  2417. copy_match:
  2418. MOVQ R9, CX
  2419. SUBQ R12, CX
  2420. // ml <= mo
  2421. CMPQ R13, R12
  2422. JA copy_overlapping_match
  2423. // Copy non-overlapping match
  2424. ADDQ R13, R11
  2425. MOVQ R9, R12
  2426. ADDQ R13, R9
  2427. copy_2:
  2428. MOVUPS (CX), X0
  2429. MOVUPS X0, (R12)
  2430. ADDQ $0x10, CX
  2431. ADDQ $0x10, R12
  2432. SUBQ $0x10, R13
  2433. JHI copy_2
  2434. JMP handle_loop
  2435. // Copy overlapping match
  2436. copy_overlapping_match:
  2437. ADDQ R13, R11
  2438. copy_slow_3:
  2439. MOVB (CX), R12
  2440. MOVB R12, (R9)
  2441. INCQ CX
  2442. INCQ R9
  2443. DECQ R13
  2444. JNZ copy_slow_3
  2445. handle_loop:
  2446. MOVQ ctx+16(FP), CX
  2447. DECQ 96(CX)
  2448. JNS sequenceDecs_decodeSync_bmi2_main_loop
  2449. loop_finished:
  2450. MOVQ br+8(FP), CX
  2451. MOVQ AX, 24(CX)
  2452. MOVB DL, 32(CX)
  2453. MOVQ BX, 8(CX)
  2454. // Update the context
  2455. MOVQ ctx+16(FP), AX
  2456. MOVQ R11, 136(AX)
  2457. MOVQ 144(AX), CX
  2458. SUBQ CX, R10
  2459. MOVQ R10, 168(AX)
  2460. // Return success
  2461. MOVQ $0x00000000, ret+24(FP)
  2462. RET
  2463. // Return with match length error
  2464. sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
  2465. MOVQ 16(SP), AX
  2466. MOVQ ctx+16(FP), CX
  2467. MOVQ AX, 216(CX)
  2468. MOVQ $0x00000001, ret+24(FP)
  2469. RET
  2470. // Return with match too long error
  2471. sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
  2472. MOVQ ctx+16(FP), AX
  2473. MOVQ 16(SP), CX
  2474. MOVQ CX, 216(AX)
  2475. MOVQ $0x00000002, ret+24(FP)
  2476. RET
  2477. // Return with match offset too long error
  2478. error_match_off_too_big:
  2479. MOVQ ctx+16(FP), AX
  2480. MOVQ 8(SP), CX
  2481. MOVQ CX, 224(AX)
  2482. MOVQ R11, 136(AX)
  2483. MOVQ $0x00000003, ret+24(FP)
  2484. RET
  2485. // Return with not enough literals error
  2486. error_not_enough_literals:
  2487. MOVQ ctx+16(FP), AX
  2488. MOVQ 24(SP), CX
  2489. MOVQ CX, 208(AX)
  2490. MOVQ $0x00000004, ret+24(FP)
  2491. RET
  2492. // Return with overread error
  2493. error_overread:
  2494. MOVQ $0x00000006, ret+24(FP)
  2495. RET
  2496. // Return with not enough output space error
  2497. error_not_enough_space:
  2498. MOVQ ctx+16(FP), AX
  2499. MOVQ 24(SP), CX
  2500. MOVQ CX, 208(AX)
  2501. MOVQ 16(SP), CX
  2502. MOVQ CX, 216(AX)
  2503. MOVQ R11, 136(AX)
  2504. MOVQ $0x00000005, ret+24(FP)
  2505. RET
  2506. // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  2507. // Requires: CMOV, SSE
  2508. TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
  2509. MOVQ br+8(FP), CX
  2510. MOVQ 24(CX), DX
  2511. MOVBQZX 32(CX), BX
  2512. MOVQ (CX), AX
  2513. MOVQ 8(CX), SI
  2514. ADDQ SI, AX
  2515. MOVQ AX, (SP)
  2516. MOVQ ctx+16(FP), AX
  2517. MOVQ 72(AX), DI
  2518. MOVQ 80(AX), R8
  2519. MOVQ 88(AX), R9
  2520. XORQ CX, CX
  2521. MOVQ CX, 8(SP)
  2522. MOVQ CX, 16(SP)
  2523. MOVQ CX, 24(SP)
  2524. MOVQ 112(AX), R10
  2525. MOVQ 128(AX), CX
  2526. MOVQ CX, 32(SP)
  2527. MOVQ 144(AX), R11
  2528. MOVQ 136(AX), R12
  2529. MOVQ 200(AX), CX
  2530. MOVQ CX, 56(SP)
  2531. MOVQ 176(AX), CX
  2532. MOVQ CX, 48(SP)
  2533. MOVQ 184(AX), AX
  2534. MOVQ AX, 40(SP)
  2535. MOVQ 40(SP), AX
  2536. ADDQ AX, 48(SP)
  2537. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  2538. ADDQ R10, 32(SP)
  2539. // outBase += outPosition
  2540. ADDQ R12, R10
  2541. sequenceDecs_decodeSync_safe_amd64_main_loop:
  2542. MOVQ (SP), R13
  2543. // Fill bitreader to have enough for the offset and match length.
  2544. CMPQ SI, $0x08
  2545. JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2546. MOVQ BX, AX
  2547. SHRQ $0x03, AX
  2548. SUBQ AX, R13
  2549. MOVQ (R13), DX
  2550. SUBQ AX, SI
  2551. ANDQ $0x07, BX
  2552. JMP sequenceDecs_decodeSync_safe_amd64_fill_end
  2553. sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
  2554. CMPQ SI, $0x00
  2555. JLE sequenceDecs_decodeSync_safe_amd64_fill_check_overread
  2556. CMPQ BX, $0x07
  2557. JLE sequenceDecs_decodeSync_safe_amd64_fill_end
  2558. SHLQ $0x08, DX
  2559. SUBQ $0x01, R13
  2560. SUBQ $0x01, SI
  2561. SUBQ $0x08, BX
  2562. MOVBQZX (R13), AX
  2563. ORQ AX, DX
  2564. JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
  2565. sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
  2566. CMPQ BX, $0x40
  2567. JA error_overread
  2568. sequenceDecs_decodeSync_safe_amd64_fill_end:
  2569. // Update offset
  2570. MOVQ R9, AX
  2571. MOVQ BX, CX
  2572. MOVQ DX, R14
  2573. SHLQ CL, R14
  2574. MOVB AH, CL
  2575. SHRQ $0x20, AX
  2576. TESTQ CX, CX
  2577. JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2578. ADDQ CX, BX
  2579. CMPQ BX, $0x40
  2580. JA sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2581. CMPQ CX, $0x40
  2582. JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero
  2583. NEGQ CX
  2584. SHRQ CL, R14
  2585. ADDQ R14, AX
  2586. sequenceDecs_decodeSync_safe_amd64_of_update_zero:
  2587. MOVQ AX, 8(SP)
  2588. // Update match length
  2589. MOVQ R8, AX
  2590. MOVQ BX, CX
  2591. MOVQ DX, R14
  2592. SHLQ CL, R14
  2593. MOVB AH, CL
  2594. SHRQ $0x20, AX
  2595. TESTQ CX, CX
  2596. JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2597. ADDQ CX, BX
  2598. CMPQ BX, $0x40
  2599. JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2600. CMPQ CX, $0x40
  2601. JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero
  2602. NEGQ CX
  2603. SHRQ CL, R14
  2604. ADDQ R14, AX
  2605. sequenceDecs_decodeSync_safe_amd64_ml_update_zero:
  2606. MOVQ AX, 16(SP)
  2607. // Fill bitreader to have enough for the remaining
  2608. CMPQ SI, $0x08
  2609. JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2610. MOVQ BX, AX
  2611. SHRQ $0x03, AX
  2612. SUBQ AX, R13
  2613. MOVQ (R13), DX
  2614. SUBQ AX, SI
  2615. ANDQ $0x07, BX
  2616. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2617. sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
  2618. CMPQ SI, $0x00
  2619. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
  2620. CMPQ BX, $0x07
  2621. JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
  2622. SHLQ $0x08, DX
  2623. SUBQ $0x01, R13
  2624. SUBQ $0x01, SI
  2625. SUBQ $0x08, BX
  2626. MOVBQZX (R13), AX
  2627. ORQ AX, DX
  2628. JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
  2629. sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
  2630. CMPQ BX, $0x40
  2631. JA error_overread
  2632. sequenceDecs_decodeSync_safe_amd64_fill_2_end:
  2633. // Update literal length
  2634. MOVQ DI, AX
  2635. MOVQ BX, CX
  2636. MOVQ DX, R14
  2637. SHLQ CL, R14
  2638. MOVB AH, CL
  2639. SHRQ $0x20, AX
  2640. TESTQ CX, CX
  2641. JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2642. ADDQ CX, BX
  2643. CMPQ BX, $0x40
  2644. JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2645. CMPQ CX, $0x40
  2646. JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero
  2647. NEGQ CX
  2648. SHRQ CL, R14
  2649. ADDQ R14, AX
  2650. sequenceDecs_decodeSync_safe_amd64_ll_update_zero:
  2651. MOVQ AX, 24(SP)
  2652. // Fill bitreader for state updates
  2653. MOVQ R13, (SP)
  2654. MOVQ R9, AX
  2655. SHRQ $0x08, AX
  2656. MOVBQZX AL, AX
  2657. MOVQ ctx+16(FP), CX
  2658. CMPQ 96(CX), $0x00
  2659. JZ sequenceDecs_decodeSync_safe_amd64_skip_update
  2660. // Update Literal Length State
  2661. MOVBQZX DI, R13
  2662. SHRL $0x10, DI
  2663. LEAQ (BX)(R13*1), CX
  2664. MOVQ DX, R14
  2665. MOVQ CX, BX
  2666. ROLQ CL, R14
  2667. MOVL $0x00000001, R15
  2668. MOVB R13, CL
  2669. SHLL CL, R15
  2670. DECL R15
  2671. ANDQ R15, R14
  2672. ADDQ R14, DI
  2673. // Load ctx.llTable
  2674. MOVQ ctx+16(FP), CX
  2675. MOVQ (CX), CX
  2676. MOVQ (CX)(DI*8), DI
  2677. // Update Match Length State
  2678. MOVBQZX R8, R13
  2679. SHRL $0x10, R8
  2680. LEAQ (BX)(R13*1), CX
  2681. MOVQ DX, R14
  2682. MOVQ CX, BX
  2683. ROLQ CL, R14
  2684. MOVL $0x00000001, R15
  2685. MOVB R13, CL
  2686. SHLL CL, R15
  2687. DECL R15
  2688. ANDQ R15, R14
  2689. ADDQ R14, R8
  2690. // Load ctx.mlTable
  2691. MOVQ ctx+16(FP), CX
  2692. MOVQ 24(CX), CX
  2693. MOVQ (CX)(R8*8), R8
  2694. // Update Offset State
  2695. MOVBQZX R9, R13
  2696. SHRL $0x10, R9
  2697. LEAQ (BX)(R13*1), CX
  2698. MOVQ DX, R14
  2699. MOVQ CX, BX
  2700. ROLQ CL, R14
  2701. MOVL $0x00000001, R15
  2702. MOVB R13, CL
  2703. SHLL CL, R15
  2704. DECL R15
  2705. ANDQ R15, R14
  2706. ADDQ R14, R9
  2707. // Load ctx.ofTable
  2708. MOVQ ctx+16(FP), CX
  2709. MOVQ 48(CX), CX
  2710. MOVQ (CX)(R9*8), R9
  2711. sequenceDecs_decodeSync_safe_amd64_skip_update:
  2712. // Adjust offset
  2713. MOVQ s+0(FP), CX
  2714. MOVQ 8(SP), R13
  2715. CMPQ AX, $0x01
  2716. JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
  2717. MOVUPS 144(CX), X0
  2718. MOVQ R13, 144(CX)
  2719. MOVUPS X0, 152(CX)
  2720. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2721. sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
  2722. CMPQ 24(SP), $0x00000000
  2723. JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
  2724. INCQ R13
  2725. JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2726. sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
  2727. TESTQ R13, R13
  2728. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
  2729. MOVQ 144(CX), R13
  2730. JMP sequenceDecs_decodeSync_safe_amd64_after_adjust
  2731. sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
  2732. MOVQ R13, AX
  2733. XORQ R14, R14
  2734. MOVQ $-1, R15
  2735. CMPQ R13, $0x03
  2736. CMOVQEQ R14, AX
  2737. CMOVQEQ R15, R14
  2738. ADDQ 144(CX)(AX*8), R14
  2739. JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
  2740. MOVQ $0x00000001, R14
  2741. sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
  2742. CMPQ R13, $0x01
  2743. JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
  2744. MOVQ 152(CX), AX
  2745. MOVQ AX, 160(CX)
  2746. sequenceDecs_decodeSync_safe_amd64_adjust_skip:
  2747. MOVQ 144(CX), AX
  2748. MOVQ AX, 152(CX)
  2749. MOVQ R14, 144(CX)
  2750. MOVQ R14, R13
  2751. sequenceDecs_decodeSync_safe_amd64_after_adjust:
  2752. MOVQ R13, 8(SP)
  2753. // Check values
  2754. MOVQ 16(SP), AX
  2755. MOVQ 24(SP), CX
  2756. LEAQ (AX)(CX*1), R14
  2757. MOVQ s+0(FP), R15
  2758. ADDQ R14, 256(R15)
  2759. MOVQ ctx+16(FP), R14
  2760. SUBQ CX, 104(R14)
  2761. JS error_not_enough_literals
  2762. CMPQ AX, $0x00020002
  2763. JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
  2764. TESTQ R13, R13
  2765. JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
  2766. TESTQ AX, AX
  2767. JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
  2768. sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
  2769. MOVQ 24(SP), AX
  2770. MOVQ 8(SP), CX
  2771. MOVQ 16(SP), R13
  2772. // Check if we have enough space in s.out
  2773. LEAQ (AX)(R13*1), R14
  2774. ADDQ R10, R14
  2775. CMPQ R14, 32(SP)
  2776. JA error_not_enough_space
  2777. // Copy literals
  2778. TESTQ AX, AX
  2779. JZ check_offset
  2780. MOVQ AX, R14
  2781. SUBQ $0x10, R14
  2782. JB copy_1_small
  2783. copy_1_loop:
  2784. MOVUPS (R11), X0
  2785. MOVUPS X0, (R10)
  2786. ADDQ $0x10, R11
  2787. ADDQ $0x10, R10
  2788. SUBQ $0x10, R14
  2789. JAE copy_1_loop
  2790. LEAQ 16(R11)(R14*1), R11
  2791. LEAQ 16(R10)(R14*1), R10
  2792. MOVUPS -16(R11), X0
  2793. MOVUPS X0, -16(R10)
  2794. JMP copy_1_end
  2795. copy_1_small:
  2796. CMPQ AX, $0x03
  2797. JE copy_1_move_3
  2798. JB copy_1_move_1or2
  2799. CMPQ AX, $0x08
  2800. JB copy_1_move_4through7
  2801. JMP copy_1_move_8through16
  2802. copy_1_move_1or2:
  2803. MOVB (R11), R14
  2804. MOVB -1(R11)(AX*1), R15
  2805. MOVB R14, (R10)
  2806. MOVB R15, -1(R10)(AX*1)
  2807. ADDQ AX, R11
  2808. ADDQ AX, R10
  2809. JMP copy_1_end
  2810. copy_1_move_3:
  2811. MOVW (R11), R14
  2812. MOVB 2(R11), R15
  2813. MOVW R14, (R10)
  2814. MOVB R15, 2(R10)
  2815. ADDQ AX, R11
  2816. ADDQ AX, R10
  2817. JMP copy_1_end
  2818. copy_1_move_4through7:
  2819. MOVL (R11), R14
  2820. MOVL -4(R11)(AX*1), R15
  2821. MOVL R14, (R10)
  2822. MOVL R15, -4(R10)(AX*1)
  2823. ADDQ AX, R11
  2824. ADDQ AX, R10
  2825. JMP copy_1_end
  2826. copy_1_move_8through16:
  2827. MOVQ (R11), R14
  2828. MOVQ -8(R11)(AX*1), R15
  2829. MOVQ R14, (R10)
  2830. MOVQ R15, -8(R10)(AX*1)
  2831. ADDQ AX, R11
  2832. ADDQ AX, R10
  2833. copy_1_end:
  2834. ADDQ AX, R12
  2835. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  2836. check_offset:
  2837. MOVQ R12, AX
  2838. ADDQ 40(SP), AX
  2839. CMPQ CX, AX
  2840. JG error_match_off_too_big
  2841. CMPQ CX, 56(SP)
  2842. JG error_match_off_too_big
  2843. // Copy match from history
  2844. MOVQ CX, AX
  2845. SUBQ R12, AX
  2846. JLS copy_match
  2847. MOVQ 48(SP), R14
  2848. SUBQ AX, R14
  2849. CMPQ R13, AX
  2850. JG copy_all_from_history
  2851. MOVQ R13, AX
  2852. SUBQ $0x10, AX
  2853. JB copy_4_small
  2854. copy_4_loop:
  2855. MOVUPS (R14), X0
  2856. MOVUPS X0, (R10)
  2857. ADDQ $0x10, R14
  2858. ADDQ $0x10, R10
  2859. SUBQ $0x10, AX
  2860. JAE copy_4_loop
  2861. LEAQ 16(R14)(AX*1), R14
  2862. LEAQ 16(R10)(AX*1), R10
  2863. MOVUPS -16(R14), X0
  2864. MOVUPS X0, -16(R10)
  2865. JMP copy_4_end
  2866. copy_4_small:
  2867. CMPQ R13, $0x03
  2868. JE copy_4_move_3
  2869. CMPQ R13, $0x08
  2870. JB copy_4_move_4through7
  2871. JMP copy_4_move_8through16
  2872. copy_4_move_3:
  2873. MOVW (R14), AX
  2874. MOVB 2(R14), CL
  2875. MOVW AX, (R10)
  2876. MOVB CL, 2(R10)
  2877. ADDQ R13, R14
  2878. ADDQ R13, R10
  2879. JMP copy_4_end
  2880. copy_4_move_4through7:
  2881. MOVL (R14), AX
  2882. MOVL -4(R14)(R13*1), CX
  2883. MOVL AX, (R10)
  2884. MOVL CX, -4(R10)(R13*1)
  2885. ADDQ R13, R14
  2886. ADDQ R13, R10
  2887. JMP copy_4_end
  2888. copy_4_move_8through16:
  2889. MOVQ (R14), AX
  2890. MOVQ -8(R14)(R13*1), CX
  2891. MOVQ AX, (R10)
  2892. MOVQ CX, -8(R10)(R13*1)
  2893. ADDQ R13, R14
  2894. ADDQ R13, R10
  2895. copy_4_end:
  2896. ADDQ R13, R12
  2897. JMP handle_loop
  2898. JMP loop_finished
  2899. copy_all_from_history:
  2900. MOVQ AX, R15
  2901. SUBQ $0x10, R15
  2902. JB copy_5_small
  2903. copy_5_loop:
  2904. MOVUPS (R14), X0
  2905. MOVUPS X0, (R10)
  2906. ADDQ $0x10, R14
  2907. ADDQ $0x10, R10
  2908. SUBQ $0x10, R15
  2909. JAE copy_5_loop
  2910. LEAQ 16(R14)(R15*1), R14
  2911. LEAQ 16(R10)(R15*1), R10
  2912. MOVUPS -16(R14), X0
  2913. MOVUPS X0, -16(R10)
  2914. JMP copy_5_end
  2915. copy_5_small:
  2916. CMPQ AX, $0x03
  2917. JE copy_5_move_3
  2918. JB copy_5_move_1or2
  2919. CMPQ AX, $0x08
  2920. JB copy_5_move_4through7
  2921. JMP copy_5_move_8through16
  2922. copy_5_move_1or2:
  2923. MOVB (R14), R15
  2924. MOVB -1(R14)(AX*1), BP
  2925. MOVB R15, (R10)
  2926. MOVB BP, -1(R10)(AX*1)
  2927. ADDQ AX, R14
  2928. ADDQ AX, R10
  2929. JMP copy_5_end
  2930. copy_5_move_3:
  2931. MOVW (R14), R15
  2932. MOVB 2(R14), BP
  2933. MOVW R15, (R10)
  2934. MOVB BP, 2(R10)
  2935. ADDQ AX, R14
  2936. ADDQ AX, R10
  2937. JMP copy_5_end
  2938. copy_5_move_4through7:
  2939. MOVL (R14), R15
  2940. MOVL -4(R14)(AX*1), BP
  2941. MOVL R15, (R10)
  2942. MOVL BP, -4(R10)(AX*1)
  2943. ADDQ AX, R14
  2944. ADDQ AX, R10
  2945. JMP copy_5_end
  2946. copy_5_move_8through16:
  2947. MOVQ (R14), R15
  2948. MOVQ -8(R14)(AX*1), BP
  2949. MOVQ R15, (R10)
  2950. MOVQ BP, -8(R10)(AX*1)
  2951. ADDQ AX, R14
  2952. ADDQ AX, R10
  2953. copy_5_end:
  2954. ADDQ AX, R12
  2955. SUBQ AX, R13
  2956. // Copy match from the current buffer
  2957. copy_match:
  2958. MOVQ R10, AX
  2959. SUBQ CX, AX
  2960. // ml <= mo
  2961. CMPQ R13, CX
  2962. JA copy_overlapping_match
  2963. // Copy non-overlapping match
  2964. ADDQ R13, R12
  2965. MOVQ R13, CX
  2966. SUBQ $0x10, CX
  2967. JB copy_2_small
  2968. copy_2_loop:
  2969. MOVUPS (AX), X0
  2970. MOVUPS X0, (R10)
  2971. ADDQ $0x10, AX
  2972. ADDQ $0x10, R10
  2973. SUBQ $0x10, CX
  2974. JAE copy_2_loop
  2975. LEAQ 16(AX)(CX*1), AX
  2976. LEAQ 16(R10)(CX*1), R10
  2977. MOVUPS -16(AX), X0
  2978. MOVUPS X0, -16(R10)
  2979. JMP copy_2_end
  2980. copy_2_small:
  2981. CMPQ R13, $0x03
  2982. JE copy_2_move_3
  2983. JB copy_2_move_1or2
  2984. CMPQ R13, $0x08
  2985. JB copy_2_move_4through7
  2986. JMP copy_2_move_8through16
  2987. copy_2_move_1or2:
  2988. MOVB (AX), CL
  2989. MOVB -1(AX)(R13*1), R14
  2990. MOVB CL, (R10)
  2991. MOVB R14, -1(R10)(R13*1)
  2992. ADDQ R13, AX
  2993. ADDQ R13, R10
  2994. JMP copy_2_end
  2995. copy_2_move_3:
  2996. MOVW (AX), CX
  2997. MOVB 2(AX), R14
  2998. MOVW CX, (R10)
  2999. MOVB R14, 2(R10)
  3000. ADDQ R13, AX
  3001. ADDQ R13, R10
  3002. JMP copy_2_end
  3003. copy_2_move_4through7:
  3004. MOVL (AX), CX
  3005. MOVL -4(AX)(R13*1), R14
  3006. MOVL CX, (R10)
  3007. MOVL R14, -4(R10)(R13*1)
  3008. ADDQ R13, AX
  3009. ADDQ R13, R10
  3010. JMP copy_2_end
  3011. copy_2_move_8through16:
  3012. MOVQ (AX), CX
  3013. MOVQ -8(AX)(R13*1), R14
  3014. MOVQ CX, (R10)
  3015. MOVQ R14, -8(R10)(R13*1)
  3016. ADDQ R13, AX
  3017. ADDQ R13, R10
  3018. copy_2_end:
  3019. JMP handle_loop
  3020. // Copy overlapping match
  3021. copy_overlapping_match:
  3022. ADDQ R13, R12
  3023. copy_slow_3:
  3024. MOVB (AX), CL
  3025. MOVB CL, (R10)
  3026. INCQ AX
  3027. INCQ R10
  3028. DECQ R13
  3029. JNZ copy_slow_3
  3030. handle_loop:
  3031. MOVQ ctx+16(FP), AX
  3032. DECQ 96(AX)
  3033. JNS sequenceDecs_decodeSync_safe_amd64_main_loop
  3034. loop_finished:
  3035. MOVQ br+8(FP), AX
  3036. MOVQ DX, 24(AX)
  3037. MOVB BL, 32(AX)
  3038. MOVQ SI, 8(AX)
  3039. // Update the context
  3040. MOVQ ctx+16(FP), AX
  3041. MOVQ R12, 136(AX)
  3042. MOVQ 144(AX), CX
  3043. SUBQ CX, R11
  3044. MOVQ R11, 168(AX)
  3045. // Return success
  3046. MOVQ $0x00000000, ret+24(FP)
  3047. RET
  3048. // Return with match length error
  3049. sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
  3050. MOVQ 16(SP), AX
  3051. MOVQ ctx+16(FP), CX
  3052. MOVQ AX, 216(CX)
  3053. MOVQ $0x00000001, ret+24(FP)
  3054. RET
  3055. // Return with match too long error
  3056. sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
  3057. MOVQ ctx+16(FP), AX
  3058. MOVQ 16(SP), CX
  3059. MOVQ CX, 216(AX)
  3060. MOVQ $0x00000002, ret+24(FP)
  3061. RET
  3062. // Return with match offset too long error
  3063. error_match_off_too_big:
  3064. MOVQ ctx+16(FP), AX
  3065. MOVQ 8(SP), CX
  3066. MOVQ CX, 224(AX)
  3067. MOVQ R12, 136(AX)
  3068. MOVQ $0x00000003, ret+24(FP)
  3069. RET
  3070. // Return with not enough literals error
  3071. error_not_enough_literals:
  3072. MOVQ ctx+16(FP), AX
  3073. MOVQ 24(SP), CX
  3074. MOVQ CX, 208(AX)
  3075. MOVQ $0x00000004, ret+24(FP)
  3076. RET
  3077. // Return with overread error
  3078. error_overread:
  3079. MOVQ $0x00000006, ret+24(FP)
  3080. RET
  3081. // Return with not enough output space error
  3082. error_not_enough_space:
  3083. MOVQ ctx+16(FP), AX
  3084. MOVQ 24(SP), CX
  3085. MOVQ CX, 208(AX)
  3086. MOVQ 16(SP), CX
  3087. MOVQ CX, 216(AX)
  3088. MOVQ R12, 136(AX)
  3089. MOVQ $0x00000005, ret+24(FP)
  3090. RET
  3091. // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
  3092. // Requires: BMI, BMI2, CMOV, SSE
  3093. TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
  3094. MOVQ br+8(FP), BX
  3095. MOVQ 24(BX), AX
  3096. MOVBQZX 32(BX), DX
  3097. MOVQ (BX), CX
  3098. MOVQ 8(BX), BX
  3099. ADDQ BX, CX
  3100. MOVQ CX, (SP)
  3101. MOVQ ctx+16(FP), CX
  3102. MOVQ 72(CX), SI
  3103. MOVQ 80(CX), DI
  3104. MOVQ 88(CX), R8
  3105. XORQ R9, R9
  3106. MOVQ R9, 8(SP)
  3107. MOVQ R9, 16(SP)
  3108. MOVQ R9, 24(SP)
  3109. MOVQ 112(CX), R9
  3110. MOVQ 128(CX), R10
  3111. MOVQ R10, 32(SP)
  3112. MOVQ 144(CX), R10
  3113. MOVQ 136(CX), R11
  3114. MOVQ 200(CX), R12
  3115. MOVQ R12, 56(SP)
  3116. MOVQ 176(CX), R12
  3117. MOVQ R12, 48(SP)
  3118. MOVQ 184(CX), CX
  3119. MOVQ CX, 40(SP)
  3120. MOVQ 40(SP), CX
  3121. ADDQ CX, 48(SP)
  3122. // Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
  3123. ADDQ R9, 32(SP)
  3124. // outBase += outPosition
  3125. ADDQ R11, R9
  3126. sequenceDecs_decodeSync_safe_bmi2_main_loop:
  3127. MOVQ (SP), R12
  3128. // Fill bitreader to have enough for the offset and match length.
  3129. CMPQ BX, $0x08
  3130. JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3131. MOVQ DX, CX
  3132. SHRQ $0x03, CX
  3133. SUBQ CX, R12
  3134. MOVQ (R12), AX
  3135. SUBQ CX, BX
  3136. ANDQ $0x07, DX
  3137. JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
  3138. sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
  3139. CMPQ BX, $0x00
  3140. JLE sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
  3141. CMPQ DX, $0x07
  3142. JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
  3143. SHLQ $0x08, AX
  3144. SUBQ $0x01, R12
  3145. SUBQ $0x01, BX
  3146. SUBQ $0x08, DX
  3147. MOVBQZX (R12), CX
  3148. ORQ CX, AX
  3149. JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
  3150. sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
  3151. CMPQ DX, $0x40
  3152. JA error_overread
  3153. sequenceDecs_decodeSync_safe_bmi2_fill_end:
  3154. // Update offset
  3155. MOVQ $0x00000808, CX
  3156. BEXTRQ CX, R8, R13
  3157. MOVQ AX, R14
  3158. LEAQ (DX)(R13*1), CX
  3159. ROLQ CL, R14
  3160. BZHIQ R13, R14, R14
  3161. MOVQ CX, DX
  3162. MOVQ R8, CX
  3163. SHRQ $0x20, CX
  3164. ADDQ R14, CX
  3165. MOVQ CX, 8(SP)
  3166. // Update match length
  3167. MOVQ $0x00000808, CX
  3168. BEXTRQ CX, DI, R13
  3169. MOVQ AX, R14
  3170. LEAQ (DX)(R13*1), CX
  3171. ROLQ CL, R14
  3172. BZHIQ R13, R14, R14
  3173. MOVQ CX, DX
  3174. MOVQ DI, CX
  3175. SHRQ $0x20, CX
  3176. ADDQ R14, CX
  3177. MOVQ CX, 16(SP)
  3178. // Fill bitreader to have enough for the remaining
  3179. CMPQ BX, $0x08
  3180. JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3181. MOVQ DX, CX
  3182. SHRQ $0x03, CX
  3183. SUBQ CX, R12
  3184. MOVQ (R12), AX
  3185. SUBQ CX, BX
  3186. ANDQ $0x07, DX
  3187. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3188. sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
  3189. CMPQ BX, $0x00
  3190. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
  3191. CMPQ DX, $0x07
  3192. JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
  3193. SHLQ $0x08, AX
  3194. SUBQ $0x01, R12
  3195. SUBQ $0x01, BX
  3196. SUBQ $0x08, DX
  3197. MOVBQZX (R12), CX
  3198. ORQ CX, AX
  3199. JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
  3200. sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
  3201. CMPQ DX, $0x40
  3202. JA error_overread
  3203. sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
  3204. // Update literal length
  3205. MOVQ $0x00000808, CX
  3206. BEXTRQ CX, SI, R13
  3207. MOVQ AX, R14
  3208. LEAQ (DX)(R13*1), CX
  3209. ROLQ CL, R14
  3210. BZHIQ R13, R14, R14
  3211. MOVQ CX, DX
  3212. MOVQ SI, CX
  3213. SHRQ $0x20, CX
  3214. ADDQ R14, CX
  3215. MOVQ CX, 24(SP)
  3216. // Fill bitreader for state updates
  3217. MOVQ R12, (SP)
  3218. MOVQ $0x00000808, CX
  3219. BEXTRQ CX, R8, R12
  3220. MOVQ ctx+16(FP), CX
  3221. CMPQ 96(CX), $0x00
  3222. JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
  3223. LEAQ (SI)(DI*1), R13
  3224. ADDQ R8, R13
  3225. MOVBQZX R13, R13
  3226. LEAQ (DX)(R13*1), CX
  3227. MOVQ AX, R14
  3228. MOVQ CX, DX
  3229. ROLQ CL, R14
  3230. BZHIQ R13, R14, R14
  3231. // Update Offset State
  3232. BZHIQ R8, R14, CX
  3233. SHRXQ R8, R14, R14
  3234. SHRL $0x10, R8
  3235. ADDQ CX, R8
  3236. // Load ctx.ofTable
  3237. MOVQ ctx+16(FP), CX
  3238. MOVQ 48(CX), CX
  3239. MOVQ (CX)(R8*8), R8
  3240. // Update Match Length State
  3241. BZHIQ DI, R14, CX
  3242. SHRXQ DI, R14, R14
  3243. SHRL $0x10, DI
  3244. ADDQ CX, DI
  3245. // Load ctx.mlTable
  3246. MOVQ ctx+16(FP), CX
  3247. MOVQ 24(CX), CX
  3248. MOVQ (CX)(DI*8), DI
  3249. // Update Literal Length State
  3250. BZHIQ SI, R14, CX
  3251. SHRL $0x10, SI
  3252. ADDQ CX, SI
  3253. // Load ctx.llTable
  3254. MOVQ ctx+16(FP), CX
  3255. MOVQ (CX), CX
  3256. MOVQ (CX)(SI*8), SI
  3257. sequenceDecs_decodeSync_safe_bmi2_skip_update:
  3258. // Adjust offset
  3259. MOVQ s+0(FP), CX
  3260. MOVQ 8(SP), R13
  3261. CMPQ R12, $0x01
  3262. JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
  3263. MOVUPS 144(CX), X0
  3264. MOVQ R13, 144(CX)
  3265. MOVUPS X0, 152(CX)
  3266. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3267. sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
  3268. CMPQ 24(SP), $0x00000000
  3269. JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
  3270. INCQ R13
  3271. JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3272. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
  3273. TESTQ R13, R13
  3274. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
  3275. MOVQ 144(CX), R13
  3276. JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust
  3277. sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
  3278. MOVQ R13, R12
  3279. XORQ R14, R14
  3280. MOVQ $-1, R15
  3281. CMPQ R13, $0x03
  3282. CMOVQEQ R14, R12
  3283. CMOVQEQ R15, R14
  3284. ADDQ 144(CX)(R12*8), R14
  3285. JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
  3286. MOVQ $0x00000001, R14
  3287. sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
  3288. CMPQ R13, $0x01
  3289. JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
  3290. MOVQ 152(CX), R12
  3291. MOVQ R12, 160(CX)
  3292. sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
  3293. MOVQ 144(CX), R12
  3294. MOVQ R12, 152(CX)
  3295. MOVQ R14, 144(CX)
  3296. MOVQ R14, R13
  3297. sequenceDecs_decodeSync_safe_bmi2_after_adjust:
  3298. MOVQ R13, 8(SP)
  3299. // Check values
  3300. MOVQ 16(SP), CX
  3301. MOVQ 24(SP), R12
  3302. LEAQ (CX)(R12*1), R14
  3303. MOVQ s+0(FP), R15
  3304. ADDQ R14, 256(R15)
  3305. MOVQ ctx+16(FP), R14
  3306. SUBQ R12, 104(R14)
  3307. JS error_not_enough_literals
  3308. CMPQ CX, $0x00020002
  3309. JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
  3310. TESTQ R13, R13
  3311. JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
  3312. TESTQ CX, CX
  3313. JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
  3314. sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
  3315. MOVQ 24(SP), CX
  3316. MOVQ 8(SP), R12
  3317. MOVQ 16(SP), R13
  3318. // Check if we have enough space in s.out
  3319. LEAQ (CX)(R13*1), R14
  3320. ADDQ R9, R14
  3321. CMPQ R14, 32(SP)
  3322. JA error_not_enough_space
  3323. // Copy literals
  3324. TESTQ CX, CX
  3325. JZ check_offset
  3326. MOVQ CX, R14
  3327. SUBQ $0x10, R14
  3328. JB copy_1_small
  3329. copy_1_loop:
  3330. MOVUPS (R10), X0
  3331. MOVUPS X0, (R9)
  3332. ADDQ $0x10, R10
  3333. ADDQ $0x10, R9
  3334. SUBQ $0x10, R14
  3335. JAE copy_1_loop
  3336. LEAQ 16(R10)(R14*1), R10
  3337. LEAQ 16(R9)(R14*1), R9
  3338. MOVUPS -16(R10), X0
  3339. MOVUPS X0, -16(R9)
  3340. JMP copy_1_end
  3341. copy_1_small:
  3342. CMPQ CX, $0x03
  3343. JE copy_1_move_3
  3344. JB copy_1_move_1or2
  3345. CMPQ CX, $0x08
  3346. JB copy_1_move_4through7
  3347. JMP copy_1_move_8through16
  3348. copy_1_move_1or2:
  3349. MOVB (R10), R14
  3350. MOVB -1(R10)(CX*1), R15
  3351. MOVB R14, (R9)
  3352. MOVB R15, -1(R9)(CX*1)
  3353. ADDQ CX, R10
  3354. ADDQ CX, R9
  3355. JMP copy_1_end
  3356. copy_1_move_3:
  3357. MOVW (R10), R14
  3358. MOVB 2(R10), R15
  3359. MOVW R14, (R9)
  3360. MOVB R15, 2(R9)
  3361. ADDQ CX, R10
  3362. ADDQ CX, R9
  3363. JMP copy_1_end
  3364. copy_1_move_4through7:
  3365. MOVL (R10), R14
  3366. MOVL -4(R10)(CX*1), R15
  3367. MOVL R14, (R9)
  3368. MOVL R15, -4(R9)(CX*1)
  3369. ADDQ CX, R10
  3370. ADDQ CX, R9
  3371. JMP copy_1_end
  3372. copy_1_move_8through16:
  3373. MOVQ (R10), R14
  3374. MOVQ -8(R10)(CX*1), R15
  3375. MOVQ R14, (R9)
  3376. MOVQ R15, -8(R9)(CX*1)
  3377. ADDQ CX, R10
  3378. ADDQ CX, R9
  3379. copy_1_end:
  3380. ADDQ CX, R11
  3381. // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
  3382. check_offset:
  3383. MOVQ R11, CX
  3384. ADDQ 40(SP), CX
  3385. CMPQ R12, CX
  3386. JG error_match_off_too_big
  3387. CMPQ R12, 56(SP)
  3388. JG error_match_off_too_big
  3389. // Copy match from history
  3390. MOVQ R12, CX
  3391. SUBQ R11, CX
  3392. JLS copy_match
  3393. MOVQ 48(SP), R14
  3394. SUBQ CX, R14
  3395. CMPQ R13, CX
  3396. JG copy_all_from_history
  3397. MOVQ R13, CX
  3398. SUBQ $0x10, CX
  3399. JB copy_4_small
  3400. copy_4_loop:
  3401. MOVUPS (R14), X0
  3402. MOVUPS X0, (R9)
  3403. ADDQ $0x10, R14
  3404. ADDQ $0x10, R9
  3405. SUBQ $0x10, CX
  3406. JAE copy_4_loop
  3407. LEAQ 16(R14)(CX*1), R14
  3408. LEAQ 16(R9)(CX*1), R9
  3409. MOVUPS -16(R14), X0
  3410. MOVUPS X0, -16(R9)
  3411. JMP copy_4_end
  3412. copy_4_small:
  3413. CMPQ R13, $0x03
  3414. JE copy_4_move_3
  3415. CMPQ R13, $0x08
  3416. JB copy_4_move_4through7
  3417. JMP copy_4_move_8through16
  3418. copy_4_move_3:
  3419. MOVW (R14), CX
  3420. MOVB 2(R14), R12
  3421. MOVW CX, (R9)
  3422. MOVB R12, 2(R9)
  3423. ADDQ R13, R14
  3424. ADDQ R13, R9
  3425. JMP copy_4_end
  3426. copy_4_move_4through7:
  3427. MOVL (R14), CX
  3428. MOVL -4(R14)(R13*1), R12
  3429. MOVL CX, (R9)
  3430. MOVL R12, -4(R9)(R13*1)
  3431. ADDQ R13, R14
  3432. ADDQ R13, R9
  3433. JMP copy_4_end
  3434. copy_4_move_8through16:
  3435. MOVQ (R14), CX
  3436. MOVQ -8(R14)(R13*1), R12
  3437. MOVQ CX, (R9)
  3438. MOVQ R12, -8(R9)(R13*1)
  3439. ADDQ R13, R14
  3440. ADDQ R13, R9
  3441. copy_4_end:
  3442. ADDQ R13, R11
  3443. JMP handle_loop
  3444. JMP loop_finished
  3445. copy_all_from_history:
  3446. MOVQ CX, R15
  3447. SUBQ $0x10, R15
  3448. JB copy_5_small
  3449. copy_5_loop:
  3450. MOVUPS (R14), X0
  3451. MOVUPS X0, (R9)
  3452. ADDQ $0x10, R14
  3453. ADDQ $0x10, R9
  3454. SUBQ $0x10, R15
  3455. JAE copy_5_loop
  3456. LEAQ 16(R14)(R15*1), R14
  3457. LEAQ 16(R9)(R15*1), R9
  3458. MOVUPS -16(R14), X0
  3459. MOVUPS X0, -16(R9)
  3460. JMP copy_5_end
  3461. copy_5_small:
  3462. CMPQ CX, $0x03
  3463. JE copy_5_move_3
  3464. JB copy_5_move_1or2
  3465. CMPQ CX, $0x08
  3466. JB copy_5_move_4through7
  3467. JMP copy_5_move_8through16
  3468. copy_5_move_1or2:
  3469. MOVB (R14), R15
  3470. MOVB -1(R14)(CX*1), BP
  3471. MOVB R15, (R9)
  3472. MOVB BP, -1(R9)(CX*1)
  3473. ADDQ CX, R14
  3474. ADDQ CX, R9
  3475. JMP copy_5_end
  3476. copy_5_move_3:
  3477. MOVW (R14), R15
  3478. MOVB 2(R14), BP
  3479. MOVW R15, (R9)
  3480. MOVB BP, 2(R9)
  3481. ADDQ CX, R14
  3482. ADDQ CX, R9
  3483. JMP copy_5_end
  3484. copy_5_move_4through7:
  3485. MOVL (R14), R15
  3486. MOVL -4(R14)(CX*1), BP
  3487. MOVL R15, (R9)
  3488. MOVL BP, -4(R9)(CX*1)
  3489. ADDQ CX, R14
  3490. ADDQ CX, R9
  3491. JMP copy_5_end
  3492. copy_5_move_8through16:
  3493. MOVQ (R14), R15
  3494. MOVQ -8(R14)(CX*1), BP
  3495. MOVQ R15, (R9)
  3496. MOVQ BP, -8(R9)(CX*1)
  3497. ADDQ CX, R14
  3498. ADDQ CX, R9
  3499. copy_5_end:
  3500. ADDQ CX, R11
  3501. SUBQ CX, R13
  3502. // Copy match from the current buffer
  3503. copy_match:
  3504. MOVQ R9, CX
  3505. SUBQ R12, CX
  3506. // ml <= mo
  3507. CMPQ R13, R12
  3508. JA copy_overlapping_match
  3509. // Copy non-overlapping match
  3510. ADDQ R13, R11
  3511. MOVQ R13, R12
  3512. SUBQ $0x10, R12
  3513. JB copy_2_small
  3514. copy_2_loop:
  3515. MOVUPS (CX), X0
  3516. MOVUPS X0, (R9)
  3517. ADDQ $0x10, CX
  3518. ADDQ $0x10, R9
  3519. SUBQ $0x10, R12
  3520. JAE copy_2_loop
  3521. LEAQ 16(CX)(R12*1), CX
  3522. LEAQ 16(R9)(R12*1), R9
  3523. MOVUPS -16(CX), X0
  3524. MOVUPS X0, -16(R9)
  3525. JMP copy_2_end
  3526. copy_2_small:
  3527. CMPQ R13, $0x03
  3528. JE copy_2_move_3
  3529. JB copy_2_move_1or2
  3530. CMPQ R13, $0x08
  3531. JB copy_2_move_4through7
  3532. JMP copy_2_move_8through16
  3533. copy_2_move_1or2:
  3534. MOVB (CX), R12
  3535. MOVB -1(CX)(R13*1), R14
  3536. MOVB R12, (R9)
  3537. MOVB R14, -1(R9)(R13*1)
  3538. ADDQ R13, CX
  3539. ADDQ R13, R9
  3540. JMP copy_2_end
  3541. copy_2_move_3:
  3542. MOVW (CX), R12
  3543. MOVB 2(CX), R14
  3544. MOVW R12, (R9)
  3545. MOVB R14, 2(R9)
  3546. ADDQ R13, CX
  3547. ADDQ R13, R9
  3548. JMP copy_2_end
  3549. copy_2_move_4through7:
  3550. MOVL (CX), R12
  3551. MOVL -4(CX)(R13*1), R14
  3552. MOVL R12, (R9)
  3553. MOVL R14, -4(R9)(R13*1)
  3554. ADDQ R13, CX
  3555. ADDQ R13, R9
  3556. JMP copy_2_end
  3557. copy_2_move_8through16:
  3558. MOVQ (CX), R12
  3559. MOVQ -8(CX)(R13*1), R14
  3560. MOVQ R12, (R9)
  3561. MOVQ R14, -8(R9)(R13*1)
  3562. ADDQ R13, CX
  3563. ADDQ R13, R9
  3564. copy_2_end:
  3565. JMP handle_loop
  3566. // Copy overlapping match
  3567. copy_overlapping_match:
  3568. ADDQ R13, R11
  3569. copy_slow_3:
  3570. MOVB (CX), R12
  3571. MOVB R12, (R9)
  3572. INCQ CX
  3573. INCQ R9
  3574. DECQ R13
  3575. JNZ copy_slow_3
  3576. handle_loop:
  3577. MOVQ ctx+16(FP), CX
  3578. DECQ 96(CX)
  3579. JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
  3580. loop_finished:
  3581. MOVQ br+8(FP), CX
  3582. MOVQ AX, 24(CX)
  3583. MOVB DL, 32(CX)
  3584. MOVQ BX, 8(CX)
  3585. // Update the context
  3586. MOVQ ctx+16(FP), AX
  3587. MOVQ R11, 136(AX)
  3588. MOVQ 144(AX), CX
  3589. SUBQ CX, R10
  3590. MOVQ R10, 168(AX)
  3591. // Return success
  3592. MOVQ $0x00000000, ret+24(FP)
  3593. RET
  3594. // Return with match length error
  3595. sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
  3596. MOVQ 16(SP), AX
  3597. MOVQ ctx+16(FP), CX
  3598. MOVQ AX, 216(CX)
  3599. MOVQ $0x00000001, ret+24(FP)
  3600. RET
  3601. // Return with match too long error
  3602. sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
  3603. MOVQ ctx+16(FP), AX
  3604. MOVQ 16(SP), CX
  3605. MOVQ CX, 216(AX)
  3606. MOVQ $0x00000002, ret+24(FP)
  3607. RET
  3608. // Return with match offset too long error
  3609. error_match_off_too_big:
  3610. MOVQ ctx+16(FP), AX
  3611. MOVQ 8(SP), CX
  3612. MOVQ CX, 224(AX)
  3613. MOVQ R11, 136(AX)
  3614. MOVQ $0x00000003, ret+24(FP)
  3615. RET
  3616. // Return with not enough literals error
  3617. error_not_enough_literals:
  3618. MOVQ ctx+16(FP), AX
  3619. MOVQ 24(SP), CX
  3620. MOVQ CX, 208(AX)
  3621. MOVQ $0x00000004, ret+24(FP)
  3622. RET
  3623. // Return with overread error
  3624. error_overread:
  3625. MOVQ $0x00000006, ret+24(FP)
  3626. RET
  3627. // Return with not enough output space error
  3628. error_not_enough_space:
  3629. MOVQ ctx+16(FP), AX
  3630. MOVQ 24(SP), CX
  3631. MOVQ CX, 208(AX)
  3632. MOVQ 16(SP), CX
  3633. MOVQ CX, 216(AX)
  3634. MOVQ R11, 136(AX)
  3635. MOVQ $0x00000005, ret+24(FP)
  3636. RET