123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 |
- // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
- //go:build amd64 && gc && !purego
- // +build amd64,gc,!purego
- #include "textflag.h"
- // func feMul(out *Element, a *Element, b *Element)
- TEXT ·feMul(SB), NOSPLIT, $0-24
- MOVQ a+8(FP), CX
- MOVQ b+16(FP), BX
- // r0 = a0×b0
- MOVQ (CX), AX
- MULQ (BX)
- MOVQ AX, DI
- MOVQ DX, SI
- // r0 += 19×a1×b4
- MOVQ 8(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 32(BX)
- ADDQ AX, DI
- ADCQ DX, SI
- // r0 += 19×a2×b3
- MOVQ 16(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 24(BX)
- ADDQ AX, DI
- ADCQ DX, SI
- // r0 += 19×a3×b2
- MOVQ 24(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 16(BX)
- ADDQ AX, DI
- ADCQ DX, SI
- // r0 += 19×a4×b1
- MOVQ 32(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 8(BX)
- ADDQ AX, DI
- ADCQ DX, SI
- // r1 = a0×b1
- MOVQ (CX), AX
- MULQ 8(BX)
- MOVQ AX, R9
- MOVQ DX, R8
- // r1 += a1×b0
- MOVQ 8(CX), AX
- MULQ (BX)
- ADDQ AX, R9
- ADCQ DX, R8
- // r1 += 19×a2×b4
- MOVQ 16(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 32(BX)
- ADDQ AX, R9
- ADCQ DX, R8
- // r1 += 19×a3×b3
- MOVQ 24(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 24(BX)
- ADDQ AX, R9
- ADCQ DX, R8
- // r1 += 19×a4×b2
- MOVQ 32(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 16(BX)
- ADDQ AX, R9
- ADCQ DX, R8
- // r2 = a0×b2
- MOVQ (CX), AX
- MULQ 16(BX)
- MOVQ AX, R11
- MOVQ DX, R10
- // r2 += a1×b1
- MOVQ 8(CX), AX
- MULQ 8(BX)
- ADDQ AX, R11
- ADCQ DX, R10
- // r2 += a2×b0
- MOVQ 16(CX), AX
- MULQ (BX)
- ADDQ AX, R11
- ADCQ DX, R10
- // r2 += 19×a3×b4
- MOVQ 24(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 32(BX)
- ADDQ AX, R11
- ADCQ DX, R10
- // r2 += 19×a4×b3
- MOVQ 32(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 24(BX)
- ADDQ AX, R11
- ADCQ DX, R10
- // r3 = a0×b3
- MOVQ (CX), AX
- MULQ 24(BX)
- MOVQ AX, R13
- MOVQ DX, R12
- // r3 += a1×b2
- MOVQ 8(CX), AX
- MULQ 16(BX)
- ADDQ AX, R13
- ADCQ DX, R12
- // r3 += a2×b1
- MOVQ 16(CX), AX
- MULQ 8(BX)
- ADDQ AX, R13
- ADCQ DX, R12
- // r3 += a3×b0
- MOVQ 24(CX), AX
- MULQ (BX)
- ADDQ AX, R13
- ADCQ DX, R12
- // r3 += 19×a4×b4
- MOVQ 32(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 32(BX)
- ADDQ AX, R13
- ADCQ DX, R12
- // r4 = a0×b4
- MOVQ (CX), AX
- MULQ 32(BX)
- MOVQ AX, R15
- MOVQ DX, R14
- // r4 += a1×b3
- MOVQ 8(CX), AX
- MULQ 24(BX)
- ADDQ AX, R15
- ADCQ DX, R14
- // r4 += a2×b2
- MOVQ 16(CX), AX
- MULQ 16(BX)
- ADDQ AX, R15
- ADCQ DX, R14
- // r4 += a3×b1
- MOVQ 24(CX), AX
- MULQ 8(BX)
- ADDQ AX, R15
- ADCQ DX, R14
- // r4 += a4×b0
- MOVQ 32(CX), AX
- MULQ (BX)
- ADDQ AX, R15
- ADCQ DX, R14
- // First reduction chain
- MOVQ $0x0007ffffffffffff, AX
- SHLQ $0x0d, DI, SI
- SHLQ $0x0d, R9, R8
- SHLQ $0x0d, R11, R10
- SHLQ $0x0d, R13, R12
- SHLQ $0x0d, R15, R14
- ANDQ AX, DI
- IMUL3Q $0x13, R14, R14
- ADDQ R14, DI
- ANDQ AX, R9
- ADDQ SI, R9
- ANDQ AX, R11
- ADDQ R8, R11
- ANDQ AX, R13
- ADDQ R10, R13
- ANDQ AX, R15
- ADDQ R12, R15
- // Second reduction chain (carryPropagate)
- MOVQ DI, SI
- SHRQ $0x33, SI
- MOVQ R9, R8
- SHRQ $0x33, R8
- MOVQ R11, R10
- SHRQ $0x33, R10
- MOVQ R13, R12
- SHRQ $0x33, R12
- MOVQ R15, R14
- SHRQ $0x33, R14
- ANDQ AX, DI
- IMUL3Q $0x13, R14, R14
- ADDQ R14, DI
- ANDQ AX, R9
- ADDQ SI, R9
- ANDQ AX, R11
- ADDQ R8, R11
- ANDQ AX, R13
- ADDQ R10, R13
- ANDQ AX, R15
- ADDQ R12, R15
- // Store output
- MOVQ out+0(FP), AX
- MOVQ DI, (AX)
- MOVQ R9, 8(AX)
- MOVQ R11, 16(AX)
- MOVQ R13, 24(AX)
- MOVQ R15, 32(AX)
- RET
- // func feSquare(out *Element, a *Element)
- TEXT ·feSquare(SB), NOSPLIT, $0-16
- MOVQ a+8(FP), CX
- // r0 = l0×l0
- MOVQ (CX), AX
- MULQ (CX)
- MOVQ AX, SI
- MOVQ DX, BX
- // r0 += 38×l1×l4
- MOVQ 8(CX), AX
- IMUL3Q $0x26, AX, AX
- MULQ 32(CX)
- ADDQ AX, SI
- ADCQ DX, BX
- // r0 += 38×l2×l3
- MOVQ 16(CX), AX
- IMUL3Q $0x26, AX, AX
- MULQ 24(CX)
- ADDQ AX, SI
- ADCQ DX, BX
- // r1 = 2×l0×l1
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 8(CX)
- MOVQ AX, R8
- MOVQ DX, DI
- // r1 += 38×l2×l4
- MOVQ 16(CX), AX
- IMUL3Q $0x26, AX, AX
- MULQ 32(CX)
- ADDQ AX, R8
- ADCQ DX, DI
- // r1 += 19×l3×l3
- MOVQ 24(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 24(CX)
- ADDQ AX, R8
- ADCQ DX, DI
- // r2 = 2×l0×l2
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 16(CX)
- MOVQ AX, R10
- MOVQ DX, R9
- // r2 += l1×l1
- MOVQ 8(CX), AX
- MULQ 8(CX)
- ADDQ AX, R10
- ADCQ DX, R9
- // r2 += 38×l3×l4
- MOVQ 24(CX), AX
- IMUL3Q $0x26, AX, AX
- MULQ 32(CX)
- ADDQ AX, R10
- ADCQ DX, R9
- // r3 = 2×l0×l3
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 24(CX)
- MOVQ AX, R12
- MOVQ DX, R11
- // r3 += 2×l1×l2
- MOVQ 8(CX), AX
- IMUL3Q $0x02, AX, AX
- MULQ 16(CX)
- ADDQ AX, R12
- ADCQ DX, R11
- // r3 += 19×l4×l4
- MOVQ 32(CX), AX
- IMUL3Q $0x13, AX, AX
- MULQ 32(CX)
- ADDQ AX, R12
- ADCQ DX, R11
- // r4 = 2×l0×l4
- MOVQ (CX), AX
- SHLQ $0x01, AX
- MULQ 32(CX)
- MOVQ AX, R14
- MOVQ DX, R13
- // r4 += 2×l1×l3
- MOVQ 8(CX), AX
- IMUL3Q $0x02, AX, AX
- MULQ 24(CX)
- ADDQ AX, R14
- ADCQ DX, R13
- // r4 += l2×l2
- MOVQ 16(CX), AX
- MULQ 16(CX)
- ADDQ AX, R14
- ADCQ DX, R13
- // First reduction chain
- MOVQ $0x0007ffffffffffff, AX
- SHLQ $0x0d, SI, BX
- SHLQ $0x0d, R8, DI
- SHLQ $0x0d, R10, R9
- SHLQ $0x0d, R12, R11
- SHLQ $0x0d, R14, R13
- ANDQ AX, SI
- IMUL3Q $0x13, R13, R13
- ADDQ R13, SI
- ANDQ AX, R8
- ADDQ BX, R8
- ANDQ AX, R10
- ADDQ DI, R10
- ANDQ AX, R12
- ADDQ R9, R12
- ANDQ AX, R14
- ADDQ R11, R14
- // Second reduction chain (carryPropagate)
- MOVQ SI, BX
- SHRQ $0x33, BX
- MOVQ R8, DI
- SHRQ $0x33, DI
- MOVQ R10, R9
- SHRQ $0x33, R9
- MOVQ R12, R11
- SHRQ $0x33, R11
- MOVQ R14, R13
- SHRQ $0x33, R13
- ANDQ AX, SI
- IMUL3Q $0x13, R13, R13
- ADDQ R13, SI
- ANDQ AX, R8
- ADDQ BX, R8
- ANDQ AX, R10
- ADDQ DI, R10
- ANDQ AX, R12
- ADDQ R9, R12
- ANDQ AX, R14
- ADDQ R11, R14
- // Store output
- MOVQ out+0(FP), AX
- MOVQ SI, (AX)
- MOVQ R8, 8(AX)
- MOVQ R10, 16(AX)
- MOVQ R12, 24(AX)
- MOVQ R14, 32(AX)
- RET
|