diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-31 15:34:00 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-31 15:34:00 +0800 |
commit | a1011661126e99f4f407956baccd2865a27c6f41 (patch) | |
tree | 197771b72ba095fee558d40c1478a5e8b24e413c | |
parent | d281a723170dd46ccdc07f3a8d47fa98320a88b8 (diff) | |
download | tangerine-mcl-a1011661126e99f4f407956baccd2865a27c6f41.tar.gz tangerine-mcl-a1011661126e99f4f407956baccd2865a27c6f41.tar.zst tangerine-mcl-a1011661126e99f4f407956baccd2865a27c6f41.zip |
fix mulPre4
-rw-r--r-- | src/fp_generator.hpp | 77 | ||||
-rw-r--r-- | test/fp_tower_test.cpp | 17 |
2 files changed, 60 insertions, 34 deletions
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 9532762..d69c531 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -1713,6 +1713,7 @@ private: } /* py[7..0] = px[3..0] ^ 2 + use xmm0 */ void sqrPre4(const RegExp& py, const RegExp& px, const Pack& t) { @@ -1726,34 +1727,46 @@ private: const Reg64& t7 = t[7]; const Reg64& t8 = t[8]; const Reg64& t9 = t[9]; + const Reg64& t10 = t[10]; + const Reg64& a = rax; + const Reg64& d = rdx; - // (AN + B)^2 = A^2N^2 + 2AB + B^2 - - mul2x2(px + 8 * 0, px + 8 * 2, t4, t3, t2, t1, t0); - // [t3:t2:t1:t0] = AB - xor_(t4, t4); - add_rr(Pack(t4, t3, t2, t1, t0), Pack(t4, t3, t2, t1, t0)); - // [t4:t3:t2:t1:t0] = 2AB - store_mr(py + 8 * 2, Pack(t4, t3, t2, t1, t0)); - - mov(t8, ptr [px + 8 * 0]); - mov(t9, ptr [px + 8 * 1]); - sqr2(t1, t0, t7, t6, t9, t8, rax, rcx); - // B^2 = [t1:t0:t7:t6] - store_mr(py + 8 * 0, Pack(t7, t6)); - // [t1:t0] - - mov(t8, ptr [px + 8 * 2]); - mov(t9, ptr [px + 8 * 3]); - sqr2(t5, t4, t3, t2, t9, t8, rax, rcx); - // [t5:t4:t3:t2] - add_rm(Pack(t4, t3, t2, t1, t0), py + 8 * 2); - adc(t5, 0); - store_mr(py + 8 * 2, Pack(t5, t4, t3, t2, t1, t0)); + /* + (aN + b)^2 = a^2 N^2 + 2ab N + b^2 + */ + load_rm(Pack(t9, t8), px); + sqr2(t3, t2, t1, t0, t9, t8, t7, t6); + // [t3:t2:t1:t0] = b^2 + store_mr(py, Pack(t1, t0)); + movq(xm0, t2); + mul2x2(px, px + 2 * 8, t6, t5, t4, t1, t0); + // [t5:t4:t1:t0] = ab + xor_(t6, t6); + add_rr(Pack(t6, t5, t4, t1, t0), Pack(t6, t5, t4, t1, t0)); + // [t6:t5:t4:t1:t0] = 2ab + load_rm(Pack(t8, t7), px + 2 * 8); + // free t10, t9, rax, rdx + /* + [d:t8:t10:t9] = [t8:t7]^2 + */ + mov(d, t7); + mulx(t10, t9, t7); // [t10:t9] = t7^2 + mulx(t7, t2, t8); // [t7:t2] = t7 t8 + xor_(a, a); + add_rr(Pack(a, t7, t2), Pack(a, t7, t2)); + // [a:t7:t2] = 2 t7 t8 + mov(d, t8); + mulx(d, t8, t8); // [d:t8] = t8^2 + add_rr(Pack(d, t8, t10), Pack(a, t7, t2)); + // [d:t8:t10:t9] = [t8:t7]^2 + movq(t2, xm0); + add_rr(Pack(t8, t10, t9, t3, t2), Pack(t6, t5, t4, t1, t0)); + adc(d, 0); + store_mr(py + 2 * 8, Pack(d, t8, t10, t9, t3, t2)); } /* py[11..0] = px[5..0] ^ 2 - use stack[6 * 8] + use rax, rdx, stack[6 * 8] */ void sqrPre6(const RegExp& py, const RegExp& px, const Pack& t) { @@ -1767,7 +1780,8 @@ private: sqrPre3(py + 6 * 8, px + 3 * 8, t); // [py + 6 * 8] <- a^2 mulPre3(rsp, px, px + 3 * 8, t); // ab Pack ab = t.sub(0, 6); - load_rm(ab, py + 3 * 8); + load_rm(ab, rsp); + xor_(rax, rax); for (int i = 0; i < 6; i++) { if (i == 0) { add(ab[i], ab[i]); @@ -1775,10 +1789,11 @@ private: adc(ab[i], ab[i]); } } - add_rm(ab, rsp); + adc(rax, rax); + add_rm(ab, py + 3 * 8); store_mr(py + 3 * 8, ab); load_rm(Pack(t2, t1, t0), py + 9 * 8); - adc(t0, 0); + adc(t0, rax); adc(t1, 0); adc(t2, 0); store_mr(py + 9 * 8, Pack(t2, t1, t0)); @@ -2114,11 +2129,15 @@ private: sqrPre3(sf.p[0], sf.p[1], t); return func; } +#if 1 if (pn_ == 4 && useMulx_) { - StackFrame sf(this, 2, 10 | UseRDX); - sqrPre4(sf.p[0], sf.p[1], sf.t); + StackFrame sf(this, 3, 10 | UseRDX); + Pack t = sf.t; + t.append(sf.p[2]); + sqrPre4(sf.p[0], sf.p[1], t); return func; } +#endif if (pn_ == 6 && useMulx_ && useAdx_) { StackFrame sf(this, 3, 10 | UseRDX, 6 * 8); Pack t = sf.t; diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp index 1428137..25f51e3 100644 --- a/test/fp_tower_test.cpp +++ b/test/fp_tower_test.cpp @@ -14,7 +14,7 @@ #if MCL_MAX_BIT_SIZE >= 768 typedef mcl::FpT<mcl::FpTag, MCL_MAX_BIT_SIZE> Fp; #else -typedef mcl::FpT<mcl::FpTag, 256> Fp; +typedef mcl::FpT<mcl::FpTag, 384> Fp; #endif typedef mcl::Fp2T<Fp> Fp2; typedef mcl::FpDblT<Fp> FpDbl; @@ -28,10 +28,11 @@ void testFp2() using namespace mcl; puts(__FUNCTION__); #if MCL_MAX_BIT_SIZE < 768 - CYBOZU_TEST_EQUAL(sizeof(Fp), 32); - CYBOZU_TEST_EQUAL(sizeof(Fp2), 32 * 2); - CYBOZU_TEST_EQUAL(sizeof(Fp6), 32 * 6); - CYBOZU_TEST_EQUAL(sizeof(Fp12), 32 * 12); + const size_t FpSize = 48; + CYBOZU_TEST_EQUAL(sizeof(Fp), FpSize); + CYBOZU_TEST_EQUAL(sizeof(Fp2), FpSize * 2); + CYBOZU_TEST_EQUAL(sizeof(Fp6), FpSize * 6); + CYBOZU_TEST_EQUAL(sizeof(Fp12), FpSize * 12); #endif Fp2 x, y, z; x.a = 1; @@ -335,6 +336,7 @@ void testFpDbl() tx.getMpz(mtx); mo = mtx * mtx; } +std::cout << std::hex; CYBOZU_TEST_EQUAL(mz, mo); FpDbl::mod(z, d); @@ -434,6 +436,11 @@ void testAll() "0x7523648240000001ba344d80000000086121000000000013a700000000000017", "0x800000000000000000000000000000000000000000000000000000000000005f", "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff43", // max prime +#if MCL_MAX_BIT_SIZE >= 384 + // N = 6 + "0x1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaab", + "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000ffffffff", +#endif #if MCL_MAX_BIT_SIZE >= 768 "776259046150354467574489744231251277628443008558348305569526019013025476343188443165439204414323238975243865348565536603085790022057407195722143637520590569602227488010424952775132642815799222412631499596858234375446423426908029627", #endif |