diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2016-11-22 14:45:33 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2016-11-22 14:46:11 +0800 |
commit | c1d390c254adbe12454143d1659cdb9a88d62619 (patch) | |
tree | af02ff924cc2a32a3661458172ac4ae23a25871e | |
parent | d98aeec6ead138c483d1fda0176acdb1cc500f28 (diff) | |
download | tangerine-mcl-c1d390c254adbe12454143d1659cdb9a88d62619.tar.gz tangerine-mcl-c1d390c254adbe12454143d1659cdb9a88d62619.tar.zst tangerine-mcl-c1d390c254adbe12454143d1659cdb9a88d62619.zip |
optimize Fp6::mul
-rw-r--r-- | include/mcl/fp_tower.hpp | 119 | ||||
-rw-r--r-- | test/bn_test.cpp | 14 | ||||
-rw-r--r-- | test/fp_tower_test.cpp | 2 |
3 files changed, 120 insertions, 15 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 2edf072..3987c0a 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -24,6 +24,21 @@ public: } printf("\n"); } + void clear() + { + const size_t n = getUnitSize(); + for (size_t i = 0; i < n; i++) { + v_[i] = 0; + } + } + FpDblT& operator=(const FpDblT& rhs) + { + const size_t n = getUnitSize(); + for (size_t i = 0; i < n; i++) { + v_[i] = rhs.v_[i]; + } + return *this; + } // QQQ : does not check range of x strictly(use for debug) void setMpz(const mpz_class& x) { @@ -50,6 +65,11 @@ public: static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); } static void sqrPre(FpDblT& xx, const Fp& x) { Fp::op_.fpDbl_sqrPre(xx.v_, x.v_); } static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); } + static void mulUnit(FpDblT& z, const FpDblT& x, Unit y) + { + if (mulSmallUnit(z, x, y)) return; + throw cybozu::Exception("mulUnit:not supported") << y; + } }; /* @@ -161,7 +181,7 @@ public: Fp::add(y, aa, bb); } - static uint32_t getXi_a() { return xi_a_; } + static uint32_t get_xi_a() { return xi_a_; } static void init(uint32_t xi_a) { // assert(Fp::maxSize <= 256); @@ -369,40 +389,57 @@ private: Fp::mul(py[1], b, aa); Fp::neg(py[1], py[1]); } - struct Dbl; }; template<class Fp> -struct Fp2T<Fp>::Dbl { - typedef fp::Unit Unit; - typedef typename Fp::Dbl FpDbl; +struct Fp2DblT { +// typedef fp::Unit Unit; + typedef FpDblT<Fp> FpDbl; + typedef Fp2T<Fp> Fp2; FpDbl a, b; - static void add(Dbl& z, const Dbl& x, const Dbl& y) + static void add(Fp2DblT& z, const Fp2DblT& x, const Fp2DblT& y) { FpDbl::add(z.a, x.a, y.a); FpDbl::add(z.b, x.b, y.b); } - static void addPre(Dbl& z, const Dbl& x, const Dbl& y) + static void addPre(Fp2DblT& z, const Fp2DblT& x, const Fp2DblT& y) { FpDbl::addPre(z.a, x.a, y.a); FpDbl::addPre(z.b, x.b, y.b); } - static void sub(Dbl& z, const Dbl& x, const Dbl& y) + static void sub(Fp2DblT& z, const Fp2DblT& x, const Fp2DblT& y) { FpDbl::sub(z.a, x.a, y.a); FpDbl::sub(z.b, x.b, y.b); } - static void subPre(Dbl& z, const Dbl& x, const Dbl& y) + static void subPre(Fp2DblT& z, const Fp2DblT& x, const Fp2DblT& y) { FpDbl::subPre(z.a, x.a, y.a); FpDbl::subPre(z.b, x.b, y.b); } - static void neg(Dbl& y, const Dbl& x) + static void neg(Fp2DblT& y, const Fp2DblT& x) { FpDbl::neg(y.a, x.a); FpDbl::neg(y.b, x.b); } - static void sqr(Dbl& y, const Fp2T& x) + static void mul_xi(Fp2DblT& y, const Fp2DblT& x) + { + const uint32_t xi_a = Fp2::get_xi_a(); + if (xi_a == 1) { + FpDbl t; + FpDbl::add(t, x.a, x.b); + FpDbl::sub(y.a, x.a, x.b); + y.b = t; + } else { + FpDbl t; + FpDbl::mulUnit(t, x.a, xi_a); + FpDbl::sub(t, t, x.b); + FpDbl::mulUnit(y.b, x.b, xi_a); + FpDbl::add(y.b, y.b, x.a); + y.a = t; + } + } + static void sqrPre(Fp2DblT& y, const Fp2& x) { Fp t1, t2; Fp::addPre(t1, x.b, x.b); // 2b @@ -411,7 +448,24 @@ struct Fp2T<Fp>::Dbl { Fp::sub(t2, x.a, x.b); // a - b FpDbl::mulPre(y.a, t1, t2); // (a + b)(a - b) } - static void mod(Fp2T& y, const Dbl& x) + static void mulPre(Fp2DblT& z, const Fp2& x, const Fp2& y) + { + const Fp& a = x.a; + const Fp& b = x.b; + const Fp& c = y.a; + const Fp& d = y.b; + FpDbl BD; + Fp s, t; + Fp::addPre(s, a, b); // s = a + b + Fp::addPre(t, c, d); // t = c + d + FpDbl::mulPre(BD, b, d); // BD = bd + FpDbl::mulPre(z.a, a, c); // z.a = ac + FpDbl::mulPre(z.b, s, t); // z.b = st + FpDbl::subPre(z.b, z.b, z.a); // z.b = st - ac + FpDbl::subPre(z.b, z.b, BD); // z.b = st - ac - bd = ad + bc + FpDbl::sub(z.a, z.a, BD); // ac - bd + } + static void mod(Fp2& y, const Fp2DblT& x) { FpDbl::mod(y.a, x.a); FpDbl::mod(y.b, x.b); @@ -427,6 +481,7 @@ template<class Fp> uint32_t Fp2T<Fp>::xi_a_; template<class Fp> struct Fp6T : public fp::Operator<Fp6T<Fp> > { typedef Fp2T<Fp> Fp2; + typedef Fp2DblT<Fp> Fp2Dbl; Fp2 a, b, c; Fp6T() { } Fp6T(int64_t a) : a(a) , b(0) , c(0) { } @@ -513,12 +568,50 @@ struct Fp6T : public fp::Operator<Fp6T<Fp> > { */ static void mul(Fp6T& z, const Fp6T& x, const Fp6T& y) { +//clk.begin(); const Fp2& a = x.a; const Fp2& b = x.b; const Fp2& c = x.c; const Fp2& d = y.a; const Fp2& e = y.b; const Fp2& f = y.c; +#if 1 + Fp2Dbl AD, BE, CF; + Fp2Dbl::mulPre(AD, a, d); + Fp2Dbl::mulPre(BE, b, e); + Fp2Dbl::mulPre(CF, c, f); + + Fp2 t1, t2, t3, t4; + Fp2::add(t1, b, c); + Fp2::add(t2, e, f); + Fp2Dbl T1; + Fp2Dbl::mulPre(T1, t1, t2); + Fp2Dbl::sub(T1, T1, BE); + Fp2Dbl::sub(T1, T1, CF); + Fp2Dbl::mul_xi(T1, T1); + + Fp2::add(t2, a, b); + Fp2::add(t3, e, d); + Fp2Dbl T2; + Fp2Dbl::mulPre(T2, t2, t3); + Fp2Dbl::sub(T2, T2, AD); + Fp2Dbl::sub(T2, T2, BE); + + Fp2::add(t3, a, c); + Fp2::add(t4, d, f); + Fp2Dbl T3; + Fp2Dbl::mulPre(T3, t3, t4); + Fp2Dbl::sub(T3, T3, AD); + Fp2Dbl::sub(T3, T3, CF); + + Fp2Dbl::add(AD, AD, T1); + Fp2Dbl::mod(z.a, AD); + Fp2Dbl::mul_xi(CF, CF); + Fp2Dbl::add(CF, CF, T2); + Fp2Dbl::mod(z.b, CF); + Fp2Dbl::add(T3, T3, BE); + Fp2Dbl::mod(z.c, T3); +#else Fp2 ad, be, cf; Fp2::mul(ad, a, d); Fp2::mul(be, b, e); @@ -548,6 +641,8 @@ struct Fp6T : public fp::Operator<Fp6T<Fp> > { Fp2::mul_xi(z.b, cf); z.b += t2; Fp2::add(z.c, t3, be); +#endif +//clk.end(); } /* x = a + bv + cv^2, v^3 = xi diff --git a/test/bn_test.cpp b/test/bn_test.cpp index 9ca9f2d..36a4724 100644 --- a/test/bn_test.cpp +++ b/test/bn_test.cpp @@ -1,7 +1,7 @@ #define PUT(x) std::cout << #x "=" << x << std::endl; #define CYBOZU_TEST_DISABLE_AUTO_RUN #include <cybozu/benchmark.hpp> -//cybozu::CpuClock clk; +cybozu::CpuClock clk; #include <cybozu/test.hpp> #include <mcl/bn.hpp> #include <cybozu/option.hpp> @@ -152,9 +152,14 @@ void test(const TestSet& ts) miller loop : 700Kclk final exp : 460Kclk */ +#if 0 + for (int i = 0; i < 100; i++) { + BN::pairing(e1, Q, P); + } +#else CYBOZU_BENCH("pairing", BN::pairing, e1, Q, P); // 2.4Mclk CYBOZU_BENCH("finalExp", BN::finalExp, e1, e1); // 1.3Mclk - +#endif } CYBOZU_TEST_AUTO(naive) @@ -167,6 +172,11 @@ CYBOZU_TEST_AUTO(naive) testMapToG2(); test(ts); } + int count = (int)clk.getCount(); + if (count) { + printf("count=%d ", count); + clk.put(); + } } int main(int argc, char *argv[]) diff --git a/test/fp_tower_test.cpp b/test/fp_tower_test.cpp index 3e93a3a..b249bf3 100644 --- a/test/fp_tower_test.cpp +++ b/test/fp_tower_test.cpp @@ -62,7 +62,7 @@ void testFp2() */ z = Fp2(1, -2); Fp2::mul_xi(z, z); - Fp a = Fp2::getXi_a(); + Fp a = Fp2::get_xi_a(); CYBOZU_TEST_EQUAL(z, Fp2(a + 2, a * (-2) + 1)); z = x * x; Fp2::sqr(y, x); |