diff options
-rw-r--r-- | include/mcl/fp_tower.hpp | 21 | ||||
-rw-r--r-- | include/mcl/op.hpp | 2 | ||||
-rw-r--r-- | src/fp_generator.hpp | 17 |
3 files changed, 35 insertions, 5 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp index 052b550..3eec138 100644 --- a/include/mcl/fp_tower.hpp +++ b/include/mcl/fp_tower.hpp @@ -242,9 +242,9 @@ public: static void (*neg)(Fp2T& y, const Fp2T& x); static void (*mul)(Fp2T& z, const Fp2T& x, const Fp2T& y); static void (*sqr)(Fp2T& y, const Fp2T& x); + static void (*mul_xi)(Fp2T& y, const Fp2T& x); static void addPre(Fp2T& z, const Fp2T& x, const Fp2T& y) { Fp::addPre(z.a, x.a, y.a); Fp::addPre(z.b, x.b, y.b); } static void inv(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_inv(y.a.v_, x.a.v_); } - static void mul_xi(Fp2T& y, const Fp2T& x) { Fp::op_.fp2_mul_xi(y.a.v_, x.a.v_); } static void divBy2(Fp2T& y, const Fp2T& x) { Fp::divBy2(y.a, x.a); @@ -405,11 +405,21 @@ public: sqr = (void (*)(Fp2T& y, const Fp2T& x))op.fp2_sqrA_; if (sqr == 0) sqr = fp2_sqrC; op.fp2_inv = fp2_invW; - if (xi_a == 1) { - op.fp2_mul_xi = fp2_mul_xi_1_1i; - } else { - op.fp2_mul_xi = fp2_mul_xiW; + if (op.fp2_mul_xi == 0) { + if (xi_a == 1) { + /* + current fp_generator.hpp generates mul_xi for xi_a = 1 + */ + if (op.fp2_mul_xiA_) { + op.fp2_mul_xi = op.fp2_mul_xiA_; + } else { + op.fp2_mul_xi = fp2_mul_xi_1_1i; + } + } else { + op.fp2_mul_xi = fp2_mul_xiW; + } } + mul_xi = (void (*)(Fp2T&, const Fp2T&))op.fp2_mul_xi; const Fp2T xi(xi_a, 1); const mpz_class& p = Fp::getOp().mp; Fp2T::pow(g[0], xi, (p - 1) / 6); // g = xi^((p-1)/6) @@ -621,6 +631,7 @@ template<class Fp_> void (*Fp2T<Fp_>::sub)(Fp2T& z, const Fp2T& x, const Fp2T& y template<class Fp_> void (*Fp2T<Fp_>::neg)(Fp2T& y, const Fp2T& x); template<class Fp_> void (*Fp2T<Fp_>::mul)(Fp2T& z, const Fp2T& x, const Fp2T& y); template<class Fp_> void (*Fp2T<Fp_>::sqr)(Fp2T& y, const Fp2T& x); +template<class Fp_> void (*Fp2T<Fp_>::mul_xi)(Fp2T& y, const Fp2T& x); template<class Fp> struct Fp2DblT { diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp index df6dc4b..0c61643 100644 --- a/include/mcl/op.hpp +++ b/include/mcl/op.hpp @@ -227,6 +227,7 @@ struct Op { void4u fp2_mulNF; void2u fp2_inv; void2u fp2_mul_xi; + void2u fp2_mul_xiA_; uint32_t (*hash)(void *out, uint32_t maxOutSize, const void *msg, uint32_t msgSize); PrimeMode primeMode; @@ -306,6 +307,7 @@ struct Op { fp2_mulNF = 0; fp2_inv = 0; fp2_mul_xi = 0; + fp2_mul_xiA_ = 0; primeMode = PM_GENERIC; isFullBit = false; diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 564391c..d1b48d2 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -386,6 +386,9 @@ struct Code : Xbyak::CodeGenerator { align(16); op.fp2_sqrA_ = getCurr<void2u>(); gen_fp2_sqr4(); + align(16); + op.fp2_mul_xiA_ = getCurr<void2u>(); + gen_fp2_mul_xi4(); } } void gen_addSubPre(bool isAdd, int n) @@ -2873,6 +2876,20 @@ private: gen_raw_fp_sub(sf.p[0], sf.p[1], sf.p[2], sf.t, false); gen_raw_fp_sub(sf.p[0] + FpByte_, sf.p[1] + FpByte_, sf.p[2] + FpByte_, sf.t, false); } + /* + for only xi_a = 1 + */ + void gen_fp2_mul_xi4() + { + assert(!isFullBit_); + StackFrame sf(this, 2, 8, 8 * 4); + gen_raw_fp_add(rsp, sf.p[1], sf.p[1] + FpByte_, sf.t, false); + gen_raw_fp_sub(sf.p[0], sf.p[1], sf.p[1] + FpByte_, sf.t, false); + for (int i = 0; i < 4; i++) { + mov(rax, ptr [rsp + i * 8]); + mov(ptr[sf.p[0] + FpByte_ + i * 8], rax); + } + } void gen_fp2_neg4() { assert(!isFullBit_); |