diff options
author | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-14 18:34:43 +0800 |
---|---|---|
committer | MITSUNARI Shigeo <herumi@nifty.com> | 2018-10-14 18:34:43 +0800 |
commit | d4aa05ae1a155190f12134f04153e773a2015a80 (patch) | |
tree | 18d52b2d119971dba54a12a4f0b1877e20236a43 | |
parent | f3776baa350dc4d5d534736197231f16a8b9697f (diff) | |
download | tangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.tar.gz tangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.tar.zst tangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.zip |
add fp_sub6 for bls12
-rw-r--r-- | src/fp.cpp | 2 | ||||
-rw-r--r-- | src/fp_generator.hpp | 51 |
2 files changed, 39 insertions, 14 deletions
@@ -388,7 +388,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, Mode mode, size_t mclMaxBi */ #ifdef MCL_USE_XBYAK if (mode == FP_AUTO) mode = FP_XBYAK; - if (mode == FP_XBYAK && bitSize > 256) { + if (mode == FP_XBYAK && bitSize > 384) { mode = FP_AUTO; } if (!isEnableJIT()) { diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp index 9e75f37..c203ac8 100644 --- a/src/fp_generator.hpp +++ b/src/fp_generator.hpp @@ -275,14 +275,6 @@ private: isFullBit_ = op.isFullBit; // printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_); - op.fp_add = getCurr<void4u>(); - op.fp_addA_ = getCurr<void3u>(); - gen_fp_add(); - align(16); - op.fp_sub = getCurr<void4u>(); - op.fp_subA_ = getCurr<void3u>(); - gen_fp_sub(); - align(16); op.fp_addPre = getCurr<u3u>(); gen_addSubPre(true, pn_); @@ -290,6 +282,17 @@ private: op.fp_subPre = getCurr<u3u>(); gen_addSubPre(false, pn_); align(16); + op.fp_sub = getCurr<void4u>(); + op.fp_subA_ = getCurr<void3u>(); + gen_fp_sub(); + if (op.N > 4) return; + align(16); + op.fp_add = getCurr<void4u>(); + op.fp_addA_ = getCurr<void3u>(); + gen_fp_add(); + if (op.N > 4) return; + + align(16); op.fp_shr1 = getCurr<void2u>(); gen_shr1(); @@ -700,25 +703,47 @@ private: gen_raw_sub(pz, px, py, rax, pn_); gen_raw_fp_sub(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true); } + void gen_fp_sub6() + { + StackFrame sf(this, 3, 4); + const Reg64& pz = sf.p[0]; + const Reg64& px = sf.p[1]; + const Reg64& py = sf.p[2]; + Pack t = sf.t; + t.append(rax); + t.append(px); // |t| = 6 + load_rm(t, px); // destroy px + sub_rm(t, py); + /* + jmp is faster than and-mask without jmp + */ + jnc("@f"); + mov(py, (size_t)p_); // destory py + add_rm(t, py); + L("@@"); + store_mr(pz, t); + } void gen_fp_sub() { if (pn_ <= 4) { gen_fp_sub_le4(); return; } + if (pn_ == 6) { + gen_fp_sub6(); + return; + } StackFrame sf(this, 3); const Reg64& pz = sf.p[0]; const Reg64& px = sf.p[1]; const Reg64& py = sf.p[2]; const Xbyak::CodeGenerator::LabelType jmpMode = pn_ < 5 ? T_AUTO : T_NEAR; - - inLocalLabel(); + Label exit; gen_raw_sub(pz, px, py, rax, pn_); - jnc(".exit", jmpMode); + jnc(exit, jmpMode); mov(px, (size_t)p_); gen_raw_add(pz, pz, px, rax, pn_); - L(".exit"); - outLocalLabel(); + L(exit); } void gen_fp_neg() { |