aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2018-10-14 18:34:43 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2018-10-14 18:34:43 +0800
commitd4aa05ae1a155190f12134f04153e773a2015a80 (patch)
tree18d52b2d119971dba54a12a4f0b1877e20236a43
parentf3776baa350dc4d5d534736197231f16a8b9697f (diff)
downloadtangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.tar.gz
tangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.tar.zst
tangerine-mcl-d4aa05ae1a155190f12134f04153e773a2015a80.zip
add fp_sub6 for bls12
-rw-r--r--src/fp.cpp2
-rw-r--r--src/fp_generator.hpp51
2 files changed, 39 insertions, 14 deletions
diff --git a/src/fp.cpp b/src/fp.cpp
index dfa6e11..bea1dff 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -388,7 +388,7 @@ bool Op::init(const mpz_class& _p, size_t maxBitSize, Mode mode, size_t mclMaxBi
*/
#ifdef MCL_USE_XBYAK
if (mode == FP_AUTO) mode = FP_XBYAK;
- if (mode == FP_XBYAK && bitSize > 256) {
+ if (mode == FP_XBYAK && bitSize > 384) {
mode = FP_AUTO;
}
if (!isEnableJIT()) {
diff --git a/src/fp_generator.hpp b/src/fp_generator.hpp
index 9e75f37..c203ac8 100644
--- a/src/fp_generator.hpp
+++ b/src/fp_generator.hpp
@@ -275,14 +275,6 @@ private:
isFullBit_ = op.isFullBit;
// printf("p=%p, pn_=%d, isFullBit_=%d\n", p_, pn_, isFullBit_);
- op.fp_add = getCurr<void4u>();
- op.fp_addA_ = getCurr<void3u>();
- gen_fp_add();
- align(16);
- op.fp_sub = getCurr<void4u>();
- op.fp_subA_ = getCurr<void3u>();
- gen_fp_sub();
-
align(16);
op.fp_addPre = getCurr<u3u>();
gen_addSubPre(true, pn_);
@@ -290,6 +282,17 @@ private:
op.fp_subPre = getCurr<u3u>();
gen_addSubPre(false, pn_);
align(16);
+ op.fp_sub = getCurr<void4u>();
+ op.fp_subA_ = getCurr<void3u>();
+ gen_fp_sub();
+ if (op.N > 4) return;
+ align(16);
+ op.fp_add = getCurr<void4u>();
+ op.fp_addA_ = getCurr<void3u>();
+ gen_fp_add();
+ if (op.N > 4) return;
+
+ align(16);
op.fp_shr1 = getCurr<void2u>();
gen_shr1();
@@ -700,25 +703,47 @@ private:
gen_raw_sub(pz, px, py, rax, pn_);
gen_raw_fp_sub(pz + 8 * pn_, px + 8 * pn_, py + 8 * pn_, sf.t, true);
}
+ void gen_fp_sub6()
+ {
+ StackFrame sf(this, 3, 4);
+ const Reg64& pz = sf.p[0];
+ const Reg64& px = sf.p[1];
+ const Reg64& py = sf.p[2];
+ Pack t = sf.t;
+ t.append(rax);
+ t.append(px); // |t| = 6
+ load_rm(t, px); // destroy px
+ sub_rm(t, py);
+ /*
+ jmp is faster than and-mask without jmp
+ */
+ jnc("@f");
+ mov(py, (size_t)p_); // destory py
+ add_rm(t, py);
+ L("@@");
+ store_mr(pz, t);
+ }
void gen_fp_sub()
{
if (pn_ <= 4) {
gen_fp_sub_le4();
return;
}
+ if (pn_ == 6) {
+ gen_fp_sub6();
+ return;
+ }
StackFrame sf(this, 3);
const Reg64& pz = sf.p[0];
const Reg64& px = sf.p[1];
const Reg64& py = sf.p[2];
const Xbyak::CodeGenerator::LabelType jmpMode = pn_ < 5 ? T_AUTO : T_NEAR;
-
- inLocalLabel();
+ Label exit;
gen_raw_sub(pz, px, py, rax, pn_);
- jnc(".exit", jmpMode);
+ jnc(exit, jmpMode);
mov(px, (size_t)p_);
gen_raw_add(pz, pz, px, rax, pn_);
- L(".exit");
- outLocalLabel();
+ L(exit);
}
void gen_fp_neg()
{