refactor fp_tower

author: MITSUNARI Shigeo <herumi@nifty.com> 2018-08-22 15:18:22 +0800
committer: MITSUNARI Shigeo <herumi@nifty.com> 2018-08-22 15:18:22 +0800
commit: 2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150 (patch)
tree: 814b7dce33a0db8a639c3863e0b4b4093bd64b9b /include
parent: b4cdb6a7f1cab49e6f65417d38c3ca7b8b47d3ee (diff)
download: tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.tar.gz
tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.tar.zst
tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.zip
2 files changed, 142 insertions, 71 deletions
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index 8c28a8e..ebd1aac 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -117,76 +117,79 @@ public:
 	{
 		gmp::setArray(pb, x, v_, Fp::op_.N * 2);
 	}
+#ifdef MCL_XBYAK_DIRECT_CALL
+	static void (*add)(FpDblT& z, const FpDblT& x, const FpDblT& y);
+	static void (*sub)(FpDblT& z, const FpDblT& x, const FpDblT& y);
+	static void (*mod)(Fp& z, const FpDblT& xy);
+	static void addC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); }
+	static void subC(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
+	static void modC(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
+#else
 	static void add(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_add(z.v_, x.v_, y.v_, Fp::op_.p); }
 	static void sub(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_sub(z.v_, x.v_, y.v_, Fp::op_.p); }
-	static void addPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_addPre(z.v_, x.v_, y.v_); }
-	static void subPre(FpDblT& z, const FpDblT& x, const FpDblT& y) { Fp::op_.fpDbl_subPre(z.v_, x.v_, y.v_); }
+	static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
+#endif
+	static void (*addPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
+	static void (*subPre)(FpDblT& z, const FpDblT& x, const FpDblT& y);
 	/*
 		mul(z, x, y) = mulPre(xy, x, y) + mod(z, xy)
 	*/
-	static void mulPre(FpDblT& xy, const Fp& x, const Fp& y) { Fp::op_.fpDbl_mulPre(xy.v_, x.v_, y.v_); }
-	static void sqrPre(FpDblT& xx, const Fp& x) { Fp::op_.fpDbl_sqrPre(xx.v_, x.v_); }
-	static void mod(Fp& z, const FpDblT& xy) { Fp::op_.fpDbl_mod(z.v_, xy.v_, Fp::op_.p); }
+	static void (*mulPre)(FpDblT& xy, const Fp& x, const Fp& y);
+	static void (*sqrPre)(FpDblT& xx, const Fp& x);
 	static void mulUnit(FpDblT& z, const FpDblT& x, Unit y)
 	{
 		if (mulSmallUnit(z, x, y)) return;
 		assert(0); // not supported y
 	}
-	void operator+=(const FpDblT& x) { add(*this, *this, x); }
-	void operator-=(const FpDblT& x) { sub(*this, *this, x); }
-	/*
-		Fp2Dbl::mulPre by FpDblT
-		@note mod of NIST_P192 is fast
-	*/
-	static void fp2Dbl_mulPreW(Unit *z, const Unit *x, const Unit *y)
-	{
-		const Fp *px = reinterpret_cast<const Fp*>(x);
-		const Fp *py = reinterpret_cast<const Fp*>(y);
-		const Fp& a = px[0];
-		const Fp& b = px[1];
-		const Fp& c = py[0];
-		const Fp& d = py[1];
-		FpDblT& d0 = reinterpret_cast<FpDblT*>(z)[0];
-		FpDblT& d1 = reinterpret_cast<FpDblT*>(z)[1];
-		FpDblT d2;
-		Fp s, t;
-		Fp::add(s, a, b);
-		Fp::add(t, c, d);
-		FpDblT::mulPre(d1, s, t); // (a + b)(c + d)
-		FpDblT::mulPre(d0, a, c);
-		FpDblT::mulPre(d2, b, d);
-		FpDblT::sub(d1, d1, d0); // (a + b)(c + d) - ac
-		FpDblT::sub(d1, d1, d2); // (a + b)(c + d) - ac - bd
-		FpDblT::sub(d0, d0, d2); // ac - bd
-	}
-	/*
-		Fp2Dbl::mulPre by FpDblT with No Carry
-	*/
-	static void fp2Dbl_mulPreNoCarryW(Unit *z, const Unit *x, const Unit *y)
+	static void init()
 	{
-		const Fp *px = reinterpret_cast<const Fp*>(x);
-		const Fp *py = reinterpret_cast<const Fp*>(y);
-		const Fp& a = px[0];
-		const Fp& b = px[1];
-		const Fp& c = py[0];
-		const Fp& d = py[1];
-		FpDblT& d0 = reinterpret_cast<FpDblT*>(z)[0];
-		FpDblT& d1 = reinterpret_cast<FpDblT*>(z)[1];
-		FpDblT d2;
-		Fp s, t;
-		Fp::addPre(s, a, b);
-		Fp::addPre(t, c, d);
-		FpDblT::mulPre(d1, s, t); // (a + b)(c + d)
-		FpDblT::mulPre(d0, a, c);
-		FpDblT::mulPre(d2, b, d);
-		FpDblT::subPre(d1, d1, d0); // (a + b)(c + d) - ac
-		FpDblT::subPre(d1, d1, d2); // (a + b)(c + d) - ac - bd
-		FpDblT::sub(d0, d0, d2); // ac - bd
+		const mcl::fp::Op& op = Fp::getOp();
+#ifdef MCL_XBYAK_DIRECT_CALL
+		add = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_addA_;
+		if (add == 0) add = addC;
+		sub = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_subA_;
+		if (sub == 0) sub = subC;
+		mod = (void (*)(Fp&, const FpDblT&))op.fpDbl_modA_;
+		if (mod == 0) mod = modC;
+#endif
+		if (op.fpDbl_addPreA_) {
+			addPre = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_addPreA_;
+		} else {
+			addPre = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_addPre;
+		}
+		if (op.fpDbl_subPreA_) {
+			subPre = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_subPreA_;
+		} else {
+			subPre = (void (*)(FpDblT&, const FpDblT&, const FpDblT&))op.fpDbl_subPre;
+		}
+		if (op.fpDbl_mulPreA_) {
+			mulPre = (void (*)(FpDblT&, const Fp&, const Fp&))op.fpDbl_mulPreA_;
+		} else {
+			mulPre = (void (*)(FpDblT&, const Fp&, const Fp&))op.fpDbl_mulPre;
+		}
+		if (op.fpDbl_sqrPreA_) {
+			sqrPre = (void (*)(FpDblT&, const Fp&))op.fpDbl_sqrPreA_;
+		} else {
+			sqrPre = (void (*)(FpDblT&, const Fp&))op.fpDbl_sqrPre;
+		}
 	}
+	void operator+=(const FpDblT& x) { add(*this, *this, x); }
+	void operator-=(const FpDblT& x) { sub(*this, *this, x); }
 };
 
+#ifdef MCL_XBYAK_DIRECT_CALL
+template<class Fp> void (*FpDblT<Fp>::add)(FpDblT&, const FpDblT&, const FpDblT&);
+template<class Fp> void (*FpDblT<Fp>::sub)(FpDblT&, const FpDblT&, const FpDblT&);
+template<class Fp> void (*FpDblT<Fp>::mod)(Fp&, const FpDblT&);
+#endif
+template<class Fp> void (*FpDblT<Fp>::addPre)(FpDblT&, const FpDblT&, const FpDblT&);
+template<class Fp> void (*FpDblT<Fp>::subPre)(FpDblT&, const FpDblT&, const FpDblT&);
+template<class Fp> void (*FpDblT<Fp>::mulPre)(FpDblT&, const Fp&, const Fp&);
+template<class Fp> void (*FpDblT<Fp>::sqrPre)(FpDblT&, const Fp&);
+
 template<class Fp> struct Fp12T;
 template<class Fp> class BNT;
+template<class Fp> struct Fp2DblT;
 /*
 	beta = -1
 	Fp2 = F[i] / (i^2 + 1)
@@ -198,6 +201,7 @@ class Fp2T : public fp::Serializable<Fp2T<_Fp>,
 	typedef _Fp Fp;
 	typedef fp::Unit Unit;
 	typedef FpDblT<Fp> FpDbl;
+	typedef Fp2DblT<Fp> Fp2Dbl;
 	static uint32_t xi_a_;
 	static const size_t gN = 5;
 	/*
@@ -381,13 +385,6 @@ public:
 		if (sub == 0) sub = fp2_subC;
 		neg = (void (*)(Fp2T& y, const Fp2T& x))op.fp2_negA_;
 		if (neg == 0) neg = fp2_negC;
-		if (op.fp2Dbl_mulPre == 0) {
-			if (op.isFullBit) {
-				op.fp2Dbl_mulPre = FpDblT<Fp>::fp2Dbl_mulPreW;
-			} else {
-				op.fp2Dbl_mulPre = FpDblT<Fp>::fp2Dbl_mulPreNoCarryW;
-			}
-		}
 		mul = (void (*)(Fp2T& z, const Fp2T& x, const Fp2T& y))op.fp2_mulA_;
 		if (mul == 0) {
 			if (op.isFastMod) {
@@ -444,6 +441,8 @@ public:
 			Fp2T::mul(g2[i], t, g[i]);
 			g3[i] = g[i] * g2[i];
 		}
+		FpDblT<Fp>::init();
+		Fp2DblT<Fp>::init();
 	}
 #ifndef CYBOZU_DONT_USE_EXCEPTION
 	template<class InputStream>
@@ -535,10 +534,10 @@ private:
 	}
 	static void fp2_mulC(Fp2T& z, const Fp2T& x, const Fp2T& y)
 	{
-		FpDbl d[2];
-		Fp::getOp().fp2Dbl_mulPre((Unit*)d, (const Unit*)&x, (const Unit*)&y);
-		FpDbl::mod(z.a, d[0]);
-		FpDbl::mod(z.b, d[1]);
+		Fp2Dbl d;
+		Fp2Dbl::mulPre(d, x, y);
+		FpDbl::mod(z.a, d.a);
+		FpDbl::mod(z.b, d.b);
 	}
 	/*
 		x = a + bi, i^2 = -1
@@ -630,6 +629,7 @@ template<class Fp>
 struct Fp2DblT {
 	typedef FpDblT<Fp> FpDbl;
 	typedef Fp2T<Fp> Fp2;
+	typedef fp::Unit Unit;
 	FpDbl a, b;
 	static void add(Fp2DblT& z, const Fp2DblT& x, const Fp2DblT& y)
 	{
@@ -687,10 +687,7 @@ struct Fp2DblT {
 		Fp::sub(t1, x.a, x.b); // a - b
 		FpDbl::mulPre(y.a, t1, t2); // (a + b)(a - b)
 	}
-	static void mulPre(Fp2DblT& z, const Fp2& x, const Fp2& y)
-	{
-		Fp::getOp().fp2Dbl_mulPre((fp::Unit*)&z, (const fp::Unit*)&x, (const fp::Unit*)&y);
-	}
+	static void (*mulPre)(Fp2DblT&, const Fp2&, const Fp2&);
 	static void mod(Fp2& y, const Fp2DblT& x)
 	{
 		FpDbl::mod(y.a, x.a);
@@ -704,8 +701,68 @@ struct Fp2DblT {
 #endif
 	void operator+=(const Fp2DblT& x) { add(*this, *this, x); }
 	void operator-=(const Fp2DblT& x) { sub(*this, *this, x); }
+	static void init()
+ 	{
+		const mcl::fp::Op& op = Fp::getOp();
+		if (op.fp2Dbl_mulPreA_) {
+			mulPre = (void (*)(Fp2DblT&, const Fp2&, const Fp2&))op.fp2Dbl_mulPreA_;
+		} else {
+			if (op.isFullBit) {
+				mulPre = fp2Dbl_mulPreW;
+			} else {
+				mulPre = fp2Dbl_mulPreNoCarryW;
+			}
+		}
+	}
+	/*
+		Fp2Dbl::mulPre by FpDblT
+		@note mod of NIST_P192 is fast
+	*/
+	static void fp2Dbl_mulPreW(Fp2DblT& z, const Fp2& x, const Fp2& y)
+	{
+		const Fp& a = x.a;
+		const Fp& b = x.b;
+		const Fp& c = y.a;
+		const Fp& d = y.b;
+		FpDbl& d0 = z.a;
+		FpDbl& d1 = z.b;
+		FpDbl d2;
+		Fp s, t;
+		Fp::add(s, a, b);
+		Fp::add(t, c, d);
+		FpDbl::mulPre(d1, s, t); // (a + b)(c + d)
+		FpDbl::mulPre(d0, a, c);
+		FpDbl::mulPre(d2, b, d);
+		FpDbl::sub(d1, d1, d0); // (a + b)(c + d) - ac
+		FpDbl::sub(d1, d1, d2); // (a + b)(c + d) - ac - bd
+		FpDbl::sub(d0, d0, d2); // ac - bd
+	}
+	/*
+		Fp2Dbl::mulPre by FpDbl with No Carry
+	*/
+	static void fp2Dbl_mulPreNoCarryW(Fp2DblT& z, const Fp2& x, const Fp2& y)
+	{
+		const Fp& a = x.a;
+		const Fp& b = x.b;
+		const Fp& c = y.a;
+		const Fp& d = y.b;
+		FpDbl& d0 = z.a;
+		FpDbl& d1 = z.b;
+		FpDbl d2;
+		Fp s, t;
+		Fp::addPre(s, a, b);
+		Fp::addPre(t, c, d);
+		FpDbl::mulPre(d1, s, t); // (a + b)(c + d)
+		FpDbl::mulPre(d0, a, c);
+		FpDbl::mulPre(d2, b, d);
+		FpDbl::subPre(d1, d1, d0); // (a + b)(c + d) - ac
+		FpDbl::subPre(d1, d1, d2); // (a + b)(c + d) - ac - bd
+		FpDbl::sub(d0, d0, d2); // ac - bd
+	}
 };
 
+template<class Fp> void (*Fp2DblT<Fp>::mulPre)(Fp2DblT&, const Fp2T<Fp>&, const Fp2T<Fp>&);
+
 template<class Fp> uint32_t Fp2T<Fp>::xi_a_;
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g[Fp2T<Fp>::gN];
 template<class Fp> Fp2T<Fp> Fp2T<Fp>::g2[Fp2T<Fp>::gN];
diff --git a/include/mcl/op.hpp b/include/mcl/op.hpp
index 16c1050..7aff131 100644
--- a/include/mcl/op.hpp
+++ b/include/mcl/op.hpp
@@ -189,6 +189,14 @@ struct Op {
 	void2u fp2_negA_;
 	void3u fp2_mulA_;
 	void2u fp2_sqrA_;
+	void3u fpDbl_addA_;
+	void3u fpDbl_subA_;
+	void3u fpDbl_addPreA_;
+	void3u fpDbl_subPreA_;
+	void3u fpDbl_mulPreA_;
+	void2u fpDbl_sqrPreA_;
+	void2u fpDbl_modA_;
+	void3u fp2Dbl_mulPreA_;
 	size_t maxN;
 	size_t N;
 	size_t bitSize;
@@ -218,7 +226,6 @@ struct Op {
 	u3u fp_subPre; // without modulo p
 	u3u fpDbl_addPre;
 	u3u fpDbl_subPre;
-	void3u fp2Dbl_mulPre;
 	/*
 		for Fp2 = F[u] / (u^2 + 1)
 		x = a + bu
@@ -271,6 +278,14 @@ struct Op {
 		fp2_negA_ = 0;
 		fp2_mulA_ = 0;
 		fp2_sqrA_ = 0;
+		fpDbl_addA_ = 0;
+		fpDbl_subA_ = 0;
+		fpDbl_addPreA_ = 0;
+		fpDbl_subPreA_ = 0;
+		fpDbl_mulPreA_ = 0;
+		fpDbl_sqrPreA_ = 0;
+		fpDbl_modA_ = 0;
+		fp2Dbl_mulPreA_ = 0;
 		maxN = 0;
 		N = 0;
 		bitSize = 0;
@@ -300,7 +315,6 @@ struct Op {
 		fp_subPre = 0;
 		fpDbl_addPre = 0;
 		fpDbl_subPre = 0;
-		fp2Dbl_mulPre = 0;
 
 		xi_a = 0;
 		fp2_mulNF = 0;
author	MITSUNARI Shigeo <herumi@nifty.com>	2018-08-22 15:18:22 +0800
committer	MITSUNARI Shigeo <herumi@nifty.com>	2018-08-22 15:18:22 +0800
commit	2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150 (patch)
tree	814b7dce33a0db8a639c3863e0b4b4093bd64b9b /include
parent	b4cdb6a7f1cab49e6f65417d38c3ca7b8b47d3ee (diff)
download	tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.tar.gz tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.tar.zst tangerine-mcl-2ef8c5eb20a4b67b8aecbbd7a933cc45cc60d150.zip