aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMITSUNARI Shigeo <herumi@nifty.com>2016-09-05 16:11:03 +0800
committerMITSUNARI Shigeo <herumi@nifty.com>2016-09-05 16:11:03 +0800
commite312097f27bcdee838175a23085ddeb59e10a031 (patch)
tree76c229f9b3b37a2aa2b3dd895dd08bd3fe4c4094
parent5d2d435b16e350597c38d0251deaddc1073b975c (diff)
downloadtangerine-mcl-e312097f27bcdee838175a23085ddeb59e10a031.tar.gz
tangerine-mcl-e312097f27bcdee838175a23085ddeb59e10a031.tar.zst
tangerine-mcl-e312097f27bcdee838175a23085ddeb59e10a031.zip
add mcl_fp_addNC for x86-64 by nasm
-rw-r--r--Makefile9
-rw-r--r--common.mk1
-rw-r--r--include/mcl/fp_tower.hpp11
-rw-r--r--src/asm/low_x86-64.asm88
4 files changed, 104 insertions, 5 deletions
diff --git a/Makefile b/Makefile
index d9539bf..b1abdd8 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,11 @@ ifeq ($(HAS_BMI2),1)
LLVM_FLAGS+=-mattr=bmi2
endif
+ifneq ($(ASM),)
+ LOW_ASM_OBJ=$(LOW_ASM_SRC:.asm=.o)
+ LIB_OBJ+=$(LOW_ASM_OBJ)
+endif
+
$(MCL_LIB): $(LIB_OBJ)
-$(MKDIR) $(@D)
$(AR) $@ $(LIB_OBJ)
@@ -57,6 +62,7 @@ $(LLVM_SRC): $(GEN_EXE) $(FUNC_LIST)
$(FUNC_LIST): $(LOW_ASM_SRC)
$(shell awk '/global/ { print $$2}' $(LOW_ASM_SRC) > $(FUNC_LIST) || touch $(FUNC_LIST))
+ $(shell awk '/proc/ { print $$2}' $(LOW_ASM_SRC) >> $(FUNC_LIST))
$(GEN_EXE): src/gen.cpp src/llvm_gen.hpp
$(CXX) -o $@ $< $(CFLAGS) -O0
@@ -64,6 +70,9 @@ $(GEN_EXE): src/gen.cpp src/llvm_gen.hpp
asm: $(LLVM_SRC)
$(LLVM_OPT) -O3 -o - $(LLVM_SRC) | $(LLVM_LLC) -O3 $(LLVM_FLAGS) -x86-asm-syntax=intel
+$(LOW_ASM_OBJ): $(LOW_ASM_SRC)
+ $(ASM) $<
+
##################################################################
VPATH=test sample src
diff --git a/common.mk b/common.mk
index ee1bb2d..d8ecaf5 100644
--- a/common.mk
+++ b/common.mk
@@ -9,6 +9,7 @@ ifeq ($(ARCH),x86_64)
BIT=64
BIT_OPT=-m64
LOW_ASM_SRC=src/asm/low_x86-64.asm
+ ASM=nasm -felf64
endif
ifeq ($(ARCH),x86)
CPU=x86
diff --git a/include/mcl/fp_tower.hpp b/include/mcl/fp_tower.hpp
index a86894b..22618d0 100644
--- a/include/mcl/fp_tower.hpp
+++ b/include/mcl/fp_tower.hpp
@@ -63,6 +63,7 @@ class Fp2T : public fp::Operator<Fp2T<Fp> > {
typedef FpDblT<Fp> FpDbl;
static Fp xi_a_;
public:
+ typedef typename Fp::BaseFp BaseFp;
Fp a, b;
Fp2T() { }
Fp2T(int64_t a) : a(a), b(0) { }
@@ -93,15 +94,15 @@ public:
*/
friend std::ostream& operator<<(std::ostream& os, const Fp2T& self)
{
- return os << self.a << mcl::getIoSeparator() << self.b;
+ return os << self.a << Fp::getIoSeparator() << self.b;
}
friend std::istream& operator>>(std::istream& is, Fp2T& self)
{
return is >> self.a >> self.b;
}
- std::string getStr(int base = 10, bool withPrefix = false)
+ std::string getStr(int ioMode)
{
- return a.getStr(base, withPrefix) + ' ' + b.getStr(base, withPrefix);
+ return a.getStr(ioMode) + fp::getIoSeparator(ioMode) + b.getStr(ioMode);
}
bool isZero() const { return a.isZero() && b.isZero(); }
bool isOne() const { return a.isOne() && b.isZero(); }
@@ -416,7 +417,7 @@ struct Fp6T : public fp::Operator<Fp6T<Fp> > {
bool operator!=(const Fp6T& rhs) const { return !operator==(rhs); }
friend std::ostream& operator<<(std::ostream& os, const Fp6T& x)
{
- const char *sep = mcl::getIoSeparator();
+ const char *sep = Fp::getIoSeparator();
return os << x.a << sep << x.b << sep << x.c;
}
friend std::istream& operator>>(std::istream& is, Fp6T& x)
@@ -682,7 +683,7 @@ struct Fp12T : public fp::Operator<Fp12T<Fp> > {
}
friend std::ostream& operator<<(std::ostream& os, const Fp12T& self)
{
- return os << self.a << mcl::getIoSeparator() << self.b;
+ return os << self.a << Fp::getIoSeparator() << self.b;
}
friend std::istream& operator>>(std::istream& is, Fp12T& self)
{
diff --git a/src/asm/low_x86-64.asm b/src/asm/low_x86-64.asm
index e69de29..72faa69 100644
--- a/src/asm/low_x86-64.asm
+++ b/src/asm/low_x86-64.asm
@@ -0,0 +1,88 @@
+
+; Linux rdi rsi rdx rcx
+; Win rcx rdx r8 r9
+
+%ifdef _WIN64
+ %define p1org rcx
+ %define p2org rdx
+ %define p3org r8
+ %define p4org r9
+%else
+ %define p1org rdi
+ %define p2org rsi
+ %define p3org rdx
+ %define p4org rcx
+%endif
+
+%imacro proc 1
+global %1
+%1:
+%endmacro
+
+segment .text
+
+%imacro addNC 1
+ mov rax, [p2org]
+ add rax, [p3org]
+ mov [p1org], rax
+%assign i 1
+%rep %1
+ mov rax, [p2org + i * 8]
+ adc rax, [p3org + i * 8]
+ mov [p1org + i * 8], rax
+%assign i (i+1)
+%endrep
+ setc al
+ movzx eax, al
+ ret
+%endmacro
+
+proc mcl_fp_addNC64
+ addNC 0
+proc mcl_fp_addNC128
+ addNC 1
+proc mcl_fp_addNC192
+ addNC 2
+proc mcl_fp_addNC256
+ addNC 3
+proc mcl_fp_addNC320
+ addNC 4
+proc mcl_fp_addNC384
+ addNC 5
+proc mcl_fp_addNC448
+ addNC 6
+proc mcl_fp_addNC512
+ addNC 7
+proc mcl_fp_addNC576
+ addNC 8
+proc mcl_fp_addNC640
+ addNC 9
+proc mcl_fp_addNC704
+ addNC 10
+proc mcl_fp_addNC768
+ addNC 11
+proc mcl_fp_addNC832
+ addNC 12
+proc mcl_fp_addNC896
+ addNC 13
+proc mcl_fp_addNC960
+ addNC 14
+proc mcl_fp_addNC1024
+ addNC 15
+proc mcl_fp_addNC1088
+ addNC 16
+proc mcl_fp_addNC1152
+ addNC 17
+proc mcl_fp_addNC1216
+ addNC 18
+proc mcl_fp_addNC1280
+ addNC 19
+proc mcl_fp_addNC1344
+ addNC 20
+proc mcl_fp_addNC1408
+ addNC 21
+proc mcl_fp_addNC1472
+ addNC 22
+proc mcl_fp_addNC1536
+ addNC 23
+