diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e88df1d441cb1cb847d01de13b0ca352f61ffae..cb5bed620459c51b22a249eafc441e4c85198dba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64") set(KML_LIB ${lib_blas64} ${lib_openblas}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR} -pthread -fvisibility=hidden -fvisibility-inlines-hidden") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR}/submods/plink-ng/2.0/simde/ -pthread -fvisibility=hidden -fvisibility-inlines-hidden") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG") #-flto set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3") add_compile_options(-fopenmp) diff --git a/include/FastFAM.h b/include/FastFAM.h index b81008e8739d918e681d86e7eb363b2112d7bc7c..3cd3eeba4a2831da68331c7b715ef18e5660bbe5 100644 --- a/include/FastFAM.h +++ b/include/FastFAM.h @@ -19,12 +19,6 @@ #ifndef GCTA2_FASTFAM_H #define GCTA2_FASTFAM_H -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include "Logger.h" #include "Geno.h" #include "Pheno.h" @@ -34,6 +28,13 @@ #include #include #include +#include "cpu.h" + +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif using Eigen::Map; using Eigen::MatrixXd; diff --git a/include/Matrix.hpp b/include/Matrix.hpp index 3e3038be646544c64f81b6bd4b5e475276ad5711..165553f9313ba215c7559518d7ed0b7721a0c31e 100644 --- a/include/Matrix.hpp +++ b/include/Matrix.hpp @@ -2,11 +2,11 @@ #define GCTA2_MATRIX_H #include -#include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -35,7 +35,7 @@ bool _LLT(MatrixType &A, double &logdet){ int info, cols = (int)A.cols(); char uplo = 'L'; LOGGER.ts("LLT"); - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotrf_(&uplo, &cols, vi, &cols, &info); #else dpotrf(&uplo, &cols, vi, &cols, &info); @@ -44,7 +44,7 @@ bool _LLT(MatrixType &A, double &logdet){ if(info == 0){ logdet = A.diagonal().array().square().log().sum(); //LOGGER.ts("LLT_INV"); - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotri_(&uplo, &cols, vi, &cols, &info); #else dpotri(&uplo, &cols, vi, &cols, &info); diff --git a/include/cpu.h b/include/cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..a433f5efd8ba9e8da34d4fe5c3477d1c3a12beb3 --- /dev/null +++ b/include/cpu.h @@ -0,0 +1,25 @@ +#ifndef GCTA_CPU_H +#define GCTA_CPU_H + +#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) + /* + * __i386__ is defined by gcc and Intel compiler on Linux, + * _M_IX86 by VS compiler, + * i386 by Sun compilers on opensolaris at least + */ + #define GCTA_CPU_X86 +#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) + /* + * both __x86_64__ and __amd64__ are defined by gcc + * __x86_64 defined by sun compiler on opensolaris at least + * _M_AMD64 defined by MS compiler + */ + #define GCTA_CPU_AMD64 +#elif defined(__arm__) || defined(__aarch64__) + #define GCTA_CPU_ARM +#else + #error Unknown CPU +#endif + +#endif + diff --git a/main/eigen_func.h b/main/eigen_func.h index 01f770e02659ec8a1b7e8209726804444ec336ea..b321621551e940faa1a9050cfaab8c9db0632982 100644 --- a/main/eigen_func.h +++ b/main/eigen_func.h @@ -12,14 +12,9 @@ #ifndef _EIGENFUNC_H #define _EIGENFUNC_H -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include "CommFunc.h" #include "StatFunc.h" +#include "cpu.h" #include #include #include @@ -29,6 +24,12 @@ #include #include +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif + using namespace Eigen; using namespace std; diff --git a/main/gcta.h b/main/gcta.h index 872c4f0a120b56bcac24735af0a103fa6792d9aa..f2c01b1c3b9d84925993f9710a9c0e6068d259bb 100644 --- a/main/gcta.h +++ b/main/gcta.h @@ -17,12 +17,6 @@ #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET #endif -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include #include "CommFunc.h" #include "StrFunc.h" @@ -41,8 +35,15 @@ #include #include "Logger.h" #include "Matrix.hpp" +#include "cpu.h" + +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #include #else diff --git a/main/mkl.cpp b/main/mkl.cpp index c126fc71851b439dbe98320ea038034d109125bd..ae4aafbe2ee68dfbf37327d870a88b279551d6fe 100644 --- a/main/mkl.cpp +++ b/main/mkl.cpp @@ -362,7 +362,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) // MKL's Cholesky decomposition int info = 0, int_n = (int) n; char uplo = 'L'; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotrf_(&uplo, &int_n, Vi_mkl, &int_n, &info); #else dpotrf(&uplo, &int_n, Vi_mkl, &int_n, &info); @@ -383,7 +383,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) //LOGGER << "start inverse" << endl; // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotri_(&uplo, &int_n, Vi_mkl, &int_n, &info); #else dpotri(&uplo, &int_n, Vi_mkl, &int_n, &info); @@ -428,7 +428,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); #else dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); @@ -447,7 +447,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) } // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); #else dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); @@ -490,7 +490,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); #else dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); @@ -511,7 +511,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) } // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); #else dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); diff --git a/src/FastFAM.cpp b/src/FastFAM.cpp index d4b8fcb0be9bfda44c2397370a75a633a9f0525e..5e0886ad8d952550000581389b8255edb7f5caec 100644 --- a/src/FastFAM.cpp +++ b/src/FastFAM.cpp @@ -21,11 +21,6 @@ #include #include #include - -#if !defined(__aarch64__) - #include -#endif - #include #include #include @@ -45,6 +40,7 @@ #include #include #include +#include "cpu.h" #include #include @@ -52,6 +48,10 @@ #include #include +#if !defined(GCTA_CPU_ARM) + #include +#endif + struct InvItem{ int32_t row; int32_t col; diff --git a/src/GRM.cpp b/src/GRM.cpp index 02c3866f3c2dfd870d290b779b7afef84e7c11aa..f9d6e73711cf8dd07551e538c4e41b34332623ae 100644 --- a/src/GRM.cpp +++ b/src/GRM.cpp @@ -34,28 +34,9 @@ #include #include #include +#include "cpu.h" -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define CPU_AMD64 -#elif defined(__arm__) || defined(__aarch64__) - #define CPU_ARM -#else - #error Unknown CPU -#endif - -#if defined(CPU_ARM) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -944,14 +925,14 @@ void flip64(uint64_t a[64]) { //#pragma message("multiple target of N thread") //__attribute__((target_clones("popcnt","default"))) //#endif -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint32_t popcounts(uint64_t dw){ return popcount(dw); } -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("popcnt"))) uint32_t popcounts(uint64_t dw){ return popcount(dw); @@ -1002,20 +983,20 @@ void GRM::calculate_GRM_blas(uintptr_t *buf, const vector &markerIndex static const char uplo='L'; // A * At if(part_keep_indices.first == 0){ - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dsyrk_(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); #else dsyrk(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); #endif }else{ - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dgemm_(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); #else //dgemm(¬rans, &trans, &m, &n, &num_marker, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); dgemm(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); #endif double * grm_start = grm + ((uint64_t)s_n) * m; - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dsyrk_(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); #else dsyrk(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); diff --git a/src/Geno.cpp b/src/Geno.cpp index 77dc4a8a0ac1c51ea830795b8671c02b5114af35..42fc916442abee3161928e7ed69fc9e0b2f100e9 100644 --- a/src/Geno.cpp +++ b/src/Geno.cpp @@ -42,26 +42,7 @@ #include #include "submods/Pgenlib/PgenReader.h" #include - -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define CPU_AMD64 -#elif defined(__arm__) || defined(__aarch64__) - #define CPU_ARM -#else - #error Unknown CPU -#endif +#include "cpu.h" #ifdef _WIN64 #include @@ -79,13 +60,13 @@ #else //#define CTZU __builtin_ctz //#define CLZU __builtin_clz - #if defined(__linux__) && !defined(CPU_ARM) + #if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint32_t CTZ64U(uint64_t value){ return __builtin_ctzll(value); } - #if defined(__linux__) && !defined(CPU_ARM) + #if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("popcnt"))) uint32_t CTZ64U(uint64_t value){ return __builtin_ctzll(value); @@ -94,7 +75,7 @@ #endif -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint64_t fill_inter_zero(uint64_t x) { @@ -111,7 +92,7 @@ uint64_t fill_inter_zero(uint64_t x) { x = x ^ t ^ (t << 1); return x; } -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) #include __attribute__((target("bmi2"))) uint64_t fill_inter_zero(uint64_t x) { diff --git a/src/LD.cpp b/src/LD.cpp index 319b07d9a8182e87b0dd7f04684e34712b5eb10b..e8cfca0b7db0a4b426ae6a8605cd52b7dc0baa8b 100644 --- a/src/LD.cpp +++ b/src/LD.cpp @@ -9,8 +9,9 @@ #include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -67,7 +68,7 @@ void LD::calcLD(){ double alpha = 1.0 / (nr - 1); double *ptr1 = geno_buffer[cacl_index_buffer].get(); double *res1 = new double[nc1 * nc1]; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dsyrk_(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); #else dsyrk(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); @@ -81,7 +82,7 @@ void LD::calcLD(){ nc2 = cur_buffer_offset[!cacl_index_buffer] / nr; double *ptr2 = geno_buffer[!cacl_index_buffer].get(); res2 = new double[nc2 * nc1]; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgemm_(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); #else dgemm(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); diff --git a/src/StatLib.cpp b/src/StatLib.cpp index d1116360a5b9df55106a3394ef38048a8e03f8ed..664b996f973c0381de7de55d80b0e2495f306347 100644 --- a/src/StatLib.cpp +++ b/src/StatLib.cpp @@ -18,8 +18,9 @@ #include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -57,7 +58,11 @@ namespace StatLib{ int info = 0; int lda = n; int lwork = n; - dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info); + #if defined(GCTA_CPU_ARM) + dgeqrf_(&n, &n, X, &lda, tau, work, &lwork, &info); + #else + dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info); + #endif if(info != 0){ return false; } @@ -78,7 +83,7 @@ namespace StatLib{ char side = 'L'; char t = 'N'; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dormqr_(&side, &t, &n, &n, &n, X, &lda, tau, c, &lda, work, &lwork, &info); #else