From 5e0f9b4f263274f8b6b5bde5ae672107d1adcb42 Mon Sep 17 00:00:00 2001 From: zhouzhongyuan1 Date: Wed, 7 Apr 2021 16:41:54 +0800 Subject: [PATCH] add cpu judge head file and use it --- CMakeLists.txt | 2 +- include/FastFAM.h | 13 +++++++------ include/Matrix.hpp | 8 ++++---- include/cpu.h | 25 +++++++++++++++++++++++++ main/eigen_func.h | 13 +++++++------ main/gcta.h | 15 ++++++++------- main/mkl.cpp | 12 ++++++------ src/FastFAM.cpp | 10 +++++----- src/GRM.cpp | 33 +++++++-------------------------- src/Geno.cpp | 29 +++++------------------------ src/LD.cpp | 7 ++++--- src/StatLib.cpp | 11 ++++++++--- 12 files changed, 87 insertions(+), 91 deletions(-) create mode 100644 include/cpu.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e88df1..cb5bed6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64") set(KML_LIB ${lib_blas64} ${lib_openblas}) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR} -pthread -fvisibility=hidden -fvisibility-inlines-hidden") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR}/submods/plink-ng/2.0/simde/ -pthread -fvisibility=hidden -fvisibility-inlines-hidden") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG") #-flto set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3") add_compile_options(-fopenmp) diff --git a/include/FastFAM.h b/include/FastFAM.h index b81008e..3cd3eeb 100644 --- a/include/FastFAM.h +++ b/include/FastFAM.h @@ -19,12 +19,6 @@ #ifndef GCTA2_FASTFAM_H #define GCTA2_FASTFAM_H -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include "Logger.h" #include "Geno.h" #include "Pheno.h" @@ -34,6 +28,13 @@ #include #include #include +#include "cpu.h" + +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif using Eigen::Map; using Eigen::MatrixXd; diff --git a/include/Matrix.hpp b/include/Matrix.hpp index 3e3038b..165553f 100644 --- a/include/Matrix.hpp +++ b/include/Matrix.hpp @@ -2,11 +2,11 @@ #define GCTA2_MATRIX_H #include -#include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -35,7 +35,7 @@ bool _LLT(MatrixType &A, double &logdet){ int info, cols = (int)A.cols(); char uplo = 'L'; LOGGER.ts("LLT"); - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotrf_(&uplo, &cols, vi, &cols, &info); #else dpotrf(&uplo, &cols, vi, &cols, &info); @@ -44,7 +44,7 @@ bool _LLT(MatrixType &A, double &logdet){ if(info == 0){ logdet = A.diagonal().array().square().log().sum(); //LOGGER.ts("LLT_INV"); - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotri_(&uplo, &cols, vi, &cols, &info); #else dpotri(&uplo, &cols, vi, &cols, &info); diff --git a/include/cpu.h b/include/cpu.h new file mode 100644 index 0000000..a433f5e --- /dev/null +++ b/include/cpu.h @@ -0,0 +1,25 @@ +#ifndef GCTA_CPU_H +#define GCTA_CPU_H + +#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) + /* + * __i386__ is defined by gcc and Intel compiler on Linux, + * _M_IX86 by VS compiler, + * i386 by Sun compilers on opensolaris at least + */ + #define GCTA_CPU_X86 +#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) + /* + * both __x86_64__ and __amd64__ are defined by gcc + * __x86_64 defined by sun compiler on opensolaris at least + * _M_AMD64 defined by MS compiler + */ + #define GCTA_CPU_AMD64 +#elif defined(__arm__) || defined(__aarch64__) + #define GCTA_CPU_ARM +#else + #error Unknown CPU +#endif + +#endif + diff --git a/main/eigen_func.h b/main/eigen_func.h index 01f770e..b321621 100644 --- a/main/eigen_func.h +++ b/main/eigen_func.h @@ -12,14 +12,9 @@ #ifndef _EIGENFUNC_H #define _EIGENFUNC_H -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include "CommFunc.h" #include "StatFunc.h" +#include "cpu.h" #include #include #include @@ -29,6 +24,12 @@ #include #include +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif + using namespace Eigen; using namespace std; diff --git a/main/gcta.h b/main/gcta.h index 872c4f0..f2c01b1 100644 --- a/main/gcta.h +++ b/main/gcta.h @@ -17,12 +17,6 @@ #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET #endif -#if !defined(__aarch64__) - #ifndef EIGEN_USE_MKL_ALL - #define EIGEN_USE_MKL_ALL - #endif -#endif - #include #include "CommFunc.h" #include "StrFunc.h" @@ -41,8 +35,15 @@ #include #include "Logger.h" #include "Matrix.hpp" +#include "cpu.h" + +#if !defined(GCTA_CPU_ARM) + #ifndef EIGEN_USE_MKL_ALL + #define EIGEN_USE_MKL_ALL + #endif +#endif -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #include #else diff --git a/main/mkl.cpp b/main/mkl.cpp index c126fc7..ae4aafb 100644 --- a/main/mkl.cpp +++ b/main/mkl.cpp @@ -362,7 +362,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) // MKL's Cholesky decomposition int info = 0, int_n = (int) n; char uplo = 'L'; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotrf_(&uplo, &int_n, Vi_mkl, &int_n, &info); #else dpotrf(&uplo, &int_n, Vi_mkl, &int_n, &info); @@ -383,7 +383,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet) //LOGGER << "start inverse" << endl; // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dpotri_(&uplo, &int_n, Vi_mkl, &int_n, &info); #else dpotri(&uplo, &int_n, Vi_mkl, &int_n, &info); @@ -428,7 +428,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); #else dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); @@ -447,7 +447,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet) } // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); #else dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); @@ -490,7 +490,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) int LWORK = N*N; double *WORK = new double[n * n]; int INFO; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO); #else dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO); @@ -511,7 +511,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet) } // Calcualte V inverse - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); #else dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO); diff --git a/src/FastFAM.cpp b/src/FastFAM.cpp index d4b8fcb..5e0886a 100644 --- a/src/FastFAM.cpp +++ b/src/FastFAM.cpp @@ -21,11 +21,6 @@ #include #include #include - -#if !defined(__aarch64__) - #include -#endif - #include #include #include @@ -45,6 +40,7 @@ #include #include #include +#include "cpu.h" #include #include @@ -52,6 +48,10 @@ #include #include +#if !defined(GCTA_CPU_ARM) + #include +#endif + struct InvItem{ int32_t row; int32_t col; diff --git a/src/GRM.cpp b/src/GRM.cpp index 02c3866..f9d6e73 100644 --- a/src/GRM.cpp +++ b/src/GRM.cpp @@ -34,28 +34,9 @@ #include #include #include +#include "cpu.h" -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define CPU_AMD64 -#elif defined(__arm__) || defined(__aarch64__) - #define CPU_ARM -#else - #error Unknown CPU -#endif - -#if defined(CPU_ARM) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -944,14 +925,14 @@ void flip64(uint64_t a[64]) { //#pragma message("multiple target of N thread") //__attribute__((target_clones("popcnt","default"))) //#endif -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint32_t popcounts(uint64_t dw){ return popcount(dw); } -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("popcnt"))) uint32_t popcounts(uint64_t dw){ return popcount(dw); @@ -1002,20 +983,20 @@ void GRM::calculate_GRM_blas(uintptr_t *buf, const vector &markerIndex static const char uplo='L'; // A * At if(part_keep_indices.first == 0){ - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dsyrk_(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); #else dsyrk(&uplo, ¬rans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m); #endif }else{ - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dgemm_(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); #else //dgemm(¬rans, &trans, &m, &n, &num_marker, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); dgemm(¬rans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m); #endif double * grm_start = grm + ((uint64_t)s_n) * m; - #if defined(CPU_ARM) + #if defined(GCTA_CPU_ARM) dsyrk_(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); #else dsyrk(&uplo, ¬rans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); diff --git a/src/Geno.cpp b/src/Geno.cpp index 77dc4a8..42fc916 100644 --- a/src/Geno.cpp +++ b/src/Geno.cpp @@ -42,26 +42,7 @@ #include #include "submods/Pgenlib/PgenReader.h" #include - -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define CPU_AMD64 -#elif defined(__arm__) || defined(__aarch64__) - #define CPU_ARM -#else - #error Unknown CPU -#endif +#include "cpu.h" #ifdef _WIN64 #include @@ -79,13 +60,13 @@ #else //#define CTZU __builtin_ctz //#define CLZU __builtin_clz - #if defined(__linux__) && !defined(CPU_ARM) + #if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint32_t CTZ64U(uint64_t value){ return __builtin_ctzll(value); } - #if defined(__linux__) && !defined(CPU_ARM) + #if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("popcnt"))) uint32_t CTZ64U(uint64_t value){ return __builtin_ctzll(value); @@ -94,7 +75,7 @@ #endif -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) __attribute__((target("default"))) #endif uint64_t fill_inter_zero(uint64_t x) { @@ -111,7 +92,7 @@ uint64_t fill_inter_zero(uint64_t x) { x = x ^ t ^ (t << 1); return x; } -#if defined(__linux__) && !defined(CPU_ARM) +#if defined(__linux__) && !defined(GCTA_CPU_ARM) #include __attribute__((target("bmi2"))) uint64_t fill_inter_zero(uint64_t x) { diff --git a/src/LD.cpp b/src/LD.cpp index 319b07d..e8cfca0 100644 --- a/src/LD.cpp +++ b/src/LD.cpp @@ -9,8 +9,9 @@ #include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -67,7 +68,7 @@ void LD::calcLD(){ double alpha = 1.0 / (nr - 1); double *ptr1 = geno_buffer[cacl_index_buffer].get(); double *res1 = new double[nc1 * nc1]; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dsyrk_(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); #else dsyrk(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1); @@ -81,7 +82,7 @@ void LD::calcLD(){ nc2 = cur_buffer_offset[!cacl_index_buffer] / nr; double *ptr2 = geno_buffer[!cacl_index_buffer].get(); res2 = new double[nc2 * nc1]; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dgemm_(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); #else dgemm(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2); diff --git a/src/StatLib.cpp b/src/StatLib.cpp index d111636..664b996 100644 --- a/src/StatLib.cpp +++ b/src/StatLib.cpp @@ -18,8 +18,9 @@ #include #include #include +#include "cpu.h" -#if defined(__aarch64__) +#if defined(GCTA_CPU_ARM) #include #else #include @@ -57,7 +58,11 @@ namespace StatLib{ int info = 0; int lda = n; int lwork = n; - dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info); + #if defined(GCTA_CPU_ARM) + dgeqrf_(&n, &n, X, &lda, tau, work, &lwork, &info); + #else + dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info); + #endif if(info != 0){ return false; } @@ -78,7 +83,7 @@ namespace StatLib{ char side = 'L'; char t = 'N'; - #if defined(__aarch64__) + #if defined(GCTA_CPU_ARM) dormqr_(&side, &t, &n, &n, &n, X, &lda, tau, c, &lda, work, &lwork, &info); #else -- Gitee