From 5e0f9b4f263274f8b6b5bde5ae672107d1adcb42 Mon Sep 17 00:00:00 2001
From: zhouzhongyuan1 <zhouzhongyuan96@gmail.com>
Date: Wed, 7 Apr 2021 16:41:54 +0800
Subject: [PATCH] add cpu judge head file and use it

---
 CMakeLists.txt     |  2 +-
 include/FastFAM.h  | 13 +++++++------
 include/Matrix.hpp |  8 ++++----
 include/cpu.h      | 25 +++++++++++++++++++++++++
 main/eigen_func.h  | 13 +++++++------
 main/gcta.h        | 15 ++++++++-------
 main/mkl.cpp       | 12 ++++++------
 src/FastFAM.cpp    | 10 +++++-----
 src/GRM.cpp        | 33 +++++++--------------------------
 src/Geno.cpp       | 29 +++++------------------------
 src/LD.cpp         |  7 ++++---
 src/StatLib.cpp    | 11 ++++++++---
 12 files changed, 87 insertions(+), 91 deletions(-)
 create mode 100644 include/cpu.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e88df1..cb5bed6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,7 +91,7 @@ if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64")
     set(KML_LIB ${lib_blas64} ${lib_openblas})
 
     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR} -pthread -fvisibility=hidden -fvisibility-inlines-hidden")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I ${CMAKE_CURRENT_SOURCE_DIR}/submods/plink-ng/2.0/simde/ -pthread -fvisibility=hidden -fvisibility-inlines-hidden")
         set(CMAKE_CXX_FLAGS_RELEASE  "${CMAKE_CXX_FLAGS_RELEASE} -O3 -DNDEBUG") #-flto 
         set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-inline -fno-implicit-inline-templates -g3")
         add_compile_options(-fopenmp)
diff --git a/include/FastFAM.h b/include/FastFAM.h
index b81008e..3cd3eeb 100644
--- a/include/FastFAM.h
+++ b/include/FastFAM.h
@@ -19,12 +19,6 @@
 #ifndef GCTA2_FASTFAM_H
 #define GCTA2_FASTFAM_H
 
-#if !defined(__aarch64__)
-    #ifndef EIGEN_USE_MKL_ALL
-    #define EIGEN_USE_MKL_ALL
-    #endif
-#endif
-
 #include "Logger.h"
 #include "Geno.h"
 #include "Pheno.h"
@@ -34,6 +28,13 @@
 #include <vector>
 #include <mutex>
 #include <omp.h>
+#include "cpu.h"
+
+#if !defined(GCTA_CPU_ARM)
+    #ifndef EIGEN_USE_MKL_ALL
+    #define EIGEN_USE_MKL_ALL
+    #endif
+#endif
 
 using Eigen::Map;
 using Eigen::MatrixXd;
diff --git a/include/Matrix.hpp b/include/Matrix.hpp
index 3e3038b..165553f 100644
--- a/include/Matrix.hpp
+++ b/include/Matrix.hpp
@@ -2,11 +2,11 @@
 #define GCTA2_MATRIX_H
 
 #include <Eigen/Eigen>
-#include <mkl.h>
 #include <iostream>
 #include <Logger.h>
+#include "cpu.h"
 
-#if defined(__aarch64__)
+#if defined(GCTA_CPU_ARM)
     #include <lapack.h>
 #else
     #include <mkl.h>
@@ -35,7 +35,7 @@ bool _LLT(MatrixType &A, double &logdet){
     int info, cols = (int)A.cols();
     char uplo = 'L';
     LOGGER.ts("LLT");
-    #if defined(__aarch64__)
+    #if defined(GCTA_CPU_ARM)
         dpotrf_(&uplo, &cols, vi, &cols, &info);
     #else
         dpotrf(&uplo, &cols, vi, &cols, &info);
@@ -44,7 +44,7 @@ bool _LLT(MatrixType &A, double &logdet){
     if(info == 0){
         logdet = A.diagonal().array().square().log().sum();
         //LOGGER.ts("LLT_INV");
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dpotri_(&uplo, &cols, vi, &cols, &info);
         #else
             dpotri(&uplo, &cols, vi, &cols, &info);
diff --git a/include/cpu.h b/include/cpu.h
new file mode 100644
index 0000000..a433f5e
--- /dev/null
+++ b/include/cpu.h
@@ -0,0 +1,25 @@
+#ifndef GCTA_CPU_H
+#define GCTA_CPU_H
+
+#if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
+    /*
+     * __i386__ is defined by gcc and Intel compiler on Linux,
+     * _M_IX86 by VS compiler,
+     * i386 by Sun compilers on opensolaris at least
+     */
+    #define GCTA_CPU_X86
+#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64)
+    /*
+     * both __x86_64__ and __amd64__ are defined by gcc
+     * __x86_64 defined by sun compiler on opensolaris at least
+     * _M_AMD64 defined by MS compiler
+     */
+    #define GCTA_CPU_AMD64
+#elif defined(__arm__) || defined(__aarch64__)
+    #define GCTA_CPU_ARM
+#else
+    #error Unknown CPU
+#endif
+
+#endif
+
diff --git a/main/eigen_func.h b/main/eigen_func.h
index 01f770e..b321621 100644
--- a/main/eigen_func.h
+++ b/main/eigen_func.h
@@ -12,14 +12,9 @@
 #ifndef _EIGENFUNC_H
 #define _EIGENFUNC_H
 
-#if !defined(__aarch64__)
-    #ifndef EIGEN_USE_MKL_ALL
-    #define EIGEN_USE_MKL_ALL
-    #endif
-#endif
-
 #include "CommFunc.h"
 #include "StatFunc.h"
+#include "cpu.h"
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
 #include <unsupported/Eigen/SparseExtra>
@@ -29,6 +24,12 @@
 #include <vector>
 #include <algorithm>
 
+#if !defined(GCTA_CPU_ARM)
+    #ifndef EIGEN_USE_MKL_ALL
+    #define EIGEN_USE_MKL_ALL
+    #endif
+#endif
+
 using namespace Eigen;
 using namespace std;
 
diff --git a/main/gcta.h b/main/gcta.h
index 872c4f0..f2c01b1 100644
--- a/main/gcta.h
+++ b/main/gcta.h
@@ -17,12 +17,6 @@
 #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
 #endif
 
-#if !defined(__aarch64__)
-    #ifndef EIGEN_USE_MKL_ALL
-    #define EIGEN_USE_MKL_ALL
-    #endif
-#endif
-
 #include <cstdio>
 #include "CommFunc.h"
 #include "StrFunc.h"
@@ -41,8 +35,15 @@
 #include <omp.h>
 #include "Logger.h"
 #include "Matrix.hpp"
+#include "cpu.h"
+
+#if !defined(GCTA_CPU_ARM)
+    #ifndef EIGEN_USE_MKL_ALL
+    #define EIGEN_USE_MKL_ALL
+    #endif
+#endif
 
-#if defined(__aarch64__)
+#if defined(GCTA_CPU_ARM)
     #include <kblas.h>
     #include <lapack.h>
 #else
diff --git a/main/mkl.cpp b/main/mkl.cpp
index c126fc7..ae4aafb 100644
--- a/main/mkl.cpp
+++ b/main/mkl.cpp
@@ -362,7 +362,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet)
     // MKL's Cholesky decomposition
     int info = 0, int_n = (int) n;
     char uplo = 'L';
-    #if defined(__aarch64__)
+    #if defined(GCTA_CPU_ARM)
         dpotrf_(&uplo, &int_n, Vi_mkl, &int_n, &info);
     #else
         dpotrf(&uplo, &int_n, Vi_mkl, &int_n, &info);
@@ -383,7 +383,7 @@ bool gcta::comput_inverse_logdet_LDLT_mkl(eigenMatrix &Vi, double &logdet)
 
         //LOGGER << "start inverse" << endl;
         // Calcualte V inverse
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dpotri_(&uplo, &int_n, Vi_mkl, &int_n, &info);
         #else
             dpotri(&uplo, &int_n, Vi_mkl, &int_n, &info);
@@ -428,7 +428,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet)
     int LWORK = N*N;
     double *WORK = new double[n * n];
     int INFO;
-    #if defined(__aarch64__)
+    #if defined(GCTA_CPU_ARM)
         dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO);
     #else
         dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO);
@@ -447,7 +447,7 @@ bool gcta::comput_inverse_logdet_LU_mkl(eigenMatrix &Vi, double &logdet)
         }
 
         // Calcualte V inverse
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
         #else
             dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
@@ -490,7 +490,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet)
     int LWORK = N*N;
     double *WORK = new double[n * n];
     int INFO;
-    #if defined(__aarch64__)
+    #if defined(GCTA_CPU_ARM)
         dgetrf_(&N, &N, Vi_mkl, &N, IPIV, &INFO);
     #else
         dgetrf(&N, &N, Vi_mkl, &N, IPIV, &INFO);
@@ -511,7 +511,7 @@ bool gcta::comput_inverse_logdet_LU_mkl_array(int n, float *Vi, double &logdet)
         }
 
         // Calcualte V inverse
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dgetri_(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
         #else
             dgetri(&N, Vi_mkl, &N, IPIV, WORK, &LWORK, &INFO);
diff --git a/src/FastFAM.cpp b/src/FastFAM.cpp
index d4b8fcb..5e0886a 100644
--- a/src/FastFAM.cpp
+++ b/src/FastFAM.cpp
@@ -21,11 +21,6 @@
 #include <cmath>
 #include <algorithm>
 #include <Eigen/SparseCholesky>
-
-#if !defined(__aarch64__)
-    #include <Eigen/PardisoSupport>
-#endif
-
 #include <Eigen/IterativeLinearSolvers>
 #include <Eigen/Sparse>
 #include <sstream>
@@ -45,6 +40,7 @@
 #include <cstdio>
 #include <random>
 #include <chrono>
+#include "cpu.h"
 
 #include <Eigen/Core>
 #include <Eigen/SparseCore>
@@ -52,6 +48,10 @@
 #include <Spectra/SymEigsSolver.h>
 #include <Spectra/MatOp/SparseSymMatProd.h>
 
+#if !defined(GCTA_CPU_ARM)
+    #include <Eigen/PardisoSupport>
+#endif
+
 struct InvItem{
     int32_t row;
     int32_t col;
diff --git a/src/GRM.cpp b/src/GRM.cpp
index 02c3866..f9d6e73 100644
--- a/src/GRM.cpp
+++ b/src/GRM.cpp
@@ -34,28 +34,9 @@
 #include <boost/algorithm/string/join.hpp>
 #include <sstream>
 #include <csignal>
+#include "cpu.h"
 
-#if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
-    /*
-     * __i386__ is defined by gcc and Intel compiler on Linux,
-     * _M_IX86 by VS compiler,
-     * i386 by Sun compilers on opensolaris at least
-     */
-    #define CPU_X86
-#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64)
-    /*
-     * both __x86_64__ and __amd64__ are defined by gcc
-     * __x86_64 defined by sun compiler on opensolaris at least
-     * _M_AMD64 defined by MS compiler
-     */
-    #define CPU_AMD64
-#elif defined(__arm__) || defined(__aarch64__)
-    #define CPU_ARM
-#else
-    #error Unknown CPU
-#endif
-
-#if defined(CPU_ARM)
+#if defined(GCTA_CPU_ARM)
     #include <f77blas.h>
 #else
     #include <mkl.h>
@@ -944,14 +925,14 @@ void flip64(uint64_t a[64]) {
 //#pragma message("multiple target of N thread")
 //__attribute__((target_clones("popcnt","default")))
 //#endif
-#if defined(__linux__) && !defined(CPU_ARM)
+#if defined(__linux__) && !defined(GCTA_CPU_ARM)
 __attribute__((target("default")))
 #endif
 uint32_t popcounts(uint64_t dw){
     return popcount(dw);
 }
 
-#if defined(__linux__) && !defined(CPU_ARM)
+#if defined(__linux__) && !defined(GCTA_CPU_ARM)
 __attribute__((target("popcnt")))
 uint32_t popcounts(uint64_t dw){
     return popcount(dw);
@@ -1002,20 +983,20 @@ void GRM::calculate_GRM_blas(uintptr_t *buf, const vector<uint32_t> &markerIndex
     static const char uplo='L';
    // A * At 
     if(part_keep_indices.first == 0){
-        #if defined(CPU_ARM)
+        #if defined(GCTA_CPU_ARM)
             dsyrk_(&uplo, &notrans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m);
         #else
             dsyrk(&uplo, &notrans, &n, &curNumValidMarkers, &alpha, stdGeno, &n_sample, &beta, grm, &m);
         #endif
     }else{
-        #if defined(CPU_ARM)
+        #if defined(GCTA_CPU_ARM)
             dgemm_(&notrans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m);
         #else
             //dgemm(&notrans, &trans, &m, &n, &num_marker, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m);
             dgemm(&notrans, &trans, &m, &s_n, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, stdGeno, &n_sample, &beta, grm, &m);
         #endif
         double * grm_start = grm + ((uint64_t)s_n) * m;
-        #if defined(CPU_ARM)
+        #if defined(GCTA_CPU_ARM)
             dsyrk_(&uplo, &notrans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); 
         #else
             dsyrk(&uplo, &notrans, &m, &curNumValidMarkers, &alpha, stdGeno + part_keep_indices.first, &n_sample, &beta, grm_start, &m); 
diff --git a/src/Geno.cpp b/src/Geno.cpp
index 77dc4a8..42fc916 100644
--- a/src/Geno.cpp
+++ b/src/Geno.cpp
@@ -42,26 +42,7 @@
 #include <algorithm>
 #include "submods/Pgenlib/PgenReader.h"
 #include <numeric>
-
-#if defined( __i386__ ) || defined(i386) || defined(_M_IX86)
-    /*
-     * __i386__ is defined by gcc and Intel compiler on Linux,
-     * _M_IX86 by VS compiler,
-     * i386 by Sun compilers on opensolaris at least
-     */
-    #define CPU_X86
-#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64)
-    /*
-     * both __x86_64__ and __amd64__ are defined by gcc
-     * __x86_64 defined by sun compiler on opensolaris at least
-     * _M_AMD64 defined by MS compiler
-     */
-    #define CPU_AMD64
-#elif defined(__arm__) || defined(__aarch64__)
-    #define CPU_ARM
-#else
-    #error Unknown CPU
-#endif
+#include "cpu.h"
 
 #ifdef _WIN64
   #include <intrin.h>
@@ -79,13 +60,13 @@
 #else
   //#define CTZU __builtin_ctz
   //#define CLZU __builtin_clz
-  #if defined(__linux__) && !defined(CPU_ARM)
+  #if defined(__linux__) && !defined(GCTA_CPU_ARM)
   __attribute__((target("default")))
   #endif
   uint32_t CTZ64U(uint64_t value){
       return __builtin_ctzll(value);
   }
-  #if defined(__linux__) && !defined(CPU_ARM)
+  #if defined(__linux__) && !defined(GCTA_CPU_ARM)
   __attribute__((target("popcnt")))
   uint32_t CTZ64U(uint64_t value){
       return __builtin_ctzll(value);
@@ -94,7 +75,7 @@
  
 #endif
 
-#if defined(__linux__) && !defined(CPU_ARM)
+#if defined(__linux__) && !defined(GCTA_CPU_ARM)
 __attribute__((target("default")))
 #endif
 uint64_t fill_inter_zero(uint64_t x) {
@@ -111,7 +92,7 @@ uint64_t fill_inter_zero(uint64_t x) {
    x = x ^ t ^ (t << 1);
    return x;
 }
-#if defined(__linux__) && !defined(CPU_ARM)
+#if defined(__linux__) && !defined(GCTA_CPU_ARM)
 #include <x86intrin.h>
 __attribute__((target("bmi2")))
 uint64_t fill_inter_zero(uint64_t x) {
diff --git a/src/LD.cpp b/src/LD.cpp
index 319b07d..e8cfca0 100644
--- a/src/LD.cpp
+++ b/src/LD.cpp
@@ -9,8 +9,9 @@
 #include <omp.h>
 #include <cstdio>
 #include <algorithm>
+#include "cpu.h"
 
-#if defined(__aarch64__)
+#if defined(GCTA_CPU_ARM)
     #include <f77blas.h>
 #else
     #include <mkl.h>
@@ -67,7 +68,7 @@ void LD::calcLD(){
     double alpha = 1.0 / (nr - 1);
     double *ptr1 = geno_buffer[cacl_index_buffer].get();
     double *res1 = new double[nc1 * nc1];
-    #if defined(__aarch64__)
+    #if defined(GCTA_CPU_ARM)
         dsyrk_(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1);
     #else
         dsyrk(uplo, trans, &nc1, &nr, &alpha, ptr1, &nr, &zero, res1, &nc1);
@@ -81,7 +82,7 @@ void LD::calcLD(){
         nc2 = cur_buffer_offset[!cacl_index_buffer] / nr;
         double *ptr2 = geno_buffer[!cacl_index_buffer].get();
         res2 = new double[nc2 * nc1];
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dgemm_(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2);
         #else
             dgemm(trans, notrans, &nc2, &nc1, &nr, &alpha, ptr2, &nr, ptr1, &nr, &zero, res2, &nc2);
diff --git a/src/StatLib.cpp b/src/StatLib.cpp
index d111636..664b996 100644
--- a/src/StatLib.cpp
+++ b/src/StatLib.cpp
@@ -18,8 +18,9 @@
 #include <cstdio>
 #include <limits>
 #include <boost/math/distributions/chi_squared.hpp>
+#include "cpu.h"
 
-#if defined(__aarch64__)
+#if defined(GCTA_CPU_ARM)
     #include <lapack.h>
 #else
     #include <mkl.h>
@@ -57,7 +58,11 @@ namespace StatLib{
         int info = 0;
         int lda = n;
         int lwork = n;
-        dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info);
+        #if defined(GCTA_CPU_ARM)
+            dgeqrf_(&n, &n, X, &lda, tau, work, &lwork, &info);
+        #else
+            dgeqrf(&n, &n, X, &lda, tau, work, &lwork, &info);
+        #endif
         if(info != 0){
             return false;
         }
@@ -78,7 +83,7 @@ namespace StatLib{
         char side = 'L';
         char t = 'N';
 
-        #if defined(__aarch64__)
+        #if defined(GCTA_CPU_ARM)
             dormqr_(&side, &t, &n, &n, &n, X, &lda, tau, c, 
                     &lda, work, &lwork, &info);
         #else
-- 
Gitee