diff --git a/Affinity/0-env/env.md b/Affinity/0-env/env.md new file mode 100644 index 0000000000000000000000000000000000000000..6ea34a16620ef0d8db8f57418c7799f69fe6f54e --- /dev/null +++ b/Affinity/0-env/env.md @@ -0,0 +1,166 @@ +# 实验环境准备 + +Preparation of the Experimental Environment + +我们以鲲鹏处理器的华为云ECS openEuler 22.03 64bit with ARM为实验演示环境,由于是在虚拟机上进行实验,安全要求不是很严格,为方便起见,我们以root用户登录进行演示。 + +以下是实验环境的一些展示: + +```shell +uname -m # aarch64 +uname -r # 5.10.0-60.139.0.166.oe2203.aarch64 +cat /etc/os-release # openEuler 22.03 LTS +whoami # root +``` + +我们首先安装开发工具: + +```shell +yum group install -y "Development Tools" +yum install -y zlib-devel +``` + +🔔*注意* + +如果在安装的过程中出现“Error: GPG check FAILED”错误则可在yum install命令后增加--nogpgcheck选项,例如: + +```shell +yum group install -y "Development Tools" --nogpgcheck +``` + +检查GCC版本: + +```shell +gcc --version # gcc (GCC) 10.3.1 +``` + + + +我们现在要安装[鲲鹏DevKit](https://www.hikunpeng.com/developer/devkit)工具包,在这里我们安装其命令行工具包,目前的版本是25.2.RC1,其安装步骤如下: + +🎶步骤 1:在[鲲鹏DevKit下载中心](https://www.hikunpeng.com/zh/developer/devkit/download)将命令行工具包下载到PC本地 + +本实验中工具包的名称:**DevKit-CLI-25.2.RC1-Linux-Kunpeng.tar.gz** + +🎶步骤 2:在PC的命令窗中用scp命令将其上传至华为云鲲鹏ECS + +```shell +scp DevKit-CLI-25.2.RC1-Linux-Kunpeng.tar.gz root@1.92.92.92:~/ +``` + +*注意:*请将“*1.92.92.92*”替换成您自己ECS的EIP。 + +🎶步骤 3:在PC的命令窗中以root身份登录华为云鲲鹏ECS + +```shell +ssh root@1.92.92.92 +``` + +🔔*注意* + +执行上述命令时,假设您已经在[华为云](https://console.huaweicloud.com/)创建了一台华为云鲲鹏ECS,这里的IP地址是一个示例,应该替换成您自己华为云鲲鹏ECS的IP地址。在这里华为云鲲鹏ECS也是一个示例,您可以使用自己的鲲鹏平台。 + + + +🎶步骤 4:在华为云鲲鹏ECS中,将工具包解压至/opt/DevKit/ + +```shell +mkdir /opt/DevKit/ && tar -xzf DevKit-CLI-25.2.RC1-Linux-Kunpeng.tar.gz -C $_ +``` + +🎶步骤 5:设置PATH环境变量 + +```shell +cp ~/.bashrc{,.origin} +echo >> ~/.bashrc +echo "export PATH=$PATH:/opt/DevKit/DevKit-CLI-25.2.RC1-Linux-Kunpeng" >> ~/.bashrc +source ~/.bashrc +``` + +🎶步骤 6:验证安装 + +```shell +devkit --version # devkit version 25.2.RC1 +devkit advisor -h +``` + +后一个命令显示如下: + +```txt + + Usage: devkit advisor [-h|--help] TASK [ARGS] + + The most commonly used devkit advisor sub tasks are: + run-mode Run the 64-bit running mode check task. + addr-align Run the address alignment of the structure task. + cacheline Run the cache line alignment check task. + bc-gen Run the BC file generation task. + mm-check Run the memory mode check task. + vec-check Run the vectorization check task. + affi-check Run the affinity check task. + matrix-check Run the matricization check task. + dr-check Run the dynamic memory consistency check task. + precision Run the precision analysis task. + knet Run the K-NET analysis tools task. + + See 'devkit advisor TASK -h/--help' for more information on a specific task. + +``` + +可以进一步寻求子命令的帮助,例如: + +```shell +devkit advisor run-mode -h +``` + + + +🎶步骤 7:启动HTTP服务器 + +我们可以利用Python的http.server模块启动一个简单的HTTP服务器: + +```shell +mkdir ~/Reports && cd $_ +ip addr # For instance: eth0: ... inet 192.168.0.64/24 +python3 -m http.server 9000 --bind 192.168.0.64 --directory ~/Reports +``` + +上述命令用Python启动了一个简单的HTTP服务器,它的端口号是*9000*(所以需要在您的ECS上开放TCP协议*9000*端口),*192.168.0.64*是该ECS的本地IP地址,服务器的工作目录被设置为*~/Reports*。要结束该程序,可以按“Ctrl+c”。 + +我们可以在PC端的浏览器里输入ECS的EIP地址访问这个服务器: + +```txt +http://1.92.92.92:9000/ +``` + +目前还没有任何内容可以显示,在后面的实验中我们可以把DevKit工具生成的HTML格式的报告“扔”到其工作目录,这样我们就可以很方便的读到报告的内容了。 + +🔔*注意* + +要PC端的浏览器可以访问这个HTTP服务器,请确保您的ECS开放了TCP协议的*9000*端口。对于华为云ECS,请在其所用“安全组”中进行设置。 + + + +🎶步骤 8:另起一个终端并创建工作目录 + +我们在PC端另外启一个命令窗,登录到ECS后创建一个工作目录: + +```shell +mkdir ~/workspace && cd $_ +pwd # /root/workspace +``` + +我们将在此工作目录下进行实验。 + + + +📄术语表 + +| 缩略语 | 英文全称 | 中文全称 | +| :----- | :-------------------- | :----------- | +| ECS | Elastic Cloud Server | 弹性云服务器 | +| EIP | Elastic IP | 弹性IP地址 | +| PC | Personal Computer | 个人电脑 | +| SG | Security Groups | 安全组 | +| VPC | Virtual Private Cloud | 虚拟私有云 | + diff --git a/Affinity/1-64bit-check/64bit-check.md b/Affinity/1-64bit-check/64bit-check.md new file mode 100644 index 0000000000000000000000000000000000000000..3e72f035cbad08fa57af8b2f2bc039d61c2683f7 --- /dev/null +++ b/Affinity/1-64bit-check/64bit-check.md @@ -0,0 +1,129 @@ +# 64位运行模式检查 + +64-bit Running Mode Check + +鲲鹏亲和分析工具的一个特性,对用户C/C++软件从32位模式迁移到64位模式进行检查。工具强制以64位模式编译用户软件,并通过编译选项发现从32位模式迁移到64位模式的必要修改,并提示用户进行进一步检查。 + +🎶步骤 1:创建工作目录 + +```shell +mkdir -p ~/workspace/1-64bit-check/src && cd $_ +``` + +🎶步骤 2:64位运行模式确认 + +按precheck.c准备源代码,关键代码: + +```c +printf("pointer size: %zu bytes\n", sizeof(void*)); +printf("uintptr_t size: %zu bytes\n", sizeof(uintptr_t)); +printf("int / unsigned int size: %zu / %zu bytes\n", sizeof(int), sizeof(unsigned int)); +printf("long / unsigned long size: %zu / %zu bytes\n", sizeof(long), sizeof(unsigned long)); +``` + +编译运行: + +```shell +gcc precheck.c # or +gcc -march=armv8-a precheck.c +``` + +检查运行结果: + +```shell +./a.out +``` + +输出如下: + +```txt +pointer size: 8 bytes +uintptr_t size: 8 bytes +int / unsigned int size: 4 / 4 bytes +long / unsigned long size: 8 / 8 bytes +``` + +我们可以看到地址指针是8个字节长,所以可以确定当前系统是运行在64位模式下的。 + +🎶步骤 3:64位运行模式检查 + +按func1.c准备源代码,关键代码: + +```c +long l = 0x1fffffff; +int i = l; +``` + +按func2.c准备源代码,关键代码: + +```c +char s[] = "hello"; +unsigned int p = s; +``` + +准备Makefile文件,关键脚本: + +```makefile +CC = gcc +CFLAGS = -Wconversion -Wint-to-pointer-cast +``` + +编译源代码: + +```shell +make +``` + +输出如下: + +```txt +gcc -Wconversion -Wint-to-pointer-cast -c -o func1.o func1.c +func1.c: In function ‘main’: +func1.c:15:14: warning: conversion from ‘long int’ to ‘int’ may change value [-Wconversion] + 15 | int i = l; + | ^ +gcc -Wconversion -Wint-to-pointer-cast -o func1 func1.c +func1.c: In function ‘main’: +func1.c:15:14: warning: conversion from ‘long int’ to ‘int’ may change value [-Wconversion] + 15 | int i = l; + | ^ +gcc -Wconversion -Wint-to-pointer-cast -c -o func2.o func2.c +func2.c: In function ‘main’: +func2.c:15:22: warning: initialization of ‘unsigned int’ from ‘char *’ makes integer from pointer without a cast [-Wint-conversion] + 15 | unsigned int p = s; + | ^ +gcc -Wconversion -Wint-to-pointer-cast -o func2 func2.c +func2.c: In function ‘main’: +func2.c:15:22: warning: initialization of ‘unsigned int’ from ‘char *’ makes integer from pointer without a cast [-Wint-conversion] + 15 | unsigned int p = s; + | ^ +``` + +从以上结果可以看出存在类型转换的警告信息。 + +进行鲲鹏亲和“run-mode”检查: + +```shell +cd .. +pwd # /root/workspace/1-64bit-check +devkit advisor run-mode -i ./src -c make -o ~/Reports -l 0 +``` + +根据report文件的提示,修改源代码如下: + +func1.c + +```c +long l = 0x1fffffff; +int i = (int)l; // 进行显示类型转换,以提醒可能的数据截断 +``` + +func2.c + +```c +char s[] = "hello"; +uintptr_t p = (uintptr_t)s; // 使用uintptr_t类型存储指针 +``` + +重新用`make`命令编译修改后的源代码,则无先前的警告信息。 + diff --git a/Affinity/1-64bit-check/log/func1.log b/Affinity/1-64bit-check/log/func1.log new file mode 100644 index 0000000000000000000000000000000000000000..a46d48f4b5fac5f579141b41b684ce913c961025 --- /dev/null +++ b/Affinity/1-64bit-check/log/func1.log @@ -0,0 +1,5 @@ +# gcc -Wconversion -Wint-to-pointer-cast func1.c +func1.c: In function ‘main’: +func1.c:15:14: warning: conversion from ‘long int’ to ‘int’ may change value [-Wconversion] + 15 | int i = l; + | ^ diff --git a/Affinity/1-64bit-check/log/func2.log b/Affinity/1-64bit-check/log/func2.log new file mode 100644 index 0000000000000000000000000000000000000000..f29a2cc2d98a93cd88cba5a5c778e28a29f72bd6 --- /dev/null +++ b/Affinity/1-64bit-check/log/func2.log @@ -0,0 +1,5 @@ +# gcc -Wconversion -Wint-to-pointer-cast func2.c +func2.c: In function ‘main’: +func2.c:15:22: warning: initialization of ‘unsigned int’ from ‘char *’ makes integer from pointer without a cast [-Wint-conversion] + 15 | unsigned int p = s; + | ^ diff --git a/Affinity/1-64bit-check/src/Makefile b/Affinity/1-64bit-check/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..fcc73761f0a68d3678e83d18728406e3f363b611 --- /dev/null +++ b/Affinity/1-64bit-check/src/Makefile @@ -0,0 +1,16 @@ +programs = func1 func2 +cleanfiles = *.o *.out +CC = gcc +CFLAGS = -Wconversion -Wint-to-pointer-cast + +all:${programs} + +func1: func1.o + ${CC} ${CFLAGS} -o $@ func1.c + +func2: func2.o + ${CC} ${CFLAGS} -o $@ func2.c + +.PHONY: clean +clean: + rm -f ${cleanfiles} ${programs} diff --git a/Affinity/1-64bit-check/src/func1.c b/Affinity/1-64bit-check/src/func1.c new file mode 100644 index 0000000000000000000000000000000000000000..aa04b1e651a5e95c69e050e32ba25d02e48fa6c9 --- /dev/null +++ b/Affinity/1-64bit-check/src/func1.c @@ -0,0 +1,18 @@ +/* + * func1.c + * + * 64-bit running mode check task + * gcc -Wconversion -Wint-to-pointer-cast thisfile + * + */ + +#include + +// -Wconversion +int main(int argc, char *argv[]) +{ + long l = 0x1fffffff; + int i = l; + + return 0; +} diff --git a/Affinity/1-64bit-check/src/func2.c b/Affinity/1-64bit-check/src/func2.c new file mode 100644 index 0000000000000000000000000000000000000000..7179ea7d48d43edede844a1acfb13ee74224c3a5 --- /dev/null +++ b/Affinity/1-64bit-check/src/func2.c @@ -0,0 +1,21 @@ +/* + * func2.c + * + * 64-bit running mode check task + * gcc -Wconversion -Wint-to-pointer-cast thisfile + * + */ + +#include + +// -Wint-conversion +int main(int argc, char *argv[]) +{ + char s[] = "hello"; + unsigned int p = s; + + printf("%s\n", s); + printf("%s\n", p); + + return 0; +} diff --git a/Affinity/1-64bit-check/src/precheck.c b/Affinity/1-64bit-check/src/precheck.c new file mode 100644 index 0000000000000000000000000000000000000000..006c5a7713641ba11d0a2c72bd6cad08472905b4 --- /dev/null +++ b/Affinity/1-64bit-check/src/precheck.c @@ -0,0 +1,36 @@ +// +// precheck.c +// gcc thisfile +// + +#include +#include +#include + +int main() +{ + printf("+-----------+----------------+-----------------+\n"); + printf("| | sizeof (bytes) | alignof (bytes) |\n"); + printf("+-----------+----------------+-----------------+\n"); + printf("| void * | %14zu | %14zu |\n", sizeof(void *), alignof(void *)); + printf("+-----------+----------------+-----------------+\n"); + printf("| uintptr_t | %14zu | %14zu |\n", sizeof(uintptr_t), alignof(uintptr_t)); + printf("+-----------+----------------+-----------------+\n"); + printf("| char | %14zu | %14zu |\n", sizeof(char), alignof(char)); + printf("+-----------+----------------+-----------------+\n"); + printf("| short | %14zu | %14zu |\n", sizeof(short), alignof(short)); + printf("+-----------+----------------+-----------------+\n"); + printf("| int | %14zu | %14zu |\n", sizeof(int), alignof(int)); + printf("+-----------+----------------+-----------------+\n"); + printf("| long | %14zu | %14zu |\n", sizeof(long), alignof(long)); + printf("+-----------+----------------+-----------------+\n"); + printf("| long long | %14zu | %14zu |\n", sizeof(long long), alignof(long long)); + printf("+-----------+----------------+-----------------+\n"); + printf("| float | %14zu | %14zu |\n", sizeof(float), alignof(float)); + printf("+-----------+----------------+-----------------+\n"); + printf("| double | %14zu | %14zu |\n", sizeof(double), alignof(double)); + printf("+-----------+----------------+-----------------+\n"); + printf("\n"); + + return 0; +} \ No newline at end of file diff --git a/Affinity/1-64bit-check/src/update/func1.c b/Affinity/1-64bit-check/src/update/func1.c new file mode 100644 index 0000000000000000000000000000000000000000..dc2524d7f7313c1bd2b3bc5a3aaac901f6af8465 --- /dev/null +++ b/Affinity/1-64bit-check/src/update/func1.c @@ -0,0 +1,18 @@ +/* + * func1.c + * + * 64-bit running mode check task + * gcc -Wconversion -Wint-to-pointer-cast thisfile + * + */ + +#include + +// -Wconversion +int main(int argc, char *argv[]) +{ + long l = 0x1fffffff; + int i = (int)l; + + return 0; +} diff --git a/Affinity/1-64bit-check/src/update/func2.c b/Affinity/1-64bit-check/src/update/func2.c new file mode 100644 index 0000000000000000000000000000000000000000..48f8b653422b6064fc2c06b1a38c2cdd0f7a6952 --- /dev/null +++ b/Affinity/1-64bit-check/src/update/func2.c @@ -0,0 +1,23 @@ +/* + * func2.c + * + * 64-bit running mode check task + * gcc -Wconversion -Wint-to-pointer-cast thisfile + * + */ + +#include +#include + +// -Wint-conversion +int main(int argc, char *argv[]) +{ + char s[] = "hello"; + uintptr_t p = (uintptr_t)s; + + printf("%s\n", s); + printf("%s\n", p); + + return 0; +} + diff --git a/Affinity/2-byte-check/align/align.c b/Affinity/2-byte-check/align/align.c new file mode 100644 index 0000000000000000000000000000000000000000..bd74a8dae03cafdf8d009face983026cbc6e6a97 --- /dev/null +++ b/Affinity/2-byte-check/align/align.c @@ -0,0 +1,78 @@ +// +// align.c +// gcc -O0 thisfile +// + +#include +#include +#include +#include + +typedef struct { + int a; + char c; + char s[3]; + double d; +} my_st_t1; + +typedef struct { + int a; + char c; + char s[10]; + double d; +} my_st_t2; + +typedef struct { + int a; + char c; + short s[3]; + double d; +} my_st_t3; + +typedef struct { + int a; + char c; + int b; + double d; +} my_st_t4; + +typedef struct { + int a; + char c; + my_st_t3 s; + double d; +} my_st_t5; + +int main() +{ + // 1 + my_st_t1 t1; + printf("%zu %zu %p %p %p %p\n", sizeof(t1), alignof(t1), &t1.a, &t1.c, &t1.s, &t1.d); + printf("\n"); + + // 2 + my_st_t2 t2; + printf("%zu %zu %p %p %p %p %p %p\n", sizeof(t2), alignof(t2), &t2.a, &t2.c, &t2.s, &t2.s[3], &t2.s[9], &t2.d); + + strcpy(t2.s, "0123456789\0"); // 此处故意多写了一个字节 + t2.d = 3.1415926; + printf("%s %d\n", t2.s, strlen(t2.s)); + printf("%f\n", t2.d); + printf("\n"); + + // 3 + my_st_t3 t3; + printf("%zu %zu %p %p %p %p %p %p\n", sizeof(t3), alignof(t3), &t3.a, &t3.c, &t3.s, &t3.s[1], &t3.s[2], &t3.d); + printf("\n"); + + // 4 + my_st_t4 t4; + printf("%zu %zu %zu %p %p %p %p\n", sizeof(t4), sizeof(t4.b), alignof(t4), &t4.a, &t4.c, &t4.b, &t4.d); + printf("\n"); + + // 5 + my_st_t5 t5; + printf("%zu %zu %zu %p %p %p %p %p %p %p %p %p %p\n", sizeof(t5), sizeof(t5.s), alignof(t5), &t5.a, &t5.c, &t5.s, &t5.s.a, &t5.s.c, &t5.s.s, &t5.s.s[1], &t5.s.s[2], &t5.s.d, &t5.d); + printf("\n"); +} + diff --git a/Affinity/2-byte-check/align/align.log b/Affinity/2-byte-check/align/align.log new file mode 100644 index 0000000000000000000000000000000000000000..c8ce2ea4498f8dddc8ee4d997636724b39fd2995 --- /dev/null +++ b/Affinity/2-byte-check/align/align.log @@ -0,0 +1,11 @@ +16 8 0xffffea1f0dd0 0xffffea1f0dd4 0xffffea1f0dd5 0xffffea1f0dd8 + +24 8 0xffffea1f0db8 0xffffea1f0dbc 0xffffea1f0dbd 0xffffea1f0dc0 0xffffea1f0dc6 0xffffea1f0dc8 +0123456789 10 +3.141593 + +24 8 0xffffea1f0da0 0xffffea1f0da4 0xffffea1f0da6 0xffffea1f0da8 0xffffea1f0daa 0xffffea1f0db0 + +24 4 8 0xffffea1f0d88 0xffffea1f0d8c 0xffffea1f0d90 0xffffea1f0d98 + +40 24 8 0xffffea1f0d60 0xffffea1f0d64 0xffffea1f0d68 0xffffea1f0d68 0xffffea1f0d6c 0xffffea1f0d6e 0xffffea1f0d70 0xffffea1f0d72 0xffffea1f0d78 0xffffea1f0d80 diff --git a/Affinity/2-byte-check/align/align.xlsx b/Affinity/2-byte-check/align/align.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..2a7a3b41cf251ed4ce75b0235633271c1d1ac6c5 Binary files /dev/null and b/Affinity/2-byte-check/align/align.xlsx differ diff --git a/Affinity/2-byte-check/byte-check.md b/Affinity/2-byte-check/byte-check.md new file mode 100644 index 0000000000000000000000000000000000000000..264d2db10b66312647197266b70a0a40a8fdd627 --- /dev/null +++ b/Affinity/2-byte-check/byte-check.md @@ -0,0 +1,139 @@ +# 字节对齐检查 + +Address Alignment of the Structure + +字节对齐检查是鲲鹏亲和分析工具的一个特性,对用户软件中的结构体变量进行检查,分析其内存分配情况,并反馈给用户。 + +本实验将要用到的wtdbg是一种使用C语言编写的更快更好的基因组算法。 + +🎶步骤 1:创建工作目录 + +```shell +mkdir ~/workspace/2-byte-check && cd $_ +``` + +🎶步骤 2:获取源代码 + +```shell +git clone https://github.com/ruanjue/wtdbg2.git && cd wtdbg2 +``` + +🎶步骤 3:检出到特定版本(选做) + +```shell +git log # commit b77c5657c8095412317e4a20fe3668f5bde6b1ac (HEAD -> master, origin/master, origin/HEAD) +git checkout b77c5657c8095412317e4a20fe3668f5bde6b1ac # 需要时选做 +``` + +在进行本实验时,该仓库的最新版本如以上命令`git log`所示,后续代码仓如有更新而又要在此版本上做实验,可以检出到此版本。 + +🎶步骤 4:适配到aarch64处理器架构 + +为更加贴近实际应用,在这里,我们将原仓库的代码适配到鲲鹏处理器的aarch64架构(需遵循原代码仓的GNU GENERAL PUBLIC LICENSE许可协议)。 + +```shell +pwd # ...wtdbg2 +wget https://raw.githubusercontent.com/DLTcollab/sse2neon/refs/heads/master/sse2neon.h +``` + +🔔*注意* + +若由于网络原因无法下载sse2neon.h文件请看附件所附文件(请尊重原作者版权)。 + +然后对源代码进行以下修改: + +① 将 poacns.h 中的以下头文件都注释,并包含"sse2neon.h": + +```c +#include +#include +``` + +更改后的内容为: + +```c +//#include +//#include +#include "sse2neon.h" +``` + +② 将 ksw.c 中的以下头文件都注释,并包含"sse2neon.h": + +```c +#include +``` + +更改后的内容为: + +```c +//#include +#include "sse2neon.h" +``` + +③ 将Makefile中CFLAGS中的“-mpopcnt”和“-msse4.2”编译选项都删掉,即将: + +```makefile +ifeq (1, ${DEBUG}) +CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O0 -DDEBUG=1 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -mpopcnt -msse4.2 +else +CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O4 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -mpopcnt -msse4.2 +endif +``` + +变更为: + +```makefile +ifeq (1, ${DEBUG}) +CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O0 -DDEBUG=1 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE +else +CFLAGS=-g3 -W -Wall -Wno-unused-but-set-variable -O4 -DVERSION="$(VERSION)" -DRELEASE="$(RELEASE)" -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE +endif +``` + +🎶步骤 5:验证 + +现在应该可以在鲲鹏平台编译通过了: + +```shell +make +``` + +编译完成后进行清理: + +```shell +make clean +``` + + + +🎶步骤 6:进行字节对齐检查 + +```shell +pwd # wtdbg2 +devkit advisor addr-align -i . -c make -o ~/Reports +``` + +这里需要等待好几分钟时间(和我们ECS的配置有关)。 + +执行完成后我们可以查看检查报告给出的建议,比如对于ksw.c中的一个结构体: + +```c +struct _kswq_t { + int qlen, slen; + uint8_t shift, mdiff, max, size; + __m128i *qp, *H0, *H1, *E, *Hmax; +}; +``` + +在64位模式下发现了一个4B大小的“hole”: + +```txt +qlen: 4B, slen: 4B, shift: 1B, mdiff: 1B, max: 1B, size: 1B 4B hole, qp: 8B, H0: 8B, H1: 8B, E: 8B, Hmax: 8B +``` + + + +通过鲲鹏DevKit的VS Code IDE插件进行此项检查会以更加形象化的方式给出建议,请参见[wtdbg源码字节对齐检查](https://www.hikunpeng.com/document/detail/zh/kunpengdevps/handon_tutorials/plugbestpractice/wtdbgpt_21_0001.html)。 + +这里只是给出了一个简单的示例,更多的示例可以参考[DevKit Demo代码仓](https://gitee.com/kunpengcompute/devkitdemo)中的[byte_check](https://gitee.com/kunpengcompute/devkitdemo/tree/main/Porting_advisor/testdemo/byte_check)。 + diff --git a/Affinity/3-cacheline/cacheline.md b/Affinity/3-cacheline/cacheline.md new file mode 100644 index 0000000000000000000000000000000000000000..49504e296c9c34d69dd9fd59a57b3a8fc755a796 --- /dev/null +++ b/Affinity/3-cacheline/cacheline.md @@ -0,0 +1,208 @@ +# 缓存行对齐检查 + +Cache Line Alignment Check + +Taking an array of structures as an example + +缓存行对齐检查是鲲鹏亲和分析工具的一个特性,对C/C++源码中结构体变量进行128字节对齐检查,提升访存性能。 + + + +🔔*注意* + +本实验在“鲲鹏处理器+openEuler 22.03 LTS”平台上进行,所用鲲鹏处理器至少要有2个CPU核。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir -p ~/workspace/3-cacheline/src && cd $_ +``` + +🎶步骤 2:准备源代码 + +按照abst.c准备源代码,关键代码: + +```c +struct my_data_s { + volatile int a; + volatile int b; +}; + +struct my_data_s data[] = {{1, 2}, {3, 4}}; + +#define LOOP_NUM (0x1ffffffff) + +void thread0(void *arg) { + for (unsigned long long n = 0; n < LOOP_NUM; n++) { + data[0].a = 101; + } +} + +void thread1(void *arg) { + for (unsigned long long n = 0; n < LOOP_NUM; n++) { + data[1].b = 104; + } +} + +int main() { + // ...... + + pthread_create(&t0, NULL, (void*)thread0, NULL); + pthread_create(&t1, NULL, (void*)thread1, NULL); + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + pthread_setaffinity_np(t0, sizeof(cpuset), &cpuset); + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + pthread_setaffinity_np(t1, sizeof(cpuset), &cpuset); + + pthread_join(t0, NULL); + pthread_join(t1, NULL); + + // ...... +} +``` + +这个程序是在同一时间段内,用运行在不同CPU核上的两个线程分别对两个结构体中的两个整型变量进行8,589,934,591(0x1ffffffff)次写操作,并计算出花费的总的时间。 + + + +🎶步骤 3:编译、运行并测试基线 + +```shell +gcc -O0 -lpthread -o abst abst.c +./abst +``` + +在运行程序的同时,可以另开一个命令行终端,使用`top`命令观察程序的运行情况。具体过程如下: + +① 在新开的终端中通过`ps`命令得到该进程的进程号 + +```shell +ps -ef | grep abst # For instance, 2222 +``` + +② 执行`top`命令 + +```shell +top -H -p 2222 +``` + +注意:在这里要将“2222”换成实际运行的进程ID。 + +然后在top命令的主界面中按下 f 键,添加 nTH(Number of Threads)和 P (Last used CPU) 字段到显示列中(具体的做法是移动光标到nTH和P字段,按空格键,然后按ESC键回到主界面)。 + +![top-nTH-P](./img/top-nTH-P.png) + +③ 观察结构体占用空间大小并统计时间 + +以下是该程序在2核华为云鲲鹏ECS上一次运行的输出: + +```txt ++---------+------------+------------+------------+------------+------------+------------+ +| | data[0] | a | b | data[1] | a | b | ++---------+------------+------------+------------+------------+------------+------------+ +| address | 0x420060 | 0x420060 | 0x420064 | 0x420068 | 0x420068 | 0x42006c | ++---------+------------+------------+------------+------------+------------+------------+ +| size | 8 | 4 | 4 | 8 | 4 | 4 | ++---------+------------+------------+------------+------------+------------+------------+ +| value | | 1 | 2 | | 3 | 4 | ++---------+------------+------------+------------+------------+------------+------------+ + +The execution time took 24.069 seconds. + ++---------+------------+------------+------------+------------+------------+------------+ +| | data[0] | a | b | data[1] | a | b | ++---------+------------+------------+------------+------------+------------+------------+ +| value | | 101 | 2 | | 3 | 104 | ++---------+------------+------------+------------+------------+------------+------------+ + +``` + +可以看出每个结构体的大小是8字节。 + +可以多运行几次然后统计出它的平均执行时间。 + + + +🎶步骤 4:缓存行对齐检查 + +```shell +cd .. +pwd # /root/workspace/3-cacheline +devkit advisor cacheline -i ./src -o ~/Reports/ +``` + +得到的检查报告如下所示: + +![report](./img/report.jpeg) + +可以看出,其建议在16到19行中(即定义my_data_s结构体的代码)使用`__attribute__((__aligned__(128)))`编译器指令,以强制进行128字节的内存地址对齐。 + + + +💡*Tips* +① `__attribute__((__aligned__(128)))`是GCC编译器的一个扩展语法(Clang等其他编译器也支持),它强制要求一个变量、结构体或结构体成员在内存中的起始地址必须是128的倍数。简单来说,它强制进行了128字节对齐。 + +② 鲲鹏920处理器L1和L2缓存行大小是64字节,L3缓存行大小是128字节,因此需要按128字节对齐。 + +在Linux系统中可以对cache的信息进行查询(参见cache_info.sh脚本),其输出信息如下: + +```txt +Level 1 Data: + cache linesize: 64 + cache size: 64K +Level 1 Instruction: + cache linesize: 64 + cache size: 64K +Level 2 Unified: + cache linesize: 64 + cache size: 512K +Level 3 Unified: + cache linesize: 128 + cache size: 32768K +``` + + + + +现在我们按照建议对代码进行优化: + +```c +struct my_data_s { + volatile int a __attribute__((__aligned__(128))); + volatile int b __attribute__((__aligned__(128))); +}; +``` + +然后重新编译后运行该程序,其输出如下: + +```txt ++---------+------------+------------+------------+------------+------------+------------+ +| | data[0] | a | b | data[1] | a | b | ++---------+------------+------------+------------+------------+------------+------------+ +| address | 0x420100 | 0x420100 | 0x420104 | 0x420180 | 0x420180 | 0x420184 | ++---------+------------+------------+------------+------------+------------+------------+ +| size | 128 | 4 | 4 | 128 | 4 | 4 | ++---------+------------+------------+------------+------------+------------+------------+ +| value | | 1 | 2 | | 3 | 4 | ++---------+------------+------------+------------+------------+------------+------------+ + +The execution time took 23.707 seconds. + ++---------+------------+------------+------------+------------+------------+------------+ +| | data[0] | a | b | data[1] | a | b | ++---------+------------+------------+------------+------------+------------+------------+ +| value | | 101 | 2 | | 3 | 104 | ++---------+------------+------------+------------+------------+------------+------------+ + +``` + +可以看出虽然结构体中成员a和成员b的长度还是4字节,但是整个结构体的大小变成了128字节。 + +多次运行几次,统计出平均运行时间。(可以看出在本示例的输出中,其运行时间有0.362秒的减少,即程序的运行效率得到了提升。) + diff --git a/Affinity/3-cacheline/img/report.jpeg b/Affinity/3-cacheline/img/report.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e57b232b5fbac5d44001629ac1ca86b43fe0b7f3 Binary files /dev/null and b/Affinity/3-cacheline/img/report.jpeg differ diff --git a/Affinity/3-cacheline/img/top-nTH-P.png b/Affinity/3-cacheline/img/top-nTH-P.png new file mode 100644 index 0000000000000000000000000000000000000000..eaf06801a26ed8719b28af3f85424eea9118a6ad Binary files /dev/null and b/Affinity/3-cacheline/img/top-nTH-P.png differ diff --git a/Affinity/3-cacheline/src/abst.c b/Affinity/3-cacheline/src/abst.c new file mode 100644 index 0000000000000000000000000000000000000000..45a639f5a5cf226735005c2bf69950982b5c10b2 --- /dev/null +++ b/Affinity/3-cacheline/src/abst.c @@ -0,0 +1,93 @@ +// +// abst.c +// + +// suppose that there are CPU0 and CPU1 in system +// gcc -lpthread -o abst abst.c +// + +#include +#include +#include +#define __USE_GNU +#include +#include + +struct my_data_s { + volatile int a; + volatile int b; +} /*__attribute__((__aligned__(128)))*/; + +struct my_data_s data[] = {{1, 2}, {3, 4}}; + +#define LOOP_NUM (0x1ffffffff) + +void thread0(void *arg) { + for (unsigned long long n = 0; n < LOOP_NUM; n++) { + data[0].a = 101; + } +} + +void thread1(void *arg) { + for (unsigned long long n = 0; n < LOOP_NUM; n++) { + data[1].b = 104; + } +} + + +int main() { + pthread_t t0, t1; + cpu_set_t cpuset; + + struct timeval tstart, tend; + double exectime; + + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| | data[0] | a | b | data[1] | a | b |\n"); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| address | %-10p | %-10p | %-10p | %-10p | %-10p | %-10p |\n", &data[0], &data[0].a, &data[0].b, &data[1], &data[1].a, &data[1].b); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| size | %-10zu | %-10zu | %-10zu | %-10zu | %-10zu | %-10zu |\n", sizeof(data[0]), sizeof(data[0].a), sizeof(data[0].b), sizeof(data[1]), sizeof(data[1].a), sizeof(data[1].b)); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| value | %*s | %-10d | %-10d | %*s | %-10d | %-10d |\n", 8, "", data[0].a, data[0].b, 8, "", data[1].a, data[1].b); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("\n"); + + gettimeofday(&tstart, NULL); + + pthread_create(&t0, NULL, (void*)thread0, NULL); + pthread_create(&t1, NULL, (void*)thread1, NULL); + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (pthread_setaffinity_np(t0, sizeof(cpuset), &cpuset) < 0) { + printf("pthread_setaffinity_np() for thread 0 err!\n"); + return -1; + } + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + if (pthread_setaffinity_np(t1, sizeof(cpuset), &cpuset) < 0) { + printf("pthread_setaffinity_np() for thread 1 err!\n"); + return -1; + } + + pthread_join(t0, NULL); + pthread_join(t1, NULL); + + gettimeofday(&tend, NULL); + + exectime = (tend.tv_sec - tstart.tv_sec) * 1000.0; // sec to ms + exectime += (tend.tv_usec - tstart.tv_usec) / 1000.0; // us to ms + + printf("The execution time took %.3lf seconds.\n", exectime / 1000.0); + printf("\n"); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| | data[0] | a | b | data[1] | a | b |\n"); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("| value | %*s | %-10d | %-10d | %*s | %-10d | %-10d |\n", 10, "", data[0].a, data[0].b, 10, "", data[1].a, data[1].b); + printf("+---------+------------+------------+------------+------------+------------+------------+\n"); + printf("\n"); + + return 0; +} diff --git a/Affinity/3-cacheline/src/cache_info.sh b/Affinity/3-cacheline/src/cache_info.sh new file mode 100755 index 0000000000000000000000000000000000000000..5223ba910cac7820f1894bd628519bb92b857967 --- /dev/null +++ b/Affinity/3-cacheline/src/cache_info.sh @@ -0,0 +1,9 @@ +#!/usr/bin/bash + +for index in index0 index1 index2 index3; do + if test -e /sys/devices/system/cpu/cpu0/cache/$index/level; then + echo Level `cat /sys/devices/system/cpu/cpu0/cache/$index/level` `cat /sys/devices/system/cpu/cpu0/cache/$index/type`: + echo -e '\t'cache linesize: `cat /sys/devices/system/cpu/cpu0/cache/$index/coherency_line_size` + echo -e '\t'cache size: `cat /sys/devices/system/cpu/cpu0/cache/$index/size` + fi +done diff --git a/Affinity/3-cacheline/src/cmdline.sh b/Affinity/3-cacheline/src/cmdline.sh new file mode 100644 index 0000000000000000000000000000000000000000..81f3d7a9b1f9f6d06918785f93e7b8408f6faa86 --- /dev/null +++ b/Affinity/3-cacheline/src/cmdline.sh @@ -0,0 +1,9 @@ +# @ Terminal 1 +gcc -O0 -lpthread -o abst abst.c +./abst + +# @ Terminal 2 +ps -ef | grep abst # For instance, 2222 +top -H -p 2222 +# 然后按下 f 键,添加 nTH(Number of Threads)和 P (Last used CPU) 字段到显示列中。 +# Press 'f' and add nTH, P field to the list, Esc back. diff --git a/Affinity/4-bc-gen/bc-gen.md b/Affinity/4-bc-gen/bc-gen.md new file mode 100644 index 0000000000000000000000000000000000000000..ce043f11e974c219c513ed6aa60265ca7f51c88c --- /dev/null +++ b/Affinity/4-bc-gen/bc-gen.md @@ -0,0 +1,23 @@ +# BC文件生成 + +BC File Generation + +BC(BitCode)文件是LLVM(Low Level Virtual Machine)编译源代码生成的中间文件的二进制表示,它包含了程序编译的中间表示IR(Intermediate Representation),比源代码更加紧凑和高效。 + +DevKit基于LLVM框架,利用BC文件避免了直接分析程序源代码的解析开销,可以提高程序分析的效率和准确度。 + +在DevKit的鲲鹏亲和分析中,BC文件一般用于内存一致性检查和向量化检查。 + + + +以下是一些生成BC文件的示例: + +```shell +devkit advisor bc-gen -c make +devkit advisor bc-gen -c "mkdir build && cd $_; cmake ..; make" +devkit advisor bc-gen -c "gcc -lpthread -o simd simd.c" +``` + +说明:”-c“是必选参数,后跟源代码构建命令。 + +更多资料请参考:[BC文件生成](https://www.hikunpeng.com/document/detail/zh/kunpengdevps/userguide/cliuserguide/KunpengDevKitCli_0043.html)。 diff --git a/Affinity/5-mm-check/img/top-nTH-P.png b/Affinity/5-mm-check/img/top-nTH-P.png new file mode 100644 index 0000000000000000000000000000000000000000..d543f14ebace34d2ab8395d9601fbbc1095cf48c Binary files /dev/null and b/Affinity/5-mm-check/img/top-nTH-P.png differ diff --git a/Affinity/5-mm-check/mm-check.md b/Affinity/5-mm-check/mm-check.md new file mode 100644 index 0000000000000000000000000000000000000000..2f1aaff3cfb806998ee28a0a894139c859b03454 --- /dev/null +++ b/Affinity/5-mm-check/mm-check.md @@ -0,0 +1,211 @@ +# 内存一致性静态检查 + +Memory Consistency Static Check + +内存一致性静态检查就是采用静态检查模式检查C/C++源码在鲲鹏平台运行时是否存在内存一致性问题,并提供内存一致性的检测结果与插入内存屏障的建议。 + +具体地说,就是在程序运行之前,通过分析程序的源代码或中间代码(如LLVM IR)来验证其内存访问操作(读/写)是否符合预设的内存一致性模型规则。 + + + +🔔*注意* + +本实验在“鲲鹏处理器+openEuler 22.03 LTS”平台上进行,所用鲲鹏处理器至少要有3个CPU核。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir -p ~/workspace/5-mm-check/src && cd $_ +``` + +🎶步骤 2:准备源代码 + +按照dmb.cpp准备源代码,核心代码如下(这里仅仅是示例,实际运行的代码请参照源代码文件): + +```c++ +int x=0, y=0, r1=1, r2=1; + +void thread1() { // @CPU1 + while(true) { + x = 1; + r1 = y; + } +} + +void thread2() { // @CPU2 + while(true) { + y = 1; + r2 = x; + } +} + +if((r1 == 0) && (r2 == 0)) printf("reorder detected\n"); +``` + + + +上述程序首先对变量x和变量y都赋值为0; + +在线程thread1(假设运行在CPU1核上)中,对变量x写1,然后读变量y; + +在线程thread2(假设运行在CPU2核上)中,对变量y写1,然后读变量x; + +如果变量r1和r2出现都为1的情况,则说明CPU对这段指令的执行有乱序行为。 + +源代码dmb.cpp中用信号量对thread1、thread2进行了同步,以使它们大致上是同时开始、同时结束的。 + + + +🎶步骤 3:编译并运行程序 + +编写Makefile,核心脚本如下(这里仅仅是示例,实际运行的脚本请参照源文件): + +```makefile +g++ -c -o dmb.o dmb.cpp +g++ --std=c++20 -o dmb dmb.o -lpthread +./dmb +``` + +在运行程序的同时,可以另开一个命令行终端,使用`top`命令观察程序的运行情况(具体的过程请参照“缓存行对齐检查”实验中说明),以下是一个截图: + +![top-nTH-P](./img/top-nTH-P.png) + +可以看出,主线程和线程thread0、thread1分别位于不同的核上。 + +在本例中,程序在命令行终端的输出如下所示: + +```txt +reorder detected @ 1978975 +reorder detected @ 2143011 +reorder detected @ 2927776 +reorder detected @ 3730335 +reorder detected @ 4473329 +reorder detected @ 7702269 +reorder detected @ 8589436 +reorder detected @ 10469871 +``` + +可以看出,在这里,在10469871次的运行中共发生了8次乱序。 + + + +🎶步骤 4:生成BC文件 + +```shell +pwd # /root/workspace/5-mm-check/src +devkit advisor bc-gen -c make +``` + +在本例中,该命令有如下输出: + +```txt + +Executing bc gen task, please wait... +Current progress: ###### [20%] +Start to get the compile database +Current progress: ################ [50%] +Start to generate the bc files +Current progress: ################################# [100%] +Configuration: + Generate bc files path: /root/workspace/5-mm-check/src + Compile command: make + Threads: The threads is not set, using the half number of cores. + Task Timeout Interval: The timeout period is not set. + Log level: info + +Summary: + Scanned all source files, there are 1 linked bc files and 1 object bc files generated. + +For the detail information, please check: +Output path of linked bc files: /root/workspace/5-mm-check/src/bc-gen_20250921114707_9e6b/bin +Output path of object bc files: /root/workspace/5-mm-check/src/bc-gen_20250921114707_9e6b/object + +There are 0 linked bc files fail to be generated. For the causes of the failure to generate all the bc files and their intermediates, please check: +Log path: /opt/DevKit/DevKit-CLI-25.1.0-Linux-Kunpeng/advisor/logs/affinity/affinity.log + +``` + +在本例中,在当前目录生成了“*bc-gen_20250921114707_9e6b*”文件夹,其中有bin和object两个子目录,其中有BC文件: + +```shell +ls bc-gen_20250921114707_9e6b/bin/ # dmb.bc +ls bc-gen_20250921114707_9e6b/object/ # dmb.o.bc +``` + + + +🎶步骤 5:内存一致性静态检查 + +```shell +cd .. +pwd # /root/workspace/5-mm-check +devkit advisor mm-check -i ./src -f ./src/bc-gen_20250921114707_9e6b/ -o ~/Reports/ +``` + +在命令行终端有如下输出信息: + +```txt +Executing static memory consistency check task, please wait... +Current progress: ################################# [100%]Scanned time: 2025/09/21 11:58:08 + +Configuration: + Scan bc files path: /root/workspace/5-mm-check/src/bc-gen_20250921114707_9e6b + Scan source code path: /root/workspace/5-mm-check/src + Autofix: false + Generate report path: /root/Reports + Generate report type: all + Task Timeout Interval: The timeout period is not set. + Log level: info + +Summary: + Scanned 2 bc files, there are 14 recommended modifications. + +For the details information, please check: + /root/Reports/mem-check_20250921115808_187d.json + /root/Reports/mem-check_20250921115808_187d.html + /root/Reports/mem-check_20250921115808_187d.csv +``` + +生成的检查报告提示在相关的部分加内存屏障指令`__asm__ volatile("dmb sy")`。 + +💡*Tips* + +内存屏障指令`__asm__ volatile("dmb sy")`解释: + +- `__asm__`关键字:内嵌汇编指令 +- `volatile`类型修饰符:防止编译器优化 +- `dmb`:数据内存屏障(Data Memory Barrier) +- `sy`:作用域,全系统(Full System) + + + +根据我们对内存顺序模型的了解,我们只需要在程序的以下位置加内存屏障指令即可: + +```c +void thread1() { // @CPU1 + while(true) { + x = 1; + __asm__ volatile("dmb sy"); + r1 = y; + } +} + +void thread2() { // @CPU2 + while(true) { + y = 1; + __asm__ volatile("dmb sy"); + r2 = x; + } +} +``` + +内存屏障指令有很多种形式,在这里也可写成这种形式: + +```c +asm ("dmb sy" ::: "memory"); +``` + +添加完内存屏障之后我们重新编译、运行程序,可以观察到经过多轮(即程序中的1FFFFFFH共约33.5亿轮)“写-读”操作都没有发生乱序。 + diff --git a/Affinity/5-mm-check/src/Makefile b/Affinity/5-mm-check/src/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..637178d1f9e580c8c94d45a98f6638e2a48af83c --- /dev/null +++ b/Affinity/5-mm-check/src/Makefile @@ -0,0 +1,4 @@ +dmb: dmb.o + g++ --std=c++20 -o $@ dmb.o -lpthread +clean: + rm -f dmb *.o diff --git a/Affinity/5-mm-check/src/cmdline.sh b/Affinity/5-mm-check/src/cmdline.sh new file mode 100644 index 0000000000000000000000000000000000000000..7432eecd0caa7904ce27434285206690473b8cab --- /dev/null +++ b/Affinity/5-mm-check/src/cmdline.sh @@ -0,0 +1,9 @@ +# @ Terminal 1 +make +./dmb + +# @ Terminal 2 +ps -ef | grep dmb # For instance, 2223 +top -H -p 2223 +# 然后按下 f 键,添加 nTH(Number of Threads)和 P (Last used CPU) 字段到显示列中。 +# Press 'f' and add nTH, P field to the list, Esc back. diff --git a/Affinity/5-mm-check/src/dmb.cpp b/Affinity/5-mm-check/src/dmb.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0b099141ed563579e4a8eddbb59810b76fc6fa06 --- /dev/null +++ b/Affinity/5-mm-check/src/dmb.cpp @@ -0,0 +1,96 @@ +// +// dmb.cpp +// +// suppose that there are CPU0, CPU1 and CPU2 in system +// g++ --std=c++20 -lpthread -o dmb dmb.cpp +// g++ --std=c++20 -lpthread -o dmb dmb.cpp -O3 +// + +#include +#include +#include +#include +#include + +int x, y, r1, r2; +sem_t start1, start2, complete; + +void thread1() { + while(true) { + sem_wait(&start1); // wait for start + x = 1; + + //__asm__ volatile("dmb sy"); + // or + //asm ("dmb sy" ::: "memory"); + + r1 = y; + sem_post(&complete); // complete & trigger a signal + } +} + +void thread2() { + while(true) { + sem_wait(&start2); // wait for start + y = 1; + + //__asm__ volatile("dmb sy"); + // or + //asm ("dmb sy" ::: "memory"); + + r2 = x; + sem_post(&complete); // complete & trigger a signal + } +} + + +int main() { + sem_init(&start1, 0, 0); + sem_init(&start2, 0, 0); + sem_init(&complete, 0, 0); + + cpu_set_t cpuset; + + // 绑定主线程到CPU核0 + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); // 设置目标CPU核0 + if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset) < 0) { + printf("pthread_setaffinity_np() for main thread error!\n"); + } + + std::thread t1(thread1); + // 绑定到CPU核1 + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); // 设置目标CPU核1 + if (pthread_setaffinity_np(t1.native_handle(), sizeof(cpuset), &cpuset) != 0) { + printf("pthread_setaffinity_np() for thread 1 error!\n"); + } + + std::thread t2(thread2); + // 绑定到CPU核2 + CPU_ZERO(&cpuset); + CPU_SET(2, &cpuset); // 设置目标CPU核2 + if (pthread_setaffinity_np(t2.native_handle(), sizeof(cpuset), &cpuset) != 0) { + printf("pthread_setaffinity_np() for thread 2 error!\n"); + } + + // 主循环 + for (unsigned long n = 0; n < 0x1ffffff; n++) { + r1 = r2 = 1; + x = y = 0; + + sem_post(&start1); // start t1 + sem_post(&start2); // start t2 + + // wait for t1 & t2 completion + sem_wait(&complete); + sem_wait(&complete); + + if((r1 == 0) && (r2 == 0)) { + printf("reorder detected @ %lu\n", n); + } + } + + t1.detach(); + t2.detach(); +} diff --git a/Affinity/6-vec-check/src/cmdline.sh b/Affinity/6-vec-check/src/cmdline.sh new file mode 100644 index 0000000000000000000000000000000000000000..7bd9c6ec9577db4b2320854c5bbe34b082530f2f --- /dev/null +++ b/Affinity/6-vec-check/src/cmdline.sh @@ -0,0 +1,11 @@ +# Case 1 - GCC +gcc --version # gcc (GCC) 10.3.1 +gcc -O2 -ftree-vectorize loop_invariant.c +objdump -S a.out + +# Case 2 - GCC for openEuler +# https://www.hikunpeng.com/document/detail/zh/kunpengdevps/compiler/ug-hgcc/kunpenghgcc_06_0001.html +# https://www.hikunpeng.com/zh/developer/devkit/download/gcc +gcc --version # gcc (gcc for openEuler 3.0.3) 12.3.1 +gcc -O3 -march=armv8-a+sve loop_invariant.c +objdump -S a.out diff --git a/Affinity/6-vec-check/src/loop_invariant.c b/Affinity/6-vec-check/src/loop_invariant.c new file mode 100644 index 0000000000000000000000000000000000000000..08b63aac715257654d0e224d83f353b8826c2008 --- /dev/null +++ b/Affinity/6-vec-check/src/loop_invariant.c @@ -0,0 +1,25 @@ +/* + * loop_invariant.c + */ + +#define BUF_SIZE (1024) + +struct Data { + int n; +}; + +void func(struct Data *pdata, int *array) { + for (int i = 0; i < pdata->n; i++) { + array[i] += i; + } +} + +int main(void) { + struct Data data = { BUF_SIZE }; + int array[BUF_SIZE] = { 0 }; + + func(&data, array); + + return 0; +} + diff --git a/Affinity/6-vec-check/vec-check.md b/Affinity/6-vec-check/vec-check.md new file mode 100644 index 0000000000000000000000000000000000000000..5204b7090caea8320b7aa04bc3ac1fb8cf4df0bc --- /dev/null +++ b/Affinity/6-vec-check/vec-check.md @@ -0,0 +1,134 @@ +# 向量化检查 + +vectorization check + +向量化检查功能是通过静态代码分析技术,自动扫描用户提供的源代码(如C/C++等),识别出其中可以被向量化但尚未被向量化的代码片段(通常是循环结构),并提供向量化修改建议。 + + + +📢*说明* + +本实验在“鲲鹏处理器+openEuler 22.03 LTS”平台上进行。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir -p ~/workspace/6-vec-check/src && cd $_ +``` + +🎶步骤 2:准备源代码 + +按照loop_invariant.c准备源代码,核心代码如下: + +```c +struct Data { + int n; +}; + +void func(struct Data *pdata, int *array) { + for (int i = 0; i < pdata->n; i++) { + array[i] += i; + } +} +``` + + +🎶步骤 3:生成BC文件 + +```shell +pwd # /root/workspace/6-vec-check/src +devkit advisor bc-gen -c "gcc -O2 -ftree-vectorize loop_invariant.c" +``` + +在本例中,在当前目录生成了“*bc-gen_20250921163409_35bd*”文件夹,查看其中的BC文件: + +```shell +ls bc-gen_20250921163409_35bd/bin/ # a.out.bc +ls bc-gen_20250921163409_35bd/object/ # loop_invariant.o.bc +``` + + +🎶步骤 4:向量化检查 + +```shell +cd .. +pwd # /root/workspace/6-vec-check +devkit advisor vec-check -i ./src -f ./src/bc-gen_20250921163409_35bd/ -c "gcc -O2 -ftree-vectorize loop_invariant.c" -p gcc -o ~/Reports/ -l 0 +``` + +生成的检查报告通过示例提示“提取循环控制变量”,即for循环的循环控制变量为结构体的成员时,编译器无法确定循环结束条件,导致无法自动向量化循环,这时需要将循环控制变量提取到循环外。即: + +```c +struct Data { + int n; +}; + +void func(struct Data *pdata, int *array) { + int len = pdata->n; + for (int i = 0; i < len; i++) { + array[i] += i; + } +} +``` +修改完成后再次生成BC文件、进行向量化检查,则不会有先前的提示了。 + +这个时候我们可以查看其可执行程序的反汇编代码,看是否进行了向量化: +```shell +objdump -S a.out +``` + +本示例func()函数的反汇编代码是: + +```asm +0000000000400650 : + 400650: b9400003 ldr w3, [x0] + 400654: 7100007f cmp w3, #0x0 + 400658: 5400038d b.le 4006c8 + 40065c: 51000460 sub w0, w3, #0x1 + 400660: 7100081f cmp w0, #0x2 + 400664: 54000349 b.ls 4006cc // b.plast + 400668: 90000004 adrp x4, 400000 <__abi_tag-0x278> + 40066c: 53027c62 lsr w2, w3, #2 + 400670: 4f000483 movi v3.4s, #0x4 + 400674: aa0103e0 mov x0, x1 + 400678: 3dc1c081 ldr q1, [x4, #1792] + 40067c: 8b225022 add x2, x1, w2, uxtw #4 + 400680: 4ea11c22 mov v2.16b, v1.16b + 400684: 3dc00000 ldr q0, [x0] + 400688: 4ea38421 add v1.4s, v1.4s, v3.4s + 40068c: 4ea28400 add v0.4s, v0.4s, v2.4s + 400690: 3c810400 str q0, [x0], #16 + 400694: eb02001f cmp x0, x2 + 400698: 54ffff41 b.ne 400680 // b.any + 40069c: 121e7460 and w0, w3, #0xfffffffc + 4006a0: f240047f tst x3, #0x3 + 4006a4: 54000120 b.eq 4006c8 // b.none + 4006a8: 93407c00 sxtw x0, w0 + 4006ac: d503201f nop + 4006b0: b8607822 ldr w2, [x1, x0, lsl #2] + 4006b4: 0b000042 add w2, w2, w0 + 4006b8: b8207822 str w2, [x1, x0, lsl #2] + 4006bc: 91000400 add x0, x0, #0x1 + 4006c0: 6b00007f cmp w3, w0 + 4006c4: 54ffff6c b.gt 4006b0 + 4006c8: d65f03c0 ret + 4006cc: 52800000 mov w0, #0x0 // #0 + 4006d0: 17fffff6 b 4006a8 +``` + +可以看到,其中的指令使用了向量寄存器,如: +`movi v3.4s, #0x4`:将立即数4移动到向量寄存器v3的每个32位元素中,用于步长计算。 +`add v1.4s, v1.4s, v3.4s`:向量加法,将v1的每个32位元素与v3的对应元素相加(用于生成索引向量)。 + +等等。 + + + + +💡*Tips* + +编译该程序的命令也可以用`gcc -O3 -march=armv8-a+sve loop_invariant.c`。 + +注意:使用该选项时,产生的向量化指令有所不同,请读者自行实验和分析。 diff --git a/Affinity/7-affi-check/affi-check.md b/Affinity/7-affi-check/affi-check.md new file mode 100644 index 0000000000000000000000000000000000000000..4f0b79835b52ae6779feb5d9743cf90020e3b932 --- /dev/null +++ b/Affinity/7-affi-check/affi-check.md @@ -0,0 +1,77 @@ +# 构建亲和 + +Affinity Check + +构建亲和功能用于分析makefile、CMakeLists.txt中可以替换鲲鹏加速库的内容,并提供替换建议和功能修复。 + + + +📢*说明* + +① 本实验的实验环境是“鲲鹏处理器+openEuler 22.03 LTS”。 + +② 本实验将要用到的wtdbg是一种使用C语言编写的更快更好的基因组算法。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir ~/workspace/7-affi-check && cd $_ +``` + +🎶步骤 2:准备好源代码 + +请参照“**字节对齐检查**”实验准备好源代码并适配到鲲鹏处理器的aarch64指令集架构。 + + + +🎶步骤 3:进行构建亲和检查 + +```shell +pwd # /root/workspace/7-affi-check +ls -F # wtdbg2/ +devkit advisor affi-check -i ./wtdbg2/ -c make -o ~/Reports/ +``` + +终端输出信息如下: + +```txt +Executing Build Affinity Analysis task, please wait... +Data initialization. +Current progress: ### [10%] +Compiling the project. It may take a long time. Please wait... +Current progress: #################### [60%] +Optimization suggestions for the matching acceleration library. +Current progress: ################################# [100%] +Build Affinity Affinity task has been executed. +Scanned time: 2025/09/22 01:46:24 +Scan status: Succeed. Some items are recommended to be modified. + +Configuration: + Source code file path: /root/workspace/7-affi-check/wtdbg2 + Log level: info + Report type: all + Timeout interval setting: The timeout period is not set. + +Summary: + 3 dependency files can be accelerated. + +For the detailed information, please check: + /root/Reports/affi-check_20250922014624_1071.json + /root/Reports/affi-check_20250922014624_1071.html + /root/Reports/affi-check_20250922014624_1071.csv +``` + +本例中生成的报告如下所示: + +![Report](./img/report.png) + +可以看出,该报告建议使用鲲鹏BoostKit加速库提供的基于ARM指令深度优化和基于鲲鹏KAE(鲲鹏加速引擎)开发的加速库。详情可以参照报告中“获取资源”的链接:[鲲鹏BoostKit基础加速软件包](https://www.hikunpeng.com/boostkit/library)。 + + + +🔗更多学习资源 + +关于“构建亲和”的更多学习资源请参考[鲲鹏社区DevKit“构建亲和”文档](https://www.hikunpeng.com/document/detail/zh/kunpengdevps/userguide/cliuserguide/KunpengDevKitCli_0041.html)。 + diff --git a/Affinity/7-affi-check/img/report.png b/Affinity/7-affi-check/img/report.png new file mode 100644 index 0000000000000000000000000000000000000000..2be2a8ea88f69ba0c8c501a350bf8a25de861d1a Binary files /dev/null and b/Affinity/7-affi-check/img/report.png differ diff --git a/Affinity/8-matrix-check/doc/matrix-check_20250922033829_d903.html b/Affinity/8-matrix-check/doc/matrix-check_20250922033829_d903.html new file mode 100644 index 0000000000000000000000000000000000000000..fa16d8b1cdd41bb1a5978217a1d307e118b82ff2 --- /dev/null +++ b/Affinity/8-matrix-check/doc/matrix-check_20250922033829_d903.html @@ -0,0 +1,17201 @@ + + + + + + + Document + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/Affinity/8-matrix-check/img/report.png b/Affinity/8-matrix-check/img/report.png new file mode 100644 index 0000000000000000000000000000000000000000..136cffde5a0852e5cc3af2b55b61f8eb2af9f27c Binary files /dev/null and b/Affinity/8-matrix-check/img/report.png differ diff --git a/Affinity/8-matrix-check/matrix-check.md b/Affinity/8-matrix-check/matrix-check.md new file mode 100644 index 0000000000000000000000000000000000000000..b48e43228a2ed1bbdfdb0a011b7f71a7a49426e8 --- /dev/null +++ b/Affinity/8-matrix-check/matrix-check.md @@ -0,0 +1,181 @@ +# 矩阵化检查 + +Matricization Check + +矩阵化检查是鲲鹏DevKit亲和分析工具中的一项高级静态代码分析功能。它专门为识别和优化计算密集型核心(计算热点)而设计,旨在充分挖掘鲲鹏处理器(尤其是其SME等高级指令集扩展)的硬件潜力。 + +- 支持的语言:C、C++和Fortran。 +- 分析基础:通过构建抽象语法树(AST)进行分析。 +- 检查的技术点:其SME(Scalable Matrix Extension)矩阵化主要针对以下3类计算模式进行检查和优化建议: + - Stencil计算:广泛应用于科学计算中的网格更新算法; + - GEMV:通用矩阵-向量乘法; + - FFT:快速傅里叶变换。 + + + +我们针对一个计算密集型任务的源代码——基因组组装软件[Hifiasm](https://github.com/chhylp123/hifiasm)来进行本实验。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir ~/workspace/8-matrix-check && cd $_ +``` + +🎶步骤 2:获取源代码 + +```shell +git clone https://github.com/chhylp123/hifiasm.git && cd hifiasm +``` + +🎶步骤 3:检出到特定版本(选做) + +```shell +git log # commit commit ec9a8b222d149d25b7355e83765698640d59b189 (HEAD -> master, tag: 0.25.0, origin/master, origin/HEAD) +git checkout ec9a8b222d149d25b7355e83765698640d59b189 # 需要时选做 +``` + +在进行本实验时,该仓库的最新版本如以上命令`git log`所示,后续代码仓如有更新而又要在此版本上做实验,可以检出到此版本。 + +🎶步骤 4:适配到aarch64处理器架构 + +为更加贴近实际应用,在这里,我们将原仓库的代码适配到鲲鹏处理器的aarch64架构(需遵循原代码仓的MIT License许可协议)。 + +```shell +pwd # ...hifiasm +wget https://raw.githubusercontent.com/DLTcollab/sse2neon/refs/heads/master/sse2neon.h +``` + +🔔*注意* + +若由于网络原因无法下载sse2neon.h文件请看附件所附文件(请尊重原作者版权)。 + +然后对源代码进行以下修改: + +① 将 Levenshtein_distance.h 中的以下头文件都注释,并包含"sse2neon.h": + +```c +#include "emmintrin.h" +#include "nmmintrin.h" +#include "smmintrin.h" +#include +``` + +更改后的内容为: + +```c +//#include "emmintrin.h" +//#include "nmmintrin.h" +//#include "smmintrin.h" +//#include +#include "sse2neon.h" +``` + +② 将 Makefile 中`CXXFLAGS`中的`-msse4.2`和`-mpopcnt`选项删掉,即将以下脚本: + +```makefile +CXXFLAGS= -g -O3 -msse4.2 -mpopcnt -fomit-frame-pointer -Wall +``` + +变更为: + +```makefile +CXXFLAGS= -g -O3 -fomit-frame-pointer -Wall +``` + +🎶步骤 5:验证 + +现在应该可以在鲲鹏平台编译通过了: + +```shell +make -j8 +``` + +编译完成后进行清理: + +```shell +make clean +``` + + + +🎶步骤 6:进行矩阵化检查 + +```shell +cd .. +pwd # /root/workspace/8-matrix-check +devkit advisor matrix-check -i ~/workspace/8-matrix-check/hifiasm -c make -b make -p domain,sme -m compute -o ~/Reports/ +``` + +该示例进行了矩阵化和计算优化检查(见参数`-p domain,sme -m compute`),该任务各参数的详细含义请参考[鲲鹏社区DevKit亲和分析矩阵化检查说明文档](https://www.hikunpeng.com/document/detail/zh/kunpengdevps/userguide/cliuserguide/KunpengDevKitCli_0058.html)。 + +在本例中该命令输出如下: + +```txt + +Executing matricization check task, please wait... +Current progress: ################################# [100%] +Scanned time: 2025/09/22 03:38:29 + +Configuration: + Scan source code path: /root/workspace/8-matrix-check/hifiasm + Generate report path: /root/Reports + Generate report type: all + Task Timeout Interval: The timeout period is not set. + Log level: info + +Summary: + Scanned 15 files, there are 139 suggestions. + +For the details information, please check: + /root/Reports/matrix-check_20250922033829_d903.html + /root/Reports/matrix-check_20250922033829_d903.csv + +``` + +在本例中任务完成后给出的[检查报告](./doc/matrix-check_20250922033829_d903.html)如下图所示: + +![Report](./img/report.png) + +上面的截图表示该报告建议让编译器对Hash_Table.cpp文件的305~311行的一个循环启用自动向量化。 + +```c++ + 305 for (i = 0; i < fn; i++) { + 306 dd = get_fake_gap_shift(&(t->f_cigar), i); + 307 if(dd != pdd) { + 308 pdd = dd; + 309 add_fake_cigar(&(o->f_cigar), xl-get_fake_gap_pos(&(t->f_cigar), i)-1, pdd, NULL); + 310 } + 311 } +``` + +然后可以对sketch.cpp文件第23行的代码进行精度一致除转乘(即外提计算倒数,将除法转为同精度的乘法计算): + +```c++ + 17 void debug_refine(ha_mz1_t *ma, uint64_t *mmt, int32_t sn, int32_t n, int32_t m, int32_t end) + 18 { + 19 uint64_t ks = end; + 20 int64_t t = 0, i, k, sp = -1, ep = -1, ovlp, tot = mmt[end]&0xffffffff, nt = 0;; + 21 while (ks != 0xffffffff) + 22 { + 23 i = ks/m; k = ks%m; + 24 ks = mmt[ks]>>32; + 25 if(ks == 0xffffffff || (int32_t)(ks/m) == (i-1)) + 26 { + 27 t++; + 28 ovlp = ((MIN(ep, (int64_t)ma[k].pos) >= MAX(sp, (int64_t)(ma[k].pos+1-ma[k].span)))? + 29 MIN(ep, (int64_t)ma[k].pos) - MAX(sp, (int64_t)(ma[k].pos+1-ma[k].span)) + 1:0); + 30 if(ovlp != 0) fprintf(stderr, "ERROR-OVLP\n"); + 31 if(sp == -1 || sp > (ma[k].pos+1-ma[k].span)) sp = ma[k].pos+1-ma[k].span; + 32 if(ep == -1 || ep < ma[k].pos) ep = ma[k].pos; + 33 nt += (ma[k].rid); + 34 } + 35 } + 36 if(t != sn) fprintf(stderr, "ERROR-TN, t: %ld, sn: %d\n", t, sn); + 37 if(nt != tot) fprintf(stderr, "ERROR-TOT, nt: %ld, tot: %ld\n", nt, tot); + 38 } +``` + +我们可以依据这些建议对源代码进行可能的优化。 + diff --git a/Affinity/9-dr-check/doc/dr-check_ddmc_20251014034802_6eb9.html b/Affinity/9-dr-check/doc/dr-check_ddmc_20251014034802_6eb9.html new file mode 100644 index 0000000000000000000000000000000000000000..1a02d1942d200d50af2c311c242083949632f7e7 --- /dev/null +++ b/Affinity/9-dr-check/doc/dr-check_ddmc_20251014034802_6eb9.html @@ -0,0 +1,3143 @@ + + + + + + + + dr-check_ddmc_20251014034802_6eb9 + + + + + +
+
+ +
+
+ +
+
+
+
+
+
+
+
+
+
dr-check_ddmc_20251014034802_6eb9
+
+ + Data provider + + Kunpeng DevKit + + +
+
+
+
+
+
+
EN
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 3 + + + +
+
+
+
+ 1 + + + +
+
+
+
+ 6 + +
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ +
+ + + + PC + + + + PC
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ +
+
+ 1 +
+
+
+ 0x420080 + + + + - + + + + + 1879 + + Main + + + + + + thread1 + + + + + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 18 + + 0x4007d8 + + 1874 + + Main + + + + + + + + + + main + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 46 + + 0x400990 + + Add "__asm__ volatile("dmb sy");" in the position indicated by this item +
+
+
+ +
+
+ 2 +
+
+
+ 0x4222a0 + + + + - + + + + + 1880 + + Main + + + + + + + + thread2 + + + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 26 + + 0x400850 + + 1879 + + Main + + + + + + thread1 + + + + + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 21 + + 0x400808 + + Add "__asm__ volatile("dmb sy");" in the position indicated by this item +
+
+
+ +
+
+ 3 +
+
+
+ 0x4222c0 + + + + - + + + + + 1880 + + Main + + + + + + + + thread2 + + + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 28 + + 0x400870 + + 1879 + + Main + + + + + + thread1 + + + + + + + + + + + + + + /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc.c + + + + + + + + 19 + + 0x4007f4 + + Add "__asm__ volatile("dmb sy");" in the position indicated by this item +
+
+
+
+ +
+ + + + + \ No newline at end of file diff --git a/Affinity/9-dr-check/dr-check.md b/Affinity/9-dr-check/dr-check.md new file mode 100644 index 0000000000000000000000000000000000000000..4d3388c7803ce96086f3bf7462e7118a694167b7 --- /dev/null +++ b/Affinity/9-dr-check/dr-check.md @@ -0,0 +1,182 @@ +# 内存一致性动态检查 + +Dynamic Memory Consistency Checking + +内存一致性动态检查就是采用动态检查模式检查C/C++源码在鲲鹏平台运行时是否存在内存一致性问题,并提供内存一致性的检测结果与插入内存屏障的建议。 + +数据竞争比内存一致性问题的范围大,这里dr-check任务是动态检查源码中内存一致性的情况。,得根据操作的依赖关系,在 “需要保证顺序的两个操作之间” 放置屏障。 + +所谓屏障,就是屏障之前的所有内存操作(读/写)必须在屏障之后的内存操作开始前完成。 + +📢*说明* + +目前支持对单进程应用和共享内存模式的多进程应用进行内存一致性动态检查,详情请见鲲鹏社区[内存一致性动态检查功能说明](https://www.hikunpeng.com/document/detail/zh/kunpengdevps/userguide/cliuserguide/KunpengDevKitCli_0094.html)。 + +🔔*注意* + +本实验在“鲲鹏处理器+openEuler 22.03 LTS”平台上进行,所用鲲鹏处理器至少要有2个CPU核。 + + + +🎶步骤 1:创建工作目录 + +```shell +mkdir -p ~/workspace/9-dr-check/src && cd $_ +``` + +🎶步骤 2:准备源代码 + +按照ddmc.c准备源代码,核心代码如下(这里仅仅是示例,实际运行的代码请参照源代码文件): + +```c +volatile int *x, *y, *r1, *r2; + x = (int *)malloc(sizeof(int)); + y = (int *)malloc(sizeof(int)); +r1 = (int *)malloc(sizeof(int)); +r2 = (int *)malloc(sizeof(int)); +*x = *y = *r1 = *r2 = 0; + +volatile int flag = 0; + +void thread1(void *arg) { + while(flag == 0); + *y = 1; + *x = 1; +} + +void thread2(void *arg) { + while(flag == 0); + *r2 = *x; + *r1 = *y; +} + +flag = 1; +if (*r1 == 0 && *r2 == 1) printf("reorder", n); +``` + +上述程序中x、y、r1、r2所指向的内存是在程序运行时动态分配的,初始值都为0; + +线程thread1对变量x和y指向的内存进行写操作; + +线程thread2对变量x和y指向的内存进行读操作; + +如果r1指向的内存为0而r2指向的内存为1则说明有乱序操作。 + +通过变量flag可以对thread1、thread2进行大致同步但也引入了对flag变量的读写乱序问题(为简单起见,这里暂不分析)。 + + + +🎶步骤 3:编译并运行程序 + +```makefile +gcc ddmc.c -o ddmc -lpthread +./ddmc +``` + +在有4个CPU核的鲲鹏ECS上,输出如下: + +```txt +reorder @ 14 +reorder @ 103 +reorder @ 424 +reorder @ 459 +reorder @ 532 +reorder @ 720 +reorder @ 928 +reorder @ 1484 +reorder @ 1864 +reorder @ 2088 +reorder @ 2218 +reorder @ 2337 +...... +``` + +可见有频繁的乱序发生。 + +在有2个CPU核的鲲鹏ECS上,输出如下: + +```txt +reorder @ 19431 +``` + +可见这时候CPU乱序现象骤减。 + + + +🎶步骤 4:内存一致性动态检查 + +```shell +gcc -g ddmc.c -o ddmc -lpthread # 带调试参数编译程序 +cd .. +pwd # /root/workspace/9-dr-check +devkit advisor dr-check -i ./src -f ./src/ddmc -o ~/Reports/ +``` + +在上述`devkit advisor dr-check`命令中: + +`-i`参数指明了ELF文件对应的源码文件夹路径; + +`-f`是必选参数,指示ELF文件路径。待测ELF文件必须带有调试信息(编译时增加-g选项)。 + + + +命令行终端有如下输出信息: + +```txt +Executing dynamic memory consistency check task, please wait... +The pid of the memtracer process is 1874 +Scanned time: 2025/10/14 03:48:02 + +Configuration: + ELF filepath: /root/workspace/devkit-affinity-exp/9-dr-check/src/ddmc + Scan source code path: /root/workspace/devkit-affinity-exp/9-dr-check/src + Generate report path: /root/Reports + Generate report type: all + Task Timeout Interval: The timeout period is not set. + Log level: info + +Summary: +There are 6 recommended code lines to modify. + +For the detailed information on multi-thread races, please check: + /root/Reports/dr-check_ddmc_20251014034802_6eb9.json + /root/Reports/dr-check_ddmc_20251014034802_6eb9.html + /root/Reports/dr-check_ddmc_20251014034802_6eb9.csv +``` + +生成的[检查报告](./doc/dr-check_ddmc_20251014034802_6eb9.html)显示了数据竞争源码冲突位置: + +![Report](./img/report.jpeg) + +将鼠标移到红色小火箭图标的上方,会有如下提示: + +![little-rocket](./img/little-rocket.jpeg) + +即建议在相应位置设置`__asm__ volatile("dmb sy")`内存屏障(关于此内存屏障指令的解释请参见“内存一致性静态检查”示例中的说明)。 + +我们暂时不管变量flag的数据竞争冲突,而将相关源代码做如下修改: + +```c +void thread1(void *arg) { + while(flag == 0); + *y = 1; + __asm__ volatile("dmb sy"); + *x = 1; +} + +void thread2(void *arg) { + while(flag == 0); + *r2 = *x; + __asm__ volatile("dmb sy"); + *r1 = *y; +} +``` + +然后我们重新编译、运行程序,可以观察到经过多轮(这里是1000*1000共100万)循环的“写-写”和“读-读”操作都没有发生乱序: + +```txt +Run 1000000 times, with reorders occurring 0 times. +``` + + + diff --git a/Affinity/9-dr-check/img/little-rocket.jpeg b/Affinity/9-dr-check/img/little-rocket.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..13edae59cdef022792ed031c42b9d5a64db8ea70 Binary files /dev/null and b/Affinity/9-dr-check/img/little-rocket.jpeg differ diff --git a/Affinity/9-dr-check/img/report.jpeg b/Affinity/9-dr-check/img/report.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..db59b4a582aafb6c9450993dd8011f045fcf8251 Binary files /dev/null and b/Affinity/9-dr-check/img/report.jpeg differ diff --git a/Affinity/9-dr-check/src/ddmc.c b/Affinity/9-dr-check/src/ddmc.c new file mode 100644 index 0000000000000000000000000000000000000000..b182fcc320d7384df31f35b46e508825d3895c05 --- /dev/null +++ b/Affinity/9-dr-check/src/ddmc.c @@ -0,0 +1,67 @@ +// +// ddmc.c +// Dynamic Detection of Memory Consistency +// +// gcc -g ddmc.c -o ddmc -lpthread +// devkit advisor dr-check -i . -f ./ddmc +// + +#include +#include +#include +#include + +volatile int *x, *y, *r1, *r2; +volatile int flag; + +void thread1(void *arg) { + while(flag == 0); + *y = 1; + //__asm__ volatile("dmb sy"); + *x = 1; +} + +void thread2(void *arg) { + while(flag == 0); + *r2 = *x; + //__asm__ volatile("dmb sy"); + *r1 = *y; +} + +int main() { + unsigned int n = 0; + unsigned int count = 0; + do { + x = (int *)malloc(sizeof(int)); + y = (int *)malloc(sizeof(int)); + r1 = (int *)malloc(sizeof(int)); + r2 = (int *)malloc(sizeof(int)); + *x = *y = *r1 = *r2 = flag = 0; + + pthread_t t1, t2; + pthread_create(&t1, NULL, (void *)thread1, NULL); + pthread_create(&t2, NULL, (void *)thread2, NULL); + + usleep(10); + flag = 1; + + pthread_join(t1, NULL); + pthread_join(t2, NULL); + + if (*r1 == 0 && *r2 == 1) { + printf("reorder @ %d\n", n); + count ++; + } + + free((void *)x); + free((void *)y); + free((void *)r1); + free((void *)r2); + + n ++; + } while (n < 1000*1000); + + printf("Run %u times, with reorders occurring %u times.\n", n, count); + + return 0; +} diff --git a/Affinity/readme.txt b/Affinity/readme.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3e2df662ec40c9ef1b1b5ed006d077257484011 --- /dev/null +++ b/Affinity/readme.txt @@ -0,0 +1,2 @@ +DevKit鲲鹏亲和分析体验 +DevKit Kunpeng Affinity Analysis Experience