mirror of
https://github.com/cirosantilli/linux-kernel-module-cheat.git
synced 2026-01-27 20:14:27 +01:00
stream: play with the STREAM benchmark
This commit is contained in:
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -39,3 +39,6 @@
|
|||||||
[submodule "submodules/xen"]
|
[submodule "submodules/xen"]
|
||||||
path = submodules/xen
|
path = submodules/xen
|
||||||
url = https://github.com/cirosantilli/xen
|
url = https://github.com/cirosantilli/xen
|
||||||
|
[submodule "submodules/stream-benchmark"]
|
||||||
|
path = submodules/stream-benchmark
|
||||||
|
url = https://github.com/cirosantilli/stream-benchmark
|
||||||
|
|||||||
82
README.adoc
82
README.adoc
@@ -12115,6 +12115,8 @@ Indirect leak of 1346 byte(s) in 2 object(s) allocated from:
|
|||||||
|
|
||||||
From the message, this appears however to be a Python / pyenv11 bug however and not in gem5 specifically. I think it worked when I tried it in the past in an older gem5 / Ubuntu.
|
From the message, this appears however to be a Python / pyenv11 bug however and not in gem5 specifically. I think it worked when I tried it in the past in an older gem5 / Ubuntu.
|
||||||
|
|
||||||
|
`--without-tcmalloc` is needed / a good idea when using `--with-asan`: https://stackoverflow.com/questions/42712555/address-sanitizer-fsanitize-address-works-with-tcmalloc since both do more or less similar jobs, see also <<memory-leaks>>.
|
||||||
|
|
||||||
==== gem5 Ruby build
|
==== gem5 Ruby build
|
||||||
|
|
||||||
Ruby is a system that includes the SLICC domain specific language to describe memory systems: http://gem5.org/Ruby
|
Ruby is a system that includes the SLICC domain specific language to describe memory systems: http://gem5.org/Ruby
|
||||||
@@ -12666,6 +12668,8 @@ Tested at b4879ae5b0b6644e6836b0881e4da05c64a6550d.
|
|||||||
|
|
||||||
===== gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis
|
===== gem5 event queue TimingSimpleCPU syscall emulation freestanding example analysis
|
||||||
|
|
||||||
|
TODO: analyze better what each of the memory event mean. For now, we have just collected a bunch of data there, but needs interpreting. The CPU specifics in this section are already insightful however.
|
||||||
|
|
||||||
<<gem5-basesimplecpu,TimingSimpleCPU>> should be the second simplest CPU to analyze, so let's give it a try:
|
<<gem5-basesimplecpu,TimingSimpleCPU>> should be the second simplest CPU to analyze, so let's give it a try:
|
||||||
|
|
||||||
....
|
....
|
||||||
@@ -14506,20 +14510,26 @@ Build and run on QEMU <<user-mode-simulation>>:
|
|||||||
|
|
||||||
....
|
....
|
||||||
git submodule update --init submodules/dhrystone
|
git submodule update --init submodules/dhrystone
|
||||||
./build-dhrystone --mode userland
|
./build-dhrystone --optimization-level 3
|
||||||
./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
....
|
....
|
||||||
|
|
||||||
|
Increase the number of loops to try and reach more meaningful results:
|
||||||
|
|
||||||
|
....
|
||||||
|
./run --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone" --userland-args 100000000
|
||||||
|
....
|
||||||
|
|
||||||
Build and run on gem5 user mode:
|
Build and run on gem5 user mode:
|
||||||
|
|
||||||
....
|
....
|
||||||
./build-dhrystone --mode userland --static --force-rebuild
|
./build-dhrystone --optimization-level 3 --static
|
||||||
./run --emulator gem5 --userland "$(./getvar userland_build_dir)/submodules/dhrystone/dhrystone"
|
./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
....
|
....
|
||||||
|
|
||||||
TODO automate run more nicely.
|
TODO automate run more nicely.
|
||||||
|
|
||||||
Build for <<baremetal>> execution and run it in baremetal QEMU:
|
Build for <<baremetal>> execution and run it in baremetal QEMU. TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
|
||||||
|
|
||||||
....
|
....
|
||||||
# Build our Newlib stubs.
|
# Build our Newlib stubs.
|
||||||
@@ -14528,8 +14538,6 @@ Build for <<baremetal>> execution and run it in baremetal QEMU:
|
|||||||
./run --arch aarch64 --baremetal "$(./getvar baremetal_build_dir)/submodules/dhrystone/dhrystone"
|
./run --arch aarch64 --baremetal "$(./getvar baremetal_build_dir)/submodules/dhrystone/dhrystone"
|
||||||
....
|
....
|
||||||
|
|
||||||
TODO: fix the build, just need to factor out all run arguments from link:build-baremetal[] into link:common.py[] and it should just work, no missing syscalls.
|
|
||||||
|
|
||||||
If you really want the Buildroot package for some reason, build it with:
|
If you really want the Buildroot package for some reason, build it with:
|
||||||
|
|
||||||
....
|
....
|
||||||
@@ -14542,6 +14550,68 @@ and run inside the guest from `PATH` with:
|
|||||||
dhrystone
|
dhrystone
|
||||||
....
|
....
|
||||||
|
|
||||||
|
==== STREAM benchmark
|
||||||
|
|
||||||
|
http://www.cs.virginia.edu/stream/ref.html
|
||||||
|
|
||||||
|
Very simple memory width benchmark with one C and one Fortran version, originally published in 1991, and the latest version at the time of writing is from 2013.
|
||||||
|
|
||||||
|
Its operation is very simple: fork one thread for each CPU in the system (using OpenMP) and do the following four array operations (4 separate loops of individual operations):
|
||||||
|
|
||||||
|
....
|
||||||
|
/* Copy. */
|
||||||
|
times[0 * ntimes + k] = mysecond();
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<stream_array_size; j++)
|
||||||
|
c[j] = a[j];
|
||||||
|
times[0 * ntimes + k] = mysecond() - times[0 * ntimes + k];
|
||||||
|
|
||||||
|
/* Scale. */
|
||||||
|
times[1 * ntimes + k] = mysecond();
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<stream_array_size; j++)
|
||||||
|
b[j] = scalar*c[j];
|
||||||
|
times[1 * ntimes + k] = mysecond() - times[1 * ntimes + k];
|
||||||
|
|
||||||
|
/* Add. */
|
||||||
|
times[2 * ntimes + k] = mysecond();
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<stream_array_size; j++)
|
||||||
|
c[j] = a[j]+b[j];
|
||||||
|
times[2 * ntimes + k] = mysecond() - times[2 * ntimes + k];
|
||||||
|
|
||||||
|
/* Triad. */
|
||||||
|
times[3 * ntimes + k] = mysecond();
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (j=0; j<stream_array_size; j++)
|
||||||
|
a[j] = b[j]+scalar*c[j];
|
||||||
|
times[3 * ntimes + k] = mysecond() - times[3 * ntimes + k];
|
||||||
|
}
|
||||||
|
....
|
||||||
|
|
||||||
|
See also: https://stackoverflow.com/questions/56086993/what-does-stream-memory-bandwidth-benchmark-really-measure
|
||||||
|
|
||||||
|
The LKMC usage of STREAM is analogous to that of <<dhrystone>>. Build and run on QEMU <<user-mode-simulation>>:
|
||||||
|
|
||||||
|
....
|
||||||
|
git submodule update --init submodules/stream-benchmark
|
||||||
|
./build-stream --optimization-level 3
|
||||||
|
./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe"
|
||||||
|
....
|
||||||
|
|
||||||
|
Decrease the benchmark size and the retry count to finish simulation faster, but possibly have a less representative result:
|
||||||
|
|
||||||
|
....
|
||||||
|
./run --userland "$(./getvar userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '100 2'
|
||||||
|
....
|
||||||
|
|
||||||
|
Build and run on gem5 user mode:
|
||||||
|
|
||||||
|
....
|
||||||
|
./build-stream --optimization-level 3 --static
|
||||||
|
./run --emulator gem5 --userland "$(./getvar --static userland_build_dir)/submodules/stream-benchmark/stream_c.exe" --userland-args '1000 2'
|
||||||
|
....
|
||||||
|
|
||||||
==== PARSEC benchmark
|
==== PARSEC benchmark
|
||||||
|
|
||||||
We have ported parts of the http://parsec.cs.princeton.edu[PARSEC benchmark] for cross compilation at: https://github.com/cirosantilli/parsec-benchmark See the documentation on that repo to find out which benchmarks have been ported. Some of the benchmarks were are segfaulting, they are documented in that repo.
|
We have ported parts of the http://parsec.cs.princeton.edu[PARSEC benchmark] for cross compilation at: https://github.com/cirosantilli/parsec-benchmark See the documentation on that repo to find out which benchmarks have been ported. Some of the benchmarks were are segfaulting, they are documented in that repo.
|
||||||
|
|||||||
@@ -42,13 +42,13 @@ https://cirosantilli.com/linux-kernel-module-cheat#dhrystone
|
|||||||
'-j', str(self.env['nproc']), LF,
|
'-j', str(self.env['nproc']), LF,
|
||||||
'-C', os.path.join(self.env['submodules_dir'], 'dhrystone'), LF,
|
'-C', os.path.join(self.env['submodules_dir'], 'dhrystone'), LF,
|
||||||
'CC={}'.format(self.env['gcc_path']), LF,
|
'CC={}'.format(self.env['gcc_path']), LF,
|
||||||
'CFLAGS={}'.format(' '.join(cflags)), LF,
|
'CFLAGS_EXTRA={}'.format(' '.join(cflags)), LF,
|
||||||
'EXTRA_OBJS={}'.format(' '.join(extra_objs)), LF,
|
'EXTRA_OBJS={}'.format(' '.join(extra_objs)), LF,
|
||||||
'OUT_DIR={}'.format(build_dir), LF,
|
'OUT_DIR={}'.format(build_dir), LF,
|
||||||
]
|
]
|
||||||
+ extra_flags
|
+ extra_flags
|
||||||
)
|
)
|
||||||
if ret == 0 and env['copy_overlay']:
|
if ret == 0 and self.env['copy_overlay']:
|
||||||
self.sh.copy_file_if_update(
|
self.sh.copy_file_if_update(
|
||||||
os.path.join(build_dir, 'dhrystone'),
|
os.path.join(build_dir, 'dhrystone'),
|
||||||
os.path.join(self.env['out_rootfs_overlay_lkmc_dir'], self.root_relpath, 'dhrystone'),
|
os.path.join(self.env['out_rootfs_overlay_lkmc_dir'], self.root_relpath, 'dhrystone'),
|
||||||
|
|||||||
63
build-stream
Executable file
63
build-stream
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
import common
|
||||||
|
import shlex
|
||||||
|
from shell_helpers import LF
|
||||||
|
|
||||||
|
class Main(common.BuildCliFunction):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__(
|
||||||
|
description='''\
|
||||||
|
https://cirosantilli.com/linux-kernel-module-cheat#stream-benchmark
|
||||||
|
'''
|
||||||
|
)
|
||||||
|
self._add_argument('--ccflags')
|
||||||
|
self._add_argument('--force-rebuild')
|
||||||
|
self._add_argument('--optimization-level')
|
||||||
|
|
||||||
|
def setup(self, env):
|
||||||
|
self.root_relpath = os.path.join('submodules', 'stream-benchmark')
|
||||||
|
|
||||||
|
def build(self):
|
||||||
|
build_dir = self.get_build_dir()
|
||||||
|
cflags = ['-O{}'.format(self.env['optimization_level'])]
|
||||||
|
extra_flags = []
|
||||||
|
if self.env['static']:
|
||||||
|
cflags.extend(['-static'])
|
||||||
|
if self.env['force_rebuild']:
|
||||||
|
extra_flags.extend(['-B', LF])
|
||||||
|
if self.env['mode'] == 'baremetal':
|
||||||
|
extra_objs = [
|
||||||
|
self.env['baremetal_syscalls_obj'],
|
||||||
|
self.env['baremetal_syscalls_asm_obj']
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
extra_objs = []
|
||||||
|
ret = self.sh.run_cmd(
|
||||||
|
[
|
||||||
|
'make', LF,
|
||||||
|
'-j', str(self.env['nproc']), LF,
|
||||||
|
'-C', os.path.join(self.env['submodules_dir'], 'stream-benchmark'), LF,
|
||||||
|
'CC={}'.format(self.env['gcc_path']), LF,
|
||||||
|
'CFLAGS_EXTRA={}'.format(' '.join(cflags)), LF,
|
||||||
|
'EXTRA_OBJS={}'.format(' '.join(extra_objs)), LF,
|
||||||
|
'FC={}'.format(self.env['gfortran_path']), LF,
|
||||||
|
'OUT_DIR={}'.format(build_dir), LF,
|
||||||
|
]
|
||||||
|
+ extra_flags
|
||||||
|
)
|
||||||
|
if ret == 0 and self.env['copy_overlay']:
|
||||||
|
self.sh.copy_file_if_update(
|
||||||
|
os.path.join(build_dir, 'stream_c.exe'),
|
||||||
|
os.path.join(self.env['out_rootfs_overlay_lkmc_dir'], self.root_relpath, 'stream-benchmark'),
|
||||||
|
)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def get_build_dir(self):
|
||||||
|
return os.path.join(self.env['build_dir'], self.root_relpath)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
Main().cli()
|
||||||
@@ -1011,7 +1011,7 @@ Incompatible archs are skipped.
|
|||||||
)
|
)
|
||||||
if env['mode'] == 'baremetal':
|
if env['mode'] == 'baremetal':
|
||||||
env['build_dir'] = env['baremetal_build_dir']
|
env['build_dir'] = env['baremetal_build_dir']
|
||||||
elif env['mode'] == 'userland':
|
else:
|
||||||
env['build_dir'] = env['userland_build_dir']
|
env['build_dir'] = env['userland_build_dir']
|
||||||
|
|
||||||
# Docker
|
# Docker
|
||||||
@@ -1119,6 +1119,7 @@ lunch aosp_{}-eng
|
|||||||
env['toolchain_prefix_dash'] = ''
|
env['toolchain_prefix_dash'] = ''
|
||||||
else:
|
else:
|
||||||
env['toolchain_prefix_dash'] = '{}-'.format(env['toolchain_prefix'])
|
env['toolchain_prefix_dash'] = '{}-'.format(env['toolchain_prefix'])
|
||||||
|
env['gfortran_path'] = self.get_toolchain_tool('gfortran')
|
||||||
env['gcc_path'] = self.get_toolchain_tool('gcc')
|
env['gcc_path'] = self.get_toolchain_tool('gcc')
|
||||||
env['gxx_path'] = self.get_toolchain_tool('g++')
|
env['gxx_path'] = self.get_toolchain_tool('g++')
|
||||||
env['ld_path'] = self.get_toolchain_tool('ld')
|
env['ld_path'] = self.get_toolchain_tool('ld')
|
||||||
|
|||||||
1
submodules/stream-benchmark
Submodule
1
submodules/stream-benchmark
Submodule
Submodule submodules/stream-benchmark added at eaa0a90ded
34
userland/c/multidimentional_array.c
Normal file
34
userland/c/multidimentional_array.c
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
/* https://cirosantilli.com/linux-kernel-module-cheat#c
|
||||||
|
*
|
||||||
|
* Multidimentional arrays are generally a bad idea that confuses
|
||||||
|
* everyone, use single dimentional arrays + indexing if possible.
|
||||||
|
* But here goes nothing.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
int main(void) {
|
||||||
|
/* Initialized in the code. */
|
||||||
|
{
|
||||||
|
/* We can skip the first dimension as it is inferred. */
|
||||||
|
int is[][3] = {
|
||||||
|
{1, 2, 3,},
|
||||||
|
{4, 5, 5,},
|
||||||
|
};
|
||||||
|
assert(is[0][0] == 1);
|
||||||
|
assert(is[0][1] == 2);
|
||||||
|
assert(is[1][0] == 4);
|
||||||
|
|
||||||
|
/* We can get the total sizes of either the entire array,
|
||||||
|
* or of just on row. */
|
||||||
|
assert(sizeof(is) == 6 * sizeof(is[0][0]));
|
||||||
|
assert(sizeof(is[0]) == 3 * sizeof(is[0][0]));
|
||||||
|
|
||||||
|
/* Multi dimentional arrays are contiguous and row major. */
|
||||||
|
assert(&is[0][1] - &is[0][0] == 1);
|
||||||
|
assert(&is[1][0] - &is[0][0] == 3);
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user