mirror of
https://github.com/python/cpython.git
synced 2024-11-23 01:45:25 +08:00
gh-118518: Allow perf to work without frame pointers (#112254)
This commit is contained in:
parent
999f0c5122
commit
1b22d801b8
@ -1251,7 +1251,10 @@ PyConfig
|
||||
for more information.
|
||||
|
||||
Set by :option:`-X perf <-X>` command line option and by the
|
||||
:envvar:`PYTHONPERFSUPPORT` environment variable.
|
||||
:envvar:`PYTHONPERFSUPPORT` environment variable for perf support
|
||||
with stack pointers and :option:`-X perfjit <-X>` command line option
|
||||
and by the :envvar:`PYTHONPERFJITSUPPORT` environment variable for perf
|
||||
support with DWARF JIT information.
|
||||
|
||||
Default: ``-1``.
|
||||
|
||||
|
@ -205,3 +205,36 @@ You can check if your system has been compiled with this flag by running::
|
||||
If you don't see any output it means that your interpreter has not been compiled with
|
||||
frame pointers and therefore it may not be able to show Python functions in the output
|
||||
of ``perf``.
|
||||
|
||||
|
||||
How to work without frame pointers
|
||||
----------------------------------
|
||||
|
||||
If you are working with a Python interpreter that has been compiled without frame pointers
|
||||
you can still use the ``perf`` profiler but the overhead will be a bit higher because Python
|
||||
needs to generate unwinding information for every Python function call on the fly. Additionally,
|
||||
``perf`` will take more time to process the data because it will need to use the DWARF debugging
|
||||
information to unwind the stack and this is a slow process.
|
||||
|
||||
To enable this mode, you can use the environment variable :envvar:`PYTHONPERFJITSUPPORT` or the
|
||||
:option:`-X perfjit <-X>` option, which will enable the JIT mode for the ``perf`` profiler.
|
||||
|
||||
When using the perf JIT mode, you need an extra step before you can run ``perf report``. You need to
|
||||
call the ``perf inject`` command to inject the JIT information into the ``perf.data`` file.
|
||||
|
||||
$ perf record -F 9999 -g --call-graph dwarf -o perf.data python -Xperfjit my_script.py
|
||||
$ perf inject -i perf.data --jit
|
||||
$ perf report -g -i perf.data
|
||||
|
||||
or using the environment variable::
|
||||
|
||||
$ PYTHONPERFJITSUPPORT=1 perf record -F 9999 -g --call-graph dwarf -o perf.data python my_script.py
|
||||
$ perf inject -i perf.data --jit
|
||||
$ perf report -g -i perf.data
|
||||
|
||||
Notice that when using ``--call-graph dwarf`` the ``perf`` tool will take snapshots of the stack of
|
||||
the process being profiled and save the information in the ``perf.data`` file. By default the size of
|
||||
the stack dump is 8192 bytes but the user can change the size by passing the size after comma like
|
||||
``--call-graph dwarf,4096``. The size of the stack dump is important because if the size is too small
|
||||
``perf`` will not be able to unwind the stack and the output will be incomplete.
|
||||
|
||||
|
@ -586,6 +586,15 @@ Miscellaneous options
|
||||
|
||||
.. versionadded:: 3.12
|
||||
|
||||
* ``-X perfjit`` enables support for the Linux ``perf`` profiler with DWARF
|
||||
support. When this option is provided, the ``perf`` profiler will be able
|
||||
to report Python calls using DWARF ifnormation. This option is only available on
|
||||
some platforms and will do nothing if is not supported on the current
|
||||
system. The default value is "off". See also :envvar:`PYTHONPERFJITSUPPORT`
|
||||
and :ref:`perf_profiling`.
|
||||
|
||||
.. versionadded:: 3.13
|
||||
|
||||
* :samp:`-X cpu_count={n}` overrides :func:`os.cpu_count`,
|
||||
:func:`os.process_cpu_count`, and :func:`multiprocessing.cpu_count`.
|
||||
*n* must be greater than or equal to 1.
|
||||
@ -1127,6 +1136,21 @@ conflict.
|
||||
|
||||
.. versionadded:: 3.12
|
||||
|
||||
.. envvar:: PYTHONPERFJITSUPPORT
|
||||
|
||||
If this variable is set to a nonzero value, it enables support for
|
||||
the Linux ``perf`` profiler so Python calls can be detected by it
|
||||
using DWARF information.
|
||||
|
||||
If set to ``0``, disable Linux ``perf`` profiler support.
|
||||
|
||||
See also the :option:`-X perfjit <-X>` command-line option
|
||||
and :ref:`perf_profiling`.
|
||||
|
||||
.. versionadded:: 3.13
|
||||
|
||||
|
||||
|
||||
.. envvar:: PYTHON_CPU_COUNT
|
||||
|
||||
If this variable is set to a positive integer, it overrides the return
|
||||
|
@ -231,6 +231,11 @@ Other Language Changes
|
||||
equivalent of the :option:`-X frozen_modules <-X>` command-line option.
|
||||
(Contributed by Yilei Yang in :gh:`111374`.)
|
||||
|
||||
* Add :ref:`support for the perf profiler <perf_profiling>` working without
|
||||
frame pointers through the new environment variable
|
||||
:envvar:`PYTHONPERFJITSUPPORT` and command-line option :option:`-X perfjit
|
||||
<-X>` (Contributed by Pablo Galindo in :gh:`118518`.)
|
||||
|
||||
* The new :envvar:`PYTHON_HISTORY` environment variable can be used to change
|
||||
the location of a ``.python_history`` file.
|
||||
(Contributed by Levi Sabah, Zackery Spytz and Hugo van Kemenade in
|
||||
|
@ -108,6 +108,7 @@ extern int _PyIsPerfTrampolineActive(void);
|
||||
extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
|
||||
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
||||
extern _PyPerf_Callbacks _Py_perfmap_callbacks;
|
||||
extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks;
|
||||
#endif
|
||||
|
||||
static inline PyObject*
|
||||
|
@ -75,6 +75,7 @@ struct trampoline_api_st {
|
||||
unsigned int code_size, PyCodeObject* code);
|
||||
int (*free_state)(void* state);
|
||||
void *state;
|
||||
Py_ssize_t code_padding;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -83,6 +84,7 @@ struct _ceval_runtime_state {
|
||||
struct {
|
||||
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
||||
perf_status_t status;
|
||||
int perf_trampoline_type;
|
||||
Py_ssize_t extra_code_index;
|
||||
struct code_arena_st *code_arena;
|
||||
struct trampoline_api_st trampoline_api;
|
||||
|
@ -5,6 +5,7 @@ import sys
|
||||
import sysconfig
|
||||
import os
|
||||
import pathlib
|
||||
import shutil
|
||||
from test import support
|
||||
from test.support.script_helper import (
|
||||
make_script,
|
||||
@ -76,14 +77,27 @@ class TestPerfTrampoline(unittest.TestCase):
|
||||
perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
|
||||
self.assertTrue(perf_file.exists())
|
||||
perf_file_contents = perf_file.read_text()
|
||||
perf_lines = perf_file_contents.splitlines();
|
||||
expected_symbols = [f"py::foo:{script}", f"py::bar:{script}", f"py::baz:{script}"]
|
||||
perf_lines = perf_file_contents.splitlines()
|
||||
expected_symbols = [
|
||||
f"py::foo:{script}",
|
||||
f"py::bar:{script}",
|
||||
f"py::baz:{script}",
|
||||
]
|
||||
for expected_symbol in expected_symbols:
|
||||
perf_line = next((line for line in perf_lines if expected_symbol in line), None)
|
||||
self.assertIsNotNone(perf_line, f"Could not find {expected_symbol} in perf file")
|
||||
perf_line = next(
|
||||
(line for line in perf_lines if expected_symbol in line), None
|
||||
)
|
||||
self.assertIsNotNone(
|
||||
perf_line, f"Could not find {expected_symbol} in perf file"
|
||||
)
|
||||
perf_addr = perf_line.split(" ")[0]
|
||||
self.assertFalse(perf_addr.startswith("0x"), "Address should not be prefixed with 0x")
|
||||
self.assertTrue(set(perf_addr).issubset(string.hexdigits), "Address should contain only hex characters")
|
||||
self.assertFalse(
|
||||
perf_addr.startswith("0x"), "Address should not be prefixed with 0x"
|
||||
)
|
||||
self.assertTrue(
|
||||
set(perf_addr).issubset(string.hexdigits),
|
||||
"Address should contain only hex characters",
|
||||
)
|
||||
|
||||
def test_trampoline_works_with_forks(self):
|
||||
code = """if 1:
|
||||
@ -212,7 +226,7 @@ class TestPerfTrampoline(unittest.TestCase):
|
||||
assert_python_ok("-c", code)
|
||||
|
||||
|
||||
def is_unwinding_reliable():
|
||||
def is_unwinding_reliable_with_frame_pointers():
|
||||
cflags = sysconfig.get_config_var("PY_CORE_CFLAGS")
|
||||
if not cflags:
|
||||
return False
|
||||
@ -259,14 +273,27 @@ def perf_command_works():
|
||||
return True
|
||||
|
||||
|
||||
def run_perf(cwd, *args, **env_vars):
|
||||
def run_perf(cwd, *args, use_jit=False, **env_vars):
|
||||
if env_vars:
|
||||
env = os.environ.copy()
|
||||
env.update(env_vars)
|
||||
else:
|
||||
env = None
|
||||
output_file = cwd + "/perf_output.perf"
|
||||
base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
|
||||
if not use_jit:
|
||||
base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--")
|
||||
else:
|
||||
base_cmd = (
|
||||
"perf",
|
||||
"record",
|
||||
"-g",
|
||||
"--call-graph=dwarf,65528",
|
||||
"-F99",
|
||||
"-k1",
|
||||
"-o",
|
||||
output_file,
|
||||
"--",
|
||||
)
|
||||
proc = subprocess.run(
|
||||
base_cmd + args,
|
||||
stdout=subprocess.PIPE,
|
||||
@ -274,9 +301,21 @@ def run_perf(cwd, *args, **env_vars):
|
||||
env=env,
|
||||
)
|
||||
if proc.returncode:
|
||||
print(proc.stderr)
|
||||
print(proc.stderr, file=sys.stderr)
|
||||
raise ValueError(f"Perf failed with return code {proc.returncode}")
|
||||
|
||||
if use_jit:
|
||||
jit_output_file = cwd + "/jit_output.dump"
|
||||
command = ("perf", "inject", "-j", "-i", output_file, "-o", jit_output_file)
|
||||
proc = subprocess.run(
|
||||
command, stderr=subprocess.PIPE, stdout=subprocess.PIPE, env=env
|
||||
)
|
||||
if proc.returncode:
|
||||
print(proc.stderr)
|
||||
raise ValueError(f"Perf failed with return code {proc.returncode}")
|
||||
# Copy the jit_output_file to the output_file
|
||||
os.rename(jit_output_file, output_file)
|
||||
|
||||
base_cmd = ("perf", "script")
|
||||
proc = subprocess.run(
|
||||
("perf", "script", "-i", output_file),
|
||||
@ -290,20 +329,9 @@ def run_perf(cwd, *args, **env_vars):
|
||||
)
|
||||
|
||||
|
||||
@unittest.skipUnless(perf_command_works(), "perf command doesn't work")
|
||||
@unittest.skipUnless(is_unwinding_reliable(), "Unwinding is unreliable")
|
||||
class TestPerfProfiler(unittest.TestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
files_to_delete = (
|
||||
set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
|
||||
)
|
||||
for file in files_to_delete:
|
||||
file.unlink()
|
||||
class TestPerfProfilerMixin:
|
||||
def run_perf(self, script_dir, perf_mode, script):
|
||||
raise NotImplementedError()
|
||||
|
||||
def test_python_calls_appear_in_the_stack_if_perf_activated(self):
|
||||
with temp_dir() as script_dir:
|
||||
@ -322,14 +350,14 @@ class TestPerfProfiler(unittest.TestCase):
|
||||
baz(10000000)
|
||||
"""
|
||||
script = make_script(script_dir, "perftest", code)
|
||||
stdout, stderr = run_perf(script_dir, sys.executable, "-Xperf", script)
|
||||
stdout, stderr = self.run_perf(script_dir, script)
|
||||
self.assertEqual(stderr, "")
|
||||
|
||||
self.assertIn(f"py::foo:{script}", stdout)
|
||||
self.assertIn(f"py::bar:{script}", stdout)
|
||||
self.assertIn(f"py::baz:{script}", stdout)
|
||||
|
||||
def test_python_calls_do_not_appear_in_the_stack_if_perf_activated(self):
|
||||
def test_python_calls_do_not_appear_in_the_stack_if_perf_deactivated(self):
|
||||
with temp_dir() as script_dir:
|
||||
code = """if 1:
|
||||
def foo(n):
|
||||
@ -346,13 +374,38 @@ class TestPerfProfiler(unittest.TestCase):
|
||||
baz(10000000)
|
||||
"""
|
||||
script = make_script(script_dir, "perftest", code)
|
||||
stdout, stderr = run_perf(script_dir, sys.executable, script)
|
||||
stdout, stderr = self.run_perf(
|
||||
script_dir, script, activate_trampoline=False
|
||||
)
|
||||
self.assertEqual(stderr, "")
|
||||
|
||||
self.assertNotIn(f"py::foo:{script}", stdout)
|
||||
self.assertNotIn(f"py::bar:{script}", stdout)
|
||||
self.assertNotIn(f"py::baz:{script}", stdout)
|
||||
|
||||
@unittest.skipUnless(perf_command_works(), "perf command doesn't work")
|
||||
@unittest.skipUnless(
|
||||
is_unwinding_reliable_with_frame_pointers(),
|
||||
"Unwinding is unreliable with frame pointers",
|
||||
)
|
||||
class TestPerfProfiler(unittest.TestCase, TestPerfProfilerMixin):
|
||||
def run_perf(self, script_dir, script, activate_trampoline=True):
|
||||
if activate_trampoline:
|
||||
return run_perf(script_dir, sys.executable, "-Xperf", script)
|
||||
return run_perf(script_dir, sys.executable, script)
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
files_to_delete = (
|
||||
set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
|
||||
)
|
||||
for file in files_to_delete:
|
||||
file.unlink()
|
||||
|
||||
def test_pre_fork_compile(self):
|
||||
code = """if 1:
|
||||
import sys
|
||||
@ -370,7 +423,7 @@ class TestPerfProfiler(unittest.TestCase):
|
||||
foo_fork()
|
||||
|
||||
def foo():
|
||||
pass
|
||||
import time; time.sleep(1)
|
||||
|
||||
def bar():
|
||||
foo()
|
||||
@ -423,12 +476,41 @@ class TestPerfProfiler(unittest.TestCase):
|
||||
# identical in both the parent and child perf-map files.
|
||||
perf_file_lines = perf_file_contents.split("\n")
|
||||
for line in perf_file_lines:
|
||||
if (
|
||||
f"py::foo_fork:{script}" in line
|
||||
or f"py::bar_fork:{script}" in line
|
||||
):
|
||||
if f"py::foo_fork:{script}" in line or f"py::bar_fork:{script}" in line:
|
||||
self.assertIn(line, child_perf_file_contents)
|
||||
|
||||
def _is_kernel_version_at_least(major, minor):
|
||||
try:
|
||||
with open("/proc/version") as f:
|
||||
version = f.readline().split()[2]
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
version = version.split(".")
|
||||
return int(version[0]) > major or (int(version[0]) == major and int(version[1]) >= minor)
|
||||
|
||||
@unittest.skipUnless(perf_command_works(), "perf command doesn't work")
|
||||
@unittest.skipUnless(_is_kernel_version_at_least(6, 6), "perf command may not work due to a perf bug")
|
||||
class TestPerfProfilerWithDwarf(unittest.TestCase, TestPerfProfilerMixin):
|
||||
def run_perf(self, script_dir, script, activate_trampoline=True):
|
||||
if activate_trampoline:
|
||||
return run_perf(
|
||||
script_dir, sys.executable, "-Xperfjit", script, use_jit=True
|
||||
)
|
||||
return run_perf(script_dir, sys.executable, script, use_jit=True)
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.perf_files = set(pathlib.Path("/tmp/").glob("jit*.dump"))
|
||||
self.perf_files |= set(pathlib.Path("/tmp/").glob("jitted-*.so"))
|
||||
|
||||
def tearDown(self) -> None:
|
||||
super().tearDown()
|
||||
files_to_delete = set(pathlib.Path("/tmp/").glob("jit*.dump"))
|
||||
files_to_delete |= set(pathlib.Path("/tmp/").glob("jitted-*.so"))
|
||||
files_to_delete = files_to_delete - self.perf_files
|
||||
for file in files_to_delete:
|
||||
file.unlink()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
@ -488,6 +488,7 @@ PYTHON_OBJS= \
|
||||
Python/fileutils.o \
|
||||
Python/suggestions.o \
|
||||
Python/perf_trampoline.o \
|
||||
Python/perf_jit_trampoline.o \
|
||||
Python/$(DYNLOADFILE) \
|
||||
$(LIBOBJS) \
|
||||
$(MACHDEP_OBJS) \
|
||||
|
@ -0,0 +1,4 @@
|
||||
Allow the Linux perf support to work without frame pointers using perf's
|
||||
advanced JIT support. The feature is activated when using the
|
||||
``PYTHONPERFJITSUPPORT`` environment variable or when running Python with
|
||||
``-Xperfjit``. Patch by Pablo Galindo
|
@ -240,6 +240,7 @@
|
||||
<ClCompile Include="..\Python\parking_lot.c" />
|
||||
<ClCompile Include="..\Python\pathconfig.c" />
|
||||
<ClCompile Include="..\Python\perf_trampoline.c" />
|
||||
<ClCompile Include="..\Python\perf_jit_trampoline.c" />
|
||||
<ClCompile Include="..\Python\preconfig.c" />
|
||||
<ClCompile Include="..\Python\pyarena.c" />
|
||||
<ClCompile Include="..\Python\pyctype.c" />
|
||||
|
@ -94,6 +94,9 @@
|
||||
<ClCompile Include="..\Python\perf_trampoline.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Python\perf_jit_trampoline.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Python\compile.c">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
@ -609,6 +609,7 @@
|
||||
<ClCompile Include="..\Python\parking_lot.c" />
|
||||
<ClCompile Include="..\Python\pathconfig.c" />
|
||||
<ClCompile Include="..\Python\perf_trampoline.c" />
|
||||
<ClCompile Include="..\Python\perf_jit_trampoline.c" />
|
||||
<ClCompile Include="..\Python\preconfig.c" />
|
||||
<ClCompile Include="..\Python\pyarena.c" />
|
||||
<ClCompile Include="..\Python\pyctype.c" />
|
||||
|
@ -1403,6 +1403,9 @@
|
||||
<ClCompile Include="..\Python\perf_trampoline.c">
|
||||
<Filter>Python</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Python\perf_jit_trampoline.c">
|
||||
<Filter>Python</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Python\preconfig.c">
|
||||
<Filter>Python</Filter>
|
||||
</ClCompile>
|
||||
|
@ -1703,6 +1703,20 @@ config_init_perf_profiling(PyConfig *config)
|
||||
if (xoption) {
|
||||
config->perf_profiling = 1;
|
||||
}
|
||||
env = config_get_env(config, "PYTHONPERFJITSUPPORT");
|
||||
if (env) {
|
||||
if (_Py_str_to_int(env, &active) != 0) {
|
||||
active = 0;
|
||||
}
|
||||
if (active) {
|
||||
config->perf_profiling = 2;
|
||||
}
|
||||
}
|
||||
xoption = config_get_xoption(config, L"perfjit");
|
||||
if (xoption) {
|
||||
config->perf_profiling = 2;
|
||||
}
|
||||
|
||||
return _PyStatus_OK();
|
||||
|
||||
}
|
||||
|
615
Python/perf_jit_trampoline.c
Normal file
615
Python/perf_jit_trampoline.c
Normal file
@ -0,0 +1,615 @@
|
||||
#include "Python.h"
|
||||
#include "pycore_ceval.h" // _PyPerf_Callbacks
|
||||
#include "pycore_frame.h"
|
||||
#include "pycore_interp.h"
|
||||
|
||||
|
||||
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h> // mmap()
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h> // sysconf()
|
||||
#include <sys/time.h> // gettimeofday()
|
||||
|
||||
// ----------------------------------
|
||||
// Perf jitdump API
|
||||
// ----------------------------------
|
||||
|
||||
typedef struct {
|
||||
FILE* perf_map;
|
||||
PyThread_type_lock map_lock;
|
||||
void* mapped_buffer;
|
||||
size_t mapped_size;
|
||||
int code_id;
|
||||
} PerfMapJitState;
|
||||
|
||||
static PerfMapJitState perf_jit_map_state;
|
||||
|
||||
/*
|
||||
Usually the binary and libraries are mapped in separate region like below:
|
||||
|
||||
address ->
|
||||
--+---------------------+--//--+---------------------+--
|
||||
| .text | .data | ... | | .text | .data | ... |
|
||||
--+---------------------+--//--+---------------------+--
|
||||
myprog libc.so
|
||||
|
||||
So it'd be easy and straight-forward to find a mapped binary or library from an
|
||||
address.
|
||||
|
||||
But for JIT code, the code arena only cares about the code section. But the
|
||||
resulting DSOs (which is generated by perf inject -j) contain ELF headers and
|
||||
unwind info too. Then it'd generate following address space with synthesized
|
||||
MMAP events. Let's say it has a sample between address B and C.
|
||||
|
||||
sample
|
||||
|
|
||||
address -> A B v C
|
||||
---------------------------------------------------------------------------------------------------
|
||||
/tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
||||
/tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
||||
/tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
||||
...
|
||||
---------------------------------------------------------------------------------------------------
|
||||
|
||||
If it only maps the .text section, it'd find the jitted-PID-1.so but cannot see
|
||||
the unwind info. If it maps both .text section and unwind sections, the sample
|
||||
could be mapped to either jitted-PID-0.so or jitted-PID-1.so and it's confusing
|
||||
which one is right. So to make perf happy we have non-overlapping ranges for each
|
||||
DSO:
|
||||
|
||||
address ->
|
||||
-------------------------------------------------------------------------------------------------------
|
||||
/tmp/jitted-PID-0.so | (headers) | .text | unwind info |
|
||||
/tmp/jitted-PID-1.so | (headers) | .text | unwind info |
|
||||
/tmp/jitted-PID-2.so | (headers) | .text | unwind info |
|
||||
...
|
||||
-------------------------------------------------------------------------------------------------------
|
||||
|
||||
As the trampolines are constant, we add a constant padding but in general the padding needs to have the
|
||||
size of the unwind info rounded to 16 bytes. In general, for our trampolines this is 0x50
|
||||
*/
|
||||
|
||||
#define PERF_JIT_CODE_PADDING 0x100
|
||||
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
|
||||
|
||||
typedef uint64_t uword;
|
||||
typedef const char* CodeComments;
|
||||
|
||||
#define Pd "d"
|
||||
#define MB (1024 * 1024)
|
||||
|
||||
#define EM_386 3
|
||||
#define EM_X86_64 62
|
||||
#define EM_ARM 40
|
||||
#define EM_AARCH64 183
|
||||
#define EM_RISCV 243
|
||||
|
||||
#define TARGET_ARCH_IA32 0
|
||||
#define TARGET_ARCH_X64 0
|
||||
#define TARGET_ARCH_ARM 0
|
||||
#define TARGET_ARCH_ARM64 0
|
||||
#define TARGET_ARCH_RISCV32 0
|
||||
#define TARGET_ARCH_RISCV64 0
|
||||
|
||||
#define FLAG_generate_perf_jitdump 0
|
||||
#define FLAG_write_protect_code 0
|
||||
#define FLAG_write_protect_vm_isolate 0
|
||||
#define FLAG_code_comments 0
|
||||
|
||||
#define UNREACHABLE()
|
||||
|
||||
static uword GetElfMachineArchitecture(void) {
|
||||
#if TARGET_ARCH_IA32
|
||||
return EM_386;
|
||||
#elif TARGET_ARCH_X64
|
||||
return EM_X86_64;
|
||||
#elif TARGET_ARCH_ARM
|
||||
return EM_ARM;
|
||||
#elif TARGET_ARCH_ARM64
|
||||
return EM_AARCH64;
|
||||
#elif TARGET_ARCH_RISCV32 || TARGET_ARCH_RISCV64
|
||||
return EM_RISCV;
|
||||
#else
|
||||
UNREACHABLE();
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
uint32_t magic;
|
||||
uint32_t version;
|
||||
uint32_t size;
|
||||
uint32_t elf_mach_target;
|
||||
uint32_t reserved;
|
||||
uint32_t process_id;
|
||||
uint64_t time_stamp;
|
||||
uint64_t flags;
|
||||
} Header;
|
||||
|
||||
enum PerfEvent {
|
||||
PerfLoad = 0,
|
||||
PerfMove = 1,
|
||||
PerfDebugInfo = 2,
|
||||
PerfClose = 3,
|
||||
PerfUnwindingInfo = 4
|
||||
};
|
||||
|
||||
struct BaseEvent {
|
||||
uint32_t event;
|
||||
uint32_t size;
|
||||
uint64_t time_stamp;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
struct BaseEvent base;
|
||||
uint32_t process_id;
|
||||
uint32_t thread_id;
|
||||
uint64_t vma;
|
||||
uint64_t code_address;
|
||||
uint64_t code_size;
|
||||
uint64_t code_id;
|
||||
} CodeLoadEvent;
|
||||
|
||||
typedef struct {
|
||||
struct BaseEvent base;
|
||||
uint64_t unwind_data_size;
|
||||
uint64_t eh_frame_hdr_size;
|
||||
uint64_t mapped_size;
|
||||
} CodeUnwindingInfoEvent;
|
||||
|
||||
static const intptr_t nanoseconds_per_second = 1000000000;
|
||||
|
||||
// Dwarf encoding constants
|
||||
|
||||
static const uint8_t DwarfUData4 = 0x03;
|
||||
static const uint8_t DwarfSData4 = 0x0b;
|
||||
static const uint8_t DwarfPcRel = 0x10;
|
||||
static const uint8_t DwarfDataRel = 0x30;
|
||||
// static uint8_t DwarfOmit = 0xff;
|
||||
typedef struct {
|
||||
unsigned char version;
|
||||
unsigned char eh_frame_ptr_enc;
|
||||
unsigned char fde_count_enc;
|
||||
unsigned char table_enc;
|
||||
int32_t eh_frame_ptr;
|
||||
int32_t eh_fde_count;
|
||||
int32_t from;
|
||||
int32_t to;
|
||||
} EhFrameHeader;
|
||||
|
||||
static int64_t get_current_monotonic_ticks(void) {
|
||||
struct timespec ts;
|
||||
if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
|
||||
UNREACHABLE();
|
||||
return 0;
|
||||
}
|
||||
// Convert to nanoseconds.
|
||||
int64_t result = ts.tv_sec;
|
||||
result *= nanoseconds_per_second;
|
||||
result += ts.tv_nsec;
|
||||
return result;
|
||||
}
|
||||
|
||||
static int64_t get_current_time_microseconds(void) {
|
||||
// gettimeofday has microsecond resolution.
|
||||
struct timeval tv;
|
||||
if (gettimeofday(&tv, NULL) < 0) {
|
||||
UNREACHABLE();
|
||||
return 0;
|
||||
}
|
||||
return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec;
|
||||
}
|
||||
|
||||
|
||||
static size_t round_up(int64_t value, int64_t multiple) {
|
||||
if (multiple == 0) {
|
||||
// Avoid division by zero
|
||||
return value;
|
||||
}
|
||||
|
||||
int64_t remainder = value % multiple;
|
||||
if (remainder == 0) {
|
||||
// Value is already a multiple of 'multiple'
|
||||
return value;
|
||||
}
|
||||
|
||||
// Calculate the difference to the next multiple
|
||||
int64_t difference = multiple - remainder;
|
||||
|
||||
// Add the difference to the value
|
||||
int64_t rounded_up_value = value + difference;
|
||||
|
||||
return rounded_up_value;
|
||||
}
|
||||
|
||||
|
||||
static void perf_map_jit_write_fully(const void* buffer, size_t size) {
|
||||
FILE* out_file = perf_jit_map_state.perf_map;
|
||||
const char* ptr = (const char*)(buffer);
|
||||
while (size > 0) {
|
||||
const size_t written = fwrite(ptr, 1, size, out_file);
|
||||
if (written == 0) {
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
size -= written;
|
||||
ptr += written;
|
||||
}
|
||||
}
|
||||
|
||||
static void perf_map_jit_write_header(int pid, FILE* out_file) {
|
||||
Header header;
|
||||
header.magic = 0x4A695444;
|
||||
header.version = 1;
|
||||
header.size = sizeof(Header);
|
||||
header.elf_mach_target = GetElfMachineArchitecture();
|
||||
header.process_id = pid;
|
||||
header.time_stamp = get_current_time_microseconds();
|
||||
header.flags = 0;
|
||||
perf_map_jit_write_fully(&header, sizeof(header));
|
||||
}
|
||||
|
||||
static void* perf_map_jit_init(void) {
|
||||
char filename[100];
|
||||
int pid = getpid();
|
||||
snprintf(filename, sizeof(filename) - 1, "/tmp/jit-%d.dump", pid);
|
||||
const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666);
|
||||
if (fd == -1) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const long page_size = sysconf(_SC_PAGESIZE); // NOLINT(runtime/int)
|
||||
if (page_size == -1) {
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// The perf jit interface forces us to map the first page of the file
|
||||
// to signal that we are using the interface.
|
||||
perf_jit_map_state.mapped_buffer = mmap(NULL, page_size, PROT_READ | PROT_EXEC, MAP_PRIVATE, fd, 0);
|
||||
if (perf_jit_map_state.mapped_buffer == NULL) {
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
perf_jit_map_state.mapped_size = page_size;
|
||||
perf_jit_map_state.perf_map = fdopen(fd, "w+");
|
||||
if (perf_jit_map_state.perf_map == NULL) {
|
||||
close(fd);
|
||||
return NULL;
|
||||
}
|
||||
setvbuf(perf_jit_map_state.perf_map, NULL, _IOFBF, 2 * MB);
|
||||
perf_map_jit_write_header(pid, perf_jit_map_state.perf_map);
|
||||
|
||||
perf_jit_map_state.map_lock = PyThread_allocate_lock();
|
||||
if (perf_jit_map_state.map_lock == NULL) {
|
||||
fclose(perf_jit_map_state.perf_map);
|
||||
return NULL;
|
||||
}
|
||||
perf_jit_map_state.code_id = 0;
|
||||
|
||||
// trampoline_api.code_padding = PERF_JIT_CODE_PADDING;
|
||||
return &perf_jit_map_state;
|
||||
}
|
||||
|
||||
/* DWARF definitions. */
|
||||
|
||||
#define DWRF_CIE_VERSION 1
|
||||
|
||||
enum {
|
||||
DWRF_CFA_nop = 0x0,
|
||||
DWRF_CFA_offset_extended = 0x5,
|
||||
DWRF_CFA_def_cfa = 0xc,
|
||||
DWRF_CFA_def_cfa_offset = 0xe,
|
||||
DWRF_CFA_offset_extended_sf = 0x11,
|
||||
DWRF_CFA_advance_loc = 0x40,
|
||||
DWRF_CFA_offset = 0x80
|
||||
};
|
||||
|
||||
enum
|
||||
{
|
||||
DWRF_EH_PE_absptr = 0x00,
|
||||
DWRF_EH_PE_omit = 0xff,
|
||||
|
||||
/* FDE data encoding. */
|
||||
DWRF_EH_PE_uleb128 = 0x01,
|
||||
DWRF_EH_PE_udata2 = 0x02,
|
||||
DWRF_EH_PE_udata4 = 0x03,
|
||||
DWRF_EH_PE_udata8 = 0x04,
|
||||
DWRF_EH_PE_sleb128 = 0x09,
|
||||
DWRF_EH_PE_sdata2 = 0x0a,
|
||||
DWRF_EH_PE_sdata4 = 0x0b,
|
||||
DWRF_EH_PE_sdata8 = 0x0c,
|
||||
DWRF_EH_PE_signed = 0x08,
|
||||
|
||||
/* FDE flags. */
|
||||
DWRF_EH_PE_pcrel = 0x10,
|
||||
DWRF_EH_PE_textrel = 0x20,
|
||||
DWRF_EH_PE_datarel = 0x30,
|
||||
DWRF_EH_PE_funcrel = 0x40,
|
||||
DWRF_EH_PE_aligned = 0x50,
|
||||
|
||||
DWRF_EH_PE_indirect = 0x80
|
||||
};
|
||||
|
||||
enum { DWRF_TAG_compile_unit = 0x11 };
|
||||
|
||||
enum { DWRF_children_no = 0, DWRF_children_yes = 1 };
|
||||
|
||||
enum { DWRF_AT_name = 0x03, DWRF_AT_stmt_list = 0x10, DWRF_AT_low_pc = 0x11, DWRF_AT_high_pc = 0x12 };
|
||||
|
||||
enum { DWRF_FORM_addr = 0x01, DWRF_FORM_data4 = 0x06, DWRF_FORM_string = 0x08 };
|
||||
|
||||
enum { DWRF_LNS_extended_op = 0, DWRF_LNS_copy = 1, DWRF_LNS_advance_pc = 2, DWRF_LNS_advance_line = 3 };
|
||||
|
||||
enum { DWRF_LNE_end_sequence = 1, DWRF_LNE_set_address = 2 };
|
||||
|
||||
enum {
|
||||
#ifdef __x86_64__
|
||||
/* Yes, the order is strange, but correct. */
|
||||
DWRF_REG_AX,
|
||||
DWRF_REG_DX,
|
||||
DWRF_REG_CX,
|
||||
DWRF_REG_BX,
|
||||
DWRF_REG_SI,
|
||||
DWRF_REG_DI,
|
||||
DWRF_REG_BP,
|
||||
DWRF_REG_SP,
|
||||
DWRF_REG_8,
|
||||
DWRF_REG_9,
|
||||
DWRF_REG_10,
|
||||
DWRF_REG_11,
|
||||
DWRF_REG_12,
|
||||
DWRF_REG_13,
|
||||
DWRF_REG_14,
|
||||
DWRF_REG_15,
|
||||
DWRF_REG_RA,
|
||||
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
|
||||
DWRF_REG_SP = 31,
|
||||
DWRF_REG_RA = 30,
|
||||
#else
|
||||
# error "Unsupported target architecture"
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef struct ELFObjectContext
|
||||
{
|
||||
uint8_t* p; /* Pointer to next address in obj.space. */
|
||||
uint8_t* startp; /* Pointer to start address in obj.space. */
|
||||
uint8_t* eh_frame_p; /* Pointer to start address in obj.space. */
|
||||
uint32_t code_size; /* Size of machine code. */
|
||||
} ELFObjectContext;
|
||||
|
||||
/* Append a null-terminated string. */
|
||||
static uint32_t
|
||||
elfctx_append_string(ELFObjectContext* ctx, const char* str)
|
||||
{
|
||||
uint8_t* p = ctx->p;
|
||||
uint32_t ofs = (uint32_t)(p - ctx->startp);
|
||||
do {
|
||||
*p++ = (uint8_t)*str;
|
||||
} while (*str++);
|
||||
ctx->p = p;
|
||||
return ofs;
|
||||
}
|
||||
|
||||
/* Append a SLEB128 value. */
|
||||
static void
|
||||
elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v)
|
||||
{
|
||||
uint8_t* p = ctx->p;
|
||||
for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) {
|
||||
*p++ = (uint8_t)((v & 0x7f) | 0x80);
|
||||
}
|
||||
*p++ = (uint8_t)(v & 0x7f);
|
||||
ctx->p = p;
|
||||
}
|
||||
|
||||
/* Append a ULEB128 to buffer. */
|
||||
static void
|
||||
elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v)
|
||||
{
|
||||
uint8_t* p = ctx->p;
|
||||
for (; v >= 0x80; v >>= 7) {
|
||||
*p++ = (char)((v & 0x7f) | 0x80);
|
||||
}
|
||||
*p++ = (char)v;
|
||||
ctx->p = p;
|
||||
}
|
||||
|
||||
/* Shortcuts to generate DWARF structures. */
|
||||
#define DWRF_U8(x) (*p++ = (x))
|
||||
#define DWRF_I8(x) (*(int8_t*)p = (x), p++)
|
||||
#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2)
|
||||
#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4)
|
||||
#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t))
|
||||
#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p)
|
||||
#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p)
|
||||
#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p)
|
||||
#define DWRF_ALIGNNOP(s) \
|
||||
while ((uintptr_t)p & ((s)-1)) { \
|
||||
*p++ = DWRF_CFA_nop; \
|
||||
}
|
||||
#define DWRF_SECTION(name, stmt) \
|
||||
{ \
|
||||
uint32_t* szp_##name = (uint32_t*)p; \
|
||||
p += 4; \
|
||||
stmt; \
|
||||
*szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \
|
||||
}
|
||||
|
||||
/* Initialize .eh_frame section. */
|
||||
static void
|
||||
elf_init_ehframe(ELFObjectContext* ctx)
|
||||
{
|
||||
uint8_t* p = ctx->p;
|
||||
uint8_t* framep = p;
|
||||
|
||||
/* Emit DWARF EH CIE. */
|
||||
DWRF_SECTION(CIE, DWRF_U32(0); /* Offset to CIE itself. */
|
||||
DWRF_U8(DWRF_CIE_VERSION);
|
||||
DWRF_STR("zR"); /* Augmentation. */
|
||||
DWRF_UV(1); /* Code alignment factor. */
|
||||
DWRF_SV(-(int64_t)sizeof(uintptr_t)); /* Data alignment factor. */
|
||||
DWRF_U8(DWRF_REG_RA); /* Return address register. */
|
||||
DWRF_UV(1);
|
||||
DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); /* Augmentation data. */
|
||||
DWRF_U8(DWRF_CFA_def_cfa); DWRF_UV(DWRF_REG_SP); DWRF_UV(sizeof(uintptr_t));
|
||||
DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); DWRF_UV(1);
|
||||
DWRF_ALIGNNOP(sizeof(uintptr_t));
|
||||
)
|
||||
|
||||
ctx->eh_frame_p = p;
|
||||
|
||||
/* Emit DWARF EH FDE. */
|
||||
DWRF_SECTION(FDE, DWRF_U32((uint32_t)(p - framep)); /* Offset to CIE. */
|
||||
DWRF_U32(-0x30); /* Machine code offset relative to .text. */
|
||||
DWRF_U32(ctx->code_size); /* Machine code length. */
|
||||
DWRF_U8(0); /* Augmentation data. */
|
||||
/* Registers saved in CFRAME. */
|
||||
#ifdef __x86_64__
|
||||
DWRF_U8(DWRF_CFA_advance_loc | 4);
|
||||
DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
|
||||
DWRF_U8(DWRF_CFA_advance_loc | 6);
|
||||
DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(8);
|
||||
/* Extra registers saved for JIT-compiled code. */
|
||||
#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__)
|
||||
DWRF_U8(DWRF_CFA_advance_loc | 1);
|
||||
DWRF_U8(DWRF_CFA_def_cfa_offset); DWRF_UV(16);
|
||||
DWRF_U8(DWRF_CFA_offset | 29); DWRF_UV(2);
|
||||
DWRF_U8(DWRF_CFA_offset | 30); DWRF_UV(1);
|
||||
DWRF_U8(DWRF_CFA_advance_loc | 3);
|
||||
DWRF_U8(DWRF_CFA_offset | -(64 - 29));
|
||||
DWRF_U8(DWRF_CFA_offset | -(64 - 30));
|
||||
DWRF_U8(DWRF_CFA_def_cfa_offset);
|
||||
DWRF_UV(0);
|
||||
#else
|
||||
# error "Unsupported target architecture"
|
||||
#endif
|
||||
DWRF_ALIGNNOP(sizeof(uintptr_t));)
|
||||
|
||||
ctx->p = p;
|
||||
}
|
||||
|
||||
static void perf_map_jit_write_entry(void *state, const void *code_addr,
|
||||
unsigned int code_size, PyCodeObject *co)
|
||||
{
|
||||
|
||||
if (perf_jit_map_state.perf_map == NULL) {
|
||||
void* ret = perf_map_jit_init();
|
||||
if(ret == NULL){
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
const char *entry = "";
|
||||
if (co->co_qualname != NULL) {
|
||||
entry = PyUnicode_AsUTF8(co->co_qualname);
|
||||
}
|
||||
const char *filename = "";
|
||||
if (co->co_filename != NULL) {
|
||||
filename = PyUnicode_AsUTF8(co->co_filename);
|
||||
}
|
||||
|
||||
|
||||
size_t perf_map_entry_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1;
|
||||
char* perf_map_entry = (char*) PyMem_RawMalloc(perf_map_entry_size);
|
||||
if (perf_map_entry == NULL) {
|
||||
return;
|
||||
}
|
||||
snprintf(perf_map_entry, perf_map_entry_size, "py::%s:%s", entry, filename);
|
||||
|
||||
const size_t name_length = strlen(perf_map_entry);
|
||||
uword base = (uword)code_addr;
|
||||
uword size = code_size;
|
||||
|
||||
// Write the code unwinding info event.
|
||||
|
||||
// Create unwinding information (eh frame)
|
||||
ELFObjectContext ctx;
|
||||
char buffer[1024];
|
||||
ctx.code_size = code_size;
|
||||
ctx.startp = ctx.p = (uint8_t*)buffer;
|
||||
elf_init_ehframe(&ctx);
|
||||
int eh_frame_size = ctx.p - ctx.startp;
|
||||
|
||||
// Populate the unwind info event for perf
|
||||
CodeUnwindingInfoEvent ev2;
|
||||
ev2.base.event = PerfUnwindingInfo;
|
||||
ev2.base.time_stamp = get_current_monotonic_ticks();
|
||||
ev2.unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size;
|
||||
// Ensure we have enough space between DSOs when perf maps them
|
||||
assert(ev2.unwind_data_size <= PERF_JIT_CODE_PADDING);
|
||||
ev2.eh_frame_hdr_size = sizeof(EhFrameHeader);
|
||||
ev2.mapped_size = round_up(ev2.unwind_data_size, 16);
|
||||
int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size;
|
||||
int padding_size = round_up(content_size, 8) - content_size;
|
||||
ev2.base.size = content_size + padding_size;
|
||||
perf_map_jit_write_fully(&ev2, sizeof(ev2));
|
||||
|
||||
|
||||
// Populate the eh Frame header
|
||||
EhFrameHeader f;
|
||||
f.version = 1;
|
||||
f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel;
|
||||
f.fde_count_enc = DwarfUData4;
|
||||
f.table_enc = DwarfSData4 | DwarfDataRel;
|
||||
f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char));
|
||||
f.eh_fde_count = 1;
|
||||
f.from = -(round_up(code_size, 8) + eh_frame_size);
|
||||
int cie_size = ctx.eh_frame_p - ctx.startp;
|
||||
f.to = -(eh_frame_size - cie_size);
|
||||
|
||||
perf_map_jit_write_fully(ctx.startp, eh_frame_size);
|
||||
perf_map_jit_write_fully(&f, sizeof(f));
|
||||
|
||||
char padding_bytes[] = "\0\0\0\0\0\0\0\0";
|
||||
perf_map_jit_write_fully(&padding_bytes, padding_size);
|
||||
|
||||
// Write the code load event.
|
||||
CodeLoadEvent ev;
|
||||
ev.base.event = PerfLoad;
|
||||
ev.base.size = sizeof(ev) + (name_length+1) + size;
|
||||
ev.base.time_stamp = get_current_monotonic_ticks();
|
||||
ev.process_id = getpid();
|
||||
ev.thread_id = gettid();
|
||||
ev.vma = base;
|
||||
ev.code_address = base;
|
||||
ev.code_size = size;
|
||||
perf_jit_map_state.code_id += 1;
|
||||
ev.code_id = perf_jit_map_state.code_id;
|
||||
|
||||
perf_map_jit_write_fully(&ev, sizeof(ev));
|
||||
perf_map_jit_write_fully(perf_map_entry, name_length+1);
|
||||
perf_map_jit_write_fully((void*)(base), size);
|
||||
return;
|
||||
}
|
||||
|
||||
static int perf_map_jit_fini(void* state) {
|
||||
if (perf_jit_map_state.perf_map != NULL) {
|
||||
// close the file
|
||||
PyThread_acquire_lock(perf_jit_map_state.map_lock, 1);
|
||||
fclose(perf_jit_map_state.perf_map);
|
||||
PyThread_release_lock(perf_jit_map_state.map_lock);
|
||||
|
||||
// clean up the lock and state
|
||||
PyThread_free_lock(perf_jit_map_state.map_lock);
|
||||
perf_jit_map_state.perf_map = NULL;
|
||||
}
|
||||
if (perf_jit_map_state.mapped_buffer != NULL) {
|
||||
munmap(perf_jit_map_state.mapped_buffer, perf_jit_map_state.mapped_size);
|
||||
}
|
||||
trampoline_api.state = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
_PyPerf_Callbacks _Py_perfmap_jit_callbacks = {
|
||||
&perf_map_jit_init,
|
||||
&perf_map_jit_write_entry,
|
||||
&perf_map_jit_fini,
|
||||
};
|
||||
|
||||
#endif
|
@ -143,6 +143,8 @@ any DWARF information available for them).
|
||||
#include <sys/mman.h> // mmap()
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h> // sysconf()
|
||||
#include <sys/time.h> // gettimeofday()
|
||||
|
||||
|
||||
#if defined(__arm__) || defined(__arm64__) || defined(__aarch64__)
|
||||
#define PY_HAVE_INVALIDATE_ICACHE
|
||||
@ -187,12 +189,19 @@ struct code_arena_st {
|
||||
typedef struct code_arena_st code_arena_t;
|
||||
typedef struct trampoline_api_st trampoline_api_t;
|
||||
|
||||
enum perf_trampoline_type {
|
||||
PERF_TRAMPOLINE_UNSET = 0,
|
||||
PERF_TRAMPOLINE_TYPE_MAP = 1,
|
||||
PERF_TRAMPOLINE_TYPE_JITDUMP = 2,
|
||||
};
|
||||
|
||||
#define perf_status _PyRuntime.ceval.perf.status
|
||||
#define extra_code_index _PyRuntime.ceval.perf.extra_code_index
|
||||
#define perf_code_arena _PyRuntime.ceval.perf.code_arena
|
||||
#define trampoline_api _PyRuntime.ceval.perf.trampoline_api
|
||||
#define perf_map_file _PyRuntime.ceval.perf.map_file
|
||||
#define persist_after_fork _PyRuntime.ceval.perf.persist_after_fork
|
||||
#define perf_trampoline_type _PyRuntime.ceval.perf.perf_trampoline_type
|
||||
|
||||
static void
|
||||
perf_map_write_entry(void *state, const void *code_addr,
|
||||
@ -220,6 +229,8 @@ static void*
|
||||
perf_map_init_state(void)
|
||||
{
|
||||
PyUnstable_PerfMapState_Init();
|
||||
trampoline_api.code_padding = 0;
|
||||
perf_trampoline_type = PERF_TRAMPOLINE_TYPE_MAP;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -236,6 +247,30 @@ _PyPerf_Callbacks _Py_perfmap_callbacks = {
|
||||
&perf_map_free_state,
|
||||
};
|
||||
|
||||
|
||||
static size_t round_up(int64_t value, int64_t multiple) {
|
||||
if (multiple == 0) {
|
||||
// Avoid division by zero
|
||||
return value;
|
||||
}
|
||||
|
||||
int64_t remainder = value % multiple;
|
||||
if (remainder == 0) {
|
||||
// Value is already a multiple of 'multiple'
|
||||
return value;
|
||||
}
|
||||
|
||||
// Calculate the difference to the next multiple
|
||||
int64_t difference = multiple - remainder;
|
||||
|
||||
// Add the difference to the value
|
||||
int64_t rounded_up_value = value + difference;
|
||||
|
||||
return rounded_up_value;
|
||||
}
|
||||
|
||||
// TRAMPOLINE MANAGEMENT API
|
||||
|
||||
static int
|
||||
new_code_arena(void)
|
||||
{
|
||||
@ -256,6 +291,7 @@ new_code_arena(void)
|
||||
void *start = &_Py_trampoline_func_start;
|
||||
void *end = &_Py_trampoline_func_end;
|
||||
size_t code_size = end - start;
|
||||
size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
|
||||
// TODO: Check the effect of alignment of the code chunks. Initial investigation
|
||||
// showed that this has no effect on performance in x86-64 or aarch64 and the current
|
||||
// version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
|
||||
@ -264,9 +300,9 @@ new_code_arena(void)
|
||||
// measurable performance improvement by rounding trampolines up to 32-bit
|
||||
// or 64-bit alignment.
|
||||
|
||||
size_t n_copies = mem_size / code_size;
|
||||
size_t n_copies = mem_size / chunk_size;
|
||||
for (size_t i = 0; i < n_copies; i++) {
|
||||
memcpy(memory + i * code_size, start, code_size * sizeof(char));
|
||||
memcpy(memory + i * chunk_size, start, code_size * sizeof(char));
|
||||
}
|
||||
// Some systems may prevent us from creating executable code on the fly.
|
||||
int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC);
|
||||
@ -320,16 +356,18 @@ static inline py_trampoline
|
||||
code_arena_new_code(code_arena_t *code_arena)
|
||||
{
|
||||
py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
|
||||
code_arena->size_left -= code_arena->code_size;
|
||||
code_arena->current_addr += code_arena->code_size;
|
||||
size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
|
||||
code_arena->size_left -= total_code_size;
|
||||
code_arena->current_addr += total_code_size;
|
||||
return trampoline;
|
||||
}
|
||||
|
||||
static inline py_trampoline
|
||||
compile_trampoline(void)
|
||||
{
|
||||
size_t total_code_size = round_up(perf_code_arena->code_size + trampoline_api.code_padding, 16);
|
||||
if ((perf_code_arena == NULL) ||
|
||||
(perf_code_arena->size_left <= perf_code_arena->code_size)) {
|
||||
(perf_code_arena->size_left <= total_code_size)) {
|
||||
if (new_code_arena() < 0) {
|
||||
return NULL;
|
||||
}
|
||||
@ -480,6 +518,7 @@ _PyPerfTrampoline_Fini(void)
|
||||
}
|
||||
if (perf_status == PERF_STATUS_OK) {
|
||||
trampoline_api.free_state(trampoline_api.state);
|
||||
perf_trampoline_type = PERF_TRAMPOLINE_UNSET;
|
||||
}
|
||||
extra_code_index = -1;
|
||||
perf_status = PERF_STATUS_NO_INIT;
|
||||
@ -508,6 +547,9 @@ _PyPerfTrampoline_AfterFork_Child(void)
|
||||
{
|
||||
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
||||
if (persist_after_fork) {
|
||||
if (perf_trampoline_type != PERF_TRAMPOLINE_TYPE_MAP) {
|
||||
return PyStatus_Error("Failed to copy perf map file as perf trampoline type is not type map.");
|
||||
}
|
||||
_PyPerfTrampoline_Fini();
|
||||
char filename[256];
|
||||
pid_t parent_pid = getppid();
|
||||
|
@ -1210,7 +1210,14 @@ init_interp_main(PyThreadState *tstate)
|
||||
|
||||
#ifdef PY_HAVE_PERF_TRAMPOLINE
|
||||
if (config->perf_profiling) {
|
||||
if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ||
|
||||
_PyPerf_Callbacks *cur_cb;
|
||||
if (config->perf_profiling == 1) {
|
||||
cur_cb = &_Py_perfmap_callbacks;
|
||||
}
|
||||
else {
|
||||
cur_cb = &_Py_perfmap_jit_callbacks;
|
||||
}
|
||||
if (_PyPerfTrampoline_SetCallbacks(cur_cb) < 0 ||
|
||||
_PyPerfTrampoline_Init(config->perf_profiling) < 0) {
|
||||
return _PyStatus_ERR("can't initialize the perf trampoline");
|
||||
}
|
||||
|
@ -2282,6 +2282,16 @@ sys_activate_stack_trampoline_impl(PyObject *module, const char *backend)
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else if (strcmp(backend, "perfjit") == 0) {
|
||||
_PyPerf_Callbacks cur_cb;
|
||||
_PyPerfTrampoline_GetCallbacks(&cur_cb);
|
||||
if (cur_cb.write_state != _Py_perfmap_jit_callbacks.write_state) {
|
||||
if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_jit_callbacks) < 0 ) {
|
||||
PyErr_SetString(PyExc_ValueError, "can't activate perf jit trampoline");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend);
|
||||
|
@ -365,6 +365,8 @@ Python/intrinsics.c - _PyIntrinsics_BinaryFunctions -
|
||||
Python/lock.c - TIME_TO_BE_FAIR_NS -
|
||||
Python/opcode_targets.h - opcode_targets -
|
||||
Python/perf_trampoline.c - _Py_perfmap_callbacks -
|
||||
Python/perf_jit_trampoline.c - _Py_perfmap_jit_callbacks -
|
||||
Python/perf_jit_trampoline.c - perf_jit_map_state -
|
||||
Python/pyhash.c - PyHash_Func -
|
||||
Python/pylifecycle.c - _C_LOCALE_WARNING -
|
||||
Python/pylifecycle.c - _PyOS_mystrnicmp_hack -
|
||||
|
Can't render this file because it has a wrong number of fields in line 4.
|
Loading…
Reference in New Issue
Block a user