diff --git a/teuthology/task/internal/__init__.py b/teuthology/task/internal/__init__.py index 15b8f81f54..bc63e265f0 100644 --- a/teuthology/task/internal/__init__.py +++ b/teuthology/task/internal/__init__.py @@ -299,46 +299,153 @@ def check_conflict(ctx, config): raise RuntimeError('Stale jobs detected, aborting.') -def fetch_binaries_for_coredumps(path, remote): +def fetch_binaries_for_coredumps(path: str, remote: remote.Remote) -> None: """ - Pul ELFs (debug and stripped) for each coredump found + Pull ELFs (debug and stripped) for each coredump found + + The coredumps might appear compressed, either by gzip or zstd (Centos9) + The following are examples from the output of the 'file' command: + # 1422917770.7450.core: ELF 64-bit LSB core file x86-64, version 1 (SYSV), SVR4-style, \ + # from 'radosgw --rgw-socket-path /home/ubuntu/cephtest/apache/tmp.client.0/fastcgi_soc' + # Centos 9: + # core.ceph_test_neora.0.fb62b98.zst: Zstandard compressed data (v0.8+), Dictionary ID: None + # ELF 64-bit LSB core file, x86-64, version 1 (SYSV), SVR4-style, \ + # from 'bin/ceph_test_neorados_snapshots --gtest_break_on_failure', real uid: 0, \ + # effective uid: 0, real gid: 0, effective gid: 0, execfn: 'bin/ceph_test_neorados_snapshots', platform: 'x86_64' """ + def _is_core_gziped(dump_path: str) -> bool: + """ + Return True whether the core file is gzip compressed + """ + with open(dump_path, 'rb') as f: + magic = f.read(2) + if magic == b'\x1f\x8b': + return True + return False + + def _is_core_zstded(dump_path: str) -> bool: + """ + Return True whether the core file is zstd compressed + """ + with open(dump_path, 'rb') as f: + magic = f.read(4) + if magic == b'\x28\xb5\x2f\xfd': + return True + return False + + # Auxiliary dict for compression types: call back to check compression type, + # and how to uncompress + csdict = { + 'gzip': { + 'check': _is_core_gziped, + 'uncompress': [ 'gzip', '-d '], + 'regex': r'.*gzip compressed data.*' + }, + 'zstd': { + 'check': _is_core_zstded, + 'uncompress': [ 'zstd', '-d '], + 'regex': r'.*Zstandard compressed data.*' + } + } + + def _get_compressed_type(dump_path: str) -> str | None: + """ + Identify the compression type of the core file + """ + for ck, cs in csdict.items(): + if cs['check'](dump_path): + return ck + return None + + def _looks_compressed(dump_out: str) -> bool: + """ + Identify whether the core file looks compressed from 'file' output + """ + for cs in csdict.values(): + if re.match(cs['regex'], dump_out): + return True + return False + + def _uncompress_file(dump_path: str, cs_type:str|None) -> str | None: + """ + Uncompress the core file based on its compression type, in the remote machine + """ + if cs_type is None: + return None + # Construct a bash cmd to uncompress the file based on its type + try: + cmd = csdict[cs_type]['uncompress'] + [dump_path] + log.info(f'Uncompressing via {cmd} on remote ...') + remote.sh(cmd) + # After uncompressing, the new file path is the original path without the compression suffix + uncompressed_path = dump_path.rsplit('.', 1)[0] + log.info(f'Uncompressed file path: {uncompressed_path}') + return uncompressed_path + except Exception as e: + log.info('Something went wrong while attempting to uncompress the file') + log.error(e) + return None + + def _get_file_info(dump_path): + """ + Get the 'file' command output for the core file + """ + dump_info = remote.sh(['file', dump_path]).rstrip() + return dump_info + + def get_backtraces_from_coredumps( coredump_path: str, + dump_path: str, dump_program: str, dump: str) -> None: + """ + Get backtraces from coredumps found in path + On a future iteration, we can expand this to inject gdb commands from the test plan yaml + An alternative would be to craft a bash script to run on the remote machine which wraps + the uncompression, pipe to gdb and get the backtraces. See for example: internal/vm_setup.py that + executes edit_sudoers.sh on the remote + """ + gdb_output_path = os.path.join(coredump_path, + dump + '.gdb.txt') + log.info(f'Getting backtrace from core {dump} on remote ...') + cmd = ['gdb', '--batch', '-ex', 'set pagination 0', + '-ex', 'thread apply all bt full', + dump_program, dump_path] + gdb_out = remote.sh(cmd).rstrip() + with open(gdb_output_path, 'w') as gdb_out_file: + gdb_out_file.write(gdb_out) + log.info(f"core {dump} backtrace saved to {gdb_output_path}") + + # Check for Coredumps: coredump_path = os.path.join(path, 'coredump') if os.path.isdir(coredump_path): log.info('Transferring binaries for coredumps...') for dump in os.listdir(coredump_path): - # Pull program from core file + # Pull program (that dropped the core) from the core file info dump_path = os.path.join(coredump_path, dump) - dump_info = subprocess.Popen(['file', dump_path], - stdout=subprocess.PIPE) - dump_out = dump_info.communicate()[0].decode() - - # Parse file output to get program, Example output: - # 1422917770.7450.core: ELF 64-bit LSB core file x86-64, version 1 (SYSV), SVR4-style, \ - # from 'radosgw --rgw-socket-path /home/ubuntu/cephtest/apache/tmp.client.0/fastcgi_soc' + dump_out = _get_file_info(dump_path) log.info(f' core looks like: {dump_out}') - if 'gzip' in dump_out: + if _looks_compressed(dump_out): + # if the core is compressed, recognise the type and uncompress it + cs_type = _get_compressed_type(dump_path) try: - log.info("core is compressed, try accessing gzip file ...") - with gzip.open(dump_path, 'rb') as f_in, \ - tempfile.NamedTemporaryFile(mode='w+b') as f_out: - shutil.copyfileobj(f_in, f_out) - dump_info = subprocess.Popen(['file', f_out.name], - stdout=subprocess.PIPE) - dump_out = dump_info.communicate()[0].decode() - log.info(f' core looks like: {dump_out}') + log.info(f"core is compressed, try accessing {cs_type} file ...") + uncompressed_path = _uncompress_file(dump_path, cs_type) + if uncompressed_path is None: + log.info(f"Could not uncompress {dump}, moving on ...") + continue except Exception as e: log.info('Something went wrong while opening the compressed file') log.error(e) continue + dump_path = uncompressed_path + dump_out = _get_file_info(dump_path) + log.info(f' after uncompressing core looks like: {dump_out}') try: dump_command = re.findall("from '([^ ']+)", dump_out)[0] dump_program = dump_command.split()[0] log.info(f' dump_program: {dump_program}') except Exception as e: - log.info("core doesn't have the desired format, moving on ...") + log.info(f"core {dump} doesn't have the desired format, moving on ...") log.error(e) continue @@ -362,7 +469,23 @@ def fetch_binaries_for_coredumps(path, remote): debug_path = '{debug_path}.debug'.format(debug_path=debug_path) remote.get_file(debug_path, coredump_path) - + # If debug symbols were found, rename them to match the binary + debug_filename = os.path.basename(debug_path) + local_debug_path = os.path.join(coredump_path, debug_filename) + if os.path.exists(local_debug_path): + new_debug_path = os.path.join( + coredump_path, + dump_program.lstrip(os.path.sep) + '.debug' + ) + os.rename(local_debug_path, new_debug_path) + + # Execute gdb to get the backtrace and locals + get_backtraces_from_coredumps(coredump_path, dump_path, + dump_program, dump) + # Compress the core file always to save space + with open(dump_path, 'rb') as f_in, \ + gzip.open(dump_path + '.gz', 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) def gzip_if_too_large(compress_min_size, src, tarinfo, local_path): if tarinfo.size >= compress_min_size: @@ -521,6 +644,17 @@ def coredump(ctx, config): if 'failure_reason' not in ctx.summary: ctx.summary['failure_reason'] = \ 'Found coredumps on {rem}'.format(rem=rem) + # Add the backtraces of the coredumps to the failure reason + coredump_path = os.path.join(archive_dir, 'coredump') + for dump in os.listdir(coredump_path): + if dump.endswith('.gdb.txt'): + with open(os.path.join(coredump_path, dump), 'r') as f: + backtrace = f.read() + ctx.summary['failure_reason'] += \ + '\n\nBacktrace from core {dump}:\n{bt}'.format( + dump=dump, + bt=backtrace + ) @contextlib.contextmanager diff --git a/teuthology/task/tests/test_fetch_coredumps.py b/teuthology/task/tests/test_fetch_coredumps.py index 2a9a7de094..f509bb7d2d 100644 --- a/teuthology/task/tests/test_fetch_coredumps.py +++ b/teuthology/task/tests/test_fetch_coredumps.py @@ -1,4 +1,5 @@ from teuthology.task.internal import fetch_binaries_for_coredumps +from teuthology.task.internal import get_backtraces_from_coredumps from unittest.mock import patch, Mock import gzip import os @@ -19,6 +20,36 @@ def __init__(self, ret): def communicate(self, input=None): return [TestFetchCoreDumps.MockDecode(self.ret)] + class MockCompletedProcess(object): + def __init__(self, ret): + self.ret = ret + + @property + def stdout(self): + return self.ret + + class MockGdb(object): + def __init__(self, ret): + self.ret = ret + + def run(self, *args, **kwargs): + return TestFetchCoreDumps.MockCompletedProcess(self.ret) + + class TestGetBacktracesFromCoreDumps(object): + @patch('teuthology.task.internal.subprocess.run') + def test_get_backtraces_from_coredumps(self, mock_run): + mock_run.return_value = TestFetchCoreDumps.MockCompletedProcess( + "Backtrace line 1\nBacktrace line 2\nBacktrace line 3\n" + ) + backtraces = get_backtraces_from_coredumps(coredump_path="core_dump_path", dump_path="binary_path", + dump_program="ceph_test_rados_api_io", dump="core_dump") + expected_backtraces = [ + "Backtrace line 1", + "Backtrace line 2", + "Backtrace line 3" + ] + assert backtraces == expected_backtraces + def setup_method(self): self.the_function = fetch_binaries_for_coredumps with gzip.open('file.gz', 'wb') as f: @@ -44,6 +75,20 @@ def setup_method(self): " 19:56:56 2022, from Unix, original size modulo 2^32 11" ) + # Centos 9 coredumps are zstd compressed: + self.zstd_compressed_correct = self.MockPopen( + "Zstandard compressed data"\ + "'correct.format.core', last modified: Wed Jun 29"\ + " 19:55:29 2022, from Unix, original size modulo 2^32 3167080" + ) + + self.zstd_compressed_incorrect = self.MockPopen( + "Zstandard compressed data"\ + "'incorrect.format.core', last modified: Wed Jun 29"\ + " 19:56:56 2022, from Unix, original size modulo 2^32 11" + ) + + # Core is not compressed and file is in the correct format @patch('teuthology.task.internal.subprocess.Popen') @patch('teuthology.task.internal.os') @@ -112,5 +157,39 @@ def test_compressed_incorrect_format(self, m_os, m_subproc_popen): self.the_function(None, self.m_remote) assert self.m_remote.get_file.called == False + # Core is zstd-compressed and file is in the correct format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_zstd_compressed_correct_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.zstd_compressed_correct, + self.uncompressed_correct + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called + + # Core is compressed and file is in the wrong format + @patch('teuthology.task.internal.subprocess.Popen') + @patch('teuthology.task.internal.os') + def test_zstd_compressed_incorrect_format(self, m_os, m_subproc_popen): + m_subproc_popen.side_effect = [ + self.zstd_compressed_incorrect, + self.uncompressed_incorrect + ] + m_os.path.join.return_value = self.core_dump_path + m_os.path.sep = self.core_dump_path + m_os.path.isdir.return_value = True + m_os.path.dirname.return_value = self.core_dump_path + m_os.path.exists.return_value = True + m_os.listdir.return_value = [self.core_dump_path] + self.the_function(None, self.m_remote) + assert self.m_remote.get_file.called == False + def teardown(self): - os.remove(self.core_dump_path) \ No newline at end of file + os.remove(self.core_dump_path)