Skip to content

Commit 6332e5c

Browse files
committed
Refactor NVMLProvider for error handling
1 parent 26e4309 commit 6332e5c

File tree

8 files changed

+274
-264
lines changed

8 files changed

+274
-264
lines changed

profiling/energy-profiler/kokkos-tools/kp_nvml_direct_power.cpp

Lines changed: 43 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@
3939
namespace KokkosTools {
4040
namespace DirectPower {
4141

42+
using EnergyProfiler::log_message;
43+
using EnergyProfiler::LogLevel;
4244
using EnergyProfiler::NVMLProvider;
43-
using EnergyProfiler::Result;
4445

4546
// --- Configuration ---
4647
// The interval in milliseconds for power sampling.
@@ -84,12 +85,11 @@ void power_monitoring_tick() {
8485

8586
// Collect power for each device
8687
for (size_t i = 0; i < g_device_count; ++i) {
87-
double power = 0.0;
88-
Result result = g_nvml_provider->get_device_power_usage_direct(i, power);
89-
if (!result) {
90-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_WARNING(
91-
COMPONENT_NAME, "Failed to get direct power for device " +
92-
std::to_string(i) + ": " + result.message);
88+
double power = 0.0;
89+
bool success = g_nvml_provider->get_device_power_usage_direct(i, power);
90+
if (!success) {
91+
log_message(LogLevel::WARNING, COMPONENT_NAME,
92+
"Failed to get direct power for device " + std::to_string(i));
9393
power = -1.0; // Use sentinel value for missing data
9494
}
9595
sample.device_powers_watts.push_back(power);
@@ -203,53 +203,52 @@ void export_direct_power_data_csv(const std::string& filename) {
203203
void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
204204
const uint32_t devInfoCount,
205205
Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
206-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(COMPONENT_NAME, "Initializing...");
207-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
208-
COMPONENT_NAME,
206+
log_message(LogLevel::INFO, COMPONENT_NAME, "Initializing...");
207+
log_message(
208+
LogLevel::INFO, COMPONENT_NAME,
209209
"Sampling Interval: " + std::to_string(SAMPLING_INTERVAL_MS) + " ms");
210210

211211
// Initialize the timer tool
212212
g_timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
213213

214-
g_nvml_provider = std::make_unique<NVMLProvider>();
215-
Result init_result = g_nvml_provider->initialize();
216-
if (!init_result) {
217-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(
218-
COMPONENT_NAME,
219-
"Failed to initialize NVML provider: " + init_result.message +
220-
". Direct power profiling disabled.");
214+
g_nvml_provider = std::make_unique<NVMLProvider>();
215+
bool init_success = g_nvml_provider->initialize();
216+
if (!init_success) {
217+
log_message(
218+
LogLevel::ERROR, COMPONENT_NAME,
219+
"Failed to initialize NVML provider. Direct power profiling disabled.");
221220
g_nvml_provider.reset(); // Release the provider
222221
return;
223222
}
224223

225224
g_device_count = g_nvml_provider->get_device_count();
226-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
227-
COMPONENT_NAME, "NVML provider initialized with " +
228-
std::to_string(g_device_count) + " device(s)");
225+
log_message(LogLevel::INFO, COMPONENT_NAME,
226+
"NVML provider initialized with " +
227+
std::to_string(g_device_count) + " device(s)");
229228

230229
// Print device information
231230
for (size_t i = 0; i < g_device_count; ++i) {
232-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
233-
COMPONENT_NAME, "Device " + std::to_string(i) + ": " +
234-
g_nvml_provider->get_device_name(i));
231+
log_message(LogLevel::INFO, COMPONENT_NAME,
232+
"Device " + std::to_string(i) + ": " +
233+
g_nvml_provider->get_device_name(i));
235234
}
236235

237236
// Start the monitoring daemon
238237
g_power_daemon =
239238
std::make_unique<Daemon>(power_monitoring_tick, SAMPLING_INTERVAL_MS);
240239
g_start_time = std::chrono::high_resolution_clock::now();
241240
g_power_daemon->start();
242-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
243-
COMPONENT_NAME, "Direct power monitoring daemon started");
241+
log_message(LogLevel::INFO, COMPONENT_NAME,
242+
"Direct power monitoring daemon started");
244243
}
245244

246245
void kokkosp_finalize_library() {
247-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(COMPONENT_NAME, "Finalizing...");
246+
log_message(LogLevel::INFO, COMPONENT_NAME, "Finalizing...");
248247

249248
if (g_power_daemon) {
250249
g_power_daemon->stop();
251-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
252-
COMPONENT_NAME, "Direct power monitoring daemon stopped");
250+
log_message(LogLevel::INFO, COMPONENT_NAME,
251+
"Direct power monitoring daemon stopped");
253252
}
254253

255254
// Finalize the timer
@@ -300,19 +299,25 @@ void kokkosp_finalize_library() {
300299
std::string prefix = generate_prefix();
301300

302301
const auto& kernels = g_timer.get_kernel_timings();
303-
KokkosTools::EnergyProfiler::print_kernels_summary(kernels);
304-
KokkosTools::EnergyProfiler::export_kernels_csv(kernels,
305-
prefix + "_kernels.csv");
302+
KokkosTools::EnergyProfiler::print_timings_summary(
303+
kernels, KokkosTools::EnergyProfiler::DataCategory::Kernels);
304+
KokkosTools::EnergyProfiler::export_timings_csv(
305+
kernels, prefix + "_kernels.csv",
306+
KokkosTools::EnergyProfiler::DataCategory::Kernels);
306307

307308
const auto& regions = g_timer.get_region_timings();
308-
KokkosTools::EnergyProfiler::print_regions_summary(regions);
309-
KokkosTools::EnergyProfiler::export_regions_csv(regions,
310-
prefix + "_regions.csv");
309+
KokkosTools::EnergyProfiler::print_timings_summary(
310+
regions, KokkosTools::EnergyProfiler::DataCategory::Regions);
311+
KokkosTools::EnergyProfiler::export_timings_csv(
312+
regions, prefix + "_regions.csv",
313+
KokkosTools::EnergyProfiler::DataCategory::Regions);
311314

312315
const auto& deepcopies = g_timer.get_deep_copy_timings();
313-
KokkosTools::EnergyProfiler::print_deepcopies_summary(deepcopies);
314-
KokkosTools::EnergyProfiler::export_deepcopies_csv(
315-
deepcopies, prefix + "_deepcopies.csv");
316+
KokkosTools::EnergyProfiler::print_timings_summary(
317+
deepcopies, KokkosTools::EnergyProfiler::DataCategory::DeepCopies);
318+
KokkosTools::EnergyProfiler::export_timings_csv(
319+
deepcopies, prefix + "_deepcopies.csv",
320+
KokkosTools::EnergyProfiler::DataCategory::DeepCopies);
316321

317322
if (g_nvml_provider) {
318323
g_nvml_provider->finalize();
@@ -372,4 +377,4 @@ EXPOSE_POP_REGION(impl::kokkosp_pop_profile_region)
372377
EXPOSE_BEGIN_DEEP_COPY(impl::kokkosp_begin_deep_copy)
373378
EXPOSE_END_DEEP_COPY(impl::kokkosp_end_deep_copy)
374379

375-
} // extern "C"
380+
} // extern "C"

profiling/energy-profiler/kokkos-tools/kp_nvml_energy_consumption.cpp

Lines changed: 62 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,9 @@
4040
namespace KokkosTools {
4141
namespace EnergyConsumption {
4242

43-
using EnergyProfiler::ErrorCode;
43+
using EnergyProfiler::log_message;
44+
using EnergyProfiler::LogLevel;
4445
using EnergyProfiler::NVMLProvider;
45-
using EnergyProfiler::Result;
4646

4747
// --- Global State for the Profiler ---
4848
static constexpr const char* COMPONENT_NAME = "EnergyConsumption";
@@ -115,23 +115,21 @@ EnergySnapshot capture_energy_snapshot() {
115115
for (size_t i = 0; i < g_device_count; ++i) {
116116
snapshot.device_energies_joules.push_back(-1.0);
117117
}
118-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_WARNING(
119-
COMPONENT_NAME,
120-
"Provider not initialized, using invalid energy values");
118+
log_message(LogLevel::WARNING, COMPONENT_NAME,
119+
"Provider not initialized, using invalid energy values");
121120
return snapshot;
122121
}
123122

124123
// Collect energy for each device
125124
for (size_t i = 0; i < g_device_count; ++i) {
126125
double energy = 0.0;
127-
Result result = g_nvml_provider->get_current_energy_consumption(i, energy);
128-
if (result) {
126+
bool success = g_nvml_provider->get_current_energy_consumption(i, energy);
127+
if (success) {
129128
snapshot.device_energies_joules.push_back(energy);
130129
} else {
131130
snapshot.device_energies_joules.push_back(-1.0);
132-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_WARNING(
133-
COMPONENT_NAME, "Failed to get energy for device " +
134-
std::to_string(i) + ": " + result.message);
131+
log_message(LogLevel::WARNING, COMPONENT_NAME,
132+
"Failed to get energy for device " + std::to_string(i));
135133
}
136134
}
137135

@@ -175,12 +173,12 @@ double calculate_duration_seconds(const EnergySnapshot& start,
175173
/**
176174
* @brief Exports energy consumption data to CSV file with error handling
177175
*/
178-
Result export_energy_consumption_csv(const std::string& filename) {
176+
bool export_energy_consumption_csv(const std::string& filename) {
179177
std::ofstream file(filename);
180178
if (!file.is_open()) {
181-
std::string error_msg = "Unable to open file " + filename + " for writing";
182-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(COMPONENT_NAME, error_msg);
183-
return Result(ErrorCode::FILE_WRITE_FAILED, error_msg);
179+
log_message(LogLevel::ERROR, COMPONENT_NAME,
180+
"Unable to open file " + filename + " for writing");
181+
return false;
184182
}
185183

186184
try {
@@ -235,20 +233,20 @@ Result export_energy_consumption_csv(const std::string& filename) {
235233
file.close();
236234

237235
if (file.fail()) {
238-
std::string error_msg = "Failed to write data to file " + filename;
239-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(COMPONENT_NAME, error_msg);
240-
return Result(ErrorCode::FILE_WRITE_FAILED, error_msg);
236+
log_message(LogLevel::ERROR, COMPONENT_NAME,
237+
"Failed to write data to file " + filename);
238+
return false;
241239
}
242240

243-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
244-
COMPONENT_NAME, "Energy consumption data exported to " + filename);
245-
return Result(ErrorCode::SUCCESS);
241+
log_message(LogLevel::INFO, COMPONENT_NAME,
242+
"Energy consumption data exported to " + filename);
243+
return true;
246244

247245
} catch (const std::exception& e) {
248-
std::string error_msg =
249-
"Exception while writing to file " + filename + ": " + e.what();
250-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(COMPONENT_NAME, error_msg);
251-
return Result(ErrorCode::FILE_WRITE_FAILED, error_msg);
246+
log_message(
247+
LogLevel::ERROR, COMPONENT_NAME,
248+
"Exception while writing to file " + filename + ": " + e.what());
249+
return false;
252250
}
253251
}
254252

@@ -323,41 +321,40 @@ void print_energy_summary() {
323321
void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer,
324322
const uint32_t devInfoCount,
325323
Kokkos_Profiling_KokkosPDeviceInfo* deviceInfo) {
326-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(COMPONENT_NAME, "Initializing...");
324+
log_message(LogLevel::INFO, COMPONENT_NAME, "Initializing...");
327325

328326
// Initialize the timer tool
329327
g_timer.init_library(loadSeq, interfaceVer, devInfoCount, deviceInfo);
330328

331-
g_nvml_provider = std::make_unique<NVMLProvider>();
332-
Result init_result = g_nvml_provider->initialize();
333-
if (!init_result) {
334-
std::string error_msg =
335-
"Failed to initialize NVML provider: " + init_result.message;
336-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(
337-
COMPONENT_NAME, error_msg + ". Energy consumption profiling disabled.");
329+
g_nvml_provider = std::make_unique<NVMLProvider>();
330+
bool init_success = g_nvml_provider->initialize();
331+
if (!init_success) {
332+
log_message(LogLevel::ERROR, COMPONENT_NAME,
333+
"Failed to initialize NVML provider. Energy consumption "
334+
"profiling disabled.");
338335
g_nvml_provider.reset(); // Release the provider
339336
return;
340337
}
341338

342339
g_device_count = g_nvml_provider->get_device_count();
343-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
344-
COMPONENT_NAME, "NVML provider initialized with " +
345-
std::to_string(g_device_count) + " device(s)");
340+
log_message(LogLevel::INFO, COMPONENT_NAME,
341+
"NVML provider initialized with " +
342+
std::to_string(g_device_count) + " device(s)");
346343

347344
// Print device information
348345
for (size_t i = 0; i < g_device_count; ++i) {
349-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
350-
COMPONENT_NAME, "Device " + std::to_string(i) + ": " +
351-
g_nvml_provider->get_device_name(i));
346+
log_message(LogLevel::INFO, COMPONENT_NAME,
347+
"Device " + std::to_string(i) + ": " +
348+
g_nvml_provider->get_device_name(i));
352349
}
353350

354351
g_start_time = std::chrono::high_resolution_clock::now();
355-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
356-
COMPONENT_NAME, "Energy consumption monitoring initialized");
352+
log_message(LogLevel::INFO, COMPONENT_NAME,
353+
"Energy consumption monitoring initialized");
357354
}
358355

359356
void kokkosp_finalize_library() {
360-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(COMPONENT_NAME, "Finalizing...");
357+
log_message(LogLevel::INFO, COMPONENT_NAME, "Finalizing...");
361358

362359
// Finalize the timer
363360
g_timer.finalize_library();
@@ -366,8 +363,8 @@ void kokkosp_finalize_library() {
366363
auto total_duration_s =
367364
std::chrono::duration<double>(end_time - g_start_time).count();
368365

369-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
370-
COMPONENT_NAME,
366+
log_message(
367+
LogLevel::INFO, COMPONENT_NAME,
371368
"Total Monitoring Duration: " + std::to_string(total_duration_s) + " s");
372369

373370
print_energy_summary();
@@ -376,35 +373,39 @@ void kokkosp_finalize_library() {
376373

377374
// Export energy data
378375
std::string csv_filename = prefix + "_nvml_energy_consumption.csv";
379-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(
380-
COMPONENT_NAME, "Exporting energy consumption data to " + csv_filename);
381-
Result export_result = export_energy_consumption_csv(csv_filename);
382-
if (!export_result) {
383-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_ERROR(
384-
COMPONENT_NAME,
385-
"Failed to export energy data: " + export_result.message);
376+
log_message(LogLevel::INFO, COMPONENT_NAME,
377+
"Exporting energy consumption data to " + csv_filename);
378+
bool export_success = export_energy_consumption_csv(csv_filename);
379+
if (!export_success) {
380+
log_message(LogLevel::ERROR, COMPONENT_NAME,
381+
"Failed to export energy data");
386382
}
387383

388384
// Export timing data
389385
const auto& kernels = g_timer.get_kernel_timings();
390-
KokkosTools::EnergyProfiler::print_kernels_summary(kernels);
391-
KokkosTools::EnergyProfiler::export_kernels_csv(kernels,
392-
prefix + "_kernels.csv");
386+
KokkosTools::EnergyProfiler::print_timings_summary(
387+
kernels, KokkosTools::EnergyProfiler::DataCategory::Kernels);
388+
KokkosTools::EnergyProfiler::export_timings_csv(
389+
kernels, prefix + "_kernels.csv",
390+
KokkosTools::EnergyProfiler::DataCategory::Kernels);
393391

394392
const auto& regions = g_timer.get_region_timings();
395-
KokkosTools::EnergyProfiler::print_regions_summary(regions);
396-
KokkosTools::EnergyProfiler::export_regions_csv(regions,
397-
prefix + "_regions.csv");
393+
KokkosTools::EnergyProfiler::print_timings_summary(
394+
regions, KokkosTools::EnergyProfiler::DataCategory::Regions);
395+
KokkosTools::EnergyProfiler::export_timings_csv(
396+
regions, prefix + "_regions.csv",
397+
KokkosTools::EnergyProfiler::DataCategory::Regions);
398398

399399
const auto& deepcopies = g_timer.get_deep_copy_timings();
400-
KokkosTools::EnergyProfiler::print_deepcopies_summary(deepcopies);
401-
KokkosTools::EnergyProfiler::export_deepcopies_csv(
402-
deepcopies, prefix + "_deepcopies.csv");
400+
KokkosTools::EnergyProfiler::print_timings_summary(
401+
deepcopies, KokkosTools::EnergyProfiler::DataCategory::DeepCopies);
402+
KokkosTools::EnergyProfiler::export_timings_csv(
403+
deepcopies, prefix + "_deepcopies.csv",
404+
KokkosTools::EnergyProfiler::DataCategory::DeepCopies);
403405

404406
if (g_nvml_provider) {
405407
g_nvml_provider->finalize();
406-
KOKKOS_TOOLS_ENERGY_PROFILER_LOG_INFO(COMPONENT_NAME,
407-
"NVML provider finalized");
408+
log_message(LogLevel::INFO, COMPONENT_NAME, "NVML provider finalized");
408409
}
409410
}
410411

0 commit comments

Comments
 (0)