Skip to content

Commit 1bc9ff5

Browse files
committed
GH-48858: [C++][Parquet] Avoid re-serializing footer for signature verification
1 parent 34045db commit 1bc9ff5

File tree

3 files changed

+101
-70
lines changed

3 files changed

+101
-70
lines changed

cpp/src/parquet/file_reader.cc

Lines changed: 53 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow;
5555

5656
namespace parquet {
5757

58+
using ::arrow::Future;
59+
using ::arrow::Result;
60+
using ::arrow::Status;
61+
5862
namespace {
5963
bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
6064
// Check the encoding_stats to see if all data pages are dictionary encoded.
@@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents {
398402
PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
399403
}
400404

401-
::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
405+
Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
402406
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
403407
int64_t hole_size_limit, int64_t range_size_limit) {
404408
std::vector<::arrow::io::ReadRange> ranges;
@@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents {
413417
range_size_limit);
414418
}
415419

416-
::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
417-
const std::vector<int>& column_indices) const {
420+
Future<> WhenBuffered(const std::vector<int>& row_groups,
421+
const std::vector<int>& column_indices) const {
418422
if (!cached_source_) {
419-
return ::arrow::Status::Invalid("Must call PreBuffer before WhenBuffered");
423+
return Status::Invalid("Must call PreBuffer before WhenBuffered");
420424
}
421425
std::vector<::arrow::io::ReadRange> ranges;
422426
for (int row : row_groups) {
@@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents {
465469
// Fall through
466470
}
467471

468-
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
469-
metadata_buffer, metadata_len, std::move(file_decryptor));
470-
auto file_decryption_properties = properties_.file_decryption_properties();
471-
if (is_encrypted_footer) {
472-
// Nothing else to do here.
473-
return;
474-
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
475-
if (file_decryption_properties != nullptr) {
476-
if (!file_decryption_properties->plaintext_files_allowed()) {
477-
throw ParquetException("Applying decryption properties on plaintext file");
478-
}
479-
}
480-
} else {
481-
// Encrypted file with plaintext footer mode.
482-
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
483-
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
484-
}
472+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
473+
std::move(file_decryptor));
485474
}
486475

487476
// Validate the source size and get the initial read size.
@@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents {
522511
}
523512

524513
// Does not throw.
525-
::arrow::Future<> ParseMetaDataAsync() {
514+
Future<> ParseMetaDataAsync() {
526515
int64_t footer_read_size;
527516
BEGIN_PARQUET_CATCH_EXCEPTIONS
528517
footer_read_size = GetFooterReadSize();
529518
END_PARQUET_CATCH_EXCEPTIONS
530519
// Assumes this is kept alive externally
531520
return source_->ReadAsync(source_size_ - footer_read_size, footer_read_size)
532-
.Then([this,
533-
footer_read_size](const std::shared_ptr<::arrow::Buffer>& footer_buffer)
534-
-> ::arrow::Future<> {
521+
.Then([this, footer_read_size](
522+
const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> Future<> {
535523
uint32_t metadata_len;
536524
BEGIN_PARQUET_CATCH_EXCEPTIONS
537525
metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
@@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents {
557545
}
558546

559547
// Continuation
560-
::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
548+
Future<> ParseMaybeEncryptedMetaDataAsync(
561549
std::shared_ptr<::arrow::Buffer> footer_buffer,
562550
std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t footer_read_size,
563551
uint32_t metadata_len) {
@@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents {
580568
file_decryptor = std::move(file_decryptor)](
581569
const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
582570
// Continue and read the file footer
583-
return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
584-
file_decryptor);
571+
BEGIN_PARQUET_CATCH_EXCEPTIONS
572+
ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer,
573+
file_decryptor);
574+
END_PARQUET_CATCH_EXCEPTIONS
575+
return Status::OK();
585576
});
586577
}
587-
return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
588-
is_encrypted_footer, std::move(file_decryptor));
578+
BEGIN_PARQUET_CATCH_EXCEPTIONS
579+
ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, is_encrypted_footer,
580+
std::move(file_decryptor));
581+
END_PARQUET_CATCH_EXCEPTIONS
582+
return Status::OK();
589583
}
590584

591585
// Continuation
592-
::arrow::Status ParseMetaDataFinal(
593-
std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len,
594-
const bool is_encrypted_footer,
595-
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
596-
BEGIN_PARQUET_CATCH_EXCEPTIONS
586+
void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
587+
uint32_t metadata_len, const bool is_encrypted_footer,
588+
std::shared_ptr<InternalFileDecryptor> file_decryptor) {
597589
const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
598590
metadata_buffer, metadata_len, std::move(file_decryptor));
599591
auto file_decryption_properties = properties_.file_decryption_properties();
600592
if (is_encrypted_footer) {
601593
// Nothing else to do here.
602-
return ::arrow::Status::OK();
594+
return;
603595
} else if (!file_metadata_->is_encryption_algorithm_set()) { // Non encrypted file.
604596
if (file_decryption_properties != nullptr) {
605597
if (!file_decryption_properties->plaintext_files_allowed()) {
@@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents {
611603
ParseMetaDataOfEncryptedFileWithPlaintextFooter(
612604
file_decryption_properties, metadata_buffer, metadata_len, read_metadata_len);
613605
}
614-
END_PARQUET_CATCH_EXCEPTIONS
615-
return ::arrow::Status::OK();
616606
}
617607

618608
private:
@@ -704,23 +694,18 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
704694
EncryptionAlgorithm algo = file_metadata_->encryption_algorithm();
705695
// Handle AAD prefix
706696
std::string file_aad = HandleAadPrefix(file_decryption_properties, algo);
697+
// Set the InternalFileDecryptor in the metadata as well, as it's used
698+
// for signature verification and for ColumnChunkMetaData creation.
707699
auto file_decryptor = std::make_shared<InternalFileDecryptor>(
708700
file_decryption_properties, file_aad, algo.algorithm,
709701
file_metadata_->footer_signing_key_metadata(), properties_.memory_pool());
710-
// set the InternalFileDecryptor in the metadata as well, as it's used
711-
// for signature verification and for ColumnChunkMetaData creation.
712702
file_metadata_->set_file_decryptor(std::move(file_decryptor));
713703

714704
if (file_decryption_properties->check_plaintext_footer_integrity()) {
715-
if (metadata_len - read_metadata_len !=
716-
(parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength)) {
717-
throw ParquetInvalidOrCorruptedFileException(
718-
"Failed reading metadata for encryption signature (requested ",
719-
parquet::encryption::kGcmTagLength + parquet::encryption::kNonceLength,
720-
" bytes but have ", metadata_len - read_metadata_len, " bytes)");
721-
}
722-
723-
if (!file_metadata_->VerifySignature(metadata_buffer->data() + read_metadata_len)) {
705+
auto serialized_metadata =
706+
metadata_buffer->span_as<uint8_t>().subspan(0, read_metadata_len);
707+
auto signature = metadata_buffer->span_as<uint8_t>().subspan(read_metadata_len);
708+
if (!file_metadata_->VerifySignature(serialized_metadata, signature)) {
724709
throw ParquetInvalidOrCorruptedFileException(
725710
"Parquet crypto signature verification failed");
726711
}
@@ -804,7 +789,7 @@ std::unique_ptr<ParquetFileReader::Contents> ParquetFileReader::Contents::Open(
804789
return result;
805790
}
806791

807-
::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
792+
Future<std::unique_ptr<ParquetFileReader::Contents>>
808793
ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
809794
const ReaderProperties& props,
810795
std::shared_ptr<FileMetaData> metadata) {
@@ -815,7 +800,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
815800
if (metadata == nullptr) {
816801
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
817802
struct {
818-
::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
803+
Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
819804
return std::move(result);
820805
}
821806

@@ -825,7 +810,7 @@ ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
825810
return file->ParseMetaDataAsync().Then(std::move(Continuation));
826811
} else {
827812
file->set_metadata(std::move(metadata));
828-
return ::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
813+
return Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
829814
std::move(result));
830815
}
831816
END_PARQUET_CATCH_EXCEPTIONS
@@ -855,24 +840,24 @@ std::unique_ptr<ParquetFileReader> ParquetFileReader::OpenFile(
855840
return Open(std::move(source), props, std::move(metadata));
856841
}
857842

858-
::arrow::Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
843+
Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
859844
std::shared_ptr<::arrow::io::RandomAccessFile> source, const ReaderProperties& props,
860845
std::shared_ptr<FileMetaData> metadata) {
861846
BEGIN_PARQUET_CATCH_EXCEPTIONS
862847
auto fut = SerializedFile::OpenAsync(std::move(source), props, std::move(metadata));
863848
// TODO(ARROW-12259): workaround since we have Future<(move-only type)>
864-
auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
865-
fut.AddCallback([fut, completed](
866-
const ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
867-
contents) mutable {
868-
if (!contents.ok()) {
869-
completed.MarkFinished(contents.status());
870-
return;
871-
}
872-
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
873-
result->Open(fut.MoveResult().MoveValueUnsafe());
874-
completed.MarkFinished(std::move(result));
875-
});
849+
auto completed = Future<std::unique_ptr<ParquetFileReader>>::Make();
850+
fut.AddCallback(
851+
[fut, completed](
852+
const Result<std::unique_ptr<ParquetFileReader::Contents>>& contents) mutable {
853+
if (!contents.ok()) {
854+
completed.MarkFinished(contents.status());
855+
return;
856+
}
857+
std::unique_ptr<ParquetFileReader> result = std::make_unique<ParquetFileReader>();
858+
result->Open(fut.MoveResult().MoveValueUnsafe());
859+
completed.MarkFinished(std::move(result));
860+
});
876861
return completed;
877862
END_PARQUET_CATCH_EXCEPTIONS
878863
}
@@ -919,7 +904,7 @@ void ParquetFileReader::PreBuffer(const std::vector<int>& row_groups,
919904
file->PreBuffer(row_groups, column_indices, ctx, options);
920905
}
921906

922-
::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
907+
Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
923908
const std::vector<int>& row_groups, const std::vector<int>& column_indices,
924909
int64_t hole_size_limit, int64_t range_size_limit) {
925910
// Access private methods here
@@ -929,8 +914,8 @@ ::arrow::Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadR
929914
range_size_limit);
930915
}
931916

932-
::arrow::Future<> ParquetFileReader::WhenBuffered(
933-
const std::vector<int>& row_groups, const std::vector<int>& column_indices) const {
917+
Future<> ParquetFileReader::WhenBuffered(const std::vector<int>& row_groups,
918+
const std::vector<int>& column_indices) const {
934919
// Access private methods here
935920
SerializedFile* file =
936921
::arrow::internal::checked_cast<SerializedFile*>(contents_.get());

cpp/src/parquet/metadata.cc

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,43 @@ class FileMetaData::FileMetaDataImpl {
834834
tag, encryption::kGcmTagLength);
835835
}
836836

837+
bool VerifySignature(std::span<const uint8_t> serialized_metadata,
838+
std::span<const uint8_t> signature) {
839+
// Verify decryption properties are set
840+
if (file_decryptor_ == nullptr) {
841+
throw ParquetException("Decryption not set properly. cannot verify signature");
842+
}
843+
844+
if (signature.size() != encryption::kGcmTagLength + encryption::kNonceLength) {
845+
throw ParquetInvalidOrCorruptedFileException(
846+
"Invalid footer encryption signature (expected ",
847+
encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ",
848+
signature.size(), ")");
849+
}
850+
851+
// Encrypt plaintext serialized metadata so as to compute its signature
852+
auto nonce = signature.subspan(0, encryption::kNonceLength);
853+
auto tag = signature.subspan(encryption::kNonceLength);
854+
const SecureString& key = file_decryptor_->GetFooterKey();
855+
const std::string& aad = encryption::CreateFooterAad(file_decryptor_->file_aad());
856+
857+
auto aes_encryptor = encryption::AesEncryptor::Make(file_decryptor_->algorithm(),
858+
static_cast<int>(key.size()),
859+
true, false /*write_length*/);
860+
861+
std::shared_ptr<Buffer> encrypted_buffer =
862+
AllocateBuffer(file_decryptor_->pool(),
863+
aes_encryptor->CiphertextLength(serialized_metadata.size()));
864+
int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
865+
serialized_metadata, key.as_span(), str2span(aad), nonce,
866+
encrypted_buffer->mutable_span_as<uint8_t>());
867+
DCHECK_EQ(encrypted_len, encrypted_buffer->size());
868+
// Check computed signature against expected
869+
return 0 ==
870+
memcmp(encrypted_buffer->data() + encrypted_len - encryption::kGcmTagLength,
871+
tag.data(), encryption::kGcmTagLength);
872+
}
873+
837874
inline uint32_t size() const { return metadata_len_; }
838875
inline int num_columns() const { return schema_.num_columns(); }
839876
inline int64_t num_rows() const { return metadata_->num_rows; }
@@ -1083,6 +1120,11 @@ bool FileMetaData::VerifySignature(const void* signature) {
10831120
return impl_->VerifySignature(signature);
10841121
}
10851122

1123+
bool FileMetaData::VerifySignature(std::span<const uint8_t> serialized_metadata,
1124+
std::span<const uint8_t> signature) {
1125+
return impl_->VerifySignature(serialized_metadata, signature);
1126+
}
1127+
10861128
uint32_t FileMetaData::size() const { return impl_->size(); }
10871129

10881130
int FileMetaData::num_columns() const { return impl_->num_columns(); }

cpp/src/parquet/metadata.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <map>
2222
#include <memory>
2323
#include <optional>
24+
#include <span>
2425
#include <string>
2526
#include <vector>
2627

@@ -322,9 +323,12 @@ class PARQUET_EXPORT FileMetaData {
322323
EncryptionAlgorithm encryption_algorithm() const;
323324
const std::string& footer_signing_key_metadata() const;
324325

325-
/// \brief Verify signature of FileMetaData when file is encrypted but footer
326-
/// is not encrypted (plaintext footer).
326+
PARQUET_DEPRECATED("Deprecated in 24.0.0. Use the two-argument overload instead.")
327327
bool VerifySignature(const void* signature);
328+
/// \brief Verify footer signature when file is encrypted but footer is not
329+
/// encrypted (plaintext footer).
330+
bool VerifySignature(std::span<const uint8_t> serialized_metadata,
331+
std::span<const uint8_t> signature);
328332

329333
void WriteTo(::arrow::io::OutputStream* dst,
330334
const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;

0 commit comments

Comments
 (0)