Fix the schema transform to not rely on frame views (#2180)

jviotti · web-flow · commit 8fb7ba6f57bf · 2026-01-08T13:53:05.000-04:00
Signed-off-by: Juan Cruz Viotti &lt;jv@jviotti.com&gt;
diff --git a/src/core/jsonschema/frame.cc b/src/core/jsonschema/frame.cc
@@ -429,6 +429,7 @@ auto SchemaFrame::analyse(const JSON &root, const SchemaWalker &walker,
                           std::string_view default_dialect,
                           std::string_view default_id,
                           const SchemaFrame::Paths &paths) -> void {
+  this->reset();
   assert(std::unordered_set<WeakPointer>(paths.cbegin(), paths.cend()).size() ==
          paths.size());
   std::vector<InternalEntry> subschema_entries;
@@ -1056,10 +1057,21 @@ auto SchemaFrame::traverse(const std::string_view uri) const
   return std::nullopt;
 }
 
+auto SchemaFrame::traverse(const Pointer &pointer) const
+    -> std::optional<std::reference_wrapper<const Location>> {
+  // TODO: This is slow. Consider adding a pointer-indexed secondary
+  // lookup structure to SchemaFrame
+  for (const auto &entry : this->locations_) {
+    if (entry.second.pointer == pointer) {
+      return entry.second;
+    }
+  }
+
+  return std::nullopt;
+}
+
 auto SchemaFrame::uri(const Pointer &pointer) const
     -> std::optional<std::reference_wrapper<const JSON::String>> {
-  // TODO: This is potentially very slow. Traversing by pointer shouldn't
-  // require an O(N) operation
   for (const auto &entry : this->locations_) {
     if (entry.second.pointer == pointer) {
       return entry.first.second;
@@ -1180,4 +1192,14 @@ auto SchemaFrame::relative_instance_location(const Location &location) const
   return location.pointer.slice(location.relative_pointer);
 }
 
+auto SchemaFrame::empty() const noexcept -> bool {
+  return this->locations_.empty() && this->references_.empty();
+}
+
+auto SchemaFrame::reset() -> void {
+  this->root_.clear();
+  this->locations_.clear();
+  this->references_.clear();
+}
+
 } // namespace sourcemeta::core
diff --git a/src/core/jsonschema/include/sourcemeta/core/jsonschema_frame.h b/src/core/jsonschema/include/sourcemeta/core/jsonschema_frame.h
@@ -180,6 +180,10 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaFrame {
   [[nodiscard]] auto traverse(const std::string_view uri) const
       -> std::optional<std::reference_wrapper<const Location>>;
 
+  /// Get the location associated with a given pointer
+  [[nodiscard]] auto traverse(const Pointer &pointer) const
+      -> std::optional<std::reference_wrapper<const Location>>;
+
   /// Turn an absolute pointer into a location URI
   [[nodiscard]] auto uri(const Pointer &pointer) const
       -> std::optional<std::reference_wrapper<const JSON::String>>;
@@ -212,6 +216,12 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaFrame {
   [[nodiscard]] auto relative_instance_location(const Location &location) const
       -> Pointer;
 
+  /// Check if the frame has no analysed data
+  [[nodiscard]] auto empty() const noexcept -> bool;
+
+  /// Reset the frame, clearing all analysed data
+  auto reset() -> void;
+
 private:
   Mode mode_;
 // Exporting symbols that depends on the standard C++ library is considered
diff --git a/src/core/jsonschema/include/sourcemeta/core/jsonschema_transform.h b/src/core/jsonschema/include/sourcemeta/core/jsonschema_transform.h
@@ -109,13 +109,6 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
     std::optional<JSON::String> description;
   };
 
-  /// Apply the rule to a schema
-  auto apply(JSON &schema, const JSON &root, const Vocabularies &vocabularies,
-             const SchemaWalker &walker, const SchemaResolver &resolver,
-             const SchemaFrame &frame,
-             const SchemaFrame::Location &location) const
-      -> std::pair<bool, Result>;
-
   /// Check if the rule applies to a schema
   [[nodiscard]] auto
   check(const JSON &schema, const JSON &root, const Vocabularies &vocabularies,
@@ -129,7 +122,6 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
   rereference(const std::string_view reference, const Pointer &origin,
               const Pointer &target, const Pointer &current) const -> Pointer;
 
-private:
   /// The rule condition
   [[nodiscard]] virtual auto
   condition(const JSON &schema, const JSON &root,
@@ -141,6 +133,7 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
   /// then the rule condition is considered to not be fixable.
   virtual auto transform(JSON &schema, const Result &result) const -> void;
 
+private:
 // Exporting symbols that depends on the standard C++ library is considered
 // safe.
 // https://learn.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4275?view=msvc-170&redirectedfrom=MSDN
diff --git a/src/core/jsonschema/transformer.cc b/src/core/jsonschema/transformer.cc
@@ -3,14 +3,26 @@
 
 #include <algorithm>     // std::erase_if
 #include <cassert>       // assert
-#include <set>           // std::set
+#include <functional>    // std::hash
 #include <sstream>       // std::ostringstream
 #include <tuple>         // std::tuple
 #include <unordered_set> // std::unordered_set
 #include <utility>       // std::move, std::pair
+#include <vector>        // std::vector
 
 namespace {
 
+struct ProcessedRuleHasher {
+  auto
+  operator()(const std::tuple<const sourcemeta::core::JSON *, std::string_view,
+                              std::uint64_t> &value) const noexcept
+      -> std::size_t {
+    return std::hash<const void *>{}(std::get<0>(value)) ^
+           (std::hash<std::string_view>{}(std::get<1>(value)) << 1) ^
+           (std::hash<std::uint64_t>{}(std::get<2>(value)) << 2);
+  }
+};
+
 auto calculate_health_percentage(const std::size_t subschemas,
                                  const std::size_t failed_subschemas)
     -> std::uint8_t {
@@ -56,40 +68,6 @@ auto SchemaTransformRule::rereference(const std::string_view reference,
                                    "The reference broke after transformation");
 }
 
-auto SchemaTransformRule::apply(JSON &schema, const JSON &root,
-                                const Vocabularies &vocabularies,
-                                const SchemaWalker &walker,
-                                const SchemaResolver &resolver,
-                                const SchemaFrame &frame,
-                                const SchemaFrame::Location &location) const
-    -> std::pair<bool, Result> {
-  auto outcome{this->condition(schema, root, vocabularies, frame, location,
-                               walker, resolver)};
-  if (!outcome.applies) {
-    return {true, std::move(outcome)};
-  }
-
-  try {
-    this->transform(schema, outcome);
-  } catch (const SchemaAbortError &) {
-    return {false, std::move(outcome)};
-  }
-
-  // The condition must always be false after applying the
-  // transformation in order to avoid infinite loops
-  if (this->condition(schema, root, vocabularies, frame, location, walker,
-                      resolver)
-          .applies) {
-    // TODO: Throw a better custom error that also highlights the schema
-    // location
-    std::ostringstream error;
-    error << "Rule condition holds after application: " << this->name();
-    throw std::runtime_error(error.str());
-  }
-
-  return {true, std::move(outcome)};
-}
-
 auto SchemaTransformRule::check(const JSON &schema, const JSON &root,
                                 const Vocabularies &vocabularies,
                                 const SchemaWalker &walker,
@@ -163,22 +141,37 @@ auto SchemaTransformer::apply(JSON &schema, const SchemaWalker &walker,
                               std::string_view default_dialect,
                               std::string_view default_id) const
     -> std::pair<bool, std::uint8_t> {
-  // There is no point in applying an empty bundle
   assert(!this->rules.empty());
-  std::set<std::tuple<const JSON *, std::string_view, std::uint64_t>>
+  std::unordered_set<std::tuple<const JSON *, std::string_view, std::uint64_t>,
+                     ProcessedRuleHasher>
       processed_rules;
 
   bool result{true};
   std::size_t subschema_count{0};
   std::size_t subschema_failures{0};
+
+  SchemaFrame frame{SchemaFrame::Mode::References};
+
+  struct PotentiallyBrokenReference {
+    Pointer origin;
+    JSON::String original;
+    JSON::String destination;
+    Pointer target_pointer;
+    std::size_t target_relative_pointer;
+  };
+
+  std::vector<PotentiallyBrokenReference> potentially_broken_references;
+
   while (true) {
-    SchemaFrame frame{SchemaFrame::Mode::References};
-    frame.analyse(schema, walker, resolver, default_dialect, default_id);
-    std::unordered_set<Pointer> visited;
+    if (frame.empty()) {
+      frame.analyse(schema, walker, resolver, default_dialect, default_id);
+    }
 
+    std::unordered_set<Pointer> visited;
     bool applied{false};
     subschema_count = 0;
     subschema_failures = 0;
+
     for (const auto &entry : frame.locations()) {
       if (entry.second.type != SchemaFrame::LocationType::Resource &&
           entry.second.type != SchemaFrame::LocationType::Subschema) {
@@ -199,71 +192,116 @@ auto SchemaTransformer::apply(JSON &schema, const SchemaWalker &walker,
 
       bool subschema_failed{false};
       for (const auto &rule : this->rules) {
-        const auto subresult{rule->apply(current, schema, current_vocabularies,
-                                         walker, resolver, frame,
-                                         entry.second)};
-        // This means the rule is fixable
-        if (subresult.first) {
-          applied = subresult.second.applies || applied;
-        } else {
-          result = false;
-          subschema_failed = true;
-          callback(entry.second.pointer, rule->name(), rule->message(),
-                   subresult.second);
-        }
+        auto outcome{rule->condition(current, schema, current_vocabularies,
+                                     frame, entry.second, walker, resolver)};
 
-        if (!applied) {
+        if (!outcome.applies) {
           continue;
         }
 
-        std::tuple<const JSON *, std::string_view, std::uint64_t> mark{
-            &current, rule->name(),
-            // Allow applying the same rule to the same location if the schema
-            // has changed, which means we are still "making progress". The
-            // hashing is not perfect, but its enough
-            current.fast_hash()};
-        if (processed_rules.contains(mark)) {
-          throw SchemaTransformRuleProcessedTwiceError(rule->name(),
-                                                       entry.second.pointer);
-        }
+        // Store data we need before invalidating the frame
+        const auto transformed_pointer{entry.second.pointer};
+        const auto transformed_relative_pointer{entry.second.relative_pointer};
 
-        // Identify and try to address broken references, if any
+        // Collect reference information BEFORE invalidating the frame.
+        // We need to save this data because after the transform, the old
+        // frame's views may point to invalid memory, and a new frame won't
+        // have location entries for paths that no longer exist.
+        potentially_broken_references.clear();
         for (const auto &reference : frame.references()) {
           const auto destination{frame.traverse(reference.second.destination)};
           if (!destination.has_value() ||
-              // We only care about references with JSON Pointer fragments,
-              // as these are the only cases, by definition, where the target
-              // is location-dependent.
               !reference.second.fragment.has_value() ||
               !reference.second.fragment.value().starts_with('/')) {
             continue;
           }
 
           const auto &target{destination.value().get()};
+          potentially_broken_references.push_back(
+              {reference.first.second, JSON::String{reference.second.original},
+               reference.second.destination, target.pointer,
+               target.relative_pointer});
+        }
+
+        try {
+          rule->transform(current, outcome);
+        } catch (const SchemaAbortError &) {
+          result = false;
+          subschema_failed = true;
+          callback(transformed_pointer, rule->name(), rule->message(), outcome);
+          continue;
+        }
+
+        applied = true;
+
+        frame.analyse(schema, walker, resolver, default_dialect, default_id);
+
+        const auto new_location{frame.traverse(transformed_pointer)};
+        // The location should still exist after transform
+        assert(new_location.has_value());
+
+        // Get vocabularies from the new frame
+        const auto new_vocabularies{
+            frame.vocabularies(new_location.value().get(), resolver)};
+
+        // The condition must always be false after applying the
+        // transformation in order to avoid infinite loops
+        if (rule->condition(current, schema, new_vocabularies, frame,
+                            new_location.value().get(), walker, resolver)
+                .applies) {
+          std::ostringstream error;
+          error << "Rule condition holds after application: " << rule->name();
+          throw std::runtime_error(error.str());
+        }
+
+        // Identify and fix broken references using the saved data
+        bool references_fixed{false};
+        for (const auto &saved_reference : potentially_broken_references) {
           // The destination still exists, so we don't have to do anything
-          if (try_get(schema, target.pointer)) {
+          if (try_get(schema, saved_reference.target_pointer)) {
             continue;
           }
 
           // If the source no longer exists, we don't need to fix the reference
-          if (!try_get(schema, reference.first.second.initial())) {
+          if (!try_get(schema, saved_reference.origin.initial())) {
             continue;
           }
 
           const auto new_fragment{rule->rereference(
-              reference.second.destination, reference.first.second,
-              frame.relative_instance_location(target),
-              frame.relative_instance_location(entry.second))};
+              saved_reference.destination, saved_reference.origin,
+              saved_reference.target_pointer.slice(
+                  saved_reference.target_relative_pointer),
+              transformed_pointer.slice(transformed_relative_pointer))};
 
           // Note we use the base from the original reference before any
           // canonicalisation takes place so that we don't overly change
           // user's references when only fixing up their pointer fragments
-          URI original{reference.second.original};
+          URI original{saved_reference.original};
           original.fragment(to_string(new_fragment));
-          set(schema, reference.first.second, JSON{original.recompose()});
+          set(schema, saved_reference.origin, JSON{original.recompose()});
+          references_fixed = true;
+        }
+
+        std::tuple<const JSON *, std::string_view, std::uint64_t> mark{
+            &current, rule->name(),
+            // Allow applying the same rule to the same location if the schema
+            // has changed, which means we are still "making progress". The
+            // hashing is not perfect, but its enough
+            current.fast_hash()};
+        if (processed_rules.contains(mark)) {
+          throw SchemaTransformRuleProcessedTwiceError(rule->name(),
+                                                       transformed_pointer);
         }
 
         processed_rules.emplace(std::move(mark));
+
+        // If we fixed references, the schema changed again, so we need to
+        // invalidate the frame. Otherwise, we can reuse it for the next
+        // iteration.
+        if (references_fixed) {
+          frame.reset();
+        }
+
         goto core_transformer_start_again;
       }