Skip to content

Commit 8fb7ba6

Browse files
authored
Fix the schema transform to not rely on frame views (#2180)
Signed-off-by: Juan Cruz Viotti <[email protected]>
1 parent 58eb063 commit 8fb7ba6

File tree

4 files changed

+147
-84
lines changed

4 files changed

+147
-84
lines changed

src/core/jsonschema/frame.cc

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ auto SchemaFrame::analyse(const JSON &root, const SchemaWalker &walker,
429429
std::string_view default_dialect,
430430
std::string_view default_id,
431431
const SchemaFrame::Paths &paths) -> void {
432+
this->reset();
432433
assert(std::unordered_set<WeakPointer>(paths.cbegin(), paths.cend()).size() ==
433434
paths.size());
434435
std::vector<InternalEntry> subschema_entries;
@@ -1056,10 +1057,21 @@ auto SchemaFrame::traverse(const std::string_view uri) const
10561057
return std::nullopt;
10571058
}
10581059

1060+
auto SchemaFrame::traverse(const Pointer &pointer) const
1061+
-> std::optional<std::reference_wrapper<const Location>> {
1062+
// TODO: This is slow. Consider adding a pointer-indexed secondary
1063+
// lookup structure to SchemaFrame
1064+
for (const auto &entry : this->locations_) {
1065+
if (entry.second.pointer == pointer) {
1066+
return entry.second;
1067+
}
1068+
}
1069+
1070+
return std::nullopt;
1071+
}
1072+
10591073
auto SchemaFrame::uri(const Pointer &pointer) const
10601074
-> std::optional<std::reference_wrapper<const JSON::String>> {
1061-
// TODO: This is potentially very slow. Traversing by pointer shouldn't
1062-
// require an O(N) operation
10631075
for (const auto &entry : this->locations_) {
10641076
if (entry.second.pointer == pointer) {
10651077
return entry.first.second;
@@ -1180,4 +1192,14 @@ auto SchemaFrame::relative_instance_location(const Location &location) const
11801192
return location.pointer.slice(location.relative_pointer);
11811193
}
11821194

1195+
auto SchemaFrame::empty() const noexcept -> bool {
1196+
return this->locations_.empty() && this->references_.empty();
1197+
}
1198+
1199+
auto SchemaFrame::reset() -> void {
1200+
this->root_.clear();
1201+
this->locations_.clear();
1202+
this->references_.clear();
1203+
}
1204+
11831205
} // namespace sourcemeta::core

src/core/jsonschema/include/sourcemeta/core/jsonschema_frame.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,10 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaFrame {
180180
[[nodiscard]] auto traverse(const std::string_view uri) const
181181
-> std::optional<std::reference_wrapper<const Location>>;
182182

183+
/// Get the location associated with a given pointer
184+
[[nodiscard]] auto traverse(const Pointer &pointer) const
185+
-> std::optional<std::reference_wrapper<const Location>>;
186+
183187
/// Turn an absolute pointer into a location URI
184188
[[nodiscard]] auto uri(const Pointer &pointer) const
185189
-> std::optional<std::reference_wrapper<const JSON::String>>;
@@ -212,6 +216,12 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaFrame {
212216
[[nodiscard]] auto relative_instance_location(const Location &location) const
213217
-> Pointer;
214218

219+
/// Check if the frame has no analysed data
220+
[[nodiscard]] auto empty() const noexcept -> bool;
221+
222+
/// Reset the frame, clearing all analysed data
223+
auto reset() -> void;
224+
215225
private:
216226
Mode mode_;
217227
// Exporting symbols that depends on the standard C++ library is considered

src/core/jsonschema/include/sourcemeta/core/jsonschema_transform.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,6 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
109109
std::optional<JSON::String> description;
110110
};
111111

112-
/// Apply the rule to a schema
113-
auto apply(JSON &schema, const JSON &root, const Vocabularies &vocabularies,
114-
const SchemaWalker &walker, const SchemaResolver &resolver,
115-
const SchemaFrame &frame,
116-
const SchemaFrame::Location &location) const
117-
-> std::pair<bool, Result>;
118-
119112
/// Check if the rule applies to a schema
120113
[[nodiscard]] auto
121114
check(const JSON &schema, const JSON &root, const Vocabularies &vocabularies,
@@ -129,7 +122,6 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
129122
rereference(const std::string_view reference, const Pointer &origin,
130123
const Pointer &target, const Pointer &current) const -> Pointer;
131124

132-
private:
133125
/// The rule condition
134126
[[nodiscard]] virtual auto
135127
condition(const JSON &schema, const JSON &root,
@@ -141,6 +133,7 @@ class SOURCEMETA_CORE_JSONSCHEMA_EXPORT SchemaTransformRule {
141133
/// then the rule condition is considered to not be fixable.
142134
virtual auto transform(JSON &schema, const Result &result) const -> void;
143135

136+
private:
144137
// Exporting symbols that depends on the standard C++ library is considered
145138
// safe.
146139
// https://learn.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4275?view=msvc-170&redirectedfrom=MSDN

src/core/jsonschema/transformer.cc

Lines changed: 112 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,26 @@
33

44
#include <algorithm> // std::erase_if
55
#include <cassert> // assert
6-
#include <set> // std::set
6+
#include <functional> // std::hash
77
#include <sstream> // std::ostringstream
88
#include <tuple> // std::tuple
99
#include <unordered_set> // std::unordered_set
1010
#include <utility> // std::move, std::pair
11+
#include <vector> // std::vector
1112

1213
namespace {
1314

15+
struct ProcessedRuleHasher {
16+
auto
17+
operator()(const std::tuple<const sourcemeta::core::JSON *, std::string_view,
18+
std::uint64_t> &value) const noexcept
19+
-> std::size_t {
20+
return std::hash<const void *>{}(std::get<0>(value)) ^
21+
(std::hash<std::string_view>{}(std::get<1>(value)) << 1) ^
22+
(std::hash<std::uint64_t>{}(std::get<2>(value)) << 2);
23+
}
24+
};
25+
1426
auto calculate_health_percentage(const std::size_t subschemas,
1527
const std::size_t failed_subschemas)
1628
-> std::uint8_t {
@@ -56,40 +68,6 @@ auto SchemaTransformRule::rereference(const std::string_view reference,
5668
"The reference broke after transformation");
5769
}
5870

59-
auto SchemaTransformRule::apply(JSON &schema, const JSON &root,
60-
const Vocabularies &vocabularies,
61-
const SchemaWalker &walker,
62-
const SchemaResolver &resolver,
63-
const SchemaFrame &frame,
64-
const SchemaFrame::Location &location) const
65-
-> std::pair<bool, Result> {
66-
auto outcome{this->condition(schema, root, vocabularies, frame, location,
67-
walker, resolver)};
68-
if (!outcome.applies) {
69-
return {true, std::move(outcome)};
70-
}
71-
72-
try {
73-
this->transform(schema, outcome);
74-
} catch (const SchemaAbortError &) {
75-
return {false, std::move(outcome)};
76-
}
77-
78-
// The condition must always be false after applying the
79-
// transformation in order to avoid infinite loops
80-
if (this->condition(schema, root, vocabularies, frame, location, walker,
81-
resolver)
82-
.applies) {
83-
// TODO: Throw a better custom error that also highlights the schema
84-
// location
85-
std::ostringstream error;
86-
error << "Rule condition holds after application: " << this->name();
87-
throw std::runtime_error(error.str());
88-
}
89-
90-
return {true, std::move(outcome)};
91-
}
92-
9371
auto SchemaTransformRule::check(const JSON &schema, const JSON &root,
9472
const Vocabularies &vocabularies,
9573
const SchemaWalker &walker,
@@ -163,22 +141,37 @@ auto SchemaTransformer::apply(JSON &schema, const SchemaWalker &walker,
163141
std::string_view default_dialect,
164142
std::string_view default_id) const
165143
-> std::pair<bool, std::uint8_t> {
166-
// There is no point in applying an empty bundle
167144
assert(!this->rules.empty());
168-
std::set<std::tuple<const JSON *, std::string_view, std::uint64_t>>
145+
std::unordered_set<std::tuple<const JSON *, std::string_view, std::uint64_t>,
146+
ProcessedRuleHasher>
169147
processed_rules;
170148

171149
bool result{true};
172150
std::size_t subschema_count{0};
173151
std::size_t subschema_failures{0};
152+
153+
SchemaFrame frame{SchemaFrame::Mode::References};
154+
155+
struct PotentiallyBrokenReference {
156+
Pointer origin;
157+
JSON::String original;
158+
JSON::String destination;
159+
Pointer target_pointer;
160+
std::size_t target_relative_pointer;
161+
};
162+
163+
std::vector<PotentiallyBrokenReference> potentially_broken_references;
164+
174165
while (true) {
175-
SchemaFrame frame{SchemaFrame::Mode::References};
176-
frame.analyse(schema, walker, resolver, default_dialect, default_id);
177-
std::unordered_set<Pointer> visited;
166+
if (frame.empty()) {
167+
frame.analyse(schema, walker, resolver, default_dialect, default_id);
168+
}
178169

170+
std::unordered_set<Pointer> visited;
179171
bool applied{false};
180172
subschema_count = 0;
181173
subschema_failures = 0;
174+
182175
for (const auto &entry : frame.locations()) {
183176
if (entry.second.type != SchemaFrame::LocationType::Resource &&
184177
entry.second.type != SchemaFrame::LocationType::Subschema) {
@@ -199,71 +192,116 @@ auto SchemaTransformer::apply(JSON &schema, const SchemaWalker &walker,
199192

200193
bool subschema_failed{false};
201194
for (const auto &rule : this->rules) {
202-
const auto subresult{rule->apply(current, schema, current_vocabularies,
203-
walker, resolver, frame,
204-
entry.second)};
205-
// This means the rule is fixable
206-
if (subresult.first) {
207-
applied = subresult.second.applies || applied;
208-
} else {
209-
result = false;
210-
subschema_failed = true;
211-
callback(entry.second.pointer, rule->name(), rule->message(),
212-
subresult.second);
213-
}
195+
auto outcome{rule->condition(current, schema, current_vocabularies,
196+
frame, entry.second, walker, resolver)};
214197

215-
if (!applied) {
198+
if (!outcome.applies) {
216199
continue;
217200
}
218201

219-
std::tuple<const JSON *, std::string_view, std::uint64_t> mark{
220-
&current, rule->name(),
221-
// Allow applying the same rule to the same location if the schema
222-
// has changed, which means we are still "making progress". The
223-
// hashing is not perfect, but its enough
224-
current.fast_hash()};
225-
if (processed_rules.contains(mark)) {
226-
throw SchemaTransformRuleProcessedTwiceError(rule->name(),
227-
entry.second.pointer);
228-
}
202+
// Store data we need before invalidating the frame
203+
const auto transformed_pointer{entry.second.pointer};
204+
const auto transformed_relative_pointer{entry.second.relative_pointer};
229205

230-
// Identify and try to address broken references, if any
206+
// Collect reference information BEFORE invalidating the frame.
207+
// We need to save this data because after the transform, the old
208+
// frame's views may point to invalid memory, and a new frame won't
209+
// have location entries for paths that no longer exist.
210+
potentially_broken_references.clear();
231211
for (const auto &reference : frame.references()) {
232212
const auto destination{frame.traverse(reference.second.destination)};
233213
if (!destination.has_value() ||
234-
// We only care about references with JSON Pointer fragments,
235-
// as these are the only cases, by definition, where the target
236-
// is location-dependent.
237214
!reference.second.fragment.has_value() ||
238215
!reference.second.fragment.value().starts_with('/')) {
239216
continue;
240217
}
241218

242219
const auto &target{destination.value().get()};
220+
potentially_broken_references.push_back(
221+
{reference.first.second, JSON::String{reference.second.original},
222+
reference.second.destination, target.pointer,
223+
target.relative_pointer});
224+
}
225+
226+
try {
227+
rule->transform(current, outcome);
228+
} catch (const SchemaAbortError &) {
229+
result = false;
230+
subschema_failed = true;
231+
callback(transformed_pointer, rule->name(), rule->message(), outcome);
232+
continue;
233+
}
234+
235+
applied = true;
236+
237+
frame.analyse(schema, walker, resolver, default_dialect, default_id);
238+
239+
const auto new_location{frame.traverse(transformed_pointer)};
240+
// The location should still exist after transform
241+
assert(new_location.has_value());
242+
243+
// Get vocabularies from the new frame
244+
const auto new_vocabularies{
245+
frame.vocabularies(new_location.value().get(), resolver)};
246+
247+
// The condition must always be false after applying the
248+
// transformation in order to avoid infinite loops
249+
if (rule->condition(current, schema, new_vocabularies, frame,
250+
new_location.value().get(), walker, resolver)
251+
.applies) {
252+
std::ostringstream error;
253+
error << "Rule condition holds after application: " << rule->name();
254+
throw std::runtime_error(error.str());
255+
}
256+
257+
// Identify and fix broken references using the saved data
258+
bool references_fixed{false};
259+
for (const auto &saved_reference : potentially_broken_references) {
243260
// The destination still exists, so we don't have to do anything
244-
if (try_get(schema, target.pointer)) {
261+
if (try_get(schema, saved_reference.target_pointer)) {
245262
continue;
246263
}
247264

248265
// If the source no longer exists, we don't need to fix the reference
249-
if (!try_get(schema, reference.first.second.initial())) {
266+
if (!try_get(schema, saved_reference.origin.initial())) {
250267
continue;
251268
}
252269

253270
const auto new_fragment{rule->rereference(
254-
reference.second.destination, reference.first.second,
255-
frame.relative_instance_location(target),
256-
frame.relative_instance_location(entry.second))};
271+
saved_reference.destination, saved_reference.origin,
272+
saved_reference.target_pointer.slice(
273+
saved_reference.target_relative_pointer),
274+
transformed_pointer.slice(transformed_relative_pointer))};
257275

258276
// Note we use the base from the original reference before any
259277
// canonicalisation takes place so that we don't overly change
260278
// user's references when only fixing up their pointer fragments
261-
URI original{reference.second.original};
279+
URI original{saved_reference.original};
262280
original.fragment(to_string(new_fragment));
263-
set(schema, reference.first.second, JSON{original.recompose()});
281+
set(schema, saved_reference.origin, JSON{original.recompose()});
282+
references_fixed = true;
283+
}
284+
285+
std::tuple<const JSON *, std::string_view, std::uint64_t> mark{
286+
&current, rule->name(),
287+
// Allow applying the same rule to the same location if the schema
288+
// has changed, which means we are still "making progress". The
289+
// hashing is not perfect, but its enough
290+
current.fast_hash()};
291+
if (processed_rules.contains(mark)) {
292+
throw SchemaTransformRuleProcessedTwiceError(rule->name(),
293+
transformed_pointer);
264294
}
265295

266296
processed_rules.emplace(std::move(mark));
297+
298+
// If we fixed references, the schema changed again, so we need to
299+
// invalidate the frame. Otherwise, we can reuse it for the next
300+
// iteration.
301+
if (references_fixed) {
302+
frame.reset();
303+
}
304+
267305
goto core_transformer_start_again;
268306
}
269307

0 commit comments

Comments
 (0)