olpc_cjson/
lib.rs

1// Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! `olpc-cjson` provides a [`serde_json::Formatter`] to serialize data as [canonical JSON], as
5//! defined by OLPC and used in [TUF].
6//!
7//! [`serde_json::Formatter`]: ../serde_json/ser/trait.Formatter.html
8//! [canonical JSON]: http://wiki.laptop.org/go/Canonical_JSON
9//! [TUF]: https://theupdateframework.github.io/
10//!
11//! OLPC's canonical JSON specification is subtly different from other "canonical JSON"
12//! specifications, and is also not a strict subset of JSON (specifically, ASCII control characters
13//! 0x00–0x1f are printed literally, which is not valid JSON). Therefore, `serde_json` cannot
14//! necessarily deserialize JSON produced by this formatter.
15//!
16//! This crate is not developed or endorsed by OLPC; use of the term is solely to distinguish this
17//! specification of canonical JSON from [other specifications of canonical JSON][xkcd].
18//!
19//! [xkcd]: https://xkcd.com/927/
20//!
21//! ```rust
22//! use olpc_cjson::CanonicalFormatter;
23//! use serde::Serialize;
24//! use serde_json::json;
25//!
26//! let value = json!({"b": 12, "a": "qwerty"});
27//! let mut buf = Vec::new();
28//! let mut ser = serde_json::Serializer::with_formatter(&mut buf, CanonicalFormatter::new());
29//! value.serialize(&mut ser).unwrap();
30//! assert_eq!(buf, br#"{"a":"qwerty","b":12}"#);
31//! ```
32
33#![deny(rust_2018_idioms)]
34#![warn(clippy::pedantic)]
35#![allow(clippy::must_use_candidate)]
36
37use serde::Serialize;
38use serde_json::ser::{CharEscape, CompactFormatter, Formatter, Serializer};
39use std::collections::BTreeMap;
40use std::io::{Error, ErrorKind, Result, Write};
41use unicode_normalization::UnicodeNormalization;
42
43/// A [`Formatter`] that produces canonical JSON.
44///
45/// See the [crate-level documentation](../index.html) for more detail.
46///
47/// [`Formatter`]: ../serde_json/ser/trait.Formatter.html
48#[derive(Debug, Default)]
49pub struct CanonicalFormatter {
50    object_stack: Vec<Object>,
51}
52
53/// Internal struct to keep track of an object in progress of being built.
54///
55/// As keys and values are received by `CanonicalFormatter`, they are written to `next_key` and
56/// `next_value` by using the `CanonicalFormatter::writer` convenience method.
57///
58/// How this struct behaves when `Formatter` methods are called:
59///
60/// ```plain
61/// [other methods]  // values written to the writer received by method
62/// begin_object     // create this object
63/// /-> begin_object_key    // object.key_done = false;
64/// |   [other methods]     // values written to object.next_key, writer received by method ignored
65/// |   end_object_key      // object.key_done = true;
66/// |   begin_object_value  // [nothing]
67/// |   [other methods]     // values written to object.next_value
68/// |   end_object_value    // object.next_key and object.next_value are inserted into object.obj
69/// \---- // jump back if more values are present
70/// end_object       // write the object (sorted by its keys) to the writer received by the method
71/// ```
72#[derive(Debug, Default)]
73struct Object {
74    obj: BTreeMap<Vec<u8>, Vec<u8>>,
75    next_key: Vec<u8>,
76    next_value: Vec<u8>,
77    key_done: bool,
78}
79
80impl CanonicalFormatter {
81    /// Create a new `CanonicalFormatter` object.
82    pub fn new() -> Self {
83        Self::default()
84    }
85
86    /// Convenience method to return the appropriate writer given the current context.
87    ///
88    /// If we are currently writing an object (that is, if `!self.object_stack.is_empty()`), we
89    /// need to write the value to either the next key or next value depending on that state
90    /// machine. See the docstrings for `Object` for more detail.
91    ///
92    /// If we are not currently writing an object, pass through `writer`.
93    fn writer<'a, W: Write + ?Sized>(&'a mut self, writer: &'a mut W) -> Box<dyn Write + 'a> {
94        self.object_stack
95            .last_mut()
96            .map_or(Box::new(writer), |object| {
97                if object.key_done {
98                    Box::new(&mut object.next_value)
99                } else {
100                    Box::new(&mut object.next_key)
101                }
102            })
103    }
104
105    /// Returns a mutable reference to the top of the object stack.
106    fn obj_mut(&mut self) -> Result<&mut Object> {
107        self.object_stack.last_mut().ok_or_else(|| {
108            Error::new(
109                ErrorKind::Other,
110                "serde_json called an object method without calling begin_object first",
111            )
112        })
113    }
114}
115
116/// Wraps `serde_json::CompactFormatter` to use the appropriate writer (see
117/// `CanonicalFormatter::writer`).
118macro_rules! wrapper {
119    ($f:ident) => {
120        fn $f<W: Write + ?Sized>(&mut self, writer: &mut W) -> Result<()> {
121            CompactFormatter.$f(&mut self.writer(writer))
122        }
123    };
124
125    ($f:ident, $t:ty) => {
126        fn $f<W: Write + ?Sized>(&mut self, writer: &mut W, arg: $t) -> Result<()> {
127            CompactFormatter.$f(&mut self.writer(writer), arg)
128        }
129    };
130}
131
132/// This is used in three places. Write it once.
133macro_rules! float_err {
134    () => {
135        Err(Error::new(
136            ErrorKind::InvalidInput,
137            "floating point numbers are not allowed in canonical JSON",
138        ))
139    };
140}
141
142impl Formatter for CanonicalFormatter {
143    wrapper!(write_null);
144    wrapper!(write_bool, bool);
145    wrapper!(write_i8, i8);
146    wrapper!(write_i16, i16);
147    wrapper!(write_i32, i32);
148    wrapper!(write_i64, i64);
149    wrapper!(write_i128, i128);
150    wrapper!(write_u8, u8);
151    wrapper!(write_u16, u16);
152    wrapper!(write_u32, u32);
153    wrapper!(write_u64, u64);
154    wrapper!(write_u128, u128);
155
156    fn write_f32<W: Write + ?Sized>(&mut self, _writer: &mut W, _value: f32) -> Result<()> {
157        float_err!()
158    }
159
160    fn write_f64<W: Write + ?Sized>(&mut self, _writer: &mut W, _value: f64) -> Result<()> {
161        float_err!()
162    }
163
164    // By default this is only used for u128/i128. If serde_json's `arbitrary_precision` feature is
165    // enabled, all numbers are internally stored as strings, and this method is always used (even
166    // for floating point values).
167    fn write_number_str<W: Write + ?Sized>(&mut self, writer: &mut W, value: &str) -> Result<()> {
168        if value.chars().any(|c| c == '.' || c == 'e' || c == 'E') {
169            float_err!()
170        } else {
171            CompactFormatter.write_number_str(&mut self.writer(writer), value)
172        }
173    }
174
175    wrapper!(begin_string);
176    wrapper!(end_string);
177
178    // Strings are normalized as Normalization Form C (NFC). `str::nfc` is provided by the
179    // `UnicodeNormalization` trait and returns an iterator of `char`s.
180    fn write_string_fragment<W: Write + ?Sized>(
181        &mut self,
182        writer: &mut W,
183        fragment: &str,
184    ) -> Result<()> {
185        fragment.nfc().try_for_each(|ch| {
186            self.writer(writer)
187                .write_all(ch.encode_utf8(&mut [0; 4]).as_bytes())
188        })
189    }
190
191    // Only quotes and backslashes are escaped in canonical JSON.
192    fn write_char_escape<W: Write + ?Sized>(
193        &mut self,
194        writer: &mut W,
195        char_escape: CharEscape,
196    ) -> Result<()> {
197        match char_escape {
198            CharEscape::Quote | CharEscape::ReverseSolidus => {
199                self.writer(writer).write_all(b"\\")?;
200            }
201            _ => {}
202        }
203        self.writer(writer).write_all(&[match char_escape {
204            CharEscape::Quote => b'\"',
205            CharEscape::ReverseSolidus => b'\\',
206            CharEscape::Solidus => b'/',
207            CharEscape::Backspace => b'\x08',
208            CharEscape::FormFeed => b'\x0c',
209            CharEscape::LineFeed => b'\n',
210            CharEscape::CarriageReturn => b'\r',
211            CharEscape::Tab => b'\t',
212            CharEscape::AsciiControl(byte) => byte,
213        }])
214    }
215
216    wrapper!(begin_array);
217    wrapper!(end_array);
218    wrapper!(begin_array_value, bool); // hack: this passes through the `first` argument
219    wrapper!(end_array_value);
220
221    // Here are the object methods. Because keys must be sorted, we serialize the object's keys and
222    // values in memory as a `BTreeMap`, then write it all out when `end_object_value` is called.
223
224    fn begin_object<W: Write + ?Sized>(&mut self, writer: &mut W) -> Result<()> {
225        CompactFormatter.begin_object(&mut self.writer(writer))?;
226        self.object_stack.push(Object::default());
227        Ok(())
228    }
229
230    fn end_object<W: Write + ?Sized>(&mut self, writer: &mut W) -> Result<()> {
231        let object = self.object_stack.pop().ok_or_else(|| {
232            Error::new(
233                ErrorKind::Other,
234                "serde_json called Formatter::end_object object method
235                 without calling begin_object first",
236            )
237        })?;
238        let mut writer = self.writer(writer);
239        let mut first = true;
240
241        for (key, value) in object.obj {
242            CompactFormatter.begin_object_key(&mut writer, first)?;
243            writer.write_all(&key)?;
244            CompactFormatter.end_object_key(&mut writer)?;
245
246            CompactFormatter.begin_object_value(&mut writer)?;
247            writer.write_all(&value)?;
248            CompactFormatter.end_object_value(&mut writer)?;
249
250            first = false;
251        }
252
253        CompactFormatter.end_object(&mut writer)
254    }
255
256    fn begin_object_key<W: Write + ?Sized>(&mut self, _writer: &mut W, _first: bool) -> Result<()> {
257        let object = self.obj_mut()?;
258        object.key_done = false;
259        Ok(())
260    }
261
262    fn end_object_key<W: Write + ?Sized>(&mut self, _writer: &mut W) -> Result<()> {
263        let object = self.obj_mut()?;
264        object.key_done = true;
265        Ok(())
266    }
267
268    fn begin_object_value<W: Write + ?Sized>(&mut self, _writer: &mut W) -> Result<()> {
269        Ok(())
270    }
271
272    fn end_object_value<W: Write + ?Sized>(&mut self, _writer: &mut W) -> Result<()> {
273        let object = self.obj_mut()?;
274        let key = std::mem::take(&mut object.next_key);
275        let value = std::mem::take(&mut object.next_value);
276        object.obj.insert(key, value);
277        Ok(())
278    }
279
280    // This is for serde_json's `raw_value` feature, which provides a RawValue type that is passed
281    // through as-is. That's not good enough for canonical JSON, so we parse it and immediately
282    // write it back out... as canonical JSON.
283    fn write_raw_fragment<W: Write + ?Sized>(
284        &mut self,
285        writer: &mut W,
286        fragment: &str,
287    ) -> Result<()> {
288        let mut ser = Serializer::with_formatter(self.writer(writer), Self::new());
289        serde_json::from_str::<serde_json::Value>(fragment)?.serialize(&mut ser)?;
290        Ok(())
291    }
292}
293
294#[cfg(test)]
295mod tests {
296    use crate::CanonicalFormatter;
297    use serde::Serialize;
298    use serde_json::Serializer;
299    use std::io::Result;
300
301    /// Small wrapper around the `serde_json` json! macro to encode the value as canonical JSON.
302    macro_rules! encode {
303        ($($tt:tt)+) => {
304            (|v: serde_json::Value| -> Result<Vec<u8>> {
305                let mut buf = Vec::new();
306                let mut ser = Serializer::with_formatter(&mut buf, CanonicalFormatter::new());
307                v.serialize(&mut ser)?;
308                Ok(buf)
309            })(serde_json::json!($($tt)+))
310        };
311    }
312
313    /// These smoke tests come from securesystemslib, the library used by the TUF reference
314    /// implementation.
315    ///
316    /// `<https://github.com/secure-systems-lab/securesystemslib/blob/f466266014aff529510216b8c2f8c8f39de279ec/tests/test_formats.py#L354-L389>`
317    #[test]
318    fn securesystemslib_asserts() -> Result<()> {
319        assert_eq!(encode!([1, 2, 3])?, b"[1,2,3]");
320        assert_eq!(encode!([1, 2, 3])?, b"[1,2,3]");
321        assert_eq!(encode!([])?, b"[]");
322        assert_eq!(encode!({})?, b"{}");
323        assert_eq!(encode!({"A": [99]})?, br#"{"A":[99]}"#);
324        assert_eq!(encode!({"A": true})?, br#"{"A":true}"#);
325        assert_eq!(encode!({"B": false})?, br#"{"B":false}"#);
326        assert_eq!(encode!({"x": 3, "y": 2})?, br#"{"x":3,"y":2}"#);
327        assert_eq!(encode!({"x": 3, "y": null})?, br#"{"x":3,"y":null}"#);
328
329        // Test conditions for invalid arguments.
330        assert!(encode!(8.0).is_err());
331        assert!(encode!({"x": 8.0}).is_err());
332
333        Ok(())
334    }
335
336    /// Canonical JSON prints literal ASCII control characters instead of escaping them. Check
337    /// ASCII 0x00 - 0x1f, plus backslash and double quote (the only escaped characters).
338    ///
339    /// The accepted strings were validated with securesystemslib, commit
340    /// f466266014aff529510216b8c2f8c8f39de279ec.
341    ///
342    /// ```python
343    /// import securesystemslib.formats
344    /// encode = securesystemslib.formats.encode_canonical
345    /// for c in range(0x20):
346    ///     print(repr(encode(chr(c))))
347    /// print(repr(encode('\\')))
348    /// print(repr(encode('"')))
349    /// ```
350    ///
351    /// This can be a little difficult to wrap a mental string parser around. But you can verify
352    /// that all the control characters result in a 3-byte JSON string:
353    ///
354    /// ```python
355    /// >>> all(map(lambda c: len(encode(chr(c))) == 3, range(0x20)))
356    /// True
357    /// ```
358    #[test]
359    fn ascii_control_characters() -> Result<()> {
360        assert_eq!(encode!("\x00")?, b"\"\x00\"");
361        assert_eq!(encode!("\x01")?, b"\"\x01\"");
362        assert_eq!(encode!("\x02")?, b"\"\x02\"");
363        assert_eq!(encode!("\x03")?, b"\"\x03\"");
364        assert_eq!(encode!("\x04")?, b"\"\x04\"");
365        assert_eq!(encode!("\x05")?, b"\"\x05\"");
366        assert_eq!(encode!("\x06")?, b"\"\x06\"");
367        assert_eq!(encode!("\x07")?, b"\"\x07\"");
368        assert_eq!(encode!("\x08")?, b"\"\x08\"");
369        assert_eq!(encode!("\x09")?, b"\"\x09\"");
370        assert_eq!(encode!("\x0a")?, b"\"\x0a\"");
371        assert_eq!(encode!("\x0b")?, b"\"\x0b\"");
372        assert_eq!(encode!("\x0c")?, b"\"\x0c\"");
373        assert_eq!(encode!("\x0d")?, b"\"\x0d\"");
374        assert_eq!(encode!("\x0e")?, b"\"\x0e\"");
375        assert_eq!(encode!("\x0f")?, b"\"\x0f\"");
376        assert_eq!(encode!("\x10")?, b"\"\x10\"");
377        assert_eq!(encode!("\x11")?, b"\"\x11\"");
378        assert_eq!(encode!("\x12")?, b"\"\x12\"");
379        assert_eq!(encode!("\x13")?, b"\"\x13\"");
380        assert_eq!(encode!("\x14")?, b"\"\x14\"");
381        assert_eq!(encode!("\x15")?, b"\"\x15\"");
382        assert_eq!(encode!("\x16")?, b"\"\x16\"");
383        assert_eq!(encode!("\x17")?, b"\"\x17\"");
384        assert_eq!(encode!("\x18")?, b"\"\x18\"");
385        assert_eq!(encode!("\x19")?, b"\"\x19\"");
386        assert_eq!(encode!("\x1a")?, b"\"\x1a\"");
387        assert_eq!(encode!("\x1b")?, b"\"\x1b\"");
388        assert_eq!(encode!("\x1c")?, b"\"\x1c\"");
389        assert_eq!(encode!("\x1d")?, b"\"\x1d\"");
390        assert_eq!(encode!("\x1e")?, b"\"\x1e\"");
391        assert_eq!(encode!("\x1f")?, b"\"\x1f\"");
392
393        // Try to trigger a panic in our unsafe blocks (from_utf8_unchecked)...
394        assert_eq!(encode!({"\t": "\n"})?, b"{\"\t\":\"\n\"}");
395
396        assert_eq!(encode!("\\")?, b"\"\\\\\"");
397        assert_eq!(encode!("\"")?, b"\"\\\"\"");
398
399        Ok(())
400    }
401
402    /// A more involved test than any of the above for olpc-cjson's core competency: ordering
403    /// things.
404    #[test]
405    fn ordered_nested_object() -> Result<()> {
406        assert_eq!(
407            encode!({
408                "nested": {
409                    "bad": true,
410                    "good": false
411                },
412                "b": 2,
413                "a": 1,
414                "c": {
415                    "h": {
416                        "h": -5,
417                        "i": 3
418                    },
419                    "a": null,
420                    "x": {}
421                }
422            })?,
423            br#"{"a":1,"b":2,"c":{"a":null,"h":{"h":-5,"i":3},"x":{}},"nested":{"bad":true,"good":false}}"#.to_vec(),
424        );
425
426        Ok(())
427    }
428
429    /// This test asserts that the canonical representation of some real-world data always comes
430    /// out the same.
431    #[allow(clippy::unreadable_literal)]
432    #[test]
433    fn actual_tuf_signed() {
434        let encode_result = encode!(
435        {
436          "signed": {
437            "_type": "timestamp",
438            "spec_version": "1.0.0",
439            "version": 1604605512,
440            "expires": "2020-11-12T19:45:12.613154979Z",
441            "meta": {
442              "snapshot.json": {
443                "length": 1278,
444                "hashes": {
445                  "sha256": "56c4ecc3b331f6154d9a5005f6e2978e4198cc8c3b79746c25a592043a2d83d4"
446                },
447                "version": 1604605512
448              }
449            }
450          }
451        }
452        );
453
454        let encoded = encode_result.unwrap();
455        let expected: Vec<u8> = vec![
456            123, 34, 115, 105, 103, 110, 101, 100, 34, 58, 123, 34, 95, 116, 121, 112, 101, 34, 58,
457            34, 116, 105, 109, 101, 115, 116, 97, 109, 112, 34, 44, 34, 101, 120, 112, 105, 114,
458            101, 115, 34, 58, 34, 50, 48, 50, 48, 45, 49, 49, 45, 49, 50, 84, 49, 57, 58, 52, 53,
459            58, 49, 50, 46, 54, 49, 51, 49, 53, 52, 57, 55, 57, 90, 34, 44, 34, 109, 101, 116, 97,
460            34, 58, 123, 34, 115, 110, 97, 112, 115, 104, 111, 116, 46, 106, 115, 111, 110, 34, 58,
461            123, 34, 104, 97, 115, 104, 101, 115, 34, 58, 123, 34, 115, 104, 97, 50, 53, 54, 34,
462            58, 34, 53, 54, 99, 52, 101, 99, 99, 51, 98, 51, 51, 49, 102, 54, 49, 53, 52, 100, 57,
463            97, 53, 48, 48, 53, 102, 54, 101, 50, 57, 55, 56, 101, 52, 49, 57, 56, 99, 99, 56, 99,
464            51, 98, 55, 57, 55, 52, 54, 99, 50, 53, 97, 53, 57, 50, 48, 52, 51, 97, 50, 100, 56,
465            51, 100, 52, 34, 125, 44, 34, 108, 101, 110, 103, 116, 104, 34, 58, 49, 50, 55, 56, 44,
466            34, 118, 101, 114, 115, 105, 111, 110, 34, 58, 49, 54, 48, 52, 54, 48, 53, 53, 49, 50,
467            125, 125, 44, 34, 115, 112, 101, 99, 95, 118, 101, 114, 115, 105, 111, 110, 34, 58, 34,
468            49, 46, 48, 46, 48, 34, 44, 34, 118, 101, 114, 115, 105, 111, 110, 34, 58, 49, 54, 48,
469            52, 54, 48, 53, 53, 49, 50, 125, 125,
470        ];
471        assert_eq!(expected, encoded);
472    }
473
474    #[test]
475    fn encode_u128_i128() {
476        #[derive(serde_derive::Serialize)]
477        struct Object {
478            u128: u128,
479            i128: i128,
480        }
481
482        let value = Object {
483            u128: u128::MAX,
484            i128: i128::MIN,
485        };
486
487        let mut buf = Vec::new();
488        let mut ser = Serializer::with_formatter(&mut buf, CanonicalFormatter::new());
489        value.serialize(&mut ser).unwrap();
490
491        let expected = [
492            123, 34, 105, 49, 50, 56, 34, 58, 45, 49, 55, 48, 49, 52, 49, 49, 56, 51, 52, 54, 48,
493            52, 54, 57, 50, 51, 49, 55, 51, 49, 54, 56, 55, 51, 48, 51, 55, 49, 53, 56, 56, 52, 49,
494            48, 53, 55, 50, 56, 44, 34, 117, 49, 50, 56, 34, 58, 51, 52, 48, 50, 56, 50, 51, 54,
495            54, 57, 50, 48, 57, 51, 56, 52, 54, 51, 52, 54, 51, 51, 55, 52, 54, 48, 55, 52, 51, 49,
496            55, 54, 56, 50, 49, 49, 52, 53, 53, 125,
497        ];
498
499        assert_eq!(buf, expected);
500    }
501}