iri_string/normalize/
pct_case.rs

1//! Percent-encoding normalization and case normalization.
2
3use core::fmt::{self, Write as _};
4use core::marker::PhantomData;
5
6use crate::format::eq_str_display;
7use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue};
8use crate::parser::str::{find_split_hole, take_first_char};
9use crate::parser::trusted::take_xdigits2;
10use crate::spec::Spec;
11
12/// Returns true if the given string is percent-encoding normalized and case
13/// normalized.
14///
15/// Note that normalization of ASCII-only host requires additional case
16/// normalization, so checking by this function is not sufficient for that case.
17pub(crate) fn is_pct_case_normalized<S: Spec>(s: &str) -> bool {
18    eq_str_display(s, &PctCaseNormalized::<S>::new(s))
19}
20
21/// Returns a character for the slice.
22///
23/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`,
24/// but this function fully trusts that the input is a valid UTF-8 string with
25/// only one character.
26fn into_char_trusted(bytes: &[u8]) -> Result<char, ()> {
27    /// The bit mask to get the content part in a continue byte.
28    const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111;
29    /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes.
30    const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000];
31
32    let len = bytes.len();
33    let c: u32 = match len {
34        2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK),
35        3 => {
36            (u32::from(bytes[0] & 0b_0000_1111) << 12)
37                | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6)
38                | u32::from(bytes[2] & CONTINUE_BYTE_MASK)
39        }
40        4 => {
41            (u32::from(bytes[0] & 0b_0000_0111) << 18)
42                | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12)
43                | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6)
44                | u32::from(bytes[3] & CONTINUE_BYTE_MASK)
45        }
46        len => unreachable!(
47            "[consistency] expected 2, 3, or 4 bytes for a character, but got {len} as the length"
48        ),
49    };
50    if c < MIN[len - 2] {
51        // Redundant UTF-8 encoding.
52        return Err(());
53    }
54    // Can be an invalid Unicode code point.
55    char::from_u32(c).ok_or(())
56}
57
58/// Writable as a normalized path segment percent-encoding IRI.
59///
60/// This wrapper does the things below when being formatted:
61///
62/// * Decode unnecessarily percent-encoded characters.
63/// * Convert alphabetic characters uppercase in percent-encoded triplets.
64///
65/// Note that this does not newly encode raw characters.
66///
67/// # Safety
68///
69/// The given string should be the valid path segment.
70#[derive(Debug, Clone, Copy)]
71pub(crate) struct PctCaseNormalized<'a, S> {
72    /// Valid segment name to normalize.
73    segname: &'a str,
74    /// Spec.
75    _spec: PhantomData<fn() -> S>,
76}
77
78impl<'a, S: Spec> PctCaseNormalized<'a, S> {
79    /// Creates a new `PctCaseNormalized` value.
80    #[inline]
81    #[must_use]
82    pub(crate) fn new(source: &'a str) -> Self {
83        Self {
84            segname: source,
85            _spec: PhantomData,
86        }
87    }
88}
89
90impl<S: Spec> fmt::Display for PctCaseNormalized<'_, S> {
91    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92        let mut rest = self.segname;
93
94        'outer_loop: while !rest.is_empty() {
95            // Scan the next percent-encoded triplet.
96            let (prefix, after_percent) = match find_split_hole(rest, b'%') {
97                Some(v) => v,
98                None => return f.write_str(rest),
99            };
100            // Write the string before the percent-encoded triplet.
101            f.write_str(prefix)?;
102            // Decode the percent-encoded triplet.
103            let (first_decoded, after_first_triplet) = take_xdigits2(after_percent);
104            rest = after_first_triplet;
105
106            let expected_char_len = match first_decoded {
107                0x00..=0x7F => {
108                    // An ASCII character.
109                    debug_assert!(first_decoded.is_ascii());
110                    if is_ascii_unreserved(first_decoded) {
111                        // Unreserved. Print the decoded.
112                        f.write_char(char::from(first_decoded))?;
113                    } else {
114                        write!(f, "%{:02X}", first_decoded)?;
115                    }
116                    continue 'outer_loop;
117                }
118                0xC2..=0xDF => 2,
119                0xE0..=0xEF => 3,
120                0xF0..=0xF4 => 4,
121                0x80..=0xC1 | 0xF5..=0xFF => {
122                    // Cannot appear as a first byte.
123                    //
124                    //  * 0x80..=0xBF: continue byte.
125                    //  * 0xC0..=0xC1: redundant encoding.
126                    //  * 0xF5..=0xFF: above the maximum value for U+10FFFF.
127                    write!(f, "%{:02X}", first_decoded)?;
128                    continue 'outer_loop;
129                }
130            };
131
132            // Get continue bytes.
133            let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len];
134            for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() {
135                match take_first_char(rest) {
136                    Some(('%', after_percent)) => {
137                        let (byte, after_triplet) = take_xdigits2(after_percent);
138                        if !is_utf8_byte_continue(byte) {
139                            // Note that `byte` can start the new string.
140                            // Leave the byte in the `rest` for next try (i.e.
141                            // don't update `rest` in this case).
142                            c_buf[..=i]
143                                .iter()
144                                .try_for_each(|b| write!(f, "%{:02X}", b))?;
145                            continue 'outer_loop;
146                        }
147                        *buf_dest = byte;
148                        rest = after_triplet;
149                    }
150                    // If the next character is not `%`, decoded bytes so far
151                    // won't be valid UTF-8 byte sequence.
152                    // Write the read percent-encoded triplets without decoding.
153                    // Note that all characters in `&c_buf[1..]` (if available)
154                    // will be decoded to "continue byte" of UTF-8, so they
155                    // cannot be the start of a valid UTF-8 byte sequence if
156                    // decoded.
157                    Some((c, after_percent)) => {
158                        c_buf[..=i]
159                            .iter()
160                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
161                        f.write_char(c)?;
162                        rest = after_percent;
163                        continue 'outer_loop;
164                    }
165                    None => {
166                        c_buf[..=i]
167                            .iter()
168                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
169                        // Reached the end of the string.
170                        break 'outer_loop;
171                    }
172                }
173            }
174
175            // Decode the bytes into a character.
176            match into_char_trusted(&c_buf[..expected_char_len]) {
177                Ok(decoded_c) => {
178                    if is_unreserved::<S>(decoded_c) {
179                        // Unreserved. Print the decoded.
180                        f.write_char(decoded_c)?;
181                    } else {
182                        c_buf[0..expected_char_len]
183                            .iter()
184                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
185                    }
186                }
187                Err(_) => {
188                    // Skip decoding of the entire sequence of pct-encoded triplets loaded
189                    // in `c_buf`. This is valid from the reasons below.
190                    //
191                    // * The first byte in `c_buf` is valid as the first byte, and it tells the
192                    //   expected number of bytes for a code unit. The cases the bytes being too
193                    //   short and the sequence being incomplete have already been handled, and
194                    //   the execution does not reach here then.
195                    // * All of the non-first bytes are checked if they are valid as UTF8 continue
196                    //   bytes by `is_utf8_byte_continue()`. If they're not, the decoding of
197                    //   that codepoint is aborted and the bytes in the buffer are immediately
198                    //   emitted as pct-encoded, and the execution does not reach here. This
199                    //   means that the bytes in the current `c_buf` have passed these tests.
200                    // * Since all of the the non-first bytes are UTF8 continue bytes, any of
201                    //   them cannot start the new valid UTF-8 byte sequence. This means that
202                    //   if the bytes in the buffer does not consitute a valid UTF-8 bytes
203                    //   sequence, the whole buffer can immediately be emmitted as pct-encoded.
204
205                    debug_assert!(
206                        c_buf[1..expected_char_len]
207                            .iter()
208                            .copied()
209                            .all(is_utf8_byte_continue),
210                        "[consistency] all non-first bytes have been \
211                         confirmed that they are UTF-8 continue bytes"
212                    );
213                    // Note that the first pct-encoded triplet is stripped from
214                    // `after_first_triplet`.
215                    rest = &after_first_triplet[((expected_char_len - 1) * 3)..];
216                    c_buf[0..expected_char_len]
217                        .iter()
218                        .try_for_each(|b| write!(f, "%{:02X}", b))?;
219                }
220            }
221        }
222
223        Ok(())
224    }
225}
226
227/// Writable as a normalized ASCII-only `host` (and optionally `port` followed).
228#[derive(Debug, Clone, Copy)]
229pub(crate) struct NormalizedAsciiOnlyHost<'a> {
230    /// Valid host (and additionaly port) to normalize.
231    host_port: &'a str,
232}
233
234impl<'a> NormalizedAsciiOnlyHost<'a> {
235    /// Creates a new `NormalizedAsciiOnlyHost` value.
236    ///
237    /// # Preconditions
238    ///
239    /// The given string should be the valid ASCII-only `host` or
240    /// `host ":" port` after percent-encoding normalization.
241    /// In other words, [`parser::trusted::is_ascii_only_host`] should return
242    /// true for the given value.
243    ///
244    /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host`
245    #[inline]
246    #[must_use]
247    pub(crate) fn new(host_port: &'a str) -> Self {
248        Self { host_port }
249    }
250}
251
252impl fmt::Display for NormalizedAsciiOnlyHost<'_> {
253    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
254        let mut rest = self.host_port;
255
256        while !rest.is_empty() {
257            // Scan the next percent-encoded triplet.
258            let (prefix, after_percent) = match find_split_hole(rest, b'%') {
259                Some(v) => v,
260                None => {
261                    return rest
262                        .chars()
263                        .try_for_each(|c| f.write_char(c.to_ascii_lowercase()));
264                }
265            };
266            // Write the string before the percent-encoded triplet.
267            prefix
268                .chars()
269                .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?;
270            // Decode the percent-encoded triplet.
271            let (first_decoded, after_triplet) = take_xdigits2(after_percent);
272            rest = after_triplet;
273
274            assert!(
275                first_decoded.is_ascii(),
276                "[consistency] this function requires ASCII-only host as an argument"
277            );
278
279            if is_ascii_unreserved(first_decoded) {
280                // Unreserved. Convert to lowercase and print.
281                f.write_char(char::from(first_decoded.to_ascii_lowercase()))?;
282            } else {
283                write!(f, "%{:02X}", first_decoded)?;
284            }
285        }
286
287        Ok(())
288    }
289}
290
291#[cfg(test)]
292#[cfg(feature = "alloc")]
293mod tests {
294    use super::*;
295
296    #[cfg(all(feature = "alloc", not(feature = "std")))]
297    use alloc::string::ToString;
298
299    use crate::spec::{IriSpec, UriSpec};
300
301    #[test]
302    fn invalid_utf8() {
303        assert_eq!(
304            PctCaseNormalized::<UriSpec>::new("%80%cc%cc%cc").to_string(),
305            "%80%CC%CC%CC"
306        );
307        assert_eq!(
308            PctCaseNormalized::<IriSpec>::new("%80%cc%cc%cc").to_string(),
309            "%80%CC%CC%CC"
310        );
311    }
312
313    #[test]
314    fn iri_unreserved() {
315        assert_eq!(
316            PctCaseNormalized::<UriSpec>::new("%ce%b1").to_string(),
317            "%CE%B1"
318        );
319        assert_eq!(
320            PctCaseNormalized::<IriSpec>::new("%ce%b1").to_string(),
321            "\u{03B1}"
322        );
323    }
324
325    #[test]
326    fn iri_middle_decode() {
327        assert_eq!(
328            PctCaseNormalized::<UriSpec>::new("%ce%ce%b1%b1").to_string(),
329            "%CE%CE%B1%B1"
330        );
331        assert_eq!(
332            PctCaseNormalized::<IriSpec>::new("%ce%ce%b1%b1").to_string(),
333            "%CE\u{03B1}%B1"
334        );
335    }
336
337    #[test]
338    fn ascii_reserved() {
339        assert_eq!(PctCaseNormalized::<UriSpec>::new("%3f").to_string(), "%3F");
340        assert_eq!(PctCaseNormalized::<IriSpec>::new("%3f").to_string(), "%3F");
341    }
342
343    #[test]
344    fn ascii_forbidden() {
345        assert_eq!(
346            PctCaseNormalized::<UriSpec>::new("%3c%3e").to_string(),
347            "%3C%3E"
348        );
349        assert_eq!(
350            PctCaseNormalized::<IriSpec>::new("%3c%3e").to_string(),
351            "%3C%3E"
352        );
353    }
354
355    #[test]
356    fn ascii_unreserved() {
357        assert_eq!(PctCaseNormalized::<UriSpec>::new("%7ea").to_string(), "~a");
358        assert_eq!(PctCaseNormalized::<IriSpec>::new("%7ea").to_string(), "~a");
359    }
360}
iri_string/normalize/pct_case.rs

iri_string/normalize/
pct_case.rs