wit_parser/ast/
lex.rs

1use anyhow::{bail, Result};
2use std::char;
3use std::fmt;
4use std::str;
5use unicode_xid::UnicodeXID;
6
7use self::Token::*;
8
9#[derive(Clone)]
10pub struct Tokenizer<'a> {
11    input: &'a str,
12    span_offset: u32,
13    chars: CrlfFold<'a>,
14    require_f32_f64: bool,
15}
16
17#[derive(Clone)]
18struct CrlfFold<'a> {
19    chars: str::CharIndices<'a>,
20}
21
22/// A span, designating a range of bytes where a token is located.
23#[derive(Eq, PartialEq, Debug, Clone, Copy)]
24pub struct Span {
25    /// The start of the range.
26    pub start: u32,
27    /// The end of the range (exclusive).
28    pub end: u32,
29}
30
31#[derive(Eq, PartialEq, Debug, Copy, Clone)]
32pub enum Token {
33    Whitespace,
34    Comment,
35
36    Equals,
37    Comma,
38    Colon,
39    Period,
40    Semicolon,
41    LeftParen,
42    RightParen,
43    LeftBrace,
44    RightBrace,
45    LessThan,
46    GreaterThan,
47    RArrow,
48    Star,
49    At,
50    Slash,
51    Plus,
52    Minus,
53
54    Use,
55    Type,
56    Func,
57    U8,
58    U16,
59    U32,
60    U64,
61    S8,
62    S16,
63    S32,
64    S64,
65    F32,
66    F64,
67    Char,
68    Record,
69    Resource,
70    Own,
71    Borrow,
72    Flags,
73    Variant,
74    Enum,
75    Bool,
76    String_,
77    Option_,
78    Result_,
79    Future,
80    Stream,
81    ErrorContext,
82    List,
83    Underscore,
84    As,
85    From_,
86    Static,
87    Interface,
88    Tuple,
89    Import,
90    Export,
91    World,
92    Package,
93    Constructor,
94    Async,
95
96    Id,
97    ExplicitId,
98
99    Integer,
100
101    Include,
102    With,
103}
104
105#[derive(Eq, PartialEq, Debug)]
106#[allow(dead_code)]
107pub enum Error {
108    InvalidCharInId(u32, char),
109    IdPartEmpty(u32),
110    InvalidEscape(u32, char),
111    Unexpected(u32, char),
112    UnterminatedComment(u32),
113    Wanted {
114        at: u32,
115        expected: &'static str,
116        found: &'static str,
117    },
118}
119
120// NB: keep in sync with `crates/wit-component/src/printing.rs`.
121const REQUIRE_F32_F64_BY_DEFAULT: bool = true;
122
123impl<'a> Tokenizer<'a> {
124    pub fn new(
125        input: &'a str,
126        span_offset: u32,
127        require_f32_f64: Option<bool>,
128    ) -> Result<Tokenizer<'a>> {
129        detect_invalid_input(input)?;
130
131        let mut t = Tokenizer {
132            input,
133            span_offset,
134            chars: CrlfFold {
135                chars: input.char_indices(),
136            },
137            require_f32_f64: require_f32_f64.unwrap_or_else(|| {
138                match std::env::var("WIT_REQUIRE_F32_F64") {
139                    Ok(s) => s == "1",
140                    Err(_) => REQUIRE_F32_F64_BY_DEFAULT,
141                }
142            }),
143        };
144        // Eat utf-8 BOM
145        t.eatc('\u{feff}');
146        Ok(t)
147    }
148
149    pub fn expect_semicolon(&mut self) -> Result<()> {
150        self.expect(Token::Semicolon)?;
151        Ok(())
152    }
153
154    pub fn get_span(&self, span: Span) -> &'a str {
155        let start = usize::try_from(span.start - self.span_offset).unwrap();
156        let end = usize::try_from(span.end - self.span_offset).unwrap();
157        &self.input[start..end]
158    }
159
160    pub fn parse_id(&self, span: Span) -> Result<&'a str> {
161        let ret = self.get_span(span);
162        validate_id(span.start, &ret)?;
163        Ok(ret)
164    }
165
166    pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
167        let token = self.get_span(span);
168        let id_part = token.strip_prefix('%').unwrap();
169        validate_id(span.start, id_part)?;
170        Ok(id_part)
171    }
172
173    pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
174        loop {
175            match self.next_raw()? {
176                Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
177                other => break Ok(other),
178            }
179        }
180    }
181
182    /// Three possibilities when calling this method: an `Err(...)` indicates that lexing failed, an
183    /// `Ok(Some(...))` produces the next token, and `Ok(None)` indicates that there are no more
184    /// tokens available.
185    pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
186        let (str_start, ch) = match self.chars.next() {
187            Some(pair) => pair,
188            None => return Ok(None),
189        };
190        let start = self.span_offset + u32::try_from(str_start).unwrap();
191        let token = match ch {
192            '\n' | '\t' | ' ' => {
193                // Eat all contiguous whitespace tokens
194                while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
195                Whitespace
196            }
197            '/' => {
198                // Eat a line comment if it's `//...`
199                if self.eatc('/') {
200                    for (_, ch) in &mut self.chars {
201                        if ch == '\n' {
202                            break;
203                        }
204                    }
205                    Comment
206                // eat a block comment if it's `/*...`
207                } else if self.eatc('*') {
208                    let mut depth = 1;
209                    while depth > 0 {
210                        let (_, ch) = match self.chars.next() {
211                            Some(pair) => pair,
212                            None => return Err(Error::UnterminatedComment(start)),
213                        };
214                        match ch {
215                            '/' if self.eatc('*') => depth += 1,
216                            '*' if self.eatc('/') => depth -= 1,
217                            _ => {}
218                        }
219                    }
220                    Comment
221                } else {
222                    Slash
223                }
224            }
225            '=' => Equals,
226            ',' => Comma,
227            ':' => Colon,
228            '.' => Period,
229            ';' => Semicolon,
230            '(' => LeftParen,
231            ')' => RightParen,
232            '{' => LeftBrace,
233            '}' => RightBrace,
234            '<' => LessThan,
235            '>' => GreaterThan,
236            '*' => Star,
237            '@' => At,
238            '-' => {
239                if self.eatc('>') {
240                    RArrow
241                } else {
242                    Minus
243                }
244            }
245            '+' => Plus,
246            '%' => {
247                let mut iter = self.chars.clone();
248                if let Some((_, ch)) = iter.next() {
249                    if is_keylike_start(ch) {
250                        self.chars = iter.clone();
251                        while let Some((_, ch)) = iter.next() {
252                            if !is_keylike_continue(ch) {
253                                break;
254                            }
255                            self.chars = iter.clone();
256                        }
257                    }
258                }
259                ExplicitId
260            }
261            ch if is_keylike_start(ch) => {
262                let remaining = self.chars.chars.as_str().len();
263                let mut iter = self.chars.clone();
264                while let Some((_, ch)) = iter.next() {
265                    if !is_keylike_continue(ch) {
266                        break;
267                    }
268                    self.chars = iter.clone();
269                }
270                let str_end =
271                    str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
272                match &self.input[str_start..str_end] {
273                    "use" => Use,
274                    "type" => Type,
275                    "func" => Func,
276                    "u8" => U8,
277                    "u16" => U16,
278                    "u32" => U32,
279                    "u64" => U64,
280                    "s8" => S8,
281                    "s16" => S16,
282                    "s32" => S32,
283                    "s64" => S64,
284                    "f32" => F32,
285                    "f64" => F64,
286                    "float32" if !self.require_f32_f64 => F32,
287                    "float64" if !self.require_f32_f64 => F64,
288                    "char" => Char,
289                    "resource" => Resource,
290                    "own" => Own,
291                    "borrow" => Borrow,
292                    "record" => Record,
293                    "flags" => Flags,
294                    "variant" => Variant,
295                    "enum" => Enum,
296                    "bool" => Bool,
297                    "string" => String_,
298                    "option" => Option_,
299                    "result" => Result_,
300                    "future" => Future,
301                    "stream" => Stream,
302                    "error-context" => ErrorContext,
303                    "list" => List,
304                    "_" => Underscore,
305                    "as" => As,
306                    "from" => From_,
307                    "static" => Static,
308                    "interface" => Interface,
309                    "tuple" => Tuple,
310                    "world" => World,
311                    "import" => Import,
312                    "export" => Export,
313                    "package" => Package,
314                    "constructor" => Constructor,
315                    "include" => Include,
316                    "with" => With,
317                    "async" => Async,
318                    _ => Id,
319                }
320            }
321
322            ch if ch.is_ascii_digit() => {
323                let mut iter = self.chars.clone();
324                while let Some((_, ch)) = iter.next() {
325                    if !ch.is_ascii_digit() {
326                        break;
327                    }
328                    self.chars = iter.clone();
329                }
330
331                Integer
332            }
333
334            ch => return Err(Error::Unexpected(start, ch)),
335        };
336        let end = match self.chars.clone().next() {
337            Some((i, _)) => i,
338            None => self.input.len(),
339        };
340
341        let end = self.span_offset + u32::try_from(end).unwrap();
342        Ok(Some((Span { start, end }, token)))
343    }
344
345    pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
346        let mut other = self.clone();
347        match other.next()? {
348            Some((_span, found)) if expected == found => {
349                *self = other;
350                Ok(true)
351            }
352            Some(_) => Ok(false),
353            None => Ok(false),
354        }
355    }
356
357    pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
358        match self.next()? {
359            Some((span, found)) => {
360                if expected == found {
361                    Ok(span)
362                } else {
363                    Err(Error::Wanted {
364                        at: span.start,
365                        expected: expected.describe(),
366                        found: found.describe(),
367                    })
368                }
369            }
370            None => Err(Error::Wanted {
371                at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
372                expected: expected.describe(),
373                found: "eof",
374            }),
375        }
376    }
377
378    fn eatc(&mut self, ch: char) -> bool {
379        let mut iter = self.chars.clone();
380        match iter.next() {
381            Some((_, ch2)) if ch == ch2 => {
382                self.chars = iter;
383                true
384            }
385            _ => false,
386        }
387    }
388
389    pub fn eof_span(&self) -> Span {
390        let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
391        Span { start: end, end }
392    }
393}
394
395impl<'a> Iterator for CrlfFold<'a> {
396    type Item = (usize, char);
397
398    fn next(&mut self) -> Option<(usize, char)> {
399        self.chars.next().map(|(i, c)| {
400            if c == '\r' {
401                let mut attempt = self.chars.clone();
402                if let Some((_, '\n')) = attempt.next() {
403                    self.chars = attempt;
404                    return (i, '\n');
405                }
406            }
407            (i, c)
408        })
409    }
410}
411
412fn detect_invalid_input(input: &str) -> Result<()> {
413    // Disallow specific codepoints.
414    let mut line = 1;
415    for ch in input.chars() {
416        match ch {
417            '\n' => line += 1,
418            '\r' | '\t' => {}
419
420            // Bidirectional override codepoints can be used to craft source code that
421            // appears to have a different meaning than its actual meaning. See
422            // [CVE-2021-42574] for background and motivation.
423            //
424            // [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
425            '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
426            | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
427                bail!(
428                    "Input contains bidirectional override codepoint {:?} at line {}",
429                    ch.escape_unicode(),
430                    line
431                );
432            }
433
434            // Disallow several characters which are deprecated or discouraged in Unicode.
435            //
436            // U+149 deprecated; see Unicode 13.0.0, sec. 7.1 Latin, Compatibility Digraphs.
437            // U+673 deprecated; see Unicode 13.0.0, sec. 9.2 Arabic, Additional Vowel Marks.
438            // U+F77 and U+F79 deprecated; see Unicode 13.0.0, sec. 13.4 Tibetan, Vowels.
439            // U+17A3 and U+17A4 deprecated, and U+17B4 and U+17B5 discouraged; see
440            // Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
441            '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
442            | '\u{17b4}' | '\u{17b5}' => {
443                bail!(
444                    "Codepoint {:?} at line {} is discouraged by Unicode",
445                    ch.escape_unicode(),
446                    line
447                );
448            }
449
450            // Disallow control codes other than the ones explicitly recognized above,
451            // so that viewing a wit file on a terminal doesn't have surprising side
452            // effects or appear to have a different meaning than its actual meaning.
453            ch if ch.is_control() => {
454                bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
455            }
456
457            _ => {}
458        }
459    }
460
461    Ok(())
462}
463
464fn is_keylike_start(ch: char) -> bool {
465    // Lex any XID start, `_`, or '-'. These aren't all valid identifier chars,
466    // but we'll diagnose that after we've lexed the full string.
467    UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
468}
469
470fn is_keylike_continue(ch: char) -> bool {
471    // Lex any XID continue (which includes `_`) or '-'.
472    UnicodeXID::is_xid_continue(ch) || ch == '-'
473}
474
475pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
476    // IDs must have at least one part.
477    if id.is_empty() {
478        return Err(Error::IdPartEmpty(start));
479    }
480
481    // Ids consist of parts separated by '-'s.
482    for part in id.split('-') {
483        // Parts must be non-empty and contain either all ASCII lowercase or
484        // all ASCII uppercase.
485        let upper = match part.chars().next() {
486            None => return Err(Error::IdPartEmpty(start)),
487            Some(first) => {
488                if first.is_ascii_lowercase() {
489                    false
490                } else if first.is_ascii_uppercase() {
491                    true
492                } else {
493                    return Err(Error::InvalidCharInId(start, first));
494                }
495            }
496        };
497
498        for ch in part.chars() {
499            if ch.is_ascii_digit() {
500                // Digits are accepted in both uppercase and lowercase segments.
501            } else if upper {
502                if !ch.is_ascii_uppercase() {
503                    return Err(Error::InvalidCharInId(start, ch));
504                }
505            } else if !ch.is_ascii_lowercase() {
506                return Err(Error::InvalidCharInId(start, ch));
507            }
508        }
509    }
510
511    Ok(())
512}
513
514impl Token {
515    pub fn describe(&self) -> &'static str {
516        match self {
517            Whitespace => "whitespace",
518            Comment => "a comment",
519            Equals => "'='",
520            Comma => "','",
521            Colon => "':'",
522            Period => "'.'",
523            Semicolon => "';'",
524            LeftParen => "'('",
525            RightParen => "')'",
526            LeftBrace => "'{'",
527            RightBrace => "'}'",
528            LessThan => "'<'",
529            GreaterThan => "'>'",
530            Use => "keyword `use`",
531            Type => "keyword `type`",
532            Func => "keyword `func`",
533            U8 => "keyword `u8`",
534            U16 => "keyword `u16`",
535            U32 => "keyword `u32`",
536            U64 => "keyword `u64`",
537            S8 => "keyword `s8`",
538            S16 => "keyword `s16`",
539            S32 => "keyword `s32`",
540            S64 => "keyword `s64`",
541            F32 => "keyword `f32`",
542            F64 => "keyword `f64`",
543            Char => "keyword `char`",
544            Own => "keyword `own`",
545            Borrow => "keyword `borrow`",
546            Resource => "keyword `resource`",
547            Record => "keyword `record`",
548            Flags => "keyword `flags`",
549            Variant => "keyword `variant`",
550            Enum => "keyword `enum`",
551            Bool => "keyword `bool`",
552            String_ => "keyword `string`",
553            Option_ => "keyword `option`",
554            Result_ => "keyword `result`",
555            Future => "keyword `future`",
556            Stream => "keyword `stream`",
557            ErrorContext => "keyword `error-context`",
558            List => "keyword `list`",
559            Underscore => "keyword `_`",
560            Id => "an identifier",
561            ExplicitId => "an '%' identifier",
562            RArrow => "`->`",
563            Star => "`*`",
564            At => "`@`",
565            Slash => "`/`",
566            Plus => "`+`",
567            Minus => "`-`",
568            As => "keyword `as`",
569            From_ => "keyword `from`",
570            Static => "keyword `static`",
571            Interface => "keyword `interface`",
572            Tuple => "keyword `tuple`",
573            Import => "keyword `import`",
574            Export => "keyword `export`",
575            World => "keyword `world`",
576            Package => "keyword `package`",
577            Constructor => "keyword `constructor`",
578            Integer => "an integer",
579            Include => "keyword `include`",
580            With => "keyword `with`",
581            Async => "keyword `async`",
582        }
583    }
584}
585
586impl std::error::Error for Error {}
587
588impl fmt::Display for Error {
589    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
590        match self {
591            Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
592            Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
593            Error::Wanted {
594                expected, found, ..
595            } => write!(f, "expected {expected}, found {found}"),
596            Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
597            Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
598            Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
599        }
600    }
601}
602
603#[test]
604fn test_validate_id() {
605    validate_id(0, "apple").unwrap();
606    validate_id(0, "apple-pear").unwrap();
607    validate_id(0, "apple-pear-grape").unwrap();
608    validate_id(0, "a0").unwrap();
609    validate_id(0, "a").unwrap();
610    validate_id(0, "a-a").unwrap();
611    validate_id(0, "bool").unwrap();
612    validate_id(0, "APPLE").unwrap();
613    validate_id(0, "APPLE-PEAR").unwrap();
614    validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
615    validate_id(0, "apple-PEAR-grape").unwrap();
616    validate_id(0, "APPLE-pear-GRAPE").unwrap();
617    validate_id(0, "ENOENT").unwrap();
618    validate_id(0, "is-XML").unwrap();
619
620    assert!(validate_id(0, "").is_err());
621    assert!(validate_id(0, "0").is_err());
622    assert!(validate_id(0, "%").is_err());
623    assert!(validate_id(0, "$").is_err());
624    assert!(validate_id(0, "0a").is_err());
625    assert!(validate_id(0, ".").is_err());
626    assert!(validate_id(0, "·").is_err());
627    assert!(validate_id(0, "a a").is_err());
628    assert!(validate_id(0, "_").is_err());
629    assert!(validate_id(0, "-").is_err());
630    assert!(validate_id(0, "a-").is_err());
631    assert!(validate_id(0, "-a").is_err());
632    assert!(validate_id(0, "Apple").is_err());
633    assert!(validate_id(0, "applE").is_err());
634    assert!(validate_id(0, "-apple-pear").is_err());
635    assert!(validate_id(0, "apple-pear-").is_err());
636    assert!(validate_id(0, "apple_pear").is_err());
637    assert!(validate_id(0, "apple.pear").is_err());
638    assert!(validate_id(0, "apple pear").is_err());
639    assert!(validate_id(0, "apple/pear").is_err());
640    assert!(validate_id(0, "apple|pear").is_err());
641    assert!(validate_id(0, "apple-Pear").is_err());
642    assert!(validate_id(0, "apple-0").is_err());
643    assert!(validate_id(0, "()()").is_err());
644    assert!(validate_id(0, "").is_err());
645    assert!(validate_id(0, "*").is_err());
646    assert!(validate_id(0, "apple\u{5f3}pear").is_err());
647    assert!(validate_id(0, "apple\u{200c}pear").is_err());
648    assert!(validate_id(0, "apple\u{200d}pear").is_err());
649    assert!(validate_id(0, "apple--pear").is_err());
650    assert!(validate_id(0, "_apple").is_err());
651    assert!(validate_id(0, "apple_").is_err());
652    assert!(validate_id(0, "_Znwj").is_err());
653    assert!(validate_id(0, "__i386").is_err());
654    assert!(validate_id(0, "__i386__").is_err());
655    assert!(validate_id(0, "Москва").is_err());
656    assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
657    assert!(validate_id(0, "😼").is_err(), "non-identifier");
658    assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
659}
660
661#[test]
662fn test_tokenizer() {
663    fn collect(s: &str) -> Result<Vec<Token>> {
664        let mut t = Tokenizer::new(s, 0, None)?;
665        let mut tokens = Vec::new();
666        while let Some(token) = t.next()? {
667            tokens.push(token.1);
668        }
669        Ok(tokens)
670    }
671
672    assert_eq!(collect("").unwrap(), vec![]);
673    assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
674    assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
675    assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
676    assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
677    assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
678    assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
679    assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
680    assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
681    assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
682    assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
683    assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
684    assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
685    assert_eq!(
686        collect("garçon-hühnervögel-москва-東京").unwrap(),
687        vec![Token::Id]
688    );
689    assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
690    assert_eq!(collect("a").unwrap(), vec![Token::Id]);
691    assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
692    assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
693    assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
694    assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
695    assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
696    assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
697    assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
698    assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
699    assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
700    assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
701    assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
702
703    assert_eq!(collect("func").unwrap(), vec![Token::Func]);
704    assert_eq!(
705        collect("a: func()").unwrap(),
706        vec![
707            Token::Id,
708            Token::Colon,
709            Token::Func,
710            Token::LeftParen,
711            Token::RightParen
712        ]
713    );
714
715    assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
716
717    assert_eq!(collect("own").unwrap(), vec![Token::Own]);
718    assert_eq!(
719        collect("own<some-id>").unwrap(),
720        vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
721    );
722
723    assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
724    assert_eq!(
725        collect("borrow<some-id>").unwrap(),
726        vec![
727            Token::Borrow,
728            Token::LessThan,
729            Token::Id,
730            Token::GreaterThan
731        ]
732    );
733
734    assert!(collect("\u{149}").is_err(), "strongly discouraged");
735    assert!(collect("\u{673}").is_err(), "strongly discouraged");
736    assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
737    assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
738    assert!(collect("\u{202a}").is_err(), "bidirectional override");
739    assert!(collect("\u{2068}").is_err(), "bidirectional override");
740    assert!(collect("\u{0}").is_err(), "control code");
741    assert!(collect("\u{b}").is_err(), "control code");
742    assert!(collect("\u{c}").is_err(), "control code");
743    assert!(collect("\u{85}").is_err(), "control code");
744}