1use anyhow::{bail, Result};
2use std::char;
3use std::fmt;
4use std::str;
5use unicode_xid::UnicodeXID;
6
7use self::Token::*;
8
9#[derive(Clone)]
10pub struct Tokenizer<'a> {
11 input: &'a str,
12 span_offset: u32,
13 chars: CrlfFold<'a>,
14 require_f32_f64: bool,
15}
16
17#[derive(Clone)]
18struct CrlfFold<'a> {
19 chars: str::CharIndices<'a>,
20}
21
22#[derive(Eq, PartialEq, Debug, Clone, Copy)]
24pub struct Span {
25 pub start: u32,
27 pub end: u32,
29}
30
31#[derive(Eq, PartialEq, Debug, Copy, Clone)]
32pub enum Token {
33 Whitespace,
34 Comment,
35
36 Equals,
37 Comma,
38 Colon,
39 Period,
40 Semicolon,
41 LeftParen,
42 RightParen,
43 LeftBrace,
44 RightBrace,
45 LessThan,
46 GreaterThan,
47 RArrow,
48 Star,
49 At,
50 Slash,
51 Plus,
52 Minus,
53
54 Use,
55 Type,
56 Func,
57 U8,
58 U16,
59 U32,
60 U64,
61 S8,
62 S16,
63 S32,
64 S64,
65 F32,
66 F64,
67 Char,
68 Record,
69 Resource,
70 Own,
71 Borrow,
72 Flags,
73 Variant,
74 Enum,
75 Bool,
76 String_,
77 Option_,
78 Result_,
79 Future,
80 Stream,
81 ErrorContext,
82 List,
83 Underscore,
84 As,
85 From_,
86 Static,
87 Interface,
88 Tuple,
89 Import,
90 Export,
91 World,
92 Package,
93 Constructor,
94 Async,
95
96 Id,
97 ExplicitId,
98
99 Integer,
100
101 Include,
102 With,
103}
104
105#[derive(Eq, PartialEq, Debug)]
106#[allow(dead_code)]
107pub enum Error {
108 InvalidCharInId(u32, char),
109 IdPartEmpty(u32),
110 InvalidEscape(u32, char),
111 Unexpected(u32, char),
112 UnterminatedComment(u32),
113 Wanted {
114 at: u32,
115 expected: &'static str,
116 found: &'static str,
117 },
118}
119
120const REQUIRE_F32_F64_BY_DEFAULT: bool = true;
122
123impl<'a> Tokenizer<'a> {
124 pub fn new(
125 input: &'a str,
126 span_offset: u32,
127 require_f32_f64: Option<bool>,
128 ) -> Result<Tokenizer<'a>> {
129 detect_invalid_input(input)?;
130
131 let mut t = Tokenizer {
132 input,
133 span_offset,
134 chars: CrlfFold {
135 chars: input.char_indices(),
136 },
137 require_f32_f64: require_f32_f64.unwrap_or_else(|| {
138 match std::env::var("WIT_REQUIRE_F32_F64") {
139 Ok(s) => s == "1",
140 Err(_) => REQUIRE_F32_F64_BY_DEFAULT,
141 }
142 }),
143 };
144 t.eatc('\u{feff}');
146 Ok(t)
147 }
148
149 pub fn expect_semicolon(&mut self) -> Result<()> {
150 self.expect(Token::Semicolon)?;
151 Ok(())
152 }
153
154 pub fn get_span(&self, span: Span) -> &'a str {
155 let start = usize::try_from(span.start - self.span_offset).unwrap();
156 let end = usize::try_from(span.end - self.span_offset).unwrap();
157 &self.input[start..end]
158 }
159
160 pub fn parse_id(&self, span: Span) -> Result<&'a str> {
161 let ret = self.get_span(span);
162 validate_id(span.start, &ret)?;
163 Ok(ret)
164 }
165
166 pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
167 let token = self.get_span(span);
168 let id_part = token.strip_prefix('%').unwrap();
169 validate_id(span.start, id_part)?;
170 Ok(id_part)
171 }
172
173 pub fn next(&mut self) -> Result<Option<(Span, Token)>, Error> {
174 loop {
175 match self.next_raw()? {
176 Some((_, Token::Whitespace)) | Some((_, Token::Comment)) => {}
177 other => break Ok(other),
178 }
179 }
180 }
181
182 pub fn next_raw(&mut self) -> Result<Option<(Span, Token)>, Error> {
186 let (str_start, ch) = match self.chars.next() {
187 Some(pair) => pair,
188 None => return Ok(None),
189 };
190 let start = self.span_offset + u32::try_from(str_start).unwrap();
191 let token = match ch {
192 '\n' | '\t' | ' ' => {
193 while self.eatc(' ') || self.eatc('\t') || self.eatc('\n') {}
195 Whitespace
196 }
197 '/' => {
198 if self.eatc('/') {
200 for (_, ch) in &mut self.chars {
201 if ch == '\n' {
202 break;
203 }
204 }
205 Comment
206 } else if self.eatc('*') {
208 let mut depth = 1;
209 while depth > 0 {
210 let (_, ch) = match self.chars.next() {
211 Some(pair) => pair,
212 None => return Err(Error::UnterminatedComment(start)),
213 };
214 match ch {
215 '/' if self.eatc('*') => depth += 1,
216 '*' if self.eatc('/') => depth -= 1,
217 _ => {}
218 }
219 }
220 Comment
221 } else {
222 Slash
223 }
224 }
225 '=' => Equals,
226 ',' => Comma,
227 ':' => Colon,
228 '.' => Period,
229 ';' => Semicolon,
230 '(' => LeftParen,
231 ')' => RightParen,
232 '{' => LeftBrace,
233 '}' => RightBrace,
234 '<' => LessThan,
235 '>' => GreaterThan,
236 '*' => Star,
237 '@' => At,
238 '-' => {
239 if self.eatc('>') {
240 RArrow
241 } else {
242 Minus
243 }
244 }
245 '+' => Plus,
246 '%' => {
247 let mut iter = self.chars.clone();
248 if let Some((_, ch)) = iter.next() {
249 if is_keylike_start(ch) {
250 self.chars = iter.clone();
251 while let Some((_, ch)) = iter.next() {
252 if !is_keylike_continue(ch) {
253 break;
254 }
255 self.chars = iter.clone();
256 }
257 }
258 }
259 ExplicitId
260 }
261 ch if is_keylike_start(ch) => {
262 let remaining = self.chars.chars.as_str().len();
263 let mut iter = self.chars.clone();
264 while let Some((_, ch)) = iter.next() {
265 if !is_keylike_continue(ch) {
266 break;
267 }
268 self.chars = iter.clone();
269 }
270 let str_end =
271 str_start + ch.len_utf8() + (remaining - self.chars.chars.as_str().len());
272 match &self.input[str_start..str_end] {
273 "use" => Use,
274 "type" => Type,
275 "func" => Func,
276 "u8" => U8,
277 "u16" => U16,
278 "u32" => U32,
279 "u64" => U64,
280 "s8" => S8,
281 "s16" => S16,
282 "s32" => S32,
283 "s64" => S64,
284 "f32" => F32,
285 "f64" => F64,
286 "float32" if !self.require_f32_f64 => F32,
287 "float64" if !self.require_f32_f64 => F64,
288 "char" => Char,
289 "resource" => Resource,
290 "own" => Own,
291 "borrow" => Borrow,
292 "record" => Record,
293 "flags" => Flags,
294 "variant" => Variant,
295 "enum" => Enum,
296 "bool" => Bool,
297 "string" => String_,
298 "option" => Option_,
299 "result" => Result_,
300 "future" => Future,
301 "stream" => Stream,
302 "error-context" => ErrorContext,
303 "list" => List,
304 "_" => Underscore,
305 "as" => As,
306 "from" => From_,
307 "static" => Static,
308 "interface" => Interface,
309 "tuple" => Tuple,
310 "world" => World,
311 "import" => Import,
312 "export" => Export,
313 "package" => Package,
314 "constructor" => Constructor,
315 "include" => Include,
316 "with" => With,
317 "async" => Async,
318 _ => Id,
319 }
320 }
321
322 ch if ch.is_ascii_digit() => {
323 let mut iter = self.chars.clone();
324 while let Some((_, ch)) = iter.next() {
325 if !ch.is_ascii_digit() {
326 break;
327 }
328 self.chars = iter.clone();
329 }
330
331 Integer
332 }
333
334 ch => return Err(Error::Unexpected(start, ch)),
335 };
336 let end = match self.chars.clone().next() {
337 Some((i, _)) => i,
338 None => self.input.len(),
339 };
340
341 let end = self.span_offset + u32::try_from(end).unwrap();
342 Ok(Some((Span { start, end }, token)))
343 }
344
345 pub fn eat(&mut self, expected: Token) -> Result<bool, Error> {
346 let mut other = self.clone();
347 match other.next()? {
348 Some((_span, found)) if expected == found => {
349 *self = other;
350 Ok(true)
351 }
352 Some(_) => Ok(false),
353 None => Ok(false),
354 }
355 }
356
357 pub fn expect(&mut self, expected: Token) -> Result<Span, Error> {
358 match self.next()? {
359 Some((span, found)) => {
360 if expected == found {
361 Ok(span)
362 } else {
363 Err(Error::Wanted {
364 at: span.start,
365 expected: expected.describe(),
366 found: found.describe(),
367 })
368 }
369 }
370 None => Err(Error::Wanted {
371 at: self.span_offset + u32::try_from(self.input.len()).unwrap(),
372 expected: expected.describe(),
373 found: "eof",
374 }),
375 }
376 }
377
378 fn eatc(&mut self, ch: char) -> bool {
379 let mut iter = self.chars.clone();
380 match iter.next() {
381 Some((_, ch2)) if ch == ch2 => {
382 self.chars = iter;
383 true
384 }
385 _ => false,
386 }
387 }
388
389 pub fn eof_span(&self) -> Span {
390 let end = self.span_offset + u32::try_from(self.input.len()).unwrap();
391 Span { start: end, end }
392 }
393}
394
395impl<'a> Iterator for CrlfFold<'a> {
396 type Item = (usize, char);
397
398 fn next(&mut self) -> Option<(usize, char)> {
399 self.chars.next().map(|(i, c)| {
400 if c == '\r' {
401 let mut attempt = self.chars.clone();
402 if let Some((_, '\n')) = attempt.next() {
403 self.chars = attempt;
404 return (i, '\n');
405 }
406 }
407 (i, c)
408 })
409 }
410}
411
412fn detect_invalid_input(input: &str) -> Result<()> {
413 let mut line = 1;
415 for ch in input.chars() {
416 match ch {
417 '\n' => line += 1,
418 '\r' | '\t' => {}
419
420 '\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
426 | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
427 bail!(
428 "Input contains bidirectional override codepoint {:?} at line {}",
429 ch.escape_unicode(),
430 line
431 );
432 }
433
434 '\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
442 | '\u{17b4}' | '\u{17b5}' => {
443 bail!(
444 "Codepoint {:?} at line {} is discouraged by Unicode",
445 ch.escape_unicode(),
446 line
447 );
448 }
449
450 ch if ch.is_control() => {
454 bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
455 }
456
457 _ => {}
458 }
459 }
460
461 Ok(())
462}
463
464fn is_keylike_start(ch: char) -> bool {
465 UnicodeXID::is_xid_start(ch) || ch == '_' || ch == '-'
468}
469
470fn is_keylike_continue(ch: char) -> bool {
471 UnicodeXID::is_xid_continue(ch) || ch == '-'
473}
474
475pub fn validate_id(start: u32, id: &str) -> Result<(), Error> {
476 if id.is_empty() {
478 return Err(Error::IdPartEmpty(start));
479 }
480
481 for part in id.split('-') {
483 let upper = match part.chars().next() {
486 None => return Err(Error::IdPartEmpty(start)),
487 Some(first) => {
488 if first.is_ascii_lowercase() {
489 false
490 } else if first.is_ascii_uppercase() {
491 true
492 } else {
493 return Err(Error::InvalidCharInId(start, first));
494 }
495 }
496 };
497
498 for ch in part.chars() {
499 if ch.is_ascii_digit() {
500 } else if upper {
502 if !ch.is_ascii_uppercase() {
503 return Err(Error::InvalidCharInId(start, ch));
504 }
505 } else if !ch.is_ascii_lowercase() {
506 return Err(Error::InvalidCharInId(start, ch));
507 }
508 }
509 }
510
511 Ok(())
512}
513
514impl Token {
515 pub fn describe(&self) -> &'static str {
516 match self {
517 Whitespace => "whitespace",
518 Comment => "a comment",
519 Equals => "'='",
520 Comma => "','",
521 Colon => "':'",
522 Period => "'.'",
523 Semicolon => "';'",
524 LeftParen => "'('",
525 RightParen => "')'",
526 LeftBrace => "'{'",
527 RightBrace => "'}'",
528 LessThan => "'<'",
529 GreaterThan => "'>'",
530 Use => "keyword `use`",
531 Type => "keyword `type`",
532 Func => "keyword `func`",
533 U8 => "keyword `u8`",
534 U16 => "keyword `u16`",
535 U32 => "keyword `u32`",
536 U64 => "keyword `u64`",
537 S8 => "keyword `s8`",
538 S16 => "keyword `s16`",
539 S32 => "keyword `s32`",
540 S64 => "keyword `s64`",
541 F32 => "keyword `f32`",
542 F64 => "keyword `f64`",
543 Char => "keyword `char`",
544 Own => "keyword `own`",
545 Borrow => "keyword `borrow`",
546 Resource => "keyword `resource`",
547 Record => "keyword `record`",
548 Flags => "keyword `flags`",
549 Variant => "keyword `variant`",
550 Enum => "keyword `enum`",
551 Bool => "keyword `bool`",
552 String_ => "keyword `string`",
553 Option_ => "keyword `option`",
554 Result_ => "keyword `result`",
555 Future => "keyword `future`",
556 Stream => "keyword `stream`",
557 ErrorContext => "keyword `error-context`",
558 List => "keyword `list`",
559 Underscore => "keyword `_`",
560 Id => "an identifier",
561 ExplicitId => "an '%' identifier",
562 RArrow => "`->`",
563 Star => "`*`",
564 At => "`@`",
565 Slash => "`/`",
566 Plus => "`+`",
567 Minus => "`-`",
568 As => "keyword `as`",
569 From_ => "keyword `from`",
570 Static => "keyword `static`",
571 Interface => "keyword `interface`",
572 Tuple => "keyword `tuple`",
573 Import => "keyword `import`",
574 Export => "keyword `export`",
575 World => "keyword `world`",
576 Package => "keyword `package`",
577 Constructor => "keyword `constructor`",
578 Integer => "an integer",
579 Include => "keyword `include`",
580 With => "keyword `with`",
581 Async => "keyword `async`",
582 }
583 }
584}
585
586impl std::error::Error for Error {}
587
588impl fmt::Display for Error {
589 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
590 match self {
591 Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
592 Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
593 Error::Wanted {
594 expected, found, ..
595 } => write!(f, "expected {expected}, found {found}"),
596 Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
597 Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
598 Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
599 }
600 }
601}
602
603#[test]
604fn test_validate_id() {
605 validate_id(0, "apple").unwrap();
606 validate_id(0, "apple-pear").unwrap();
607 validate_id(0, "apple-pear-grape").unwrap();
608 validate_id(0, "a0").unwrap();
609 validate_id(0, "a").unwrap();
610 validate_id(0, "a-a").unwrap();
611 validate_id(0, "bool").unwrap();
612 validate_id(0, "APPLE").unwrap();
613 validate_id(0, "APPLE-PEAR").unwrap();
614 validate_id(0, "APPLE-PEAR-GRAPE").unwrap();
615 validate_id(0, "apple-PEAR-grape").unwrap();
616 validate_id(0, "APPLE-pear-GRAPE").unwrap();
617 validate_id(0, "ENOENT").unwrap();
618 validate_id(0, "is-XML").unwrap();
619
620 assert!(validate_id(0, "").is_err());
621 assert!(validate_id(0, "0").is_err());
622 assert!(validate_id(0, "%").is_err());
623 assert!(validate_id(0, "$").is_err());
624 assert!(validate_id(0, "0a").is_err());
625 assert!(validate_id(0, ".").is_err());
626 assert!(validate_id(0, "·").is_err());
627 assert!(validate_id(0, "a a").is_err());
628 assert!(validate_id(0, "_").is_err());
629 assert!(validate_id(0, "-").is_err());
630 assert!(validate_id(0, "a-").is_err());
631 assert!(validate_id(0, "-a").is_err());
632 assert!(validate_id(0, "Apple").is_err());
633 assert!(validate_id(0, "applE").is_err());
634 assert!(validate_id(0, "-apple-pear").is_err());
635 assert!(validate_id(0, "apple-pear-").is_err());
636 assert!(validate_id(0, "apple_pear").is_err());
637 assert!(validate_id(0, "apple.pear").is_err());
638 assert!(validate_id(0, "apple pear").is_err());
639 assert!(validate_id(0, "apple/pear").is_err());
640 assert!(validate_id(0, "apple|pear").is_err());
641 assert!(validate_id(0, "apple-Pear").is_err());
642 assert!(validate_id(0, "apple-0").is_err());
643 assert!(validate_id(0, "()()").is_err());
644 assert!(validate_id(0, "").is_err());
645 assert!(validate_id(0, "*").is_err());
646 assert!(validate_id(0, "apple\u{5f3}pear").is_err());
647 assert!(validate_id(0, "apple\u{200c}pear").is_err());
648 assert!(validate_id(0, "apple\u{200d}pear").is_err());
649 assert!(validate_id(0, "apple--pear").is_err());
650 assert!(validate_id(0, "_apple").is_err());
651 assert!(validate_id(0, "apple_").is_err());
652 assert!(validate_id(0, "_Znwj").is_err());
653 assert!(validate_id(0, "__i386").is_err());
654 assert!(validate_id(0, "__i386__").is_err());
655 assert!(validate_id(0, "Москва").is_err());
656 assert!(validate_id(0, "garçon-hühnervögel-Москва-東京").is_err());
657 assert!(validate_id(0, "😼").is_err(), "non-identifier");
658 assert!(validate_id(0, "\u{212b}").is_err(), "non-ascii");
659}
660
661#[test]
662fn test_tokenizer() {
663 fn collect(s: &str) -> Result<Vec<Token>> {
664 let mut t = Tokenizer::new(s, 0, None)?;
665 let mut tokens = Vec::new();
666 while let Some(token) = t.next()? {
667 tokens.push(token.1);
668 }
669 Ok(tokens)
670 }
671
672 assert_eq!(collect("").unwrap(), vec![]);
673 assert_eq!(collect("_").unwrap(), vec![Token::Underscore]);
674 assert_eq!(collect("apple").unwrap(), vec![Token::Id]);
675 assert_eq!(collect("apple-pear").unwrap(), vec![Token::Id]);
676 assert_eq!(collect("apple--pear").unwrap(), vec![Token::Id]);
677 assert_eq!(collect("apple-Pear").unwrap(), vec![Token::Id]);
678 assert_eq!(collect("apple-pear-grape").unwrap(), vec![Token::Id]);
679 assert_eq!(collect("apple pear").unwrap(), vec![Token::Id, Token::Id]);
680 assert_eq!(collect("_a_p_p_l_e_").unwrap(), vec![Token::Id]);
681 assert_eq!(collect("garçon").unwrap(), vec![Token::Id]);
682 assert_eq!(collect("hühnervögel").unwrap(), vec![Token::Id]);
683 assert_eq!(collect("москва").unwrap(), vec![Token::Id]);
684 assert_eq!(collect("東京").unwrap(), vec![Token::Id]);
685 assert_eq!(
686 collect("garçon-hühnervögel-москва-東京").unwrap(),
687 vec![Token::Id]
688 );
689 assert_eq!(collect("a0").unwrap(), vec![Token::Id]);
690 assert_eq!(collect("a").unwrap(), vec![Token::Id]);
691 assert_eq!(collect("%a").unwrap(), vec![Token::ExplicitId]);
692 assert_eq!(collect("%a-a").unwrap(), vec![Token::ExplicitId]);
693 assert_eq!(collect("%bool").unwrap(), vec![Token::ExplicitId]);
694 assert_eq!(collect("%").unwrap(), vec![Token::ExplicitId]);
695 assert_eq!(collect("APPLE").unwrap(), vec![Token::Id]);
696 assert_eq!(collect("APPLE-PEAR").unwrap(), vec![Token::Id]);
697 assert_eq!(collect("APPLE-PEAR-GRAPE").unwrap(), vec![Token::Id]);
698 assert_eq!(collect("apple-PEAR-grape").unwrap(), vec![Token::Id]);
699 assert_eq!(collect("APPLE-pear-GRAPE").unwrap(), vec![Token::Id]);
700 assert_eq!(collect("ENOENT").unwrap(), vec![Token::Id]);
701 assert_eq!(collect("is-XML").unwrap(), vec![Token::Id]);
702
703 assert_eq!(collect("func").unwrap(), vec![Token::Func]);
704 assert_eq!(
705 collect("a: func()").unwrap(),
706 vec![
707 Token::Id,
708 Token::Colon,
709 Token::Func,
710 Token::LeftParen,
711 Token::RightParen
712 ]
713 );
714
715 assert_eq!(collect("resource").unwrap(), vec![Token::Resource]);
716
717 assert_eq!(collect("own").unwrap(), vec![Token::Own]);
718 assert_eq!(
719 collect("own<some-id>").unwrap(),
720 vec![Token::Own, Token::LessThan, Token::Id, Token::GreaterThan]
721 );
722
723 assert_eq!(collect("borrow").unwrap(), vec![Token::Borrow]);
724 assert_eq!(
725 collect("borrow<some-id>").unwrap(),
726 vec![
727 Token::Borrow,
728 Token::LessThan,
729 Token::Id,
730 Token::GreaterThan
731 ]
732 );
733
734 assert!(collect("\u{149}").is_err(), "strongly discouraged");
735 assert!(collect("\u{673}").is_err(), "strongly discouraged");
736 assert!(collect("\u{17a3}").is_err(), "strongly discouraged");
737 assert!(collect("\u{17a4}").is_err(), "strongly discouraged");
738 assert!(collect("\u{202a}").is_err(), "bidirectional override");
739 assert!(collect("\u{2068}").is_err(), "bidirectional override");
740 assert!(collect("\u{0}").is_err(), "control code");
741 assert!(collect("\u{b}").is_err(), "control code");
742 assert!(collect("\u{c}").is_err(), "control code");
743 assert!(collect("\u{85}").is_err(), "control code");
744}