icu_collections/char16trie/trie.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use zerofrom::ZeroFrom;
6use zerovec::{ZeroSlice, ZeroVec};
7
8// Match-node lead unit values, after masking off intermediate-value bits:
9
10// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
11// the length is one more than the next byte.
12
13// For a branch sub-node with at most this many entries, we drop down
14// to a linear search.
15const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5;
16
17// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
18const MIN_LINEAR_MATCH: u16 = 0x30;
19const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10;
20
21// Match-node lead unit bits 14..6 for the optional intermediate value.
22// If these bits are 0, then there is no intermediate value.
23// Otherwise, see the *NodeValue* constants below.
24const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40
25const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f
26
27// A final-value node has bit 15 set.
28const VALUE_IS_FINAL: u16 = 0x8000;
29
30// Compact value: After testing bit 0, shift right by 15 and then use the following thresholds.
31const MAX_ONE_UNIT_VALUE: u16 = 0x3fff;
32
33const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000
34
35const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff;
36
37const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040
38
39const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0;
40
41const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff;
42
43// Compact delta integers.
44const MAX_ONE_UNIT_DELTA: u16 = 0xfbff;
45const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00
46const THREE_UNIT_DELTA_LEAD: u16 = 0xffff;
47
48fn skip_value(pos: usize, lead: u16) -> usize {
49 if lead < MIN_TWO_UNIT_VALUE_LEAD {
50 pos
51 } else if lead < THREE_UNIT_VALUE_LEAD {
52 pos + 1
53 } else {
54 pos + 2
55 }
56}
57
58fn skip_node_value(pos: usize, lead: u16) -> usize {
59 if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD {
60 pos
61 } else if lead < THREE_UNIT_NODE_VALUE_LEAD {
62 pos + 1
63 } else {
64 pos + 2
65 }
66}
67
68/// This struct represents a de-serialized `Char16Trie` that was exported from
69/// ICU binary data.
70///
71/// Light-weight, non-const reader class for a `CharsTrie`. Traverses a
72/// char-serialized data structure with minimal state, for mapping 16-bit-unit
73/// sequences to non-negative integer values.
74///
75/// For more information:
76/// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html)
77/// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API.
78#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
79#[cfg_attr(feature = "databake", derive(databake::Bake))]
80#[cfg_attr(feature = "databake", databake(path = icu_collections::char16trie))]
81#[derive(Clone, Debug, PartialEq, Eq, ZeroFrom)]
82pub struct Char16Trie<'data> {
83 /// An array of u16 containing the trie data.
84 #[cfg_attr(feature = "serde", serde(borrow))]
85 #[doc(hidden)] // #2417
86 pub data: ZeroVec<'data, u16>,
87}
88
89impl<'data> Char16Trie<'data> {
90 /// Returns a new [`Char16Trie`] with ownership of the provided data.
91 #[inline]
92 pub fn new(data: ZeroVec<'data, u16>) -> Self {
93 Self { data }
94 }
95
96 /// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data
97 #[inline]
98 pub fn iter(&self) -> Char16TrieIterator<'_> {
99 Char16TrieIterator::new(&self.data)
100 }
101}
102
103/// This struct represents an iterator over a [`Char16Trie`].
104#[derive(Clone)]
105pub struct Char16TrieIterator<'a> {
106 /// A reference to the Char16Trie data to iterate over.
107 trie: &'a ZeroSlice<u16>,
108 /// Index of next trie unit to read, or `None` if there are no more matches.
109 pos: Option<usize>,
110 /// Remaining length of a linear-match node, minus 1, or `None` if not in
111 /// such a node.
112 remaining_match_length: Option<usize>,
113}
114
115/// An enum representing the return value from a lookup in [`Char16Trie`].
116#[derive(Clone, Copy, Debug, PartialEq)]
117pub enum TrieResult {
118 /// The input unit(s) did not continue a matching string.
119 /// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()`
120 /// will also return `TrieResult::NoMatch`.
121 NoMatch,
122 /// The input unit(s) matched a string but there is no value for the string
123 /// so far. (It is a prefix of a longer string.)
124 NoValue,
125 /// The input unit(s) continued a matching string and there is a value for
126 /// the string so far. No further input byte/unit can continue a matching
127 /// string.
128 FinalValue(i32),
129 /// The input unit(s) continued a matching string and there is a value for
130 /// the string so far. Another input byte/unit can continue a matching
131 /// string.
132 Intermediate(i32),
133}
134
135// Get the lead surrogate (0xd800..0xdbff) for a
136// supplementary code point (0x10000..0x10ffff).
137// @param supplementary 32-bit code point (U+10000..U+10ffff)
138// @return lead surrogate (U+d800..U+dbff) for supplementary
139fn u16_lead(supplementary: i32) -> u16 {
140 (((supplementary) >> 10) + 0xd7c0) as u16
141}
142
143// Get the trail surrogate (0xdc00..0xdfff) for a
144// supplementary code point (0x10000..0x10ffff).
145// @param supplementary 32-bit code point (U+10000..U+10ffff)
146// @return trail surrogate (U+dc00..U+dfff) for supplementary
147fn u16_tail(supplementary: i32) -> u16 {
148 (((supplementary) & 0x3ff) | 0xdc00) as u16
149}
150
151/// A macro that takes an `Option` argument and either unwraps it if it has a value or
152/// causes the function to return `TrieResult::NoMatch` if there is no value.
153/// This could perhaps be done with `std::ops::Try` once stabilized.
154macro_rules! trie_unwrap {
155 ($option:expr) => {
156 match $option {
157 Some(x) => x,
158 None => {
159 // Unexpected
160 debug_assert!(false);
161 return TrieResult::NoMatch;
162 }
163 }
164 };
165}
166
167impl<'a> Char16TrieIterator<'a> {
168 /// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array
169 #[inline]
170 pub fn new(trie: &'a ZeroSlice<u16>) -> Self {
171 Self {
172 trie,
173 pos: Some(0),
174 remaining_match_length: None,
175 }
176 }
177
178 /// Traverses the trie from the current state for this input char.
179 ///
180 /// # Examples
181 ///
182 /// ```
183 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
184 /// use zerovec::ZeroVec;
185 ///
186 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
187 /// let trie_data = [48, 97, 176, 98, 32868];
188 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
189 ///
190 /// let mut iter = trie.iter();
191 /// let res = iter.next('a');
192 /// assert_eq!(res, TrieResult::Intermediate(1));
193 /// let res = iter.next('b');
194 /// assert_eq!(res, TrieResult::FinalValue(100));
195 /// let res = iter.next('c');
196 /// assert_eq!(res, TrieResult::NoMatch);
197 /// ```
198 pub fn next(&mut self, c: char) -> TrieResult {
199 if (c as u32) <= 0xffff {
200 self.next16(c as u16)
201 } else {
202 match self.next16(u16_lead(c as i32)) {
203 TrieResult::NoValue | TrieResult::Intermediate(_) => {
204 self.next16(u16_tail(c as i32))
205 }
206 _ => TrieResult::NoMatch,
207 }
208 }
209 }
210
211 /// Traverses the trie from the current state for this input char.
212 ///
213 /// # Examples
214 ///
215 /// ```
216 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
217 /// use zerovec::ZeroVec;
218 ///
219 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
220 /// let trie_data = [48, 97, 176, 98, 32868];
221 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
222 ///
223 /// let mut iter = trie.iter();
224 /// let res = iter.next('a');
225 /// assert_eq!(res, TrieResult::Intermediate(1));
226 /// let res = iter.next('b');
227 /// assert_eq!(res, TrieResult::FinalValue(100));
228 /// let res = iter.next('c');
229 /// assert_eq!(res, TrieResult::NoMatch);
230 /// ```
231 pub fn next32(&mut self, c: u32) -> TrieResult {
232 if c <= 0xffff {
233 self.next16(c as u16)
234 } else {
235 match self.next16(u16_lead(c as i32)) {
236 TrieResult::NoValue | TrieResult::Intermediate(_) => {
237 self.next16(u16_tail(c as i32))
238 }
239 _ => TrieResult::NoMatch,
240 }
241 }
242 }
243
244 /// Traverses the trie from the current state for this input char.
245 ///
246 /// # Examples
247 ///
248 /// ```
249 /// use icu::collections::char16trie::{Char16Trie, TrieResult};
250 /// use zerovec::ZeroVec;
251 ///
252 /// // A Char16Trie containing the ASCII characters 'a' and 'b'.
253 /// let trie_data = [48, 97, 176, 98, 32868];
254 /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data));
255 ///
256 /// let mut iter = trie.iter();
257 /// let res = iter.next16('a' as u16);
258 /// assert_eq!(res, TrieResult::Intermediate(1));
259 /// let res = iter.next16('b' as u16);
260 /// assert_eq!(res, TrieResult::FinalValue(100));
261 /// let res = iter.next16('c' as u16);
262 /// assert_eq!(res, TrieResult::NoMatch);
263 /// ```
264 pub fn next16(&mut self, c: u16) -> TrieResult {
265 let mut pos = match self.pos {
266 Some(p) => p,
267 None => return TrieResult::NoMatch,
268 };
269 if let Some(length) = self.remaining_match_length {
270 // Remaining part of a linear-match node
271 if c == trie_unwrap!(self.trie.get(pos)) {
272 pos += 1;
273 self.pos = Some(pos);
274 if length == 0 {
275 self.remaining_match_length = None;
276 let node = trie_unwrap!(self.trie.get(pos));
277 if node >= MIN_VALUE_LEAD {
278 return self.value_result(pos);
279 }
280 } else {
281 self.remaining_match_length = Some(length - 1);
282 }
283 return TrieResult::NoValue;
284 }
285 self.stop();
286 TrieResult::NoMatch
287 } else {
288 self.next_impl(pos, c)
289 }
290 }
291
292 fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult {
293 let mut pos = pos;
294 let mut length = length;
295 if length == 0 {
296 length = trie_unwrap!(self.trie.get(pos)) as usize;
297 pos += 1;
298 }
299 length += 1;
300
301 // The length of the branch is the number of units to select from.
302 // The data structure encodes a binary search.
303 while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH {
304 if in_unit < trie_unwrap!(self.trie.get(pos)) {
305 length >>= 1;
306 pos = trie_unwrap!(self.jump_by_delta(pos + 1));
307 } else {
308 length = length - (length >> 1);
309 pos = trie_unwrap!(self.skip_delta(pos + 1));
310 }
311 }
312 // Drop down to linear search for the last few bytes.
313 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
314 // and divides length by 2.
315 loop {
316 if in_unit == trie_unwrap!(self.trie.get(pos)) {
317 pos += 1;
318 let mut node = trie_unwrap!(self.trie.get(pos));
319 if node & VALUE_IS_FINAL != 0 {
320 self.pos = Some(pos);
321 return self.value_result(pos);
322 }
323 // Use the non-final value as the jump delta.
324 pos += 1;
325
326 if node < MIN_TWO_UNIT_VALUE_LEAD {
327 pos += node as usize;
328 } else if node < THREE_UNIT_VALUE_LEAD {
329 pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize
330 | trie_unwrap!(self.trie.get(pos)) as usize;
331 pos += 1;
332 } else {
333 pos += ((trie_unwrap!(self.trie.get(pos)) as usize) << 16)
334 | trie_unwrap!(self.trie.get(pos + 1)) as usize;
335 pos += 2;
336 }
337 node = trie_unwrap!(self.trie.get(pos));
338 self.pos = Some(pos);
339
340 if node >= MIN_VALUE_LEAD {
341 return self.value_result(pos);
342 }
343 return TrieResult::NoValue;
344 }
345 length -= 1;
346 pos = trie_unwrap!(self.skip_value(pos + 1));
347 if length <= 1 {
348 break;
349 }
350 }
351
352 if in_unit == trie_unwrap!(self.trie.get(pos)) {
353 pos += 1;
354 self.pos = Some(pos);
355 let node = trie_unwrap!(self.trie.get(pos));
356 if node >= MIN_VALUE_LEAD {
357 return self.value_result(pos);
358 }
359 TrieResult::NoValue
360 } else {
361 self.stop();
362 TrieResult::NoMatch
363 }
364 }
365
366 fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult {
367 let mut node = trie_unwrap!(self.trie.get(pos));
368 let mut pos = pos + 1;
369 loop {
370 if node < MIN_LINEAR_MATCH {
371 return self.branch_next(pos, node as usize, in_unit);
372 } else if node < MIN_VALUE_LEAD {
373 // Match the first of length+1 units.
374 let length = node - MIN_LINEAR_MATCH;
375 if in_unit == trie_unwrap!(self.trie.get(pos)) {
376 pos += 1;
377 if length == 0 {
378 self.remaining_match_length = None;
379 self.pos = Some(pos);
380 node = trie_unwrap!(self.trie.get(pos));
381 if node >= MIN_VALUE_LEAD {
382 return self.value_result(pos);
383 }
384 return TrieResult::NoValue;
385 }
386 self.remaining_match_length = Some(length as usize - 1);
387 self.pos = Some(pos);
388 return TrieResult::NoValue;
389 }
390 // No match
391 break;
392 } else if (node & VALUE_IS_FINAL) != 0 {
393 // No further matching units.
394 break;
395 } else {
396 // Skip intermediate value.
397 pos = skip_node_value(pos, node);
398 node &= NODE_TYPE_MASK;
399 }
400 }
401 self.stop();
402 TrieResult::NoMatch
403 }
404
405 fn stop(&mut self) {
406 self.pos = None;
407 }
408
409 #[inline(always)] // 1 call site and we want the Option to go away
410 fn jump_by_delta(&self, pos: usize) -> Option<usize> {
411 let delta = self.trie.get(pos)?;
412 let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
413 // nothing to do
414 pos + 1 + delta as usize
415 } else if delta == THREE_UNIT_DELTA_LEAD {
416 let delta =
417 ((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize);
418 pos + delta + 3
419 } else {
420 let delta = (((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16)
421 | (self.trie.get(pos + 1)? as usize);
422 pos + delta + 2
423 };
424 Some(v)
425 }
426
427 #[inline(always)] // 1 call site and we want the Option to go away
428 fn skip_value(&self, pos: usize) -> Option<usize> {
429 let lead_unit = self.trie.get(pos)?;
430 Some(skip_value(pos + 1, lead_unit & 0x7fff))
431 }
432
433 #[inline(always)] // 1 call site and we want the Option to go away
434 fn skip_delta(&self, pos: usize) -> Option<usize> {
435 let delta = self.trie.get(pos)?;
436 let v = if delta < MIN_TWO_UNIT_DELTA_LEAD {
437 pos + 1
438 } else if delta == THREE_UNIT_DELTA_LEAD {
439 pos + 3
440 } else {
441 pos + 2
442 };
443 Some(v)
444 }
445
446 fn value_result(&self, pos: usize) -> TrieResult {
447 match self.get_value(pos) {
448 Some(result) => result,
449 None => {
450 // Unexpected
451 debug_assert!(false);
452 TrieResult::NoMatch
453 }
454 }
455 }
456
457 #[inline(always)] // 1 call site and we want the Option to go away
458 fn get_value(&self, pos: usize) -> Option<TrieResult> {
459 let lead_unit = self.trie.get(pos)?;
460 if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL {
461 self.read_value(pos + 1, lead_unit & 0x7fff)
462 .map(TrieResult::FinalValue)
463 } else {
464 self.read_node_value(pos + 1, lead_unit)
465 .map(TrieResult::Intermediate)
466 }
467 }
468
469 #[inline(always)] // 1 call site and we want the Option to go away
470 fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
471 let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD {
472 lead_unit.into()
473 } else if lead_unit < THREE_UNIT_VALUE_LEAD {
474 (((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16) | self.trie.get(pos)? as i32
475 } else {
476 ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
477 };
478 Some(v)
479 }
480
481 #[inline(always)] // 1 call site and we want the Option to go away
482 fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> {
483 let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) {
484 ((lead_unit >> 6) - 1).into()
485 } else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD {
486 ((((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10)
487 | self.trie.get(pos)? as i32
488 } else {
489 ((self.trie.get(pos)? as i32) << 16) | self.trie.get(pos + 1)? as i32
490 };
491 Some(v)
492 }
493}