icu_collections/codepointtrie/cptrie.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::codepointtrie::error::Error;
6use crate::codepointtrie::impl_const::*;
7
8#[cfg(feature = "alloc")]
9use crate::codepointinvlist::CodePointInversionList;
10use core::char::CharTryFromError;
11use core::convert::Infallible;
12use core::convert::TryFrom;
13use core::fmt::Display;
14#[cfg(feature = "alloc")]
15use core::iter::FromIterator;
16use core::num::TryFromIntError;
17use core::ops::RangeInclusive;
18use yoke::Yokeable;
19use zerofrom::ZeroFrom;
20use zerovec::ule::AsULE;
21#[cfg(feature = "alloc")]
22use zerovec::ule::UleError;
23use zerovec::ZeroSlice;
24use zerovec::ZeroVec;
25
26/// The type of trie represents whether the trie has an optimization that
27/// would make it smaller or faster.
28///
29/// Regarding performance, a trie being a small or fast type affects the number of array lookups
30/// needed for code points in the range `[0x1000, 0x10000)`. In this range, `Small` tries use 4 array lookups,
31/// while `Fast` tries use 2 array lookups.
32/// Code points before the interval (in `[0, 0x1000)`) will always use 2 array lookups.
33/// Code points after the interval (in `[0x10000, 0x10FFFF]`) will always use 4 array lookups.
34///
35/// Regarding size, `Fast` type tries are larger than `Small` type tries because the minimum size of
36/// the index array is larger. The minimum size is the "fast max" limit, which is the limit of the range
37/// of code points with 2 array lookups.
38///
39/// See the document [Unicode Properties and Code Point Tries in ICU4X](https://github.com/unicode-org/icu4x/blob/main/documents/design/properties_code_point_trie.md).
40///
41/// Also see [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
42#[derive(Clone, Copy, PartialEq, Debug, Eq)]
43#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
44#[cfg_attr(feature = "databake", derive(databake::Bake))]
45#[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
46pub enum TrieType {
47 /// Represents the "fast" type code point tries for the
48 /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
49 Fast = 0,
50 /// Represents the "small" type code point tries for the
51 /// [`TrieType`] trait. The "fast max" limit is set to `0x0fff`.
52 Small = 1,
53}
54
55// TrieValue trait
56
57// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"
58
59/// A trait representing the values stored in the data array of a [`CodePointTrie`].
60/// This trait is used as a type parameter in constructing a `CodePointTrie`.
61///
62/// This trait can be implemented on anything that can be represented as a u32s worth of data.
63pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static {
64 /// Last-resort fallback value to return if we cannot read data from the trie.
65 ///
66 /// In most cases, the error value is read from the last element of the `data` array,
67 /// this value is used for empty codepointtrie arrays
68 /// Error type when converting from a u32 to this `TrieValue`.
69 type TryFromU32Error: Display;
70 /// A parsing function that is primarily motivated by deserialization contexts.
71 /// When the serialization type width is smaller than 32 bits, then it is expected
72 /// that the call site will widen the value to a `u32` first.
73 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error>;
74
75 /// A method for converting back to a `u32` that can roundtrip through
76 /// [`Self::try_from_u32()`]. The default implementation of this trait
77 /// method panics in debug mode and returns 0 in release mode.
78 ///
79 /// This method is allowed to have GIGO behavior when fed a value that has
80 /// no corresponding `u32` (since such values cannot be stored in the trie)
81 fn to_u32(self) -> u32;
82}
83
84macro_rules! impl_primitive_trie_value {
85 ($primitive:ty, $error:ty) => {
86 impl TrieValue for $primitive {
87 type TryFromU32Error = $error;
88 fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
89 Self::try_from(i)
90 }
91
92 fn to_u32(self) -> u32 {
93 // bitcast when the same size, zero-extend/sign-extend
94 // when not the same size
95 self as u32
96 }
97 }
98 };
99}
100
101impl_primitive_trie_value!(u8, TryFromIntError);
102impl_primitive_trie_value!(u16, TryFromIntError);
103impl_primitive_trie_value!(u32, Infallible);
104impl_primitive_trie_value!(i8, TryFromIntError);
105impl_primitive_trie_value!(i16, TryFromIntError);
106impl_primitive_trie_value!(i32, TryFromIntError);
107impl_primitive_trie_value!(char, CharTryFromError);
108
109/// Helper function used by [`get_range`]. Converts occurrences of trie's null
110/// value into the provided `null_value`.
111///
112/// Note: the ICU version of this helper function uses a `ValueFilter` function
113/// to apply a transform on a non-null value. But currently, this implementation
114/// stops short of that functionality, and instead leaves the non-null trie value
115/// untouched. This is equivalent to having a `ValueFilter` function that is the
116/// identity function.
117fn maybe_filter_value<T: TrieValue>(value: T, trie_null_value: T, null_value: T) -> T {
118 if value == trie_null_value {
119 null_value
120 } else {
121 value
122 }
123}
124
125/// This struct represents a de-serialized [`CodePointTrie`] that was exported from
126/// ICU binary data.
127///
128/// For more information:
129/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie)
130/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
131// serde impls in crate::serde
132#[derive(Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
133pub struct CodePointTrie<'trie, T: TrieValue> {
134 /// # Safety Invariant
135 ///
136 /// The value of `header.trie_type` must not change after construction.
137 pub(crate) header: CodePointTrieHeader,
138 /// # Safety Invariant
139 ///
140 /// If `header.trie_type == TrieType::Fast`, `index.len()` must be greater
141 /// than `FAST_TYPE_FAST_INDEXING_MAX`. Otherwise, `index.len()`
142 /// must be greater than `SMALL_TYPE_FAST_INDEXING_MAX`. Furthermore,
143 /// this field must not change after construction. (Strictly: It must
144 /// not become shorter than the length requirement stated above and the
145 /// values within the prefix up to the length requirement must not change.)
146 pub(crate) index: ZeroVec<'trie, u16>,
147 /// # Safety Invariant
148 ///
149 /// If `header.trie_type == TrieType::Fast`, `data.len()` must be greater
150 /// than `FAST_TYPE_DATA_MASK` plus the largest value in
151 /// `index[0..FAST_TYPE_FAST_INDEXING_MAX + 1]`. Otherwise, `data.len()`
152 /// must be greater than `FAST_TYPE_DATA_MASK` plus the largest value in
153 /// `index[0..SMALL_TYPE_FAST_INDEXING_MAX + 1]`. Furthermore, this field
154 /// must not change after construction. (Strictly: The stated length
155 /// requirement must continue to hold.)
156 pub(crate) data: ZeroVec<'trie, T>,
157 // serde impl skips this field
158 #[zerofrom(clone)] // TrieValue is Copy, this allows us to avoid
159 // a T: ZeroFrom bound
160 pub(crate) error_value: T,
161}
162
163/// This struct contains the fixed-length header fields of a [`CodePointTrie`].
164///
165/// # Safety Invariant
166///
167/// The `trie_type` field must not change after construction.
168///
169/// (In practice, all the fields here remain unchanged during the lifetime
170/// of the trie.)
171#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
172#[cfg_attr(feature = "databake", derive(databake::Bake))]
173#[cfg_attr(feature = "databake", databake(path = icu_collections::codepointtrie))]
174#[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroFrom)]
175pub struct CodePointTrieHeader {
176 /// The code point of the start of the last range of the trie. A
177 /// range is defined as a partition of the code point space such that the
178 /// value in this trie associated with all code points of the same range is
179 /// the same.
180 ///
181 /// For the property value data for many Unicode properties,
182 /// often times, `high_start` is `U+10000` or lower. In such cases, not
183 /// reserving space in the `index` array for duplicate values is a large
184 /// savings. The "highValue" associated with the `high_start` range is
185 /// stored at the second-to-last position of the `data` array.
186 /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.)
187 pub high_start: u32,
188 /// A version of the `high_start` value that is right-shifted 12 spaces,
189 /// but is rounded up to a multiple `0x1000` for easy testing from UTF-8
190 /// lead bytes.
191 pub shifted12_high_start: u16,
192 /// Offset for the null block in the "index-3" table of the `index` array.
193 /// Set to an impossibly high value (e.g., `0xffff`) if there is no
194 /// dedicated index-3 null block.
195 pub index3_null_offset: u16,
196 /// Internal data null block offset, not shifted.
197 /// Set to an impossibly high value (e.g., `0xfffff`) if there is no
198 /// dedicated data null block.
199 pub data_null_offset: u32,
200 /// The value stored in the trie that represents a null value being
201 /// associated to a code point.
202 pub null_value: u32,
203 /// The enum value representing the type of trie, where trie type is as it
204 /// is defined in ICU (ex: Fast, Small).
205 ///
206 /// # Safety Invariant
207 ///
208 /// Must not change after construction.
209 pub trie_type: TrieType,
210}
211
212impl TryFrom<u8> for TrieType {
213 type Error = crate::codepointtrie::error::Error;
214
215 fn try_from(trie_type_int: u8) -> Result<TrieType, crate::codepointtrie::error::Error> {
216 match trie_type_int {
217 0 => Ok(TrieType::Fast),
218 1 => Ok(TrieType::Small),
219 _ => Err(crate::codepointtrie::error::Error::FromDeserialized {
220 reason: "Cannot parse value for trie_type",
221 }),
222 }
223 }
224}
225
226// Helper macro that turns arithmetic into wrapping-in-release, checked-in-debug arithmetic
227//
228// This is rustc's default behavior anyway, however some projects like Android deliberately
229// enable overflow checks. CodePointTrie::get() is intended to be used in Android bionic which
230// cares about codesize and we don't want the pile of panicking infrastructure brought in by overflow
231// checks, so we force wrapping in release.
232// See #6052
233macro_rules! w(
234 // Note: these matchers are not perfect since you cannot have an operator after an expr matcher
235 // Use variables if you need complex first operands.
236 ($a:tt + $b:expr) => {
237 {
238 #[allow(unused_parens)]
239 let a = $a;
240 let b = $b;
241 debug_assert!(a.checked_add(b).is_some());
242 $a.wrapping_add($b)
243 }
244 };
245 ($a:tt - $b:expr) => {
246
247 {
248 #[allow(unused_parens)]
249 let a = $a;
250 let b = $b;
251 debug_assert!(a.checked_sub(b).is_some());
252 $a.wrapping_sub($b)
253 }
254 };
255 ($a:tt * $b:expr) => {
256 {
257 #[allow(unused_parens)]
258 let a = $a;
259 let b = $b;
260 debug_assert!(a.checked_mul(b).is_some());
261 $a.wrapping_mul($b)
262 }
263 };
264);
265
266impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
267 #[doc(hidden)] // databake internal
268 /// # Safety
269 ///
270 /// `header.trie_type`, `index`, and `data` must
271 /// satisfy the invariants for the fields of the
272 /// same names on `CodePointTrie`.
273 pub const unsafe fn from_parts_unstable_unchecked_v1(
274 header: CodePointTrieHeader,
275 index: ZeroVec<'trie, u16>,
276 data: ZeroVec<'trie, T>,
277 error_value: T,
278 ) -> Self {
279 // Field invariants upheld: The caller is responsible.
280 // In practice, this means that datagen in the databake
281 // mode upholds these invariants when constructing the
282 // `CodePointTrie` that is then baked.
283 Self {
284 header,
285 index,
286 data,
287 error_value,
288 }
289 }
290
291 /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
292 /// array and `data` array, whose data values have width `W`.
293 pub fn try_new(
294 header: CodePointTrieHeader,
295 index: ZeroVec<'trie, u16>,
296 data: ZeroVec<'trie, T>,
297 ) -> Result<CodePointTrie<'trie, T>, Error> {
298 let error_value = Self::validate_fields(&header, &index, &data)?;
299 // Field invariants upheld: Checked by `validate_fields` above.
300 let trie: CodePointTrie<'trie, T> = CodePointTrie {
301 header,
302 index,
303 data,
304 error_value,
305 };
306 Ok(trie)
307 }
308
309 /// Checks the invariant on the fields that fast-path access relies on for
310 /// safety in order to omit slice bound checks and upon success returns the
311 /// `error_value` for the trie.
312 ///
313 /// # Safety Usable Invariant
314 ///
315 /// Iff this function returns `Ok(T)`, the arguments satisfy the invariants
316 /// for corresponding fields of `CodePointTrie`. (Other than proving that
317 /// nothing else changes the fields subsequently.)
318 pub(crate) fn validate_fields(
319 header: &CodePointTrieHeader,
320 index: &ZeroSlice<u16>,
321 data: &ZeroSlice<T>,
322 ) -> Result<T, Error> {
323 let error_value = data.last().ok_or(Error::EmptyDataVector)?;
324
325 // `CodePointTrie` lookup has two stages: fast and small (the trie types
326 // are also fast and small; they affect where the boundary between fast
327 // and small lookups is).
328 //
329 // The length requirements for `index` and `data` are checked here only
330 // for the fast lookup case so that the fast lookup can omit bound checks
331 // at the time of access. In the small lookup case, bounds are checked at
332 // the time of access.
333 //
334 // The fast lookup happens on the prefixes of `index` and `data` with
335 // more items for the small lookup case afterwards. The check here
336 // only looks at the prefixes relevant to the fast case.
337 //
338 // In the fast lookup case, the bits of the of the code point are
339 // partitioned into a bit prefix and a bit suffix. First, a value
340 // is read from `index` by indexing into it using the bit prefix.
341 // Then `data` is accessed by the value just read from `index` plus
342 // the bit suffix.
343 //
344 // Therefore, the length of `index` needs to accommodate access
345 // by the maximum possible bit prefix, and the length of `data`
346 // needs to accommodate access by the largest value stored in the part
347 // of `data` reachable by the bit prefix plus the maximum possible bit
348 // suffix.
349 //
350 // The maximum possible bit prefix depends on the trie type.
351
352 // The maximum code point that can be used for fast-path access:
353 let fast_max = match header.trie_type {
354 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
355 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
356 };
357 // Keep only the prefix bits:
358 let max_bit_prefix = fast_max >> FAST_TYPE_SHIFT;
359 // Attempt slice the part of `index` that the fast path can index into.
360 // Since `max_bit_prefix` is the largest possible value used for
361 // indexing (inclusive bound), we need to add one to get the exclusive
362 // bound, which is what `get_subslice` wants.
363 let fast_index = index
364 .get_subslice(0..(max_bit_prefix as usize) + 1)
365 .ok_or(Error::IndexTooShortForFastAccess)?;
366 // Invariant upheld for `index`: If we got this far, the length of `index`
367 // satisfies its length invariant on the assumption that `header.trie_type`
368 // will not change subsequently.
369
370 // Now find the largest offset in the part of `index` reachable by the
371 // bit prefix. `max` can never actually return `None`, since we already
372 // know the slice isn't empty. Hence, reusing the error kind instead of
373 // minting a new one for this check.
374 let max_offset = fast_index
375 .iter()
376 .max()
377 .ok_or(Error::IndexTooShortForFastAccess)?;
378 // `FAST_TYPE_DATA_MASK` is the maximum possible bit suffix, since the
379 // maximum is when all the bits in the suffix are set, and the mask
380 // has that many bits set.
381 if (max_offset) as usize + (FAST_TYPE_DATA_MASK as usize) >= data.len() {
382 return Err(Error::DataTooShortForFastAccess);
383 }
384
385 // Invariant upheld for `data`: If we got this far, the length of `data`
386 // satisfies `data`'s length invariant on the assumption that the contents
387 // of `fast_index` subslice of `index` and `header.trie_type` will not
388 // change subsequently.
389
390 Ok(error_value)
391 }
392
393 /// Turns this trie into a version whose trie type is encoded in the Rust type.
394 #[inline]
395 pub fn to_typed(self) -> Typed<FastCodePointTrie<'trie, T>, SmallCodePointTrie<'trie, T>> {
396 match self.header.trie_type {
397 TrieType::Fast => Typed::Fast(FastCodePointTrie { inner: self }),
398 TrieType::Small => Typed::Small(SmallCodePointTrie { inner: self }),
399 }
400 }
401
402 /// Obtains a reference to this trie as a Rust type that encodes the trie type in the Rust type.
403 #[inline]
404 pub fn as_typed_ref(
405 &self,
406 ) -> Typed<&FastCodePointTrie<'trie, T>, &SmallCodePointTrie<'trie, T>> {
407 // SAFETY: `FastCodePointTrie` and `SmallCodePointTrie` are `repr(transparent)`
408 // with `CodePointTrie`, so transmuting between the references is OK when the
409 // actual trie type agrees with the semantics of the typed wrapper.
410 match self.header.trie_type {
411 TrieType::Fast => Typed::Fast(unsafe {
412 core::mem::transmute::<&CodePointTrie<'trie, T>, &FastCodePointTrie<'trie, T>>(self)
413 }),
414 TrieType::Small => Typed::Small(unsafe {
415 core::mem::transmute::<&CodePointTrie<'trie, T>, &SmallCodePointTrie<'trie, T>>(
416 self,
417 )
418 }),
419 }
420 }
421
422 /// Returns the position in the data array containing the trie's stored
423 /// error value.
424 #[inline(always)] // `always` was based on previous normalizer benchmarking
425 fn trie_error_val_index(&self) -> u32 {
426 // We use wrapping_sub here to avoid panicky overflow checks.
427 // len should always be > 1, but if it isn't this will just cause GIGO behavior of producing
428 // None on `.get()`
429 debug_assert!(self.data.len() as u32 >= ERROR_VALUE_NEG_DATA_OFFSET);
430 w!((self.data.len() as u32) - ERROR_VALUE_NEG_DATA_OFFSET)
431 }
432
433 fn internal_small_index(&self, code_point: u32) -> u32 {
434 // We use wrapping arithmetic here to avoid overflow checks making their way into binaries
435 // with overflow checks enabled. Ultimately this code ends up as a checked index, so any
436 // bugs here will cause GIGO
437 let mut index1_pos: u32 = code_point >> SHIFT_1;
438 if self.header.trie_type == TrieType::Fast {
439 debug_assert!(
440 FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
441 );
442 index1_pos = w!(index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH);
443 } else {
444 assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT);
445 index1_pos = w!(index1_pos + SMALL_INDEX_LENGTH);
446 }
447 let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) {
448 index1_val
449 } else {
450 return self.trie_error_val_index();
451 };
452 let index3_block_idx: u32 =
453 w!((index1_val as u32) + (code_point >> SHIFT_2) & INDEX_2_MASK);
454 let mut index3_block: u32 =
455 if let Some(index3_block) = self.index.get(index3_block_idx as usize) {
456 index3_block as u32
457 } else {
458 return self.trie_error_val_index();
459 };
460 let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK;
461 let mut data_block: u32;
462 if index3_block & 0x8000 == 0 {
463 // 16-bit indexes
464 data_block =
465 if let Some(data_block) = self.index.get(w!(index3_block + index3_pos) as usize) {
466 data_block as u32
467 } else {
468 return self.trie_error_val_index();
469 };
470 } else {
471 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
472 index3_block = w!((index3_block & 0x7fff) + w!((index3_pos & !7) + index3_pos >> 3));
473 index3_pos &= 7;
474 data_block = if let Some(data_block) = self.index.get(index3_block as usize) {
475 data_block as u32
476 } else {
477 return self.trie_error_val_index();
478 };
479 data_block = (data_block << w!(2u32 + w!(2u32 * index3_pos))) & 0x30000;
480 index3_block += 1;
481 data_block =
482 if let Some(index3_val) = self.index.get(w!(index3_block + index3_pos) as usize) {
483 data_block | (index3_val as u32)
484 } else {
485 return self.trie_error_val_index();
486 };
487 }
488 // Returns data_pos == data_block (offset) +
489 // portion of code_point bit field for last (4th) lookup
490 w!(data_block + code_point & SMALL_DATA_MASK)
491 }
492
493 /// Returns the position in the `data` array for the given code point,
494 /// where this code point is at or above the fast limit associated for the
495 /// `trie_type`. We will refer to that limit as "`fastMax`" here.
496 ///
497 /// A lookup of the value in the code point trie for a code point in the
498 /// code point space range [`fastMax`, `high_start`) will be a 4-step
499 /// lookup: 3 lookups in the `index` array and one lookup in the `data`
500 /// array. Lookups for code points in the range [`high_start`,
501 /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see
502 /// [`CodePointTrieHeader::high_start`].
503 fn small_index(&self, code_point: u32) -> u32 {
504 if code_point >= self.header.high_start {
505 w!((self.data.len() as u32) - HIGH_VALUE_NEG_DATA_OFFSET)
506 } else {
507 self.internal_small_index(code_point) // helper fn
508 }
509 }
510
511 /// Returns the position in the `data` array for the given code point,
512 /// where this code point is below the fast limit associated for the
513 /// `trie type`. We will refer to that limit as "`fastMax`" here.
514 ///
515 /// A lookup of the value in the code point trie for a code point in the
516 /// code point space range [0, `fastMax`) will be a 2-step lookup: 1
517 /// lookup in the `index` array and one lookup in the `data` array. By
518 /// design, for trie type `T`, there is an element allocated in the `index`
519 /// array for each block of code points in [0, `fastMax`), which in
520 /// turn guarantees that those code points are represented and only need 1
521 /// lookup.
522 fn fast_index(&self, code_point: u32) -> u32 {
523 let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT;
524 let index_array_val: u16 =
525 if let Some(index_array_val) = self.index.get(index_array_pos as usize) {
526 index_array_val
527 } else {
528 return self.trie_error_val_index();
529 };
530 let masked_cp = code_point & FAST_TYPE_DATA_MASK;
531 let index_array_val = index_array_val as u32;
532 let fast_index_val: u32 = w!(index_array_val + masked_cp);
533 fast_index_val
534 }
535
536 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
537 /// if `code_point` uses fast-path lookup or `None` if `code_point`
538 /// should use small-path lookup or is above the supported range.
539 #[inline(always)] // "always" to make the `Option` collapse away
540 fn get32_by_fast_index(&self, code_point: u32) -> Option<T> {
541 let fast_max = match self.header.trie_type {
542 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
543 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
544 };
545 if code_point <= fast_max {
546 // SAFETY: We just checked the invariant of
547 // `get32_assuming_fast_index`,
548 // which is
549 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
550 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
551 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
552 Some(unsafe { self.get32_assuming_fast_index(code_point) })
553 } else {
554 // The caller needs to call `get32_by_small_index` or determine
555 // that the argument is above the permitted range.
556 None
557 }
558 }
559
560 /// Performs the actual fast-mode lookup
561 ///
562 /// # Safety
563 ///
564 /// If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
565 /// `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
566 /// TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`.
567 #[inline(always)]
568 unsafe fn get32_assuming_fast_index(&self, code_point: u32) -> T {
569 // Check the safety invariant of this method.
570 debug_assert!(
571 code_point
572 <= match self.header.trie_type {
573 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
574 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
575 }
576 );
577
578 let bit_prefix = (code_point as usize) >> FAST_TYPE_SHIFT;
579 debug_assert!(bit_prefix < self.index.len());
580 // SAFETY: Relying on the length invariant of `self.index` having
581 // been checked and on the unchangedness invariant of `self.index`
582 // and `self.header.trie_type` after construction.
583 let base_offset_to_data: usize = usize::from(u16::from_unaligned(*unsafe {
584 self.index.as_ule_slice().get_unchecked(bit_prefix)
585 }));
586 let bit_suffix = (code_point & FAST_TYPE_DATA_MASK) as usize;
587 // SAFETY: Cannot overflow with supported (32-bit and 64-bit) `usize`
588 // sizes, since `base_offset_to_data` was extended from `u16` and
589 // `bit_suffix` is at most `FAST_TYPE_DATA_MASK`, which is well
590 // under what it takes to reach the 32-bit (or 64-bit) max with
591 // additon from the max of `u16`.
592 let offset_to_data = w!(base_offset_to_data + bit_suffix);
593 debug_assert!(offset_to_data < self.data.len());
594 // SAFETY: Relying on the length invariant of `self.data` having
595 // been checked and on the unchangedness invariant of `self.data`,
596 // `self.index`, and `self.header.trie_type` after construction.
597 T::from_unaligned(*unsafe { self.data.as_ule_slice().get_unchecked(offset_to_data) })
598 }
599
600 /// Coldness wrapper for `get32_by_small_index` to also allow
601 /// calls without the effects of `#[cold]`.
602 #[cold]
603 #[inline(always)]
604 fn get32_by_small_index_cold(&self, code_point: u32) -> T {
605 self.get32_by_small_index(code_point)
606 }
607
608 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
609 /// assuming that the small index path should be used.
610 ///
611 /// # Intended Precondition
612 ///
613 /// `code_point` must be at most `CODE_POINT_MAX` AND greter than
614 /// `FAST_TYPE_FAST_INDEXING_MAX` if the trie type is fast or greater
615 /// than `SMALL_TYPE_FAST_INDEXING_MAX` if the trie type is small.
616 /// This is checked when debug assertions are enabled. If this
617 /// precondition is violated, the behavior of this method is
618 /// memory-safe, but the returned value may be bogus (not
619 /// necessarily the designated error value).
620 #[inline(never)]
621 fn get32_by_small_index(&self, code_point: u32) -> T {
622 debug_assert!(code_point <= CODE_POINT_MAX);
623 debug_assert!(
624 code_point
625 > match self.header.trie_type {
626 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
627 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
628 }
629 );
630 self.data
631 .get(self.small_index(code_point) as usize)
632 .unwrap_or(self.error_value)
633 }
634
635 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`].
636 ///
637 /// # Examples
638 ///
639 /// ```
640 /// use icu::collections::codepointtrie::planes;
641 /// let trie = planes::get_planes_trie();
642 ///
643 /// assert_eq!(0, trie.get32(0x41)); // 'A' as u32
644 /// assert_eq!(0, trie.get32(0x13E0)); // 'Ꮰ' as u32
645 /// assert_eq!(1, trie.get32(0x10044)); // '𐁄' as u32
646 /// ```
647 #[inline(always)] // `always` based on normalizer benchmarking
648 pub fn get32(&self, code_point: u32) -> T {
649 if let Some(v) = self.get32_by_fast_index(code_point) {
650 v
651 } else if code_point <= CODE_POINT_MAX {
652 self.get32_by_small_index_cold(code_point)
653 } else {
654 self.error_value
655 }
656 }
657
658 /// Returns the value that is associated with `char` in this [`CodePointTrie`].
659 ///
660 /// # Examples
661 ///
662 /// ```
663 /// use icu::collections::codepointtrie::planes;
664 /// let trie = planes::get_planes_trie();
665 ///
666 /// assert_eq!(0, trie.get('A')); // 'A' as u32
667 /// assert_eq!(0, trie.get('Ꮰ')); // 'Ꮰ' as u32
668 /// assert_eq!(1, trie.get('𐁄')); // '𐁄' as u32
669 /// ```
670 #[inline(always)]
671 pub fn get(&self, c: char) -> T {
672 // LLVM's optimizations have been observed not to be 100%
673 // reliable around collapsing away unnecessary parts of
674 // `get32`, so not just calling `get32` here.
675 let code_point = u32::from(c);
676 if let Some(v) = self.get32_by_fast_index(code_point) {
677 v
678 } else {
679 self.get32_by_small_index_cold(code_point)
680 }
681 }
682
683 /// Returns the value that is associated with `bmp` in this [`CodePointTrie`].
684 #[inline(always)]
685 pub fn get16(&self, bmp: u16) -> T {
686 // LLVM's optimizations have been observed not to be 100%
687 // reliable around collapsing away unnecessary parts of
688 // `get32`, so not just calling `get32` here.
689 let code_point = u32::from(bmp);
690 if let Some(v) = self.get32_by_fast_index(code_point) {
691 v
692 } else {
693 self.get32_by_small_index_cold(code_point)
694 }
695 }
696
697 /// Lookup trie value by non-Basic Multilingual Plane Scalar Value.
698 ///
699 /// The return value may be bogus (not necessarily `error_value`) is the argument is actually in
700 /// the Basic Multilingual Plane or above the Unicode Scalar Value
701 /// range (panics instead with debug assertions enabled).
702 #[inline(always)]
703 pub fn get32_supplementary(&self, supplementary: u32) -> T {
704 debug_assert!(supplementary > 0xFFFF);
705 debug_assert!(supplementary <= CODE_POINT_MAX);
706 self.get32_by_small_index(supplementary)
707 }
708
709 /// Returns a reference to the ULE of the value that is associated with `code_point` in this [`CodePointTrie`].
710 ///
711 /// # Examples
712 ///
713 /// ```
714 /// use icu::collections::codepointtrie::planes;
715 /// let trie = planes::get_planes_trie();
716 ///
717 /// assert_eq!(Some(&0), trie.get32_ule(0x41)); // 'A' as u32
718 /// assert_eq!(Some(&0), trie.get32_ule(0x13E0)); // 'Ꮰ' as u32
719 /// assert_eq!(Some(&1), trie.get32_ule(0x10044)); // '𐁄' as u32
720 /// ```
721 #[inline(always)] // `always` was based on previous normalizer benchmarking
722 pub fn get32_ule(&self, code_point: u32) -> Option<&T::ULE> {
723 // All code points up to the fast max limit are represented
724 // individually in the `index` array to hold their `data` array position, and
725 // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
726 // Code points above the "fast max" limit require 4 lookups.
727 let fast_max = match self.header.trie_type {
728 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
729 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
730 };
731 let data_pos: u32 = if code_point <= fast_max {
732 Self::fast_index(self, code_point)
733 } else if code_point <= CODE_POINT_MAX {
734 Self::small_index(self, code_point)
735 } else {
736 self.trie_error_val_index()
737 };
738 // Returns the trie value (or trie's error value).
739 self.data.as_ule_slice().get(data_pos as usize)
740 }
741
742 /// Converts the [`CodePointTrie`] into one that returns another type of the same size.
743 ///
744 /// Borrowed data remains borrowed, and owned data remains owned.
745 ///
746 /// If the old and new types are not the same size, use
747 /// [`CodePointTrie::try_alloc_map_value`].
748 ///
749 /// # Panics
750 ///
751 /// Panics if `T` and `P` are different sizes.
752 ///
753 /// More specifically, panics if [`ZeroVec::try_into_converted()`] panics when converting
754 /// `ZeroVec<T>` into `ZeroVec<P>`, which happens if `T::ULE` and `P::ULE` differ in size.
755 ///
756 /// ✨ *Enabled with the `alloc` Cargo feature.*
757 ///
758 /// # Examples
759 ///
760 /// ```no_run
761 /// use icu::collections::codepointtrie::planes;
762 /// use icu::collections::codepointtrie::CodePointTrie;
763 ///
764 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
765 /// let planes_trie_i8: CodePointTrie<i8> =
766 /// planes_trie_u8.try_into_converted().expect("infallible");
767 ///
768 /// assert_eq!(planes_trie_i8.get32(0x30000), 3);
769 /// ```
770 #[cfg(feature = "alloc")]
771 pub fn try_into_converted<P>(self) -> Result<CodePointTrie<'trie, P>, UleError>
772 where
773 P: TrieValue,
774 {
775 let converted_data = self.data.try_into_converted()?;
776 let error_ule = self.error_value.to_unaligned();
777 let slice = &[error_ule];
778 let error_vec = ZeroVec::<T>::new_borrowed(slice);
779 let error_converted = error_vec.try_into_converted::<P>()?;
780 #[expect(clippy::expect_used)] // we know this cannot fail
781 Ok(CodePointTrie {
782 header: self.header,
783 index: self.index,
784 data: converted_data,
785 error_value: error_converted
786 .get(0)
787 .expect("vector known to have one element"),
788 })
789 }
790
791 /// Maps the [`CodePointTrie`] into one that returns a different type.
792 ///
793 /// This function returns owned data.
794 ///
795 /// If the old and new types are the same size, use the more efficient
796 /// [`CodePointTrie::try_into_converted`].
797 ///
798 /// ✨ *Enabled with the `alloc` Cargo feature.*
799 ///
800 /// # Examples
801 ///
802 /// ```
803 /// use icu::collections::codepointtrie::planes;
804 /// use icu::collections::codepointtrie::CodePointTrie;
805 ///
806 /// let planes_trie_u8: CodePointTrie<u8> = planes::get_planes_trie();
807 /// let planes_trie_u16: CodePointTrie<u16> = planes_trie_u8
808 /// .try_alloc_map_value(TryFrom::try_from)
809 /// .expect("infallible");
810 ///
811 /// assert_eq!(planes_trie_u16.get32(0x30000), 3);
812 /// ```
813 #[cfg(feature = "alloc")]
814 pub fn try_alloc_map_value<P, E>(
815 &self,
816 mut f: impl FnMut(T) -> Result<P, E>,
817 ) -> Result<CodePointTrie<'trie, P>, E>
818 where
819 P: TrieValue,
820 {
821 let error_converted = f(self.error_value)?;
822 let converted_data = self.data.iter().map(f).collect::<Result<ZeroVec<P>, E>>()?;
823 Ok(CodePointTrie {
824 header: self.header,
825 index: self.index.clone(),
826 data: converted_data,
827 error_value: error_converted,
828 })
829 }
830
831 /// Returns a [`CodePointMapRange`] struct which represents a range of code
832 /// points associated with the same trie value. The returned range will be
833 /// the longest stretch of consecutive code points starting at `start` that
834 /// share this value.
835 ///
836 /// This method is designed to use the internal details of
837 /// the structure of [`CodePointTrie`] to be optimally efficient. This will
838 /// outperform a naive approach that just uses [`CodePointTrie::get()`].
839 ///
840 /// This method provides lower-level functionality that can be used in the
841 /// implementation of other methods that are more convenient to the user.
842 /// To obtain an optimal partition of the code point space for
843 /// this trie resulting in the fewest number of ranges, see
844 /// [`CodePointTrie::iter_ranges()`].
845 ///
846 /// # Examples
847 ///
848 /// ```
849 /// use icu::collections::codepointtrie::planes;
850 ///
851 /// let trie = planes::get_planes_trie();
852 ///
853 /// const CODE_POINT_MAX: u32 = 0x10ffff;
854 /// let start = 0x1_0000;
855 /// let exp_end = 0x1_ffff;
856 ///
857 /// let start_val = trie.get32(start);
858 /// assert_eq!(trie.get32(exp_end), start_val);
859 /// assert_ne!(trie.get32(exp_end + 1), start_val);
860 ///
861 /// use icu::collections::codepointtrie::CodePointMapRange;
862 ///
863 /// let cpm_range: CodePointMapRange<u8> = trie.get_range(start).unwrap();
864 ///
865 /// assert_eq!(cpm_range.range.start(), &start);
866 /// assert_eq!(cpm_range.range.end(), &exp_end);
867 /// assert_eq!(cpm_range.value, start_val);
868 ///
869 /// // `start` can be any code point, whether or not it lies on the boundary
870 /// // of a maximally large range that still contains `start`
871 ///
872 /// let submaximal_1_start = start + 0x1234;
873 /// let submaximal_1 = trie.get_range(submaximal_1_start).unwrap();
874 /// assert_eq!(submaximal_1.range.start(), &0x1_1234);
875 /// assert_eq!(submaximal_1.range.end(), &0x1_ffff);
876 /// assert_eq!(submaximal_1.value, start_val);
877 ///
878 /// let submaximal_2_start = start + 0xffff;
879 /// let submaximal_2 = trie.get_range(submaximal_2_start).unwrap();
880 /// assert_eq!(submaximal_2.range.start(), &0x1_ffff);
881 /// assert_eq!(submaximal_2.range.end(), &0x1_ffff);
882 /// assert_eq!(submaximal_2.value, start_val);
883 /// ```
884 pub fn get_range(&self, start: u32) -> Option<CodePointMapRange<T>> {
885 // Exit early if the start code point is out of range, or if it is
886 // in the last range of code points in high_start..=CODE_POINT_MAX
887 // (start- and end-inclusive) that all share the same trie value.
888 if CODE_POINT_MAX < start {
889 return None;
890 }
891 if start >= self.header.high_start {
892 let di: usize = self.data.len() - (HIGH_VALUE_NEG_DATA_OFFSET as usize);
893 let value: T = self.data.get(di)?;
894 return Some(CodePointMapRange {
895 range: start..=CODE_POINT_MAX,
896 value,
897 });
898 }
899
900 let null_value: T = T::try_from_u32(self.header.null_value).ok()?;
901
902 let mut prev_i3_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
903 let mut prev_block: u32 = u32::MAX; // using u32::MAX (instead of -1 as an i32 in ICU)
904 let mut c: u32 = start;
905 let mut trie_value: T = self.error_value();
906 let mut value: T = self.error_value();
907 let mut have_value: bool = false;
908
909 loop {
910 let i3_block: u32;
911 let mut i3: u32;
912 let i3_block_length: u32;
913 let data_block_length: u32;
914
915 // Initialize values before beginning the iteration in the subsequent
916 // `loop` block. In particular, use the "i3*" local variables
917 // (representing the `index` array position's offset + increment
918 // for a 3rd-level trie lookup) to help initialize the data block
919 // variable `block` in the loop for the `data` array.
920 //
921 // When a lookup code point is <= the trie's *_FAST_INDEXING_MAX that
922 // corresponds to its `trie_type`, the lookup only takes 2 steps
923 // (once into the `index`, once into the `data` array); otherwise,
924 // takes 4 steps (3 iterative lookups into the `index`, once more
925 // into the `data` array). So for convenience's sake, when we have the
926 // 2-stage lookup, reuse the "i3*" variable names for the first lookup.
927 if c <= 0xffff
928 && (self.header.trie_type == TrieType::Fast || c <= SMALL_TYPE_FAST_INDEXING_MAX)
929 {
930 i3_block = 0;
931 i3 = c >> FAST_TYPE_SHIFT;
932 i3_block_length = if self.header.trie_type == TrieType::Fast {
933 BMP_INDEX_LENGTH
934 } else {
935 SMALL_INDEX_LENGTH
936 };
937 data_block_length = FAST_TYPE_DATA_BLOCK_LENGTH;
938 } else {
939 // Use the multi-stage index.
940 let mut i1: u32 = c >> SHIFT_1;
941 if self.header.trie_type == TrieType::Fast {
942 debug_assert!(0xffff < c && c < self.header.high_start);
943 i1 = i1 + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH;
944 } else {
945 debug_assert!(
946 c < self.header.high_start && self.header.high_start > SMALL_LIMIT
947 );
948 i1 += SMALL_INDEX_LENGTH;
949 }
950 let i2: u16 = self.index.get(i1 as usize)?;
951 let i3_block_idx: u32 = (i2 as u32) + ((c >> SHIFT_2) & INDEX_2_MASK);
952 i3_block = if let Some(i3b) = self.index.get(i3_block_idx as usize) {
953 i3b as u32
954 } else {
955 return None;
956 };
957 if i3_block == prev_i3_block && (c - start) >= CP_PER_INDEX_2_ENTRY {
958 // The index-3 block is the same as the previous one, and filled with value.
959 debug_assert!((c & (CP_PER_INDEX_2_ENTRY - 1)) == 0);
960 c += CP_PER_INDEX_2_ENTRY;
961
962 if c >= self.header.high_start {
963 break;
964 } else {
965 continue;
966 }
967 }
968 prev_i3_block = i3_block;
969 if i3_block == self.header.index3_null_offset as u32 {
970 // This is the index-3 null block.
971 // All of the `data` array blocks pointed to by the values
972 // in this block of the `index` 3rd-stage subarray will
973 // contain this trie's null_value. So if we are in the middle
974 // of a range, end it and return early, otherwise start a new
975 // range of null values.
976 if have_value {
977 if null_value != value {
978 return Some(CodePointMapRange {
979 range: start..=(c - 1),
980 value,
981 });
982 }
983 } else {
984 trie_value = T::try_from_u32(self.header.null_value).ok()?;
985 value = null_value;
986 have_value = true;
987 }
988 prev_block = self.header.data_null_offset;
989 c = (c + CP_PER_INDEX_2_ENTRY) & !(CP_PER_INDEX_2_ENTRY - 1);
990
991 if c >= self.header.high_start {
992 break;
993 } else {
994 continue;
995 }
996 }
997 i3 = (c >> SHIFT_3) & INDEX_3_MASK;
998 i3_block_length = INDEX_3_BLOCK_LENGTH;
999 data_block_length = SMALL_DATA_BLOCK_LENGTH;
1000 }
1001
1002 // Enumerate data blocks for one index-3 block.
1003 loop {
1004 let mut block: u32;
1005 if (i3_block & 0x8000) == 0 {
1006 block = if let Some(b) = self.index.get((i3_block + i3) as usize) {
1007 b as u32
1008 } else {
1009 return None;
1010 };
1011 } else {
1012 // 18-bit indexes stored in groups of 9 entries per 8 indexes.
1013 let mut group: u32 = (i3_block & 0x7fff) + (i3 & !7) + (i3 >> 3);
1014 let gi: u32 = i3 & 7;
1015 let gi_val: u32 = if let Some(giv) = self.index.get(group as usize) {
1016 giv.into()
1017 } else {
1018 return None;
1019 };
1020 block = (gi_val << (2 + (2 * gi))) & 0x30000;
1021 group += 1;
1022 let ggi_val: u32 = if let Some(ggiv) = self.index.get((group + gi) as usize) {
1023 ggiv as u32
1024 } else {
1025 return None;
1026 };
1027 block |= ggi_val;
1028 }
1029
1030 // If our previous and current return values of the 3rd-stage `index`
1031 // lookup yield the same `data` block offset, and if we already know that
1032 // the entire `data` block / subarray starting at that offset stores
1033 // `value` and nothing else, then we can extend our range by the length
1034 // of a data block and continue.
1035 // Otherwise, we have to iterate over the values stored in the
1036 // new data block to see if they differ from `value`.
1037 if block == prev_block && (c - start) >= data_block_length {
1038 // The block is the same as the previous one, and filled with value.
1039 debug_assert!((c & (data_block_length - 1)) == 0);
1040 c += data_block_length;
1041 } else {
1042 let data_mask: u32 = data_block_length - 1;
1043 prev_block = block;
1044 if block == self.header.data_null_offset {
1045 // This is the data null block.
1046 // If we are in the middle of a range, end it and
1047 // return early, otherwise start a new range of null
1048 // values.
1049 if have_value {
1050 if null_value != value {
1051 return Some(CodePointMapRange {
1052 range: start..=(c - 1),
1053 value,
1054 });
1055 }
1056 } else {
1057 trie_value = T::try_from_u32(self.header.null_value).ok()?;
1058 value = null_value;
1059 have_value = true;
1060 }
1061 c = (c + data_block_length) & !data_mask;
1062 } else {
1063 let mut di: u32 = block + (c & data_mask);
1064 let mut trie_value_2: T = self.data.get(di as usize)?;
1065 if have_value {
1066 if trie_value_2 != trie_value {
1067 if maybe_filter_value(
1068 trie_value_2,
1069 T::try_from_u32(self.header.null_value).ok()?,
1070 null_value,
1071 ) != value
1072 {
1073 return Some(CodePointMapRange {
1074 range: start..=(c - 1),
1075 value,
1076 });
1077 }
1078 // `trie_value` stores the previous value that was retrieved
1079 // from the trie.
1080 // `value` stores the value associated for the range (return
1081 // value) that we are currently building, which is computed
1082 // as a transformation by applying maybe_filter_value()
1083 // to the trie value.
1084 // The current trie value `trie_value_2` within this data block
1085 // differs here from the previous value in `trie_value`.
1086 // But both map to `value` after applying `maybe_filter_value`.
1087 // It is not clear whether the previous or the current trie value
1088 // (or neither) is more likely to match potential subsequent trie
1089 // values that would extend the range by mapping to `value`.
1090 // On the assumption of locality -- often times consecutive
1091 // characters map to the same trie values -- remembering the new
1092 // one might make it faster to extend this range further
1093 // (by increasing the chance that the next `trie_value_2 !=
1094 // trie_value` test will be false).
1095 trie_value = trie_value_2; // may or may not help
1096 }
1097 } else {
1098 trie_value = trie_value_2;
1099 value = maybe_filter_value(
1100 trie_value_2,
1101 T::try_from_u32(self.header.null_value).ok()?,
1102 null_value,
1103 );
1104 have_value = true;
1105 }
1106
1107 c += 1;
1108 while (c & data_mask) != 0 {
1109 di += 1;
1110 trie_value_2 = self.data.get(di as usize)?;
1111 if trie_value_2 != trie_value {
1112 if maybe_filter_value(
1113 trie_value_2,
1114 T::try_from_u32(self.header.null_value).ok()?,
1115 null_value,
1116 ) != value
1117 {
1118 return Some(CodePointMapRange {
1119 range: start..=(c - 1),
1120 value,
1121 });
1122 }
1123 // `trie_value` stores the previous value that was retrieved
1124 // from the trie.
1125 // `value` stores the value associated for the range (return
1126 // value) that we are currently building, which is computed
1127 // as a transformation by applying maybe_filter_value()
1128 // to the trie value.
1129 // The current trie value `trie_value_2` within this data block
1130 // differs here from the previous value in `trie_value`.
1131 // But both map to `value` after applying `maybe_filter_value`.
1132 // It is not clear whether the previous or the current trie value
1133 // (or neither) is more likely to match potential subsequent trie
1134 // values that would extend the range by mapping to `value`.
1135 // On the assumption of locality -- often times consecutive
1136 // characters map to the same trie values -- remembering the new
1137 // one might make it faster to extend this range further
1138 // (by increasing the chance that the next `trie_value_2 !=
1139 // trie_value` test will be false).
1140 trie_value = trie_value_2; // may or may not help
1141 }
1142
1143 c += 1;
1144 }
1145 }
1146 }
1147
1148 i3 += 1;
1149 if i3 >= i3_block_length {
1150 break;
1151 }
1152 }
1153
1154 if c >= self.header.high_start {
1155 break;
1156 }
1157 }
1158
1159 debug_assert!(have_value);
1160
1161 // Now that c >= high_start, compare `value` to `high_value` to see
1162 // if we can merge our current range with the high_value range
1163 // high_start..=CODE_POINT_MAX (start- and end-inclusive), otherwise
1164 // stop at high_start - 1.
1165 let di: u32 = self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET;
1166 let high_value: T = self.data.get(di as usize)?;
1167 if maybe_filter_value(
1168 high_value,
1169 T::try_from_u32(self.header.null_value).ok()?,
1170 null_value,
1171 ) != value
1172 {
1173 c -= 1;
1174 } else {
1175 c = CODE_POINT_MAX;
1176 }
1177 Some(CodePointMapRange {
1178 range: start..=c,
1179 value,
1180 })
1181 }
1182
1183 /// Yields an [`Iterator`] returning ranges of consecutive code points that
1184 /// share the same value in the [`CodePointTrie`], as given by
1185 /// [`CodePointTrie::get_range()`].
1186 ///
1187 /// # Examples
1188 ///
1189 /// ```
1190 /// use core::ops::RangeInclusive;
1191 /// use icu::collections::codepointtrie::planes;
1192 /// use icu::collections::codepointtrie::CodePointMapRange;
1193 ///
1194 /// let planes_trie = planes::get_planes_trie();
1195 ///
1196 /// let mut ranges = planes_trie.iter_ranges();
1197 ///
1198 /// for plane in 0..=16 {
1199 /// let exp_start = plane * 0x1_0000;
1200 /// let exp_end = exp_start + 0xffff;
1201 /// assert_eq!(
1202 /// ranges.next(),
1203 /// Some(CodePointMapRange {
1204 /// range: exp_start..=exp_end,
1205 /// value: plane as u8
1206 /// })
1207 /// );
1208 /// }
1209 ///
1210 /// // Hitting the end of the iterator returns `None`, as will subsequent
1211 /// // calls to .next().
1212 /// assert_eq!(ranges.next(), None);
1213 /// assert_eq!(ranges.next(), None);
1214 /// ```
1215 pub fn iter_ranges(&self) -> CodePointMapRangeIterator<'_, T> {
1216 let init_range = Some(CodePointMapRange {
1217 range: u32::MAX..=u32::MAX,
1218 value: self.error_value(),
1219 });
1220 CodePointMapRangeIterator::<T> {
1221 cpt: self,
1222 cpm_range: init_range,
1223 }
1224 }
1225
1226 /// Yields an [`Iterator`] returning the ranges of the code points whose values
1227 /// match `value` in the [`CodePointTrie`].
1228 ///
1229 /// # Examples
1230 ///
1231 /// ```
1232 /// use icu::collections::codepointtrie::planes;
1233 ///
1234 /// let trie = planes::get_planes_trie();
1235 ///
1236 /// let plane_val = 2;
1237 /// let mut sip_range_iter = trie.iter_ranges_for_value(plane_val as u8);
1238 ///
1239 /// let start = plane_val * 0x1_0000;
1240 /// let end = start + 0xffff;
1241 ///
1242 /// let sip_range = sip_range_iter.next()
1243 /// .expect("Plane 2 (SIP) should exist in planes data");
1244 /// assert_eq!(start..=end, sip_range);
1245 ///
1246 /// assert!(sip_range_iter.next().is_none());
1247 pub fn iter_ranges_for_value(
1248 &self,
1249 value: T,
1250 ) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
1251 self.iter_ranges()
1252 .filter(move |cpm_range| cpm_range.value == value)
1253 .map(|cpm_range| cpm_range.range)
1254 }
1255
1256 /// Yields an [`Iterator`] returning the ranges of the code points after passing
1257 /// the value through a mapping function.
1258 ///
1259 /// This is preferable to calling `.get_ranges().map()` since it will coalesce
1260 /// adjacent ranges into one.
1261 ///
1262 /// # Examples
1263 ///
1264 /// ```
1265 /// use icu::collections::codepointtrie::planes;
1266 ///
1267 /// let trie = planes::get_planes_trie();
1268 ///
1269 /// let plane_val = 2;
1270 /// let mut sip_range_iter = trie.iter_ranges_mapped(|value| value != plane_val as u8).filter(|range| range.value);
1271 ///
1272 /// let end = plane_val * 0x1_0000 - 1;
1273 ///
1274 /// let sip_range = sip_range_iter.next()
1275 /// .expect("Complemented planes data should have at least one entry");
1276 /// assert_eq!(0..=end, sip_range.range);
1277 pub fn iter_ranges_mapped<'a, U: Eq + 'a>(
1278 &'a self,
1279 mut map: impl FnMut(T) -> U + Copy + 'a,
1280 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
1281 crate::iterator_utils::RangeListIteratorCoalescer::new(self.iter_ranges().map(
1282 move |range| CodePointMapRange {
1283 range: range.range,
1284 value: map(range.value),
1285 },
1286 ))
1287 }
1288
1289 /// Returns a [`CodePointInversionList`] for the code points that have the given
1290 /// [`TrieValue`] in the trie.
1291 ///
1292 /// ✨ *Enabled with the `alloc` Cargo feature.*
1293 ///
1294 /// # Examples
1295 ///
1296 /// ```
1297 /// use icu::collections::codepointtrie::planes;
1298 ///
1299 /// let trie = planes::get_planes_trie();
1300 ///
1301 /// let plane_val = 2;
1302 /// let sip = trie.get_set_for_value(plane_val as u8);
1303 ///
1304 /// let start = plane_val * 0x1_0000;
1305 /// let end = start + 0xffff;
1306 ///
1307 /// assert!(!sip.contains32(start - 1));
1308 /// assert!(sip.contains32(start));
1309 /// assert!(sip.contains32(end));
1310 /// assert!(!sip.contains32(end + 1));
1311 /// ```
1312 #[cfg(feature = "alloc")]
1313 pub fn get_set_for_value(&self, value: T) -> CodePointInversionList<'static> {
1314 let value_ranges = self.iter_ranges_for_value(value);
1315 CodePointInversionList::from_iter(value_ranges)
1316 }
1317
1318 /// Returns the value used as an error value for this trie
1319 #[inline]
1320 pub fn error_value(&self) -> T {
1321 self.error_value
1322 }
1323}
1324
1325#[cfg(feature = "databake")]
1326impl<T: TrieValue + databake::Bake> databake::Bake for CodePointTrie<'_, T> {
1327 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
1328 let header = self.header.bake(env);
1329 let index = self.index.bake(env);
1330 let data = self.data.bake(env);
1331 let error_value = self.error_value.bake(env);
1332 databake::quote! { unsafe { icu_collections::codepointtrie::CodePointTrie::from_parts_unstable_unchecked_v1(#header, #index, #data, #error_value) } }
1333 }
1334}
1335
1336#[cfg(feature = "databake")]
1337impl<T: TrieValue + databake::Bake> databake::BakeSize for CodePointTrie<'_, T> {
1338 fn borrows_size(&self) -> usize {
1339 self.header.borrows_size() + self.index.borrows_size() + self.data.borrows_size()
1340 }
1341}
1342
1343impl<T: TrieValue + Into<u32>> CodePointTrie<'_, T> {
1344 /// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
1345 /// as a `u32`.
1346 ///
1347 /// # Examples
1348 ///
1349 /// ```
1350 /// use icu::collections::codepointtrie::planes;
1351 /// let trie = planes::get_planes_trie();
1352 ///
1353 /// let cp = '𑖎' as u32;
1354 /// assert_eq!(cp, 0x1158E);
1355 ///
1356 /// let plane_num: u8 = trie.get32(cp);
1357 /// assert_eq!(trie.get32_u32(cp), plane_num as u32);
1358 /// ```
1359 // Note: This API method maintains consistency with the corresponding
1360 // original ICU APIs.
1361 pub fn get32_u32(&self, code_point: u32) -> u32 {
1362 self.get32(code_point).into()
1363 }
1364}
1365
1366impl<T: TrieValue> Clone for CodePointTrie<'_, T>
1367where
1368 <T as zerovec::ule::AsULE>::ULE: Clone,
1369{
1370 fn clone(&self) -> Self {
1371 CodePointTrie {
1372 header: self.header,
1373 index: self.index.clone(),
1374 data: self.data.clone(),
1375 error_value: self.error_value,
1376 }
1377 }
1378}
1379
1380/// Represents a range of consecutive code points sharing the same value in a
1381/// code point map.
1382///
1383/// The start and end of the interval is represented as a
1384/// `RangeInclusive<u32>`, and the value is represented as `T`.
1385#[derive(PartialEq, Eq, Debug, Clone)]
1386pub struct CodePointMapRange<T> {
1387 /// Range of code points from start to end (inclusive).
1388 pub range: RangeInclusive<u32>,
1389 /// Trie value associated with this range.
1390 pub value: T,
1391}
1392
1393/// A custom [`Iterator`] type specifically for a code point trie that returns
1394/// [`CodePointMapRange`]s.
1395pub struct CodePointMapRangeIterator<'a, T: TrieValue> {
1396 cpt: &'a CodePointTrie<'a, T>,
1397 // Initialize `range` to Some(CodePointMapRange{ start: u32::MAX, end: u32::MAX, value: 0}).
1398 // When `range` is Some(...) and has a start value different from u32::MAX, then we have
1399 // returned at least one code point range due to a call to `next()`.
1400 // When `range` == `None`, it means that we have hit the end of iteration. It would occur
1401 // after a call to `next()` returns a None <=> we attempted to call `get_range()`
1402 // with a start code point that is > CODE_POINT_MAX.
1403 cpm_range: Option<CodePointMapRange<T>>,
1404}
1405
1406impl<T: TrieValue> Iterator for CodePointMapRangeIterator<'_, T> {
1407 type Item = CodePointMapRange<T>;
1408
1409 fn next(&mut self) -> Option<Self::Item> {
1410 self.cpm_range = match &self.cpm_range {
1411 Some(cpmr) => {
1412 if *cpmr.range.start() == u32::MAX {
1413 self.cpt.get_range(0)
1414 } else {
1415 self.cpt.get_range(cpmr.range.end() + 1)
1416 }
1417 }
1418 None => None,
1419 };
1420 // Note: Clone is cheap. We can't Copy because RangeInclusive does not impl Copy.
1421 self.cpm_range.clone()
1422 }
1423}
1424
1425/// For sealing `TypedCodePointTrie`
1426///
1427/// # Safety Usable Invariant
1428///
1429/// All implementations of `TypedCodePointTrie` are reviewable in this module.
1430trait Seal {}
1431
1432/// Trait for writing trait bounds for monomorphizing over either
1433/// `FastCodePointTrie` or `SmallCodePointTrie`.
1434#[allow(private_bounds)] // Permit sealing
1435pub trait TypedCodePointTrie<'trie, T: TrieValue>: Seal {
1436 /// The `TrieType` associated with this `TypedCodePointTrie`
1437 ///
1438 /// # Safety Usable Invariant
1439 ///
1440 /// This constant matches `self.as_untyped_ref().header.trie_type`.
1441 const TRIE_TYPE: TrieType;
1442
1443 /// Lookup trie value as `u32` by Unicode Scalar Value without branching on trie type.
1444 #[inline(always)]
1445 fn get32_u32(&self, code_point: u32) -> u32 {
1446 self.get32(code_point).to_u32()
1447 }
1448
1449 /// Lookup trie value by Basic Multilingual Plane Code Point without branching on trie type.
1450 #[inline(always)]
1451 fn get16(&self, bmp: u16) -> T {
1452 // LLVM's optimizations have been observed not to be 100%
1453 // reliable around collapsing away unnecessary parts of
1454 // `get32`, so not just calling `get32` here.
1455 let code_point = u32::from(bmp);
1456 if let Some(v) = self.get32_by_fast_index(code_point) {
1457 v
1458 } else {
1459 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1460 }
1461 }
1462
1463 /// Lookup trie value by non-Basic Multilingual Plane Scalar Value without branching on trie type.
1464 #[inline(always)]
1465 fn get32_supplementary(&self, supplementary: u32) -> T {
1466 self.as_untyped_ref().get32_supplementary(supplementary)
1467 }
1468
1469 /// Lookup trie value by Unicode Scalar Value without branching on trie type.
1470 #[inline(always)]
1471 fn get(&self, c: char) -> T {
1472 // LLVM's optimizations have been observed not to be 100%
1473 // reliable around collapsing away unnecessary parts of
1474 // `get32`, so not just calling `get32` here.
1475 let code_point = u32::from(c);
1476 if let Some(v) = self.get32_by_fast_index(code_point) {
1477 v
1478 } else {
1479 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1480 }
1481 }
1482
1483 /// Lookup trie value by Unicode Code Point without branching on trie type.
1484 #[inline(always)]
1485 fn get32(&self, code_point: u32) -> T {
1486 if let Some(v) = self.get32_by_fast_index(code_point) {
1487 v
1488 } else if code_point <= CODE_POINT_MAX {
1489 self.as_untyped_ref().get32_by_small_index_cold(code_point)
1490 } else {
1491 self.as_untyped_ref().error_value
1492 }
1493 }
1494
1495 /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]
1496 /// if `code_point` uses fast-path lookup or `None` if `code_point`
1497 /// should use small-path lookup or is above the supported range.
1498 #[inline(always)] // "always" to make the `Option` collapse away
1499 fn get32_by_fast_index(&self, code_point: u32) -> Option<T> {
1500 debug_assert_eq!(Self::TRIE_TYPE, self.as_untyped_ref().header.trie_type);
1501 let fast_max = match Self::TRIE_TYPE {
1502 TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
1503 TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
1504 };
1505 if code_point <= fast_max {
1506 // SAFETY: We just checked the invariant of
1507 // `get32_assuming_fast_index`,
1508 // which is
1509 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
1510 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
1511 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
1512 // ... assuming that `Self::TRIE_TYPE` always matches
1513 // `self.as_untyped_ref().header.trie_type`, i.e. we're relying on
1514 // `CodePointTrie::to_typed` and `CodePointTrie::as_typed_ref` being correct
1515 // and the exclusive ways of obtaining `Self`.
1516 Some(unsafe { self.as_untyped_ref().get32_assuming_fast_index(code_point) })
1517 } else {
1518 // The caller needs to call `get32_by_small_index` or determine
1519 // that the argument is above the permitted range.
1520 None
1521 }
1522 }
1523
1524 /// Returns a reference to the wrapped `CodePointTrie`.
1525 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T>;
1526
1527 /// Extracts the wrapped `CodePointTrie`.
1528 fn to_untyped(self) -> CodePointTrie<'trie, T>;
1529}
1530
1531/// Type-safe wrapper for a fast trie guaranteeing
1532/// the the getters don't branch on the trie type
1533/// and for guarenteeing that `get16` is branchless
1534/// in release builds.
1535#[derive(Debug)]
1536#[repr(transparent)]
1537pub struct FastCodePointTrie<'trie, T: TrieValue> {
1538 inner: CodePointTrie<'trie, T>,
1539}
1540
1541impl<'trie, T: TrieValue> TypedCodePointTrie<'trie, T> for FastCodePointTrie<'trie, T> {
1542 const TRIE_TYPE: TrieType = TrieType::Fast;
1543
1544 /// Returns a reference to the wrapped `CodePointTrie`.
1545 #[inline(always)]
1546 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T> {
1547 &self.inner
1548 }
1549
1550 /// Extracts the wrapped `CodePointTrie`.
1551 #[inline(always)]
1552 fn to_untyped(self) -> CodePointTrie<'trie, T> {
1553 self.inner
1554 }
1555
1556 /// Lookup trie value by Basic Multilingual Plane Code Point without branching on trie type.
1557 #[inline(always)]
1558 fn get16(&self, bmp: u16) -> T {
1559 debug_assert!(u32::from(u16::MAX) <= FAST_TYPE_FAST_INDEXING_MAX);
1560 debug_assert_eq!(Self::TRIE_TYPE, TrieType::Fast);
1561 debug_assert_eq!(self.as_untyped_ref().header.trie_type, TrieType::Fast);
1562 let code_point = u32::from(bmp);
1563 // SAFETY: With `TrieType::Fast`, the `u16` range satisfies
1564 // the invariant of `get32_assuming_fast_index`, which is
1565 // "If `self.header.trie_type == TrieType::Small`, `code_point` must be at most
1566 // `SMALL_TYPE_FAST_INDEXING_MAX`. If `self.header.trie_type ==
1567 // TrieType::Fast`, `code_point` must be at most `FAST_TYPE_FAST_INDEXING_MAX`."
1568 //
1569 // We're relying on `CodePointTrie::to_typed` and `CodePointTrie::as_typed_ref`
1570 // being correct and the exclusive ways of obtaining `Self`.
1571 unsafe { self.as_untyped_ref().get32_assuming_fast_index(code_point) }
1572 }
1573}
1574
1575impl<'trie, T: TrieValue> Seal for FastCodePointTrie<'trie, T> {}
1576
1577impl<'trie, T: TrieValue> TryFrom<&'trie CodePointTrie<'trie, T>>
1578 for &'trie FastCodePointTrie<'trie, T>
1579{
1580 type Error = TypedCodePointTrieError;
1581
1582 fn try_from(
1583 reference: &'trie CodePointTrie<'trie, T>,
1584 ) -> Result<&'trie FastCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1585 match reference.as_typed_ref() {
1586 Typed::Fast(trie) => Ok(trie),
1587 Typed::Small(_) => Err(TypedCodePointTrieError),
1588 }
1589 }
1590}
1591
1592impl<'trie, T: TrieValue> TryFrom<CodePointTrie<'trie, T>> for FastCodePointTrie<'trie, T> {
1593 type Error = TypedCodePointTrieError;
1594
1595 fn try_from(
1596 value: CodePointTrie<'trie, T>,
1597 ) -> Result<FastCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1598 match value.to_typed() {
1599 Typed::Fast(trie) => Ok(trie),
1600 Typed::Small(_) => Err(TypedCodePointTrieError),
1601 }
1602 }
1603}
1604
1605/// Type-safe wrapper for a small trie guaranteeing
1606/// the the getters don't branch on the trie type.
1607#[derive(Debug)]
1608#[repr(transparent)]
1609pub struct SmallCodePointTrie<'trie, T: TrieValue> {
1610 inner: CodePointTrie<'trie, T>,
1611}
1612
1613impl<'trie, T: TrieValue> TypedCodePointTrie<'trie, T> for SmallCodePointTrie<'trie, T> {
1614 const TRIE_TYPE: TrieType = TrieType::Small;
1615
1616 /// Returns a reference to the wrapped `CodePointTrie`.
1617 #[inline(always)]
1618 fn as_untyped_ref(&self) -> &CodePointTrie<'trie, T> {
1619 &self.inner
1620 }
1621
1622 /// Extracts the wrapped `CodePointTrie`.
1623 #[inline(always)]
1624 fn to_untyped(self) -> CodePointTrie<'trie, T> {
1625 self.inner
1626 }
1627}
1628
1629impl<'trie, T: TrieValue> Seal for SmallCodePointTrie<'trie, T> {}
1630
1631impl<'trie, T: TrieValue> TryFrom<&'trie CodePointTrie<'trie, T>>
1632 for &'trie SmallCodePointTrie<'trie, T>
1633{
1634 type Error = TypedCodePointTrieError;
1635
1636 fn try_from(
1637 reference: &'trie CodePointTrie<'trie, T>,
1638 ) -> Result<&'trie SmallCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1639 match reference.as_typed_ref() {
1640 Typed::Fast(_) => Err(TypedCodePointTrieError),
1641 Typed::Small(trie) => Ok(trie),
1642 }
1643 }
1644}
1645
1646impl<'trie, T: TrieValue> TryFrom<CodePointTrie<'trie, T>> for SmallCodePointTrie<'trie, T> {
1647 type Error = TypedCodePointTrieError;
1648
1649 fn try_from(
1650 value: CodePointTrie<'trie, T>,
1651 ) -> Result<SmallCodePointTrie<'trie, T>, TypedCodePointTrieError> {
1652 match value.to_typed() {
1653 Typed::Fast(_) => Err(TypedCodePointTrieError),
1654 Typed::Small(trie) => Ok(trie),
1655 }
1656 }
1657}
1658
1659/// Error indicating that the `TrieType` of an untyped trie
1660/// does not match the requested typed trie type.
1661#[derive(Debug)]
1662#[non_exhaustive]
1663pub struct TypedCodePointTrieError;
1664
1665/// Holder for either fast or small trie with the trie
1666/// type encoded into the Rust type.
1667pub enum Typed<F, S> {
1668 /// The trie type is fast.
1669 Fast(F),
1670 /// The trie type is small.
1671 Small(S),
1672}
1673
1674#[cfg(test)]
1675mod tests {
1676 use super::*;
1677 use crate::codepointtrie::planes;
1678 use alloc::vec::Vec;
1679
1680 #[test]
1681 #[cfg(feature = "serde")]
1682 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1683 let trie = crate::codepointtrie::planes::get_planes_trie();
1684 let trie_serialized: Vec<u8> = postcard::to_allocvec(&trie).unwrap();
1685
1686 // Assert an expected (golden data) version of the serialized trie.
1687 const EXP_TRIE_SERIALIZED: &[u8] = &[
1688 128, 128, 64, 128, 2, 2, 0, 0, 1, 160, 18, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 136,
1693 2, 144, 2, 144, 2, 144, 2, 176, 2, 176, 2, 176, 2, 176, 2, 208, 2, 208, 2, 208, 2, 208,
1694 2, 240, 2, 240, 2, 240, 2, 240, 2, 16, 3, 16, 3, 16, 3, 16, 3, 48, 3, 48, 3, 48, 3, 48,
1695 3, 80, 3, 80, 3, 80, 3, 80, 3, 112, 3, 112, 3, 112, 3, 112, 3, 144, 3, 144, 3, 144, 3,
1696 144, 3, 176, 3, 176, 3, 176, 3, 176, 3, 208, 3, 208, 3, 208, 3, 208, 3, 240, 3, 240, 3,
1697 240, 3, 240, 3, 16, 4, 16, 4, 16, 4, 16, 4, 48, 4, 48, 4, 48, 4, 48, 4, 80, 4, 80, 4,
1698 80, 4, 80, 4, 112, 4, 112, 4, 112, 4, 112, 4, 0, 0, 16, 0, 32, 0, 48, 0, 64, 0, 80, 0,
1699 96, 0, 112, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32,
1700 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48,
1701 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 128, 0, 128, 0, 128, 0, 128,
1702 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1703 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
1704 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1705 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1706 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144,
1707 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1708 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1709 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160,
1710 0, 160, 0, 160, 0, 160, 0, 160, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1711 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1712 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176,
1713 0, 176, 0, 176, 0, 176, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1714 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1715 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192,
1716 0, 192, 0, 192, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1717 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1718 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208,
1719 0, 208, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1720 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1721 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224,
1722 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1723 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240,
1724 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 0,
1725 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
1726 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
1727 1, 0, 1, 0, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1,
1728 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16,
1729 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 32, 1, 32, 1, 32, 1,
1730 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32,
1731 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1,
1732 32, 1, 32, 1, 32, 1, 32, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48,
1733 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1,
1734 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 64, 1, 64,
1735 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1,
1736 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64,
1737 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1738 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80,
1739 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1,
1740 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96,
1741 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1,
1742 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 128, 0, 136, 0, 136, 0, 136, 0, 136,
1743 0, 136, 0, 136, 0, 136, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
1744 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
1745 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1746 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1747 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0,
1748 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1749 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1750 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0,
1751 200, 0, 200, 0, 200, 0, 200, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1752 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1753 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0,
1754 232, 0, 232, 0, 232, 0, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8,
1755 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1,
1756 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1757 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1,
1758 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40,
1759 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1,
1760 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72,
1761 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1762 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1763 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1,
1764 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1765 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1766 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1,
1767 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1768 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1769 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1,
1770 168, 1, 168, 1, 168, 1, 168, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1771 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1772 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1,
1773 200, 1, 200, 1, 200, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1774 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1775 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1,
1776 232, 1, 232, 1, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2,
1777 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8,
1778 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40,
1779 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2,
1780 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 72,
1781 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2,
1782 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72,
1783 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1784 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1785 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2,
1786 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 244, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1787 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1788 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1789 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1790 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1791 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1792 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
1793 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
1794 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
1795 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10,
1796 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
1797 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
1798 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
1799 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15,
1800 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 0,
1801 ];
1802 assert_eq!(trie_serialized, EXP_TRIE_SERIALIZED);
1803
1804 let trie_deserialized = postcard::from_bytes::<CodePointTrie<u8>>(&trie_serialized)?;
1805
1806 assert_eq!(&trie.index, &trie_deserialized.index);
1807 assert_eq!(&trie.data, &trie_deserialized.data);
1808
1809 assert!(!trie_deserialized.index.is_owned());
1810 assert!(!trie_deserialized.data.is_owned());
1811
1812 Ok(())
1813 }
1814
1815 #[test]
1816 fn test_typed() {
1817 let untyped = planes::get_planes_trie();
1818 assert_eq!(untyped.get('\u{10000}'), 1);
1819 let small_ref = <&SmallCodePointTrie<_>>::try_from(&untyped).unwrap();
1820 assert_eq!(small_ref.get('\u{10000}'), 1);
1821 let _ = <&FastCodePointTrie<_>>::try_from(&untyped).is_err();
1822 let small = <SmallCodePointTrie<_>>::try_from(untyped).unwrap();
1823 assert_eq!(small.get('\u{10000}'), 1);
1824 }
1825
1826 #[test]
1827 fn test_get_range() {
1828 let planes_trie = planes::get_planes_trie();
1829
1830 let first_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x0);
1831 assert_eq!(
1832 first_range,
1833 Some(CodePointMapRange {
1834 range: 0x0..=0xffff,
1835 value: 0
1836 })
1837 );
1838
1839 let second_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x1_0000);
1840 assert_eq!(
1841 second_range,
1842 Some(CodePointMapRange {
1843 range: 0x10000..=0x1ffff,
1844 value: 1
1845 })
1846 );
1847
1848 let penultimate_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0xf_0000);
1849 assert_eq!(
1850 penultimate_range,
1851 Some(CodePointMapRange {
1852 range: 0xf_0000..=0xf_ffff,
1853 value: 15
1854 })
1855 );
1856
1857 let last_range: Option<CodePointMapRange<u8>> = planes_trie.get_range(0x10_0000);
1858 assert_eq!(
1859 last_range,
1860 Some(CodePointMapRange {
1861 range: 0x10_0000..=0x10_ffff,
1862 value: 16
1863 })
1864 );
1865 }
1866
1867 #[test]
1868 #[allow(unused_unsafe)] // `unsafe` below is both necessary and unnecessary
1869 fn databake() {
1870 databake::test_bake!(
1871 CodePointTrie<'static, u32>,
1872 const,
1873 unsafe {
1874 crate::codepointtrie::CodePointTrie::from_parts_unstable_unchecked_v1(
1875 crate::codepointtrie::CodePointTrieHeader {
1876 high_start: 1u32,
1877 shifted12_high_start: 2u16,
1878 index3_null_offset: 3u16,
1879 data_null_offset: 4u32,
1880 null_value: 5u32,
1881 trie_type: crate::codepointtrie::TrieType::Small,
1882 },
1883 zerovec::ZeroVec::new(),
1884 zerovec::ZeroVec::new(),
1885 0u32,
1886 )
1887 },
1888 icu_collections,
1889 [zerovec],
1890 );
1891 }
1892}