icu_locale_core/extensions/mod.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
6//! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
7//! is called [`Locale`].
8//!
9//! There are four types of extensions:
10//!
11//! * [`Unicode Extensions`] - marked as `u`.
12//! * [`Transform Extensions`] - marked as `t`.
13//! * [`Private Use Extensions`] - marked as `x`.
14//! * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
15//!
16//! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
17//!
18//! Notice: `Other` extension type is currently not supported.
19//!
20//! # Examples
21//!
22//! ```
23//! use icu::locale::extensions::unicode::{Key, Value};
24//! use icu::locale::Locale;
25//!
26//! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
27//! .parse()
28//! .expect("Failed to parse.");
29//!
30//! assert_eq!(loc.id.language, "en".parse().unwrap());
31//! assert_eq!(loc.id.script, None);
32//! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
33//! assert_eq!(loc.id.variants.len(), 0);
34//!
35//! let key: Key = "ca".parse().expect("Parsing key failed.");
36//! let value: Value = "buddhist".parse().expect("Parsing value failed.");
37//! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
38//! ```
39//!
40//! # Syntactic vs Semantic Extension Handling
41//!
42//! This module is useful when you need to work with Locale extensions at a syntactic level,
43//! perhaps for parsing or generating locale identifiers that include any syntactically valid
44//! extensions.
45//! For handling and validating known CLDR values with semantic meaning, see the
46//! [`crate::preferences::extensions`] module.
47//!
48//! [`LanguageIdentifier`]: super::LanguageIdentifier
49//! [`Locale`]: super::Locale
50//! [`subtags`]: super::subtags
51//! [`Other Extensions`]: other
52//! [`Private Use Extensions`]: private
53//! [`Transform Extensions`]: transform
54//! [`Unicode Extensions`]: unicode
55pub mod other;
56pub mod private;
57pub mod transform;
58pub mod unicode;
59
60use core::cmp::Ordering;
61
62use other::Other;
63use private::{Private, PRIVATE_EXT_CHAR};
64use transform::{Transform, TRANSFORM_EXT_CHAR};
65use unicode::{Unicode, UNICODE_EXT_CHAR};
66
67#[cfg(feature = "alloc")]
68use alloc::vec::Vec;
69
70use crate::parser::ParseError;
71#[cfg(feature = "alloc")]
72use crate::parser::SubtagIterator;
73use crate::subtags;
74
75/// Defines the type of extension.
76#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
77#[non_exhaustive]
78pub enum ExtensionType {
79 /// Transform Extension Type marked as `t`.
80 Transform,
81 /// Unicode Extension Type marked as `u`.
82 Unicode,
83 /// Private Extension Type marked as `x`.
84 Private,
85 /// All other extension types.
86 Other(u8),
87}
88
89impl ExtensionType {
90 #[allow(dead_code)]
91 pub(crate) const fn try_from_byte_slice(key: &[u8]) -> Result<Self, ParseError> {
92 if let [b] = key {
93 Self::try_from_byte(*b)
94 } else {
95 Err(ParseError::InvalidExtension)
96 }
97 }
98
99 pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParseError> {
100 let key = key.to_ascii_lowercase();
101 match key as char {
102 UNICODE_EXT_CHAR => Ok(Self::Unicode),
103 TRANSFORM_EXT_CHAR => Ok(Self::Transform),
104 PRIVATE_EXT_CHAR => Ok(Self::Private),
105 'a'..='z' => Ok(Self::Other(key)),
106 _ => Err(ParseError::InvalidExtension),
107 }
108 }
109
110 pub(crate) const fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
111 let &[first] = code_units else {
112 return Err(ParseError::InvalidExtension);
113 };
114
115 Self::try_from_byte(first)
116 }
117}
118
119/// A map of extensions associated with a given [`Locale`](crate::Locale).
120#[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
121#[non_exhaustive]
122pub struct Extensions {
123 /// A representation of the data for a Unicode extension, when present in the locale identifier.
124 pub unicode: Unicode,
125 /// A representation of the data for a transform extension, when present in the locale identifier.
126 pub transform: Transform,
127 /// A representation of the data for a private-use extension, when present in the locale identifier.
128 pub private: Private,
129 /// A sequence of any other extensions that are present in the locale identifier but are not formally
130 /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
131 /// and [`Private`] are.
132 #[cfg(feature = "alloc")]
133 pub other: Vec<Other>,
134 /// A sequence of any other extensions that are present in the locale identifier but are not formally
135 /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
136 /// and [`Private`] are.
137 #[cfg(not(feature = "alloc"))]
138 pub other: &'static [Other],
139}
140
141impl Extensions {
142 /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
143 ///
144 /// # Examples
145 ///
146 /// ```
147 /// use icu::locale::extensions::Extensions;
148 ///
149 /// assert_eq!(Extensions::new(), Extensions::default());
150 /// ```
151 #[inline]
152 pub const fn new() -> Self {
153 Self {
154 unicode: Unicode::new(),
155 transform: Transform::new(),
156 private: Private::new(),
157 #[cfg(feature = "alloc")]
158 other: Vec::new(),
159 #[cfg(not(feature = "alloc"))]
160 other: &[],
161 }
162 }
163
164 /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
165 /// context.
166 #[inline]
167 pub const fn from_unicode(unicode: Unicode) -> Self {
168 Self {
169 unicode,
170 transform: Transform::new(),
171 private: Private::new(),
172 #[cfg(feature = "alloc")]
173 other: Vec::new(),
174 #[cfg(not(feature = "alloc"))]
175 other: &[],
176 }
177 }
178
179 /// Returns whether there are no extensions present.
180 ///
181 /// # Examples
182 ///
183 /// ```
184 /// use icu::locale::Locale;
185 ///
186 /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
187 ///
188 /// assert!(!loc.extensions.is_empty());
189 /// ```
190 pub fn is_empty(&self) -> bool {
191 self.unicode.is_empty()
192 && self.transform.is_empty()
193 && self.private.is_empty()
194 && self.other.is_empty()
195 }
196
197 #[expect(clippy::type_complexity)]
198 pub(crate) fn as_tuple(
199 &self,
200 ) -> (
201 (&unicode::Attributes, &unicode::Keywords),
202 (
203 Option<(
204 subtags::Language,
205 Option<subtags::Script>,
206 Option<subtags::Region>,
207 &subtags::Variants,
208 )>,
209 &transform::Fields,
210 ),
211 &private::Private,
212 &[other::Other],
213 ) {
214 (
215 self.unicode.as_tuple(),
216 self.transform.as_tuple(),
217 &self.private,
218 &self.other,
219 )
220 }
221
222 /// Returns an ordering suitable for use in [`BTreeSet`].
223 ///
224 /// The ordering may or may not be equivalent to string ordering, and it
225 /// may or may not be stable across ICU4X releases.
226 ///
227 /// [`BTreeSet`]: alloc::collections::BTreeSet
228 pub fn total_cmp(&self, other: &Self) -> Ordering {
229 self.as_tuple().cmp(&other.as_tuple())
230 }
231
232 /// Retains the specified extension types, clearing all others.
233 ///
234 /// ✨ *Enabled with the `alloc` Cargo feature.*
235 ///
236 /// # Examples
237 ///
238 /// ```
239 /// use icu::locale::extensions::ExtensionType;
240 /// use icu::locale::Locale;
241 ///
242 /// let loc: Locale =
243 /// "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
244 ///
245 /// let mut only_unicode = loc.clone();
246 /// only_unicode
247 /// .extensions
248 /// .retain_by_type(|t| t == ExtensionType::Unicode);
249 /// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
250 ///
251 /// let mut only_t_z = loc.clone();
252 /// only_t_z.extensions.retain_by_type(|t| {
253 /// t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
254 /// });
255 /// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
256 /// ```
257 #[cfg(feature = "alloc")]
258 pub fn retain_by_type<F>(&mut self, mut predicate: F)
259 where
260 F: FnMut(ExtensionType) -> bool,
261 {
262 if !predicate(ExtensionType::Unicode) {
263 self.unicode.clear();
264 }
265 if !predicate(ExtensionType::Transform) {
266 self.transform.clear();
267 }
268 if !predicate(ExtensionType::Private) {
269 self.private.clear();
270 }
271 #[cfg(feature = "alloc")]
272 self.other
273 .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
274 }
275
276 #[cfg(feature = "alloc")]
277 pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
278 let mut unicode = None;
279 let mut transform = None;
280 let mut private = None;
281 let mut other = Vec::new();
282
283 while let Some(subtag) = iter.next() {
284 if subtag.is_empty() {
285 return Err(ParseError::InvalidExtension);
286 }
287
288 let &[subtag] = subtag else {
289 return Err(ParseError::InvalidExtension);
290 };
291
292 match ExtensionType::try_from_byte(subtag) {
293 Ok(ExtensionType::Unicode) => {
294 if unicode.is_some() {
295 return Err(ParseError::DuplicatedExtension);
296 }
297 unicode = Some(Unicode::try_from_iter(iter)?);
298 }
299 Ok(ExtensionType::Transform) => {
300 if transform.is_some() {
301 return Err(ParseError::DuplicatedExtension);
302 }
303 transform = Some(Transform::try_from_iter(iter)?);
304 }
305 Ok(ExtensionType::Private) => {
306 if private.is_some() {
307 return Err(ParseError::DuplicatedExtension);
308 }
309 private = Some(Private::try_from_iter(iter)?);
310 }
311 Ok(ExtensionType::Other(ext)) => {
312 if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
313 return Err(ParseError::DuplicatedExtension);
314 }
315 let parsed = Other::try_from_iter(ext, iter)?;
316 if let Err(idx) = other.binary_search(&parsed) {
317 other.insert(idx, parsed);
318 } else {
319 return Err(ParseError::InvalidExtension);
320 }
321 }
322 _ => return Err(ParseError::InvalidExtension),
323 }
324 }
325
326 Ok(Self {
327 unicode: unicode.unwrap_or_default(),
328 transform: transform.unwrap_or_default(),
329 private: private.unwrap_or_default(),
330 other,
331 })
332 }
333
334 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
335 where
336 F: FnMut(&str) -> Result<(), E>,
337 {
338 let mut wrote_tu = false;
339 // Alphabetic by singleton
340 self.other.iter().try_for_each(|other| {
341 if other.get_ext() > TRANSFORM_EXT_CHAR && !wrote_tu {
342 // Since 't' and 'u' are next to each other in alphabetical
343 // order, write both now.
344 self.transform.for_each_subtag_str(f, true)?;
345 self.unicode.for_each_subtag_str(f, true)?;
346 wrote_tu = true;
347 }
348 other.for_each_subtag_str(f, true)?;
349 Ok(())
350 })?;
351
352 if !wrote_tu {
353 self.transform.for_each_subtag_str(f, true)?;
354 self.unicode.for_each_subtag_str(f, true)?;
355 }
356
357 // Private must be written last, since it allows single character
358 // keys. Extensions must also be written in alphabetical order,
359 // which would seem to imply that other extensions `y` and `z` are
360 // invalid, but this is not specified.
361 self.private.for_each_subtag_str(f, true)?;
362 Ok(())
363 }
364}
365
366impl_writeable_for_each_subtag_str_no_test!(Extensions);
367
368#[test]
369fn test_writeable() {
370 use crate::Locale;
371 use writeable::assert_writeable_eq;
372 assert_writeable_eq!(Extensions::new(), "");
373 assert_writeable_eq!(
374 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
375 "t-my-d0-zawgyi",
376 );
377 assert_writeable_eq!(
378 "ar-SA-u-ca-islamic-civil"
379 .parse::<Locale>()
380 .unwrap()
381 .extensions,
382 "u-ca-islamic-civil",
383 );
384 assert_writeable_eq!(
385 "en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
386 "x-foo-bar",
387 );
388 assert_writeable_eq!(
389 "und-t-m0-true".parse::<Locale>().unwrap().extensions,
390 "t-m0-true",
391 );
392 assert_writeable_eq!(
393 "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
394 .parse::<Locale>()
395 .unwrap()
396 .extensions,
397 "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
398 );
399}