icu_locale_core/locale.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::parser::*;
6use crate::subtags::Subtag;
7use crate::{extensions, subtags, LanguageIdentifier};
8#[cfg(feature = "alloc")]
9use alloc::borrow::Cow;
10use core::cmp::Ordering;
11#[cfg(feature = "alloc")]
12use core::str::FromStr;
13
14/// A core struct representing a [`Unicode Locale Identifier`].
15///
16/// A locale is made of two parts:
17/// * Unicode Language Identifier
18/// * A set of Unicode Extensions
19///
20/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
21/// on top of that is able to parse, manipulate and serialize unicode extension fields.
22///
23/// # Ordering
24///
25/// This type deliberately does not implement `Ord` or `PartialOrd` because there are
26/// multiple possible orderings. Depending on your use case, two orderings are available:
27///
28/// 1. A string ordering, suitable for stable serialization: [`Locale::strict_cmp`]
29/// 2. A struct ordering, suitable for use with a BTreeSet: [`Locale::total_cmp`]
30///
31/// See issue: <https://github.com/unicode-org/icu4x/issues/1215>
32///
33/// # Parsing
34///
35/// Unicode recognizes three levels of standard conformance for a locale:
36///
37/// * *well-formed* - syntactically correct
38/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
39/// * *canonical* - valid and no deprecated codes or structure.
40///
41/// Any syntactically invalid subtags will cause the parsing to fail with an error.
42///
43/// This operation normalizes syntax to be well-formed. No legacy subtag replacements is performed.
44/// For validation and canonicalization, see `LocaleCanonicalizer`.
45///
46/// ICU4X's Locale parsing does not allow for non-BCP-47-compatible locales [allowed by UTS 35 for backwards compatability][tr35-bcp].
47/// Furthermore, it currently does not allow for language tags to have more than three characters.
48///
49/// # Serde
50///
51/// This type implements `serde::Serialize` and `serde::Deserialize` if the
52/// `"serde"` Cargo feature is enabled on the crate.
53///
54/// The value will be serialized as a string and parsed when deserialized.
55/// For tips on efficient storage and retrieval of locales, see [`crate::zerovec`].
56///
57/// # Examples
58///
59/// Simple example:
60///
61/// ```
62/// use icu::locale::{
63/// extensions::unicode::{key, value},
64/// locale,
65/// subtags::{language, region},
66/// };
67///
68/// let loc = locale!("en-US-u-ca-buddhist");
69///
70/// assert_eq!(loc.id.language, language!("en"));
71/// assert_eq!(loc.id.script, None);
72/// assert_eq!(loc.id.region, Some(region!("US")));
73/// assert_eq!(loc.id.variants.len(), 0);
74/// assert_eq!(
75/// loc.extensions.unicode.keywords.get(&key!("ca")),
76/// Some(&value!("buddhist"))
77/// );
78/// ```
79///
80/// More complex example:
81///
82/// ```
83/// use icu::locale::{subtags::*, Locale};
84///
85/// let loc: Locale = "eN-latn-Us-Valencia-u-hC-H12"
86/// .parse()
87/// .expect("Failed to parse.");
88///
89/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
90/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
91/// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
92/// assert_eq!(
93/// loc.id.variants.first(),
94/// "valencia".parse::<Variant>().ok().as_ref()
95/// );
96/// ```
97///
98/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
99/// [tr35-bcp]: https://unicode.org/reports/tr35/#BCP_47_Conformance
100#[derive(PartialEq, Eq, Clone, Hash)] // no Ord or PartialOrd: see docs
101#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
102pub struct Locale {
103 /// The basic language/script/region components in the locale identifier along with any variants.
104 pub id: LanguageIdentifier,
105 /// Any extensions present in the locale identifier.
106 pub extensions: extensions::Extensions,
107}
108
109#[test]
110// Expected sizes are based on a 64-bit architecture
111#[cfg(target_pointer_width = "64")]
112fn test_sizes() {
113 assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
114 assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
115 assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
116 assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
117 assert_eq!(core::mem::size_of::<subtags::Variants>(), 16);
118 assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 32);
119
120 assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 56);
121 assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 32);
122 assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
123
124 assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 16);
125 assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 24);
126 assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
127 assert_eq!(core::mem::size_of::<extensions::private::Private>(), 16);
128 assert_eq!(core::mem::size_of::<extensions::Extensions>(), 136);
129
130 assert_eq!(core::mem::size_of::<Locale>(), 168);
131}
132
133impl Locale {
134 /// The unknown locale "und".
135 pub const UNKNOWN: Self = crate::locale!("und");
136
137 /// A constructor which takes a utf8 slice, parses it and
138 /// produces a well-formed [`Locale`].
139 ///
140 /// ✨ *Enabled with the `alloc` Cargo feature.*
141 ///
142 /// # Examples
143 ///
144 /// ```
145 /// use icu::locale::Locale;
146 ///
147 /// Locale::try_from_str("en-US-u-hc-h12").unwrap();
148 /// ```
149 #[inline]
150 #[cfg(feature = "alloc")]
151 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
152 Self::try_from_utf8(s.as_bytes())
153 }
154
155 /// See [`Self::try_from_str`]
156 ///
157 /// ✨ *Enabled with the `alloc` Cargo feature.*
158 #[cfg(feature = "alloc")]
159 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
160 parse_locale(code_units)
161 }
162
163 /// Normalize the locale (operating on UTF-8 formatted byte slices)
164 ///
165 /// This operation will normalize casing and the separator.
166 ///
167 /// ✨ *Enabled with the `alloc` Cargo feature.*
168 ///
169 /// # Examples
170 ///
171 /// ```
172 /// use icu::locale::Locale;
173 ///
174 /// assert_eq!(
175 /// Locale::normalize_utf8(b"pL-latn-pl-U-HC-H12").as_deref(),
176 /// Ok("pl-Latn-PL-u-hc-h12")
177 /// );
178 /// ```
179 #[cfg(feature = "alloc")]
180 pub fn normalize_utf8(input: &[u8]) -> Result<Cow<'_, str>, ParseError> {
181 let locale = Self::try_from_utf8(input)?;
182 Ok(writeable::to_string_or_borrow(&locale, input))
183 }
184
185 /// Normalize the locale (operating on strings)
186 ///
187 /// This operation will normalize casing and the separator.
188 ///
189 /// ✨ *Enabled with the `alloc` Cargo feature.*
190 ///
191 /// # Examples
192 ///
193 /// ```
194 /// use icu::locale::Locale;
195 ///
196 /// assert_eq!(
197 /// Locale::normalize("pL-latn-pl-U-HC-H12").as_deref(),
198 /// Ok("pl-Latn-PL-u-hc-h12")
199 /// );
200 /// ```
201 #[cfg(feature = "alloc")]
202 pub fn normalize(input: &str) -> Result<Cow<'_, str>, ParseError> {
203 Self::normalize_utf8(input.as_bytes())
204 }
205
206 /// Compare this [`Locale`] with BCP-47 bytes.
207 ///
208 /// The return value is equivalent to what would happen if you first converted this
209 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
210 ///
211 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
212 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
213 ///
214 /// # Examples
215 ///
216 /// Sorting a list of locales with this method requires converting one of them to a string:
217 ///
218 /// ```
219 /// use icu::locale::Locale;
220 /// use std::cmp::Ordering;
221 /// use writeable::Writeable;
222 ///
223 /// // Random input order:
224 /// let bcp47_strings: &[&str] = &[
225 /// "und-u-ca-hebrew",
226 /// "ar-Latn",
227 /// "zh-Hant-TW",
228 /// "zh-TW",
229 /// "und-fonipa",
230 /// "zh-Hant",
231 /// "ar-SA",
232 /// ];
233 ///
234 /// let mut locales = bcp47_strings
235 /// .iter()
236 /// .map(|s| s.parse().unwrap())
237 /// .collect::<Vec<Locale>>();
238 /// locales.sort_by(|a, b| {
239 /// let b = b.write_to_string();
240 /// a.strict_cmp(b.as_bytes())
241 /// });
242 /// let strict_cmp_strings = locales
243 /// .iter()
244 /// .map(|l| l.to_string())
245 /// .collect::<Vec<String>>();
246 ///
247 /// // Output ordering, sorted alphabetically
248 /// let expected_ordering: &[&str] = &[
249 /// "ar-Latn",
250 /// "ar-SA",
251 /// "und-fonipa",
252 /// "und-u-ca-hebrew",
253 /// "zh-Hant",
254 /// "zh-Hant-TW",
255 /// "zh-TW",
256 /// ];
257 ///
258 /// assert_eq!(expected_ordering, strict_cmp_strings);
259 /// ```
260 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
261 writeable::cmp_utf8(self, other)
262 }
263
264 #[expect(clippy::type_complexity)]
265 pub(crate) fn as_tuple(
266 &self,
267 ) -> (
268 (
269 subtags::Language,
270 Option<subtags::Script>,
271 Option<subtags::Region>,
272 &subtags::Variants,
273 ),
274 (
275 (
276 &extensions::unicode::Attributes,
277 &extensions::unicode::Keywords,
278 ),
279 (
280 Option<(
281 subtags::Language,
282 Option<subtags::Script>,
283 Option<subtags::Region>,
284 &subtags::Variants,
285 )>,
286 &extensions::transform::Fields,
287 ),
288 &extensions::private::Private,
289 &[extensions::other::Other],
290 ),
291 ) {
292 (self.id.as_tuple(), self.extensions.as_tuple())
293 }
294
295 /// Returns an ordering suitable for use in [`BTreeSet`].
296 ///
297 /// Unlike [`Locale::strict_cmp`], the ordering may or may not be equivalent
298 /// to string ordering, and it may or may not be stable across ICU4X releases.
299 ///
300 /// # Examples
301 ///
302 /// This method returns a nonsensical ordering derived from the fields of the struct:
303 ///
304 /// ```
305 /// use icu::locale::Locale;
306 /// use std::cmp::Ordering;
307 ///
308 /// // Input strings, sorted alphabetically
309 /// let bcp47_strings: &[&str] = &[
310 /// "ar-Latn",
311 /// "ar-SA",
312 /// "und-fonipa",
313 /// "und-u-ca-hebrew",
314 /// "zh-Hant",
315 /// "zh-Hant-TW",
316 /// "zh-TW",
317 /// ];
318 /// assert!(bcp47_strings.windows(2).all(|w| w[0] < w[1]));
319 ///
320 /// let mut locales = bcp47_strings
321 /// .iter()
322 /// .map(|s| s.parse().unwrap())
323 /// .collect::<Vec<Locale>>();
324 /// locales.sort_by(Locale::total_cmp);
325 /// let total_cmp_strings = locales
326 /// .iter()
327 /// .map(|l| l.to_string())
328 /// .collect::<Vec<String>>();
329 ///
330 /// // Output ordering, sorted arbitrarily
331 /// let expected_ordering: &[&str] = &[
332 /// "ar-SA",
333 /// "ar-Latn",
334 /// "und-u-ca-hebrew",
335 /// "und-fonipa",
336 /// "zh-TW",
337 /// "zh-Hant",
338 /// "zh-Hant-TW",
339 /// ];
340 ///
341 /// assert_eq!(expected_ordering, total_cmp_strings);
342 /// ```
343 ///
344 /// Use a wrapper to add a [`Locale`] to a [`BTreeSet`]:
345 ///
346 /// ```no_run
347 /// use icu::locale::Locale;
348 /// use std::cmp::Ordering;
349 /// use std::collections::BTreeSet;
350 ///
351 /// #[derive(PartialEq, Eq)]
352 /// struct LocaleTotalOrd(Locale);
353 ///
354 /// impl Ord for LocaleTotalOrd {
355 /// fn cmp(&self, other: &Self) -> Ordering {
356 /// self.0.total_cmp(&other.0)
357 /// }
358 /// }
359 ///
360 /// impl PartialOrd for LocaleTotalOrd {
361 /// fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
362 /// Some(self.cmp(other))
363 /// }
364 /// }
365 ///
366 /// let _: BTreeSet<LocaleTotalOrd> = unimplemented!();
367 /// ```
368 ///
369 /// [`BTreeSet`]: alloc::collections::BTreeSet
370 pub fn total_cmp(&self, other: &Self) -> Ordering {
371 self.as_tuple().cmp(&other.as_tuple())
372 }
373
374 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
375 ///
376 /// The return value is equivalent to what would happen if you first parsed the
377 /// BCP-47 string to a `Locale` and then performed a structural comparison.
378 ///
379 /// ✨ *Enabled with the `alloc` Cargo feature.*
380 ///
381 /// # Examples
382 ///
383 /// ```
384 /// use icu::locale::Locale;
385 ///
386 /// let bcp47_strings: &[&str] = &[
387 /// "pl-LaTn-pL",
388 /// "uNd",
389 /// "UND-FONIPA",
390 /// "UnD-t-m0-TrUe",
391 /// "uNd-u-CA-Japanese",
392 /// "ZH",
393 /// ];
394 ///
395 /// for a in bcp47_strings {
396 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
397 /// }
398 /// ```
399 #[cfg(feature = "alloc")]
400 pub fn normalizing_eq(&self, other: &str) -> bool {
401 macro_rules! subtag_matches {
402 ($T:ty, $iter:ident, $expected:expr) => {
403 $iter
404 .next()
405 .map(|b| <$T>::try_from_utf8(b) == Ok($expected))
406 .unwrap_or(false)
407 };
408 }
409
410 let mut iter = SubtagIterator::new(other.as_bytes());
411 if !subtag_matches!(subtags::Language, iter, self.id.language) {
412 return false;
413 }
414 if let Some(ref script) = self.id.script {
415 if !subtag_matches!(subtags::Script, iter, *script) {
416 return false;
417 }
418 }
419 if let Some(ref region) = self.id.region {
420 if !subtag_matches!(subtags::Region, iter, *region) {
421 return false;
422 }
423 }
424 for variant in self.id.variants.iter() {
425 if !subtag_matches!(subtags::Variant, iter, *variant) {
426 return false;
427 }
428 }
429 if !self.extensions.is_empty() {
430 match extensions::Extensions::try_from_iter(&mut iter) {
431 Ok(exts) => {
432 if self.extensions != exts {
433 return false;
434 }
435 }
436 Err(_) => {
437 return false;
438 }
439 }
440 }
441 iter.next().is_none()
442 }
443
444 #[doc(hidden)] // macro use
445 #[expect(clippy::type_complexity)]
446 pub const fn try_from_utf8_with_single_variant_single_keyword_unicode_extension(
447 code_units: &[u8],
448 ) -> Result<
449 (
450 subtags::Language,
451 Option<subtags::Script>,
452 Option<subtags::Region>,
453 Option<subtags::Variant>,
454 Option<(extensions::unicode::Key, Option<Subtag>)>,
455 ),
456 ParseError,
457 > {
458 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
459 code_units,
460 ParserMode::Locale,
461 )
462 }
463
464 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
465 where
466 F: FnMut(&str) -> Result<(), E>,
467 {
468 self.id.for_each_subtag_str(f)?;
469 self.extensions.for_each_subtag_str(f)?;
470 Ok(())
471 }
472}
473
474/// ✨ *Enabled with the `alloc` Cargo feature.*
475#[cfg(feature = "alloc")]
476impl FromStr for Locale {
477 type Err = ParseError;
478
479 #[inline]
480 fn from_str(s: &str) -> Result<Self, Self::Err> {
481 Self::try_from_str(s)
482 }
483}
484
485impl From<LanguageIdentifier> for Locale {
486 fn from(id: LanguageIdentifier) -> Self {
487 Self {
488 id,
489 extensions: extensions::Extensions::default(),
490 }
491 }
492}
493
494impl From<Locale> for LanguageIdentifier {
495 fn from(loc: Locale) -> Self {
496 loc.id
497 }
498}
499
500impl core::fmt::Debug for Locale {
501 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
502 writeable::Writeable::write_to(self, f)
503 }
504}
505
506impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.writeable_borrow());
507
508#[test]
509fn test_writeable() {
510 use writeable::assert_writeable_eq;
511 assert_writeable_eq!(Locale::UNKNOWN, "und");
512 assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
513 assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
514 assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
515 assert_writeable_eq!(
516 "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
517 "my-Mymr-MM-posix",
518 );
519 assert_writeable_eq!(
520 "zh-macos-posix".parse::<Locale>().unwrap(),
521 "zh-macos-posix",
522 );
523 assert_writeable_eq!(
524 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
525 "my-t-my-d0-zawgyi",
526 );
527 assert_writeable_eq!(
528 "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
529 "ar-SA-u-ca-islamic-civil",
530 );
531 assert_writeable_eq!(
532 "en-001-x-foo-bar".parse::<Locale>().unwrap(),
533 "en-001-x-foo-bar",
534 );
535 assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
536}
537
538/// # Examples
539///
540/// ```
541/// use icu::locale::Locale;
542/// use icu::locale::{locale, subtags::language};
543///
544/// assert_eq!(Locale::from(language!("en")), locale!("en"));
545/// ```
546impl From<subtags::Language> for Locale {
547 fn from(language: subtags::Language) -> Self {
548 Self {
549 id: language.into(),
550 extensions: extensions::Extensions::new(),
551 }
552 }
553}
554
555/// # Examples
556///
557/// ```
558/// use icu::locale::Locale;
559/// use icu::locale::{locale, subtags::script};
560///
561/// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
562/// ```
563impl From<Option<subtags::Script>> for Locale {
564 fn from(script: Option<subtags::Script>) -> Self {
565 Self {
566 id: script.into(),
567 extensions: extensions::Extensions::new(),
568 }
569 }
570}
571
572/// # Examples
573///
574/// ```
575/// use icu::locale::Locale;
576/// use icu::locale::{locale, subtags::region};
577///
578/// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
579/// ```
580impl From<Option<subtags::Region>> for Locale {
581 fn from(region: Option<subtags::Region>) -> Self {
582 Self {
583 id: region.into(),
584 extensions: extensions::Extensions::new(),
585 }
586 }
587}
588
589/// # Examples
590///
591/// ```
592/// use icu::locale::Locale;
593/// use icu::locale::{
594/// locale,
595/// subtags::{language, region, script},
596/// };
597///
598/// assert_eq!(
599/// Locale::from((
600/// language!("en"),
601/// Some(script!("Latn")),
602/// Some(region!("US"))
603/// )),
604/// locale!("en-Latn-US")
605/// );
606/// ```
607impl
608 From<(
609 subtags::Language,
610 Option<subtags::Script>,
611 Option<subtags::Region>,
612 )> for Locale
613{
614 fn from(
615 lsr: (
616 subtags::Language,
617 Option<subtags::Script>,
618 Option<subtags::Region>,
619 ),
620 ) -> Self {
621 Self {
622 id: lsr.into(),
623 extensions: extensions::Extensions::new(),
624 }
625 }
626}