icu_locale_core/data.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::extensions::unicode as unicode_ext;
6use crate::subtags::{Language, Region, Script, Subtag, Variant};
7#[cfg(feature = "alloc")]
8use crate::ParseError;
9use crate::{LanguageIdentifier, Locale};
10use core::cmp::Ordering;
11use core::default::Default;
12use core::fmt;
13use core::hash::Hash;
14#[cfg(feature = "alloc")]
15use core::str::FromStr;
16
17/// A locale type optimized for use in fallbacking and the ICU4X data pipeline.
18///
19/// [`DataLocale`] contains less functionality than [`Locale`] but more than
20/// [`LanguageIdentifier`] for better size and performance while still meeting
21/// the needs of the ICU4X data pipeline.
22///
23/// You can create a [`DataLocale`] from a borrowed [`Locale`], which is more
24/// efficient than cloning the [`Locale`], but less efficient than converting an owned
25/// [`Locale`]:
26///
27/// ```
28/// use icu_locale_core::locale;
29/// use icu_provider::DataLocale;
30///
31/// let locale1 = locale!("en-u-ca-buddhist");
32/// let data_locale = DataLocale::from(&locale1);
33/// ```
34///
35/// [`DataLocale`] only supports `-u-sd` keywords, to reflect the current state of CLDR data
36/// lookup and fallback. This may change in the future.
37///
38/// ```
39/// use icu_locale_core::{locale, Locale};
40/// use icu_provider::DataLocale;
41///
42/// let locale = "hi-IN-t-en-h0-hybrid-u-attr-ca-buddhist-sd-inas"
43/// .parse::<Locale>()
44/// .unwrap();
45///
46/// assert_eq!(
47/// DataLocale::from(locale),
48/// DataLocale::from(locale!("hi-IN-u-sd-inas"))
49/// );
50/// ```
51#[derive(Clone, Copy, PartialEq, Hash, Eq)]
52#[non_exhaustive]
53pub struct DataLocale {
54 /// Language subtag
55 pub language: Language,
56 /// Script subtag
57 pub script: Option<Script>,
58 /// Region subtag
59 pub region: Option<Region>,
60 /// Variant subtag
61 pub variant: Option<Variant>,
62 /// Subivision (-u-sd-) subtag
63 pub subdivision: Option<Subtag>,
64}
65
66impl Default for DataLocale {
67 fn default() -> Self {
68 Self {
69 language: Language::UNKNOWN,
70 script: None,
71 region: None,
72 variant: None,
73 subdivision: None,
74 }
75 }
76}
77
78impl DataLocale {
79 /// `const` version of `Default::default`
80 pub const fn default() -> Self {
81 DataLocale {
82 language: Language::UNKNOWN,
83 script: None,
84 region: None,
85 variant: None,
86 subdivision: None,
87 }
88 }
89}
90
91impl Default for &DataLocale {
92 fn default() -> Self {
93 static DEFAULT: DataLocale = DataLocale::default();
94 &DEFAULT
95 }
96}
97
98impl fmt::Debug for DataLocale {
99 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100 write!(f, "DataLocale{{{self}}}")
101 }
102}
103
104impl_writeable_for_each_subtag_str_no_test!(DataLocale, selff, selff.script.is_none() && selff.region.is_none() && selff.variant.is_none() && selff.subdivision.is_none() => Some(selff.language.as_str()));
105
106impl From<LanguageIdentifier> for DataLocale {
107 fn from(langid: LanguageIdentifier) -> Self {
108 Self::from(&langid)
109 }
110}
111
112impl From<Locale> for DataLocale {
113 fn from(locale: Locale) -> Self {
114 Self::from(&locale)
115 }
116}
117
118impl From<&LanguageIdentifier> for DataLocale {
119 fn from(langid: &LanguageIdentifier) -> Self {
120 Self {
121 language: langid.language,
122 script: langid.script,
123 region: langid.region,
124 variant: langid.variants.iter().copied().next(),
125 subdivision: None,
126 }
127 }
128}
129
130impl From<&Locale> for DataLocale {
131 fn from(locale: &Locale) -> Self {
132 let mut r = Self::from(&locale.id);
133
134 r.subdivision = locale
135 .extensions
136 .unicode
137 .keywords
138 .get(&unicode_ext::key!("sd"))
139 .and_then(|v| v.as_single_subtag().copied());
140 r
141 }
142}
143
144/// ✨ *Enabled with the `alloc` Cargo feature.*
145#[cfg(feature = "alloc")]
146impl FromStr for DataLocale {
147 type Err = ParseError;
148 #[inline]
149 fn from_str(s: &str) -> Result<Self, Self::Err> {
150 Self::try_from_str(s)
151 }
152}
153
154impl DataLocale {
155 #[inline]
156 /// Parses a [`DataLocale`].
157 ///
158 /// ✨ *Enabled with the `alloc` Cargo feature.*
159 #[cfg(feature = "alloc")]
160 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
161 Self::try_from_utf8(s.as_bytes())
162 }
163
164 /// Parses a [`DataLocale`] from a UTF-8 byte slice.
165 ///
166 /// ✨ *Enabled with the `alloc` Cargo feature.*
167 #[cfg(feature = "alloc")]
168 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
169 let locale = Locale::try_from_utf8(code_units)?;
170 if locale.id.variants.len() > 1
171 || !locale.extensions.transform.is_empty()
172 || !locale.extensions.private.is_empty()
173 || !locale.extensions.other.is_empty()
174 || !locale.extensions.unicode.attributes.is_empty()
175 {
176 return Err(ParseError::InvalidExtension);
177 }
178
179 let unicode_extensions_count = locale.extensions.unicode.keywords.iter().count();
180
181 if unicode_extensions_count != 0
182 && (unicode_extensions_count != 1
183 || !locale
184 .extensions
185 .unicode
186 .keywords
187 .contains_key(&unicode_ext::key!("sd")))
188 {
189 return Err(ParseError::InvalidExtension);
190 }
191
192 Ok(locale.into())
193 }
194
195 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
196 where
197 F: FnMut(&str) -> Result<(), E>,
198 {
199 f(self.language.as_str())?;
200 if let Some(ref script) = self.script {
201 f(script.as_str())?;
202 }
203 if let Some(ref region) = self.region {
204 f(region.as_str())?;
205 }
206 if let Some(ref single_variant) = self.variant {
207 f(single_variant.as_str())?;
208 }
209 if let Some(ref subdivision) = self.subdivision {
210 f("u")?;
211 f("sd")?;
212 f(subdivision.as_str())?;
213 }
214 Ok(())
215 }
216
217 fn as_tuple(
218 &self,
219 ) -> (
220 Language,
221 Option<Script>,
222 Option<Region>,
223 Option<Variant>,
224 Option<Subtag>,
225 ) {
226 (
227 self.language,
228 self.script,
229 self.region,
230 self.variant,
231 self.subdivision,
232 )
233 }
234
235 /// Returns an ordering suitable for use in [`BTreeSet`].
236 ///
237 /// [`BTreeSet`]: alloc::collections::BTreeSet
238 pub fn total_cmp(&self, other: &Self) -> Ordering {
239 self.as_tuple().cmp(&other.as_tuple())
240 }
241
242 /// Compare this [`DataLocale`] with BCP-47 bytes.
243 ///
244 /// The return value is equivalent to what would happen if you first converted this
245 /// [`DataLocale`] to a BCP-47 string and then performed a byte comparison.
246 ///
247 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
248 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
249 ///
250 /// # Examples
251 ///
252 /// ```
253 /// use core::cmp::Ordering;
254 /// use icu_provider::DataLocale;
255 ///
256 /// let bcp47_strings: &[&str] = &[
257 /// "ca",
258 /// "ca-ES",
259 /// "ca-ES-u-sd-esct",
260 /// "ca-ES-valencia",
261 /// "cat",
262 /// "pl-Latn-PL",
263 /// "und",
264 /// "und-fonipa",
265 /// "zh",
266 /// ];
267 ///
268 /// for ab in bcp47_strings.windows(2) {
269 /// let a = ab[0];
270 /// let b = ab[1];
271 /// assert_eq!(a.cmp(b), Ordering::Less, "strings: {} < {}", a, b);
272 /// let a_loc: DataLocale = a.parse().unwrap();
273 /// assert_eq!(
274 /// a_loc.strict_cmp(a.as_bytes()),
275 /// Ordering::Equal,
276 /// "strict_cmp: {} == {}",
277 /// a_loc,
278 /// a
279 /// );
280 /// assert_eq!(
281 /// a_loc.strict_cmp(b.as_bytes()),
282 /// Ordering::Less,
283 /// "strict_cmp: {} < {}",
284 /// a_loc,
285 /// b
286 /// );
287 /// let b_loc: DataLocale = b.parse().unwrap();
288 /// assert_eq!(
289 /// b_loc.strict_cmp(b.as_bytes()),
290 /// Ordering::Equal,
291 /// "strict_cmp: {} == {}",
292 /// b_loc,
293 /// b
294 /// );
295 /// assert_eq!(
296 /// b_loc.strict_cmp(a.as_bytes()),
297 /// Ordering::Greater,
298 /// "strict_cmp: {} > {}",
299 /// b_loc,
300 /// a
301 /// );
302 /// }
303 /// ```
304 ///
305 /// Comparison against invalid strings:
306 ///
307 /// ```
308 /// use icu_provider::DataLocale;
309 ///
310 /// let invalid_strings: &[&str] = &[
311 /// // Less than "ca-ES"
312 /// "CA",
313 /// "ar-x-gbp-FOO",
314 /// // Greater than "ca-AR"
315 /// "ca_ES",
316 /// "ca-ES-x-gbp-FOO",
317 /// ];
318 ///
319 /// let data_locale = "ca-ES".parse::<DataLocale>().unwrap();
320 ///
321 /// for s in invalid_strings.iter() {
322 /// let expected_ordering = "ca-AR".cmp(s);
323 /// let actual_ordering = data_locale.strict_cmp(s.as_bytes());
324 /// assert_eq!(expected_ordering, actual_ordering, "{}", s);
325 /// }
326 /// ```
327 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
328 writeable::cmp_utf8(self, other)
329 }
330
331 /// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
332 ///
333 /// # Examples
334 ///
335 /// ```
336 /// use icu_provider::DataLocale;
337 ///
338 /// assert!("und".parse::<DataLocale>().unwrap().is_unknown());
339 /// assert!(!"de-u-sd-denw".parse::<DataLocale>().unwrap().is_unknown());
340 /// assert!(!"und-ES".parse::<DataLocale>().unwrap().is_unknown());
341 /// ```
342 pub fn is_unknown(&self) -> bool {
343 self.language.is_unknown()
344 && self.script.is_none()
345 && self.region.is_none()
346 && self.variant.is_none()
347 && self.subdivision.is_none()
348 }
349
350 /// Converts this `DataLocale` into a [`Locale`].
351 pub fn into_locale(self) -> Locale {
352 Locale {
353 id: LanguageIdentifier {
354 language: self.language,
355 script: self.script,
356 region: self.region,
357 variants: self
358 .variant
359 .map(crate::subtags::Variants::from_variant)
360 .unwrap_or_default(),
361 },
362 extensions: {
363 let mut extensions = crate::extensions::Extensions::default();
364 if let Some(sd) = self.subdivision {
365 extensions.unicode = unicode_ext::Unicode {
366 keywords: unicode_ext::Keywords::new_single(
367 unicode_ext::key!("sd"),
368 unicode_ext::Value::from_subtag(Some(sd)),
369 ),
370 ..Default::default()
371 }
372 }
373 extensions
374 },
375 }
376 }
377}
378
379#[test]
380fn test_data_locale_to_string() {
381 struct TestCase {
382 pub locale: &'static str,
383 pub expected: &'static str,
384 }
385
386 for cas in [
387 TestCase {
388 locale: "und",
389 expected: "und",
390 },
391 TestCase {
392 locale: "und-u-sd-sdd",
393 expected: "und-u-sd-sdd",
394 },
395 TestCase {
396 locale: "en-ZA-u-sd-zaa",
397 expected: "en-ZA-u-sd-zaa",
398 },
399 ] {
400 let locale = cas.locale.parse::<DataLocale>().unwrap();
401 writeable::assert_writeable_eq!(locale, cas.expected);
402 }
403}
404
405#[test]
406fn test_data_locale_from_string() {
407 #[derive(Debug)]
408 struct TestCase {
409 pub input: &'static str,
410 pub success: bool,
411 }
412
413 for cas in [
414 TestCase {
415 input: "und",
416 success: true,
417 },
418 TestCase {
419 input: "und-u-cu-gbp",
420 success: false,
421 },
422 TestCase {
423 input: "en-ZA-u-sd-zaa",
424 success: true,
425 },
426 TestCase {
427 input: "en...",
428 success: false,
429 },
430 ] {
431 let data_locale = match (DataLocale::from_str(cas.input), cas.success) {
432 (Ok(l), true) => l,
433 (Err(_), false) => {
434 continue;
435 }
436 (Ok(_), false) => {
437 panic!("DataLocale parsed but it was supposed to fail: {cas:?}");
438 }
439 (Err(_), true) => {
440 panic!("DataLocale was supposed to parse but it failed: {cas:?}");
441 }
442 };
443 writeable::assert_writeable_eq!(data_locale, cas.input);
444 }
445}