cap_primitives/rustix/linux/fs/
open_impl.rs

1//! Linux 5.6 and later have a syscall `openat2`, with flags that allow it to
2//! enforce the sandboxing property we want. See the [LWN article] for an
3//! overview and the [`openat2` documentation] for details.
4//!
5//! [LWN article]: https://lwn.net/Articles/796868/
6//! [`openat2` documentation]: https://man7.org/linux/man-pages/man2/openat2.2.html
7//!
8//! On older Linux, fall back to `manually::open`.
9
10#[cfg(racy_asserts)]
11use crate::fs::is_same_file;
12use crate::fs::{manually, OpenOptions};
13use std::path::Path;
14use std::{fs, io};
15#[cfg(target_os = "linux")]
16use {
17    super::super::super::fs::compute_oflags,
18    crate::fs::errors,
19    io_lifetimes::FromFd,
20    rustix::fs::{openat2, Mode, OFlags, RawMode, ResolveFlags},
21    rustix::path::Arg,
22    std::sync::atomic::AtomicBool,
23    std::sync::atomic::Ordering::Relaxed,
24};
25
26/// Call the `openat2` system call, or use a fallback if that's unavailable.
27pub(crate) fn open_impl(
28    start: &fs::File,
29    path: &Path,
30    options: &OpenOptions,
31) -> io::Result<fs::File> {
32    // On regular Linux, attempt to use `openat2` to accelerate sandboxed
33    // lookups. On Android, the [seccomp policy] prevents us from even
34    // detecting whether `openat2` is supported, so don't even try.
35    //
36    // [seccomp policy]: https://android-developers.googleblog.com/2017/07/seccomp-filter-in-android-o.html
37    #[cfg(target_os = "linux")]
38    {
39        let result = open_beneath(start, path, options);
40
41        // If we got anything other than a `ENOSYS` error, that's our result.
42        match result {
43            Err(err) if err.raw_os_error() == Some(rustix::io::Errno::NOSYS.raw_os_error()) => {}
44            Err(err) => return Err(err),
45            Ok(fd) => return Ok(fd),
46        }
47    }
48
49    manually::open(start, path, options)
50}
51
52/// Call the `openat2` system call with `RESOLVE_BENEATH`. If the syscall is
53/// unavailable, mark it so for future calls. If `openat2` is unavailable
54/// either permanently or temporarily, return `ENOSYS`.
55#[cfg(target_os = "linux")]
56pub(crate) fn open_beneath(
57    start: &fs::File,
58    path: &Path,
59    options: &OpenOptions,
60) -> io::Result<fs::File> {
61    static INVALID: AtomicBool = AtomicBool::new(false);
62    if INVALID.load(Relaxed) {
63        // `openat2` is permanently unavailable.
64        return Err(rustix::io::Errno::NOSYS.into());
65    }
66
67    let oflags = compute_oflags(options)?;
68
69    // Do two `contains` checks because `TMPFILE` may be represented with
70    // multiple flags and we need to ensure they're all set.
71    let mode = if oflags.contains(OFlags::CREATE) || oflags.contains(OFlags::TMPFILE) {
72        Mode::from_bits((options.ext.mode & 0o7777) as RawMode).unwrap()
73    } else {
74        Mode::empty()
75    };
76
77    // We know `openat2` needs a `&CStr` internally; to avoid allocating on
78    // each iteration of the loop below, allocate the `CString` now.
79    path.into_with_c_str(|path_c_str| {
80        // `openat2` fails with `EAGAIN` if a rename happens anywhere on the host
81        // while it's running, so use a loop to retry it a few times. But not too many
82        // times, because there's no limit on how often this can happen. The actual
83        // number here is currently an arbitrarily chosen guess.
84        for _ in 0..4 {
85            match openat2(
86                start,
87                path_c_str,
88                oflags,
89                mode,
90                ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS,
91            ) {
92                Ok(file) => {
93                    let file = fs::File::from_into_fd(file);
94
95                    #[cfg(racy_asserts)]
96                    check_open(start, path, options, &file);
97
98                    return Ok(file);
99                }
100                Err(err) => match err {
101                    // A rename or similar happened. Try again.
102                    rustix::io::Errno::AGAIN => continue,
103
104                    // `EPERM` is used by some `seccomp` sandboxes to indicate
105                    // that `openat2` is unimplemented:
106                    // <https://github.com/systemd/systemd/blob/e2357b1c8a87b610066b8b2a59517bcfb20b832e/src/shared/seccomp-util.c#L2066>
107                    //
108                    // However, `EPERM` may also indicate a failed `O_NOATIME`
109                    // or a file seal prevented the operation, and it's complex
110                    // to detect those cases, so exit the loop and use the
111                    // fallback.
112                    rustix::io::Errno::PERM => break,
113
114                    // `ENOSYS` means `openat2` is permanently unavailable;
115                    // mark it so and exit the loop.
116                    rustix::io::Errno::NOSYS => {
117                        INVALID.store(true, Relaxed);
118                        break;
119                    }
120
121                    _ => return Err(err),
122                },
123            }
124        }
125
126        Err(rustix::io::Errno::NOSYS)
127    })
128    .map_err(|err| match err {
129        rustix::io::Errno::XDEV => errors::escape_attempt(),
130        err => err.into(),
131    })
132}
133
134#[cfg(racy_asserts)]
135fn check_open(start: &fs::File, path: &Path, options: &OpenOptions, file: &fs::File) {
136    let check = manually::open(
137        start,
138        path,
139        options
140            .clone()
141            .create(false)
142            .create_new(false)
143            .truncate(false),
144    )
145    .expect("manually::open failed when open_openat2 succeeded");
146    assert!(
147        is_same_file(file, &check).unwrap(),
148        "manually::open should open the same inode as open_openat2"
149    );
150}