cap_primitives/rustix/linux/fs/
open_impl.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
//! Linux 5.6 and later have a syscall `openat2`, with flags that allow it to
//! enforce the sandboxing property we want. See the [LWN article] for an
//! overview and the [`openat2` documentation] for details.
//!
//! [LWN article]: https://lwn.net/Articles/796868/
//! [`openat2` documentation]: https://man7.org/linux/man-pages/man2/openat2.2.html
//!
//! On older Linux, fall back to `manually::open`.

#[cfg(racy_asserts)]
use crate::fs::is_same_file;
use crate::fs::{manually, OpenOptions};
use std::path::Path;
use std::{fs, io};
#[cfg(target_os = "linux")]
use {
    super::super::super::fs::compute_oflags,
    crate::fs::errors,
    io_lifetimes::FromFd,
    rustix::fs::{openat2, Mode, OFlags, RawMode, ResolveFlags},
    rustix::path::Arg,
    std::sync::atomic::AtomicBool,
    std::sync::atomic::Ordering::Relaxed,
};

/// Call the `openat2` system call, or use a fallback if that's unavailable.
pub(crate) fn open_impl(
    start: &fs::File,
    path: &Path,
    options: &OpenOptions,
) -> io::Result<fs::File> {
    // On regular Linux, attempt to use `openat2` to accelerate sandboxed
    // lookups. On Android, the [seccomp policy] prevents us from even
    // detecting whether `openat2` is supported, so don't even try.
    //
    // [seccomp policy]: https://android-developers.googleblog.com/2017/07/seccomp-filter-in-android-o.html
    #[cfg(target_os = "linux")]
    {
        let result = open_beneath(start, path, options);

        // If we got anything other than a `ENOSYS` error, that's our result.
        match result {
            Err(err) if err.raw_os_error() == Some(rustix::io::Errno::NOSYS.raw_os_error()) => {}
            Err(err) => return Err(err),
            Ok(fd) => return Ok(fd),
        }
    }

    manually::open(start, path, options)
}

/// Call the `openat2` system call with `RESOLVE_BENEATH`. If the syscall is
/// unavailable, mark it so for future calls. If `openat2` is unavailable
/// either permanently or temporarily, return `ENOSYS`.
#[cfg(target_os = "linux")]
pub(crate) fn open_beneath(
    start: &fs::File,
    path: &Path,
    options: &OpenOptions,
) -> io::Result<fs::File> {
    static INVALID: AtomicBool = AtomicBool::new(false);
    if INVALID.load(Relaxed) {
        // `openat2` is permanently unavailable.
        return Err(rustix::io::Errno::NOSYS.into());
    }

    let oflags = compute_oflags(options)?;

    // Do two `contains` checks because `TMPFILE` may be represented with
    // multiple flags and we need to ensure they're all set.
    let mode = if oflags.contains(OFlags::CREATE) || oflags.contains(OFlags::TMPFILE) {
        Mode::from_bits((options.ext.mode & 0o7777) as RawMode).unwrap()
    } else {
        Mode::empty()
    };

    // We know `openat2` needs a `&CStr` internally; to avoid allocating on
    // each iteration of the loop below, allocate the `CString` now.
    path.into_with_c_str(|path_c_str| {
        // `openat2` fails with `EAGAIN` if a rename happens anywhere on the host
        // while it's running, so use a loop to retry it a few times. But not too many
        // times, because there's no limit on how often this can happen. The actual
        // number here is currently an arbitrarily chosen guess.
        for _ in 0..4 {
            match openat2(
                start,
                path_c_str,
                oflags,
                mode,
                ResolveFlags::BENEATH | ResolveFlags::NO_MAGICLINKS,
            ) {
                Ok(file) => {
                    let file = fs::File::from_into_fd(file);

                    #[cfg(racy_asserts)]
                    check_open(start, path, options, &file);

                    return Ok(file);
                }
                Err(err) => match err {
                    // A rename or similar happened. Try again.
                    rustix::io::Errno::AGAIN => continue,

                    // `EPERM` is used by some `seccomp` sandboxes to indicate
                    // that `openat2` is unimplemented:
                    // <https://github.com/systemd/systemd/blob/e2357b1c8a87b610066b8b2a59517bcfb20b832e/src/shared/seccomp-util.c#L2066>
                    //
                    // However, `EPERM` may also indicate a failed `O_NOATIME`
                    // or a file seal prevented the operation, and it's complex
                    // to detect those cases, so exit the loop and use the
                    // fallback.
                    rustix::io::Errno::PERM => break,

                    // `ENOSYS` means `openat2` is permanently unavailable;
                    // mark it so and exit the loop.
                    rustix::io::Errno::NOSYS => {
                        INVALID.store(true, Relaxed);
                        break;
                    }

                    _ => return Err(err),
                },
            }
        }

        Err(rustix::io::Errno::NOSYS)
    })
    .map_err(|err| match err {
        rustix::io::Errno::XDEV => errors::escape_attempt(),
        err => err.into(),
    })
}

#[cfg(racy_asserts)]
fn check_open(start: &fs::File, path: &Path, options: &OpenOptions, file: &fs::File) {
    let check = manually::open(
        start,
        path,
        options
            .clone()
            .create(false)
            .create_new(false)
            .truncate(false),
    )
    .expect("manually::open failed when open_openat2 succeeded");
    assert!(
        is_same_file(file, &check).unwrap(),
        "manually::open should open the same inode as open_openat2"
    );
}