wasmtime/runtime/vm/sys/unix/
pagemap.rs

1//! Module for Linux pagemap based tracking of dirty pages.
2//!
3//! For other platforms, a no-op implementation is provided.
4
5#[cfg(feature = "pooling-allocator")]
6use crate::prelude::*;
7
8use self::ioctl::{Categories, PageMapScanBuilder};
9use crate::runtime::vm::{HostAlignedByteCount, host_page_size};
10use rustix::ioctl::ioctl;
11use std::fs::File;
12use std::mem::MaybeUninit;
13use std::ptr;
14
15/// A static file-per-process which represents this process's page map file.
16///
17/// Note that this is required to be updated on a fork because otherwise this'll
18/// refer to the parent process's page map instead of the child process's page
19/// map. Thus when first initializing this file the `pthread_atfork` function is
20/// used to hook the child process to update this.
21///
22/// Also note that updating this is not done via mutation but rather it's done
23/// with `dup2` to replace the file descriptor that `File` points to in-place.
24/// The local copy of of `File` is then closed in the atfork handler.
25#[cfg(feature = "pooling-allocator")]
26static PROCESS_PAGEMAP: std::sync::LazyLock<Option<File>> = std::sync::LazyLock::new(|| {
27    use rustix::fd::AsRawFd;
28
29    let pagemap = File::open("/proc/self/pagemap").ok()?;
30
31    // SAFETY: all libc functions are unsafe by default, and we're basically
32    // going to do our damndest to make sure this invocation of `pthread_atfork`
33    // is safe, namely the handler registered here is intentionally quite
34    // minimal and only accesses the `PROCESS_PAGEMAP`.
35    let rc = unsafe { libc::pthread_atfork(None, None, Some(after_fork_in_child)) };
36    if rc != 0 {
37        return None;
38    }
39
40    return Some(pagemap);
41
42    /// Hook executed as part of `pthread_atfork` in the child process after a
43    /// fork.
44    ///
45    /// # Safety
46    ///
47    /// This function is not safe to call in general and additionally has its
48    /// own stringent safety requirements. This is after a fork but before exec
49    /// so all the safety requirements of `Command::pre_exec` in the standard
50    /// library apply here. Effectively the standard library primitives are
51    /// avoided here as they aren't necessarily safe to execute in this context.
52    unsafe extern "C" fn after_fork_in_child() {
53        let Some(parent_pagemap) = PROCESS_PAGEMAP.as_ref() else {
54            // This should not be reachable, but to avoid panic infrastructure
55            // here this is just skipped instead.
56            return;
57        };
58
59        // SAFETY: see function documentation.
60        //
61        // Here `/proc/self/pagemap` is opened in the child. If that fails for
62        // whatever reason then the pagemap is replaced with `/dev/null` which
63        // means that all future ioctls for `PAGEMAP_SCAN` will fail. If that
64        // fails then that's left to abort the process for now. If that's
65        // problematic we may want to consider opening a local pipe and then
66        // installing that here? Unsure.
67        //
68        // Once a fd is opened the `dup2` syscall is used to replace the
69        // previous file descriptor stored in `parent_pagemap`. That'll update
70        // the pagemap in-place in this child for all future use in case this is
71        // further used in the child.
72        //
73        // And finally once that's all done the `child_pagemap` is itself
74        // closed since we have no more need for it.
75        unsafe {
76            let flags = libc::O_CLOEXEC | libc::O_RDONLY;
77            let mut child_pagemap = libc::open(c"/proc/self/pagemap".as_ptr(), flags);
78            if child_pagemap == -1 {
79                child_pagemap = libc::open(c"/dev/null".as_ptr(), flags);
80            }
81            if child_pagemap == -1 {
82                libc::abort();
83            }
84
85            let rc = libc::dup2(child_pagemap, parent_pagemap.as_raw_fd());
86            if rc == -1 {
87                libc::abort();
88            }
89            let rc = libc::close(child_pagemap);
90            if rc == -1 {
91                libc::abort();
92            }
93        }
94    }
95});
96
97#[derive(Debug)]
98pub struct PageMap(&'static File);
99
100impl PageMap {
101    #[cfg(feature = "pooling-allocator")]
102    pub fn new() -> Option<PageMap> {
103        let file = PROCESS_PAGEMAP.as_ref()?;
104
105        // Check if the `pagemap_scan` ioctl is supported.
106        let mut regions = vec![MaybeUninit::uninit(); 1];
107        let pm_scan = PageMapScanBuilder::new(ptr::slice_from_raw_parts(ptr::null_mut(), 0))
108            .max_pages(1)
109            .return_mask(Categories::empty())
110            .category_mask(Categories::all())
111            .build(&mut regions);
112
113        // SAFETY: we did our best in the `ioctl` code below to model this ioctl
114        // safely, and it's safe to issue the ioctl on `/proc/self/pagemap`.
115        unsafe {
116            ioctl(&file, pm_scan).ok()?;
117        }
118        Some(PageMap(file))
119    }
120}
121
122/// Resets `ptr` for `len` bytes.
123///
124/// This function is a dual implementation of this function in the
125/// `pagemap_disabled` module except it uses the `PAGEMAP_SCAN` [ioctl] on
126/// Linux to be more clever about calling the `reset_manually` closure.
127/// Semantically though this still has the same meaning where all of `ptr` for
128/// `len` bytes will be reset, either through `reset_manually` or `decommit`.
129/// The optimization here is that `reset_manually` will only be called on
130/// regions as-necessary and `decommit` can be skipped entirely in some
131/// situations.
132///
133/// The `PAGEMAP_SCAN` [ioctl] scans a region of memory and reports back
134/// "regions of interest" as configured by the scan. It also does things with
135/// uffd and write-protected pages, but that's not leveraged here. Specifically
136/// this function will perform a scan of `ptr` for `len` bytes which will search
137/// for pages that:
138///
139/// * Are present.
140/// * Have been written.
141/// * Are NOT backed by the "zero" page.
142/// * Are NOT backed by a "file" page.
143///
144/// By default WebAssembly memories/tables are all accessible virtual memory,
145/// but paging optimizations on Linux means they don't actually have a backing
146/// page. For example when an instance starts for the first time its entire
147/// linear memory will be mapped as anonymous memory where page-table-entries
148/// don't even exist for the new memory. Most modules will then have an initial
149/// image mapped in, but that still won't have any page table entries. When
150/// memory is accessed for the first time a page fault will be generated and
151/// handled by the kernel.
152///
153/// If memory is read then the page fault will force a PTE to be allocated to
154/// either zero-backed pages (e.g. ZFOD behavior) or a file-backed page if the
155/// memory is in the initial image mapping. For ZFOD the kernel uses a single
156/// page for the entire system of zeros and for files it uses the page map cache
157/// in the kernel to share the same page across many mappings (as it's all
158/// read-only anyway). Note that in this situation the PTE allocated will have
159/// the write permission disabled meaning that a write will later generate a
160/// page fault.
161///
162/// If memory is written then that will allocate a fresh page from the kernel.
163/// If the PTE was not previously present then the fresh page is initialized
164/// either with zeros or a copy of the contents of the file-backed mapping. If
165/// the PTE was previously present then its previous contents are copied into
166/// the new page. In all of these cases the final PTE allocate will be a private
167/// page to just this process which will be reflected nowhere else on the
168/// system.
169///
170/// Putting this all together this helps explain the search criteria for
171/// `PAGEMAP_SCAN`, notably:
172///
173/// * `Categories::PRESENT` - we're only interested in present pages, anything
174///   unmapped wasn't touched by the guest so no need for the host to touch it
175///   either.
176///
177/// * `Categories::WRITTEN` - if a page was only read by the guest no need to
178///   take a look at it as the contents aren't changed from the initial image.
179///
180/// * `!Categories::PFNZERO` - if a page is mapped to the zero page then it's
181///   guaranteed to be readonly and it means that wasm read the memory but
182///   didn't write to it, additionally meaning it doesn't need to be reset.
183///
184/// * `!Categories::FILE` - similar to `!PFNZERO` if a page is mapped to a file
185///   then for us that means it's readonly meaning wasm only read the memory,
186///   didn't write to it, so the page can be skipped.
187///
188/// The `PAGEMAP_SCAN` will report back a set of contiguous regions of memory
189/// which match our scan flags that we're looking for. Each of these regions is
190/// then passed to `reset_manually` as-is. The ioctl will additionally then
191/// report a "walk_end" address which is the last address it considered before
192/// the scan was halted. A scan can stop for 3 reasons:
193///
194/// * The end of the region of memory being scanned was reached. In this case
195///   the entire region was scanned meaning that all dirty memory was reported
196///   through `reset_manually`. This means that `decommit` can be skipped
197///   entirely (or invoked with a 0 length here which will also end up with it
198///   being skipped).
199///
200/// * The scan's `max_pages` setting was reached. The `keep_resident` argument
201///   indicates the maximal amount of memory to pass to `reset_manually` and
202///   this translates to the `max_pages` configuration option to the ioctl. The
203///   sum total of the size of all regions reported from the ioctl is guaranteed
204///   to be less than `max_pages`. This means that if a scan reaches the
205///   `keep_resident` limit before reaching the end then the ioctl will bail out
206///   early. That means that the wasm module's working set of memory was larger
207///   than `keep_resident` and then the rest of it will be `decommit`'d away.
208///
209/// * The scan's returned set of regions exceeds the capacity passed into the
210///   ioctl. The `pm_scan_arg` of the ioctl takes a `vec` and `vec_len` which is
211///   a region of memory to store a list of `page_region` structures. Below this
212///   is always `MAX_REGIONS`. If there are more than this number of disjoint
213///   regions of memory that need to be reported then the ioctl will also return
214///   early without reaching the end of memory. Note that this means that all
215///   further memory will be `decommit`'d with reported regions still going to
216///   `reset_manually`. This is arguably something we should detect and improve
217///   in Wasmtime, but for now `MAX_REGIONS` is hardcoded.
218///
219/// In the end this ends up being a "more clever" version of this function than
220/// the one in the `pagemap_disabled` module. By using `PAGEMAP_SCAN` we can
221/// search for the first `keep_resident` bytes of dirty memory written to by a
222/// wasm guest instead of assuming the first `keep_resident` bytes of the region
223/// were modified by the guest. This crucially enables the `decommit` operation
224/// to a noop if the wasm guest's set of working memory is less than
225/// `keep_resident` which means that `memcpy` is sufficient to reset a linear
226/// memory or table. This directly translates to higher throughput as it avoids
227/// IPIs and synchronization updating page tables and additionally avoids page
228/// faults on future executions of the same module.
229///
230/// # Safety
231///
232/// Requires that `ptr` is valid to read and write for `len` bytes.
233///
234/// [ioctl]: https://www.man7.org/linux/man-pages/man2/PAGEMAP_SCAN.2const.html
235pub unsafe fn reset_with_pagemap(
236    mut pagemap: Option<&PageMap>,
237    ptr: *mut u8,
238    len: HostAlignedByteCount,
239    mut keep_resident: HostAlignedByteCount,
240    mut reset_manually: impl FnMut(&mut [u8]),
241    mut decommit: impl FnMut(*mut u8, usize),
242) {
243    keep_resident = keep_resident.min(len);
244    let host_page_size = host_page_size();
245
246    if pagemap.is_some() {
247        // Nothing to keep resident? fall back to the default behavior.
248        if keep_resident.byte_count() == 0 {
249            pagemap = None;
250        }
251
252        // Keeping less than one page of memory resident when the original
253        // mapping itself is also less than a page? Also fall back to the
254        // default behavior as this'll just be a simple memcpy.
255        if keep_resident.byte_count() <= host_page_size && len.byte_count() <= host_page_size {
256            pagemap = None;
257        }
258    }
259
260    let pagemap = match pagemap {
261        Some(pagemap) => pagemap,
262
263        // Fall back to the default behavior.
264        //
265        // SAFETY: the safety requirement of
266        // `pagemap_disabled::reset_with_pagemap` is the same as this function.
267        _ => unsafe {
268            return crate::runtime::vm::pagemap_disabled::reset_with_pagemap(
269                None,
270                ptr,
271                len,
272                keep_resident,
273                reset_manually,
274                decommit,
275            );
276        },
277    };
278
279    // For now use a fixed set of regions on the stack, but in the future this
280    // may want to use a dynamically allocated vector for more regions for
281    // example.
282    const MAX_REGIONS: usize = 32;
283    let mut storage = [MaybeUninit::uninit(); MAX_REGIONS];
284
285    let scan_arg = PageMapScanBuilder::new(ptr::slice_from_raw_parts(ptr, len.byte_count()))
286        .max_pages(keep_resident.byte_count() / host_page_size)
287        // We specifically want pages that are NOT backed by the zero page or
288        // backed by files. Such pages mean that they haven't changed from their
289        // original contents, so they're inverted.
290        .category_inverted(Categories::PFNZERO | Categories::FILE)
291        // Search for pages that are written and present as those are the dirty
292        // pages. Additionally search for the zero page/file page as those are
293        // inverted above meaning we're searching for pages that specifically
294        // don't have those flags.
295        .category_mask(
296            Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO | Categories::FILE,
297        )
298        // Don't return any categories back. This helps group regions together
299        // since the reported set of categories is always empty and we otherwise
300        // aren't looking for anything in particular.
301        .return_mask(Categories::empty())
302        .build(&mut storage);
303
304    // SAFETY: this should be a safe ioctl as we control the fd we're operating
305    // on plus all of `scan_arg`, but this relies on `Ioctl` below being the
306    // correct implementation and such.
307    let result = match unsafe { ioctl(&pagemap.0, scan_arg) } {
308        Ok(result) => result,
309
310        // If the ioctl fails for whatever reason, we at least tried, so fall
311        // back to the default behavior.
312        //
313        // SAFETY: the safety requirement of
314        // `pagemap_disabled::reset_with_pagemap` is the same as this function.
315        Err(err) => unsafe {
316            log::warn!("failed pagemap scan {err}");
317            return crate::runtime::vm::pagemap_disabled::reset_with_pagemap(
318                None,
319                ptr,
320                len,
321                keep_resident,
322                reset_manually,
323                decommit,
324            );
325        },
326    };
327
328    // For all regions that were written in the scan reset them manually, then
329    // afterwards decommit everything else.
330    for region in result.regions() {
331        // SAFETY: we're relying on Linux to pass in valid region ranges within
332        // the `ptr/len` we specified to the original syscall.
333        unsafe {
334            reset_manually(&mut *region.region().cast_mut());
335        }
336    }
337
338    // Report everything after `walk_end` to the end of memory as memory that
339    // must be decommitted as the scan didn't reach it. Note that if `walk_end`
340    // is already at the end of memory then the byte size of the decommitted
341    // memory here will be 0 meaning that this is a noop.
342    let scan_size = result.walk_end().addr() - ptr.addr();
343    decommit(result.walk_end().cast_mut(), len.byte_count() - scan_size);
344}
345
346mod ioctl {
347    use rustix::ioctl::*;
348    use std::ffi::c_void;
349    use std::fmt;
350    use std::marker;
351    use std::mem::MaybeUninit;
352    use std::ptr;
353
354    bitflags::bitflags! {
355        /// Categories that can be filtered with [`PageMapScan`]
356        #[derive(Copy, Clone, PartialEq, Eq)]
357        #[repr(transparent)]
358        pub struct Categories: u64 {
359            /// The page has asynchronous write-protection enabled.
360            const WPALLOWED = 1 << 0;
361            /// The page has been written to from the time it was write protected.
362            const WRITTEN = 1 << 1;
363            /// The page is file backed.
364            const FILE = 1 << 2;
365            /// The page is present in the memory.
366            const PRESENT = 1 << 3;
367            /// The page is swapped.
368            const SWAPPED = 1 << 4;
369            /// The page has zero PFN.
370            const PFNZERO = 1 << 5;
371            /// The page is THP or Hugetlb backed.
372            const HUGE = 1 << 6;
373            // NB: I don't know what this is and I can't find documentation for
374            // it, it's just included here for complete-ness with the API that
375            // `PAGEMAP_SCAN` provides.
376            const SOFT_DIRTY = 1 << 7;
377        }
378    }
379
380    impl fmt::Debug for Categories {
381        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
382            bitflags::parser::to_writer(self, f)
383        }
384    }
385
386    impl fmt::Display for Categories {
387        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
388            bitflags::parser::to_writer(self, f)
389        }
390    }
391
392    /// Builder-style structure for building up a [`PageMapScan`] `ioctl` call.
393    pub struct PageMapScanBuilder {
394        pm_scan_arg: pm_scan_arg,
395    }
396
397    impl PageMapScanBuilder {
398        /// Creates a new page map scan that will scan the provided range of memory.
399        pub fn new(region: *const [u8]) -> PageMapScanBuilder {
400            PageMapScanBuilder {
401                pm_scan_arg: pm_scan_arg {
402                    size: size_of::<pm_scan_arg>() as u64,
403                    flags: 0,
404                    start: region.cast::<u8>().addr() as u64,
405                    end: region.cast::<u8>().addr().wrapping_add(region.len()) as u64,
406                    walk_end: 0,
407                    vec: 0,
408                    vec_len: 0,
409                    max_pages: 0,
410                    category_inverted: Categories::empty(),
411                    category_anyof_mask: Categories::empty(),
412                    category_mask: Categories::empty(),
413                    return_mask: Categories::empty(),
414                },
415            }
416        }
417
418        /// Configures the maximum number of returned pages in the output regions.
419        ///
420        /// Setting this to 0 disables this maximum.
421        pub fn max_pages(&mut self, max: usize) -> &mut PageMapScanBuilder {
422            self.pm_scan_arg.max_pages = max.try_into().unwrap();
423            self
424        }
425
426        /// Configures categories which values must match if 0 instead of 1.
427        ///
428        /// Note that this is a mask which is xor'd to the page's true
429        /// categories before testing for `category_mask`. That means that if a
430        /// bit needs to be zero then it additionally must be specified in one
431        /// of `category_mask` or `category_anyof_mask`.
432        ///
433        /// For more detail see the `pagemap_scan_is_interesting_page` function
434        /// in the Linux kernel source.
435        pub fn category_inverted(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
436            self.pm_scan_arg.category_inverted = flags;
437            self
438        }
439
440        /// Only consider pages for which all `flags` match.
441        ///
442        /// This mask is applied after `category_inverted` is used to flip bits
443        /// in a page's categories. Only pages which match all bits in `flags`
444        /// will be considered.
445        ///
446        /// For more detail see the `pagemap_scan_is_interesting_page` function
447        /// in the Linux kernel source.
448        pub fn category_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
449            self.pm_scan_arg.category_mask = flags;
450            self
451        }
452
453        /// Only consider pages for which any bit of `flags` matches.
454        ///
455        /// After `category_inverted` and `category_mask` have been applied, if
456        /// this option is specified to a non-empty value, then at least one of
457        /// `flags` must be in a page's flags to be considered. That means that
458        /// flags specified in `category_inverted` will already be inverted for
459        /// consideration here. The page categories are and'd with `flags` and
460        /// some bit must be set for the page to be considered.
461        ///
462        /// For more detail see the `pagemap_scan_is_interesting_page` function
463        /// in the Linux kernel source.
464        #[expect(dead_code, reason = "bindings for the future if we need them")]
465        pub fn category_anyof_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
466            self.pm_scan_arg.category_anyof_mask = flags;
467            self
468        }
469
470        /// Categories that are to be reported in the regions returned
471        pub fn return_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
472            self.pm_scan_arg.return_mask = flags;
473            self
474        }
475
476        /// Finishes this configuration and flags that the scan results will be
477        /// placed within `dst`. The returned object can be used to perform the
478        /// pagemap scan ioctl.
479        pub fn build<'a>(&self, dst: &'a mut [MaybeUninit<PageRegion>]) -> PageMapScan<'a> {
480            let mut ret = PageMapScan {
481                pm_scan_arg: self.pm_scan_arg,
482                _marker: marker::PhantomData,
483            };
484            ret.pm_scan_arg.vec = dst.as_ptr() as u64;
485            ret.pm_scan_arg.vec_len = dst.len() as u64;
486            return ret;
487        }
488    }
489
490    /// Return result of [`PageMapScanBuilder::build`] used to perform an `ioctl`.
491    #[repr(transparent)]
492    pub struct PageMapScan<'a> {
493        pm_scan_arg: pm_scan_arg,
494        _marker: marker::PhantomData<&'a mut [MaybeUninit<PageRegion>]>,
495    }
496
497    #[derive(Copy, Clone)]
498    #[repr(C)]
499    struct pm_scan_arg {
500        size: u64,
501        flags: u64,
502        start: u64,
503        end: u64,
504        walk_end: u64,
505        vec: u64,
506        vec_len: u64,
507        max_pages: u64,
508        category_inverted: Categories,
509        category_mask: Categories,
510        category_anyof_mask: Categories,
511        return_mask: Categories,
512    }
513
514    /// Return result of a [`PageMapScan`] `ioctl`.
515    ///
516    /// This reports where the kernel stopped walking with
517    /// [`PageMapScanResult::walk_end`] and the description of regions found in
518    /// [`PageMapScanResult::regions`].
519    #[derive(Debug)]
520    pub struct PageMapScanResult<'a> {
521        walk_end: *const u8,
522        regions: &'a mut [PageRegion],
523    }
524
525    impl PageMapScanResult<'_> {
526        /// Where the kernel stopped walking pages, which may be earlier than the
527        /// end of the requested region
528        pub fn walk_end(&self) -> *const u8 {
529            self.walk_end
530        }
531
532        /// Regions the kernel reported back with categories and such.
533        pub fn regions(&self) -> &[PageRegion] {
534            self.regions
535        }
536    }
537
538    /// Return value of [`PageMapScan`], description of regions in the original scan
539    /// with the categories queried.
540    #[repr(transparent)]
541    #[derive(Copy, Clone)]
542    pub struct PageRegion(page_region);
543
544    #[repr(C)]
545    #[derive(Debug, Copy, Clone)]
546    struct page_region {
547        start: u64,
548        end: u64,
549        categories: Categories,
550    }
551
552    impl PageRegion {
553        /// Returns the region of memory this represents as `*const [u8]`
554        #[inline]
555        pub fn region(&self) -> *const [u8] {
556            ptr::slice_from_raw_parts(self.start(), self.len())
557        }
558
559        /// Returns the base pointer into memory this region represents.
560        #[inline]
561        pub fn start(&self) -> *const u8 {
562            self.0.start as *const u8
563        }
564
565        /// Returns the byte length that this region represents.
566        #[inline]
567        pub fn len(&self) -> usize {
568            usize::try_from(self.0.end - self.0.start).unwrap()
569        }
570
571        /// Returns the category flags associated with this region.
572        ///
573        /// Note that this will only contain categories specified in
574        /// [`PageMapScanBuilder::return_mask`].
575        #[inline]
576        #[cfg_attr(
577            not(test),
578            expect(dead_code, reason = "bindings for the future if we need them")
579        )]
580        pub fn categories(&self) -> Categories {
581            self.0.categories
582        }
583    }
584
585    impl fmt::Debug for PageRegion {
586        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587            f.debug_struct("PageRegion")
588                .field("start", &self.start())
589                .field("len", &self.len())
590                .field("categories", &self.0.categories)
591                .finish()
592        }
593    }
594
595    // SAFETY: this implementation should uphold the various requirements that
596    // this trait has, such as `IS_MUTATING` is right, it's only used on the
597    // right platform with the right files, etc.
598    unsafe impl<'a> Ioctl for PageMapScan<'a> {
599        type Output = PageMapScanResult<'a>;
600
601        const IS_MUTATING: bool = true;
602
603        fn opcode(&self) -> Opcode {
604            opcode::read_write::<pm_scan_arg>(b'f', 16)
605        }
606
607        fn as_ptr(&mut self) -> *mut c_void {
608            (&raw mut self.pm_scan_arg).cast()
609        }
610
611        unsafe fn output_from_ptr(
612            out: IoctlOutput,
613            extract_output: *mut c_void,
614        ) -> rustix::io::Result<Self::Output> {
615            let extract_output = extract_output.cast::<pm_scan_arg>();
616            let len = usize::try_from(out).unwrap();
617            // SAFETY: it's a requirement of this method that
618            // `extract_output` is safe to read and indeed a `pm_scan_arg`.
619            // Additionally the slice returned here originated from a slice
620            // provided to `PageMapScanBuilder::build` threaded through the
621            // `vec` field and it should be safe to thread that back out through
622            // to the result.
623            let regions = unsafe {
624                assert!((len as u64) <= (*extract_output).vec_len);
625                std::slice::from_raw_parts_mut((*extract_output).vec as *mut PageRegion, len)
626            };
627            Ok(PageMapScanResult {
628                regions,
629                // SAFETY: it's a requirement of this method that
630                // `extract_output` is safe to read and indeed a `pm_scan_arg`.
631                walk_end: unsafe { (*extract_output).walk_end as *const u8 },
632            })
633        }
634    }
635}
636
637#[cfg(test)]
638mod tests {
639    use super::ioctl::*;
640    use crate::prelude::*;
641    use rustix::ioctl::*;
642    use rustix::mm::*;
643    use std::fs::File;
644    use std::ptr;
645
646    struct MmapAnonymous {
647        ptr: *mut std::ffi::c_void,
648        len: usize,
649    }
650
651    impl MmapAnonymous {
652        fn new(pages: usize) -> MmapAnonymous {
653            let len = pages * rustix::param::page_size();
654            let ptr = unsafe {
655                mmap_anonymous(
656                    ptr::null_mut(),
657                    len,
658                    ProtFlags::READ | ProtFlags::WRITE,
659                    MapFlags::PRIVATE,
660                )
661                .unwrap()
662            };
663            MmapAnonymous { ptr, len }
664        }
665
666        fn read(&self, page: usize) {
667            unsafe {
668                let offset = page * rustix::param::page_size();
669                assert!(offset < self.len);
670                std::ptr::read_volatile(self.ptr.cast::<u8>().add(offset));
671            }
672        }
673
674        fn write(&self, page: usize) {
675            unsafe {
676                let offset = page * rustix::param::page_size();
677                assert!(offset < self.len);
678                std::ptr::write_volatile(self.ptr.cast::<u8>().add(offset), 1);
679            }
680        }
681
682        fn region(&self) -> *const [u8] {
683            ptr::slice_from_raw_parts(self.ptr.cast(), self.len)
684        }
685
686        fn page_region(&self, pages: std::ops::Range<usize>) -> *const [u8] {
687            ptr::slice_from_raw_parts(
688                self.ptr
689                    .cast::<u8>()
690                    .wrapping_add(pages.start * rustix::param::page_size()),
691                (pages.end - pages.start) * rustix::param::page_size(),
692            )
693        }
694
695        fn end(&self) -> *const u8 {
696            self.ptr.cast::<u8>().wrapping_add(self.len)
697        }
698
699        fn page_end(&self, page: usize) -> *const u8 {
700            self.ptr
701                .cast::<u8>()
702                .wrapping_add((page + 1) * rustix::param::page_size())
703        }
704    }
705
706    impl Drop for MmapAnonymous {
707        fn drop(&mut self) {
708            unsafe {
709                munmap(self.ptr, self.len).unwrap();
710            }
711        }
712    }
713
714    fn ioctl_supported() -> bool {
715        let mmap = MmapAnonymous::new(1);
716        let mut results = Vec::with_capacity(1);
717        let fd = File::open("/proc/self/pagemap").unwrap();
718        unsafe {
719            ioctl(
720                &fd,
721                PageMapScanBuilder::new(mmap.region())
722                    .category_mask(Categories::WRITTEN)
723                    .return_mask(Categories::all())
724                    .build(results.spare_capacity_mut()),
725            )
726            .is_ok()
727        }
728    }
729
730    #[test]
731    fn no_pages_returned() {
732        if !ioctl_supported() {
733            return;
734        }
735        let mmap = MmapAnonymous::new(10);
736        let mut results = Vec::with_capacity(10);
737        let fd = File::open("/proc/self/pagemap").unwrap();
738
739        let result = unsafe {
740            ioctl(
741                &fd,
742                PageMapScanBuilder::new(mmap.region())
743                    .category_mask(Categories::WRITTEN)
744                    .return_mask(Categories::all())
745                    .build(results.spare_capacity_mut()),
746            )
747            .unwrap()
748        };
749        assert!(result.regions().is_empty());
750        assert_eq!(result.walk_end(), mmap.end());
751    }
752
753    #[test]
754    fn empty_region() {
755        if !ioctl_supported() {
756            return;
757        }
758        let mut results = Vec::with_capacity(10);
759        let fd = File::open("/proc/self/pagemap").unwrap();
760
761        let empty_region = ptr::slice_from_raw_parts(rustix::param::page_size() as *const u8, 0);
762        let result = unsafe {
763            ioctl(
764                &fd,
765                PageMapScanBuilder::new(empty_region)
766                    .return_mask(Categories::all())
767                    .build(results.spare_capacity_mut()),
768            )
769            .unwrap()
770        };
771        assert!(result.regions().is_empty());
772    }
773
774    #[test]
775    fn basic_page_flags() {
776        if !ioctl_supported() {
777            return;
778        }
779        let mmap = MmapAnonymous::new(10);
780        let mut results = Vec::with_capacity(10);
781        let fd = File::open("/proc/self/pagemap").unwrap();
782
783        mmap.read(0);
784        mmap.write(1);
785        mmap.write(2);
786        mmap.read(3);
787
788        mmap.read(5);
789        mmap.read(6);
790
791        let result = unsafe {
792            ioctl(
793                &fd,
794                PageMapScanBuilder::new(mmap.region())
795                    .category_mask(Categories::WRITTEN)
796                    .return_mask(Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO)
797                    .build(results.spare_capacity_mut()),
798            )
799            .unwrap()
800        };
801        assert_eq!(result.regions().len(), 4);
802        assert_eq!(result.walk_end(), mmap.end());
803        assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
804        assert_eq!(
805            result.regions()[0].categories(),
806            Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
807        );
808
809        assert_eq!(result.regions()[1].region(), mmap.page_region(1..3));
810        assert_eq!(
811            result.regions()[1].categories(),
812            Categories::WRITTEN | Categories::PRESENT
813        );
814
815        assert_eq!(result.regions()[2].region(), mmap.page_region(3..4));
816        assert_eq!(
817            result.regions()[2].categories(),
818            Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
819        );
820
821        assert_eq!(result.regions()[3].region(), mmap.page_region(5..7));
822        assert_eq!(
823            result.regions()[3].categories(),
824            Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
825        );
826    }
827
828    #[test]
829    fn only_written_pages() {
830        if !ioctl_supported() {
831            return;
832        }
833        let mmap = MmapAnonymous::new(10);
834        let mut results = Vec::with_capacity(10);
835        let fd = File::open("/proc/self/pagemap").unwrap();
836
837        mmap.read(0);
838        mmap.write(1);
839        mmap.write(2);
840        mmap.read(3);
841
842        mmap.read(5);
843        mmap.read(6);
844
845        let result = unsafe {
846            ioctl(
847                &fd,
848                PageMapScanBuilder::new(mmap.region())
849                    .category_inverted(Categories::PFNZERO)
850                    .category_mask(Categories::WRITTEN | Categories::PFNZERO)
851                    .return_mask(Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO)
852                    .build(results.spare_capacity_mut()),
853            )
854            .unwrap()
855        };
856        assert_eq!(result.regions().len(), 1);
857        assert_eq!(result.walk_end(), mmap.end());
858
859        assert_eq!(result.regions()[0].region(), mmap.page_region(1..3));
860        assert_eq!(
861            result.regions()[0].categories(),
862            Categories::WRITTEN | Categories::PRESENT
863        );
864    }
865
866    #[test]
867    fn region_limit() {
868        if !ioctl_supported() {
869            return;
870        }
871        let mmap = MmapAnonymous::new(10);
872        let mut results = Vec::with_capacity(1);
873        let fd = File::open("/proc/self/pagemap").unwrap();
874
875        mmap.read(0);
876        mmap.write(1);
877        mmap.read(2);
878        mmap.write(3);
879
880        // Ask for written|pfnzero meaning only-read pages. This should return only
881        // a single region of the first page.
882        let result = unsafe {
883            ioctl(
884                &fd,
885                PageMapScanBuilder::new(mmap.region())
886                    .return_mask(Categories::WRITTEN | Categories::PFNZERO)
887                    .build(results.spare_capacity_mut()),
888            )
889            .unwrap()
890        };
891        assert_eq!(result.regions().len(), 1);
892        assert_eq!(result.walk_end(), mmap.page_end(0));
893
894        assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
895        assert_eq!(
896            result.regions()[0].categories(),
897            Categories::WRITTEN | Categories::PFNZERO
898        );
899
900        // If we ask for written pages though (which seems synonymous with
901        // present?) then everything should be in one region.
902        let result = unsafe {
903            ioctl(
904                &fd,
905                PageMapScanBuilder::new(mmap.region())
906                    .return_mask(Categories::WRITTEN)
907                    .build(results.spare_capacity_mut()),
908            )
909            .unwrap()
910        };
911        assert_eq!(result.regions().len(), 1);
912        assert_eq!(result.walk_end(), mmap.page_end(3));
913
914        assert_eq!(result.regions()[0].region(), mmap.page_region(0..4));
915        assert_eq!(result.regions()[0].categories(), Categories::WRITTEN);
916    }
917
918    #[test]
919    fn page_limit() {
920        if !ioctl_supported() {
921            return;
922        }
923        let mmap = MmapAnonymous::new(10);
924        let mut results = Vec::with_capacity(10);
925        let fd = File::open("/proc/self/pagemap").unwrap();
926
927        mmap.read(0);
928        mmap.read(1);
929        mmap.read(2);
930        mmap.read(3);
931
932        // Ask for written|pfnzero meaning only-read pages. This should return only
933        // a single region of the first page.
934        let result = unsafe {
935            ioctl(
936                &fd,
937                PageMapScanBuilder::new(mmap.region())
938                    .return_mask(Categories::WRITTEN | Categories::PFNZERO)
939                    .max_pages(2)
940                    .build(results.spare_capacity_mut()),
941            )
942            .unwrap()
943        };
944        assert_eq!(result.regions().len(), 1);
945        assert_eq!(result.walk_end(), mmap.page_end(1));
946
947        assert_eq!(result.regions()[0].region(), mmap.page_region(0..2));
948        assert_eq!(
949            result.regions()[0].categories(),
950            Categories::WRITTEN | Categories::PFNZERO
951        );
952    }
953
954    #[test]
955    fn page_limit_with_hole() {
956        if !ioctl_supported() {
957            return;
958        }
959        let mmap = MmapAnonymous::new(10);
960        let mut results = Vec::with_capacity(10);
961        let fd = File::open("/proc/self/pagemap").unwrap();
962
963        mmap.read(0);
964        mmap.read(2);
965        mmap.read(3);
966
967        // Ask for written|pfnzero meaning only-read pages. This should return only
968        // a single region of the first page.
969        let result = unsafe {
970            ioctl(
971                &fd,
972                PageMapScanBuilder::new(mmap.region())
973                    .category_mask(Categories::WRITTEN)
974                    .return_mask(Categories::WRITTEN | Categories::PFNZERO)
975                    .max_pages(2)
976                    .build(results.spare_capacity_mut()),
977            )
978            .unwrap()
979        };
980        assert_eq!(result.regions().len(), 2);
981        assert_eq!(result.walk_end(), mmap.page_end(2));
982
983        assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
984        assert_eq!(
985            result.regions()[0].categories(),
986            Categories::WRITTEN | Categories::PFNZERO
987        );
988        assert_eq!(result.regions()[1].region(), mmap.page_region(2..3));
989        assert_eq!(
990            result.regions()[1].categories(),
991            Categories::WRITTEN | Categories::PFNZERO
992        );
993    }
994}