wasmtime/runtime/vm/sys/unix/pagemap.rs
1//! Module for Linux pagemap based tracking of dirty pages.
2//!
3//! For other platforms, a no-op implementation is provided.
4
5#[cfg(feature = "pooling-allocator")]
6use crate::prelude::*;
7
8use self::ioctl::{Categories, PageMapScanBuilder};
9use crate::runtime::vm::{HostAlignedByteCount, host_page_size};
10use rustix::ioctl::ioctl;
11use std::fs::File;
12use std::mem::MaybeUninit;
13use std::ptr;
14
15/// A static file-per-process which represents this process's page map file.
16///
17/// Note that this is required to be updated on a fork because otherwise this'll
18/// refer to the parent process's page map instead of the child process's page
19/// map. Thus when first initializing this file the `pthread_atfork` function is
20/// used to hook the child process to update this.
21///
22/// Also note that updating this is not done via mutation but rather it's done
23/// with `dup2` to replace the file descriptor that `File` points to in-place.
24/// The local copy of of `File` is then closed in the atfork handler.
25#[cfg(feature = "pooling-allocator")]
26static PROCESS_PAGEMAP: std::sync::LazyLock<Option<File>> = std::sync::LazyLock::new(|| {
27 use rustix::fd::AsRawFd;
28
29 let pagemap = File::open("/proc/self/pagemap").ok()?;
30
31 // SAFETY: all libc functions are unsafe by default, and we're basically
32 // going to do our damndest to make sure this invocation of `pthread_atfork`
33 // is safe, namely the handler registered here is intentionally quite
34 // minimal and only accesses the `PROCESS_PAGEMAP`.
35 let rc = unsafe { libc::pthread_atfork(None, None, Some(after_fork_in_child)) };
36 if rc != 0 {
37 return None;
38 }
39
40 return Some(pagemap);
41
42 /// Hook executed as part of `pthread_atfork` in the child process after a
43 /// fork.
44 ///
45 /// # Safety
46 ///
47 /// This function is not safe to call in general and additionally has its
48 /// own stringent safety requirements. This is after a fork but before exec
49 /// so all the safety requirements of `Command::pre_exec` in the standard
50 /// library apply here. Effectively the standard library primitives are
51 /// avoided here as they aren't necessarily safe to execute in this context.
52 unsafe extern "C" fn after_fork_in_child() {
53 let Some(parent_pagemap) = PROCESS_PAGEMAP.as_ref() else {
54 // This should not be reachable, but to avoid panic infrastructure
55 // here this is just skipped instead.
56 return;
57 };
58
59 // SAFETY: see function documentation.
60 //
61 // Here `/proc/self/pagemap` is opened in the child. If that fails for
62 // whatever reason then the pagemap is replaced with `/dev/null` which
63 // means that all future ioctls for `PAGEMAP_SCAN` will fail. If that
64 // fails then that's left to abort the process for now. If that's
65 // problematic we may want to consider opening a local pipe and then
66 // installing that here? Unsure.
67 //
68 // Once a fd is opened the `dup2` syscall is used to replace the
69 // previous file descriptor stored in `parent_pagemap`. That'll update
70 // the pagemap in-place in this child for all future use in case this is
71 // further used in the child.
72 //
73 // And finally once that's all done the `child_pagemap` is itself
74 // closed since we have no more need for it.
75 unsafe {
76 let flags = libc::O_CLOEXEC | libc::O_RDONLY;
77 let mut child_pagemap = libc::open(c"/proc/self/pagemap".as_ptr(), flags);
78 if child_pagemap == -1 {
79 child_pagemap = libc::open(c"/dev/null".as_ptr(), flags);
80 }
81 if child_pagemap == -1 {
82 libc::abort();
83 }
84
85 let rc = libc::dup2(child_pagemap, parent_pagemap.as_raw_fd());
86 if rc == -1 {
87 libc::abort();
88 }
89 let rc = libc::close(child_pagemap);
90 if rc == -1 {
91 libc::abort();
92 }
93 }
94 }
95});
96
97#[derive(Debug)]
98pub struct PageMap(&'static File);
99
100impl PageMap {
101 #[cfg(feature = "pooling-allocator")]
102 pub fn new() -> Option<PageMap> {
103 let file = PROCESS_PAGEMAP.as_ref()?;
104
105 // Check if the `pagemap_scan` ioctl is supported.
106 let mut regions = vec![MaybeUninit::uninit(); 1];
107 let pm_scan = PageMapScanBuilder::new(ptr::slice_from_raw_parts(ptr::null_mut(), 0))
108 .max_pages(1)
109 .return_mask(Categories::empty())
110 .category_mask(Categories::all())
111 .build(&mut regions);
112
113 // SAFETY: we did our best in the `ioctl` code below to model this ioctl
114 // safely, and it's safe to issue the ioctl on `/proc/self/pagemap`.
115 unsafe {
116 ioctl(&file, pm_scan).ok()?;
117 }
118 Some(PageMap(file))
119 }
120}
121
122/// Resets `ptr` for `len` bytes.
123///
124/// This function is a dual implementation of this function in the
125/// `pagemap_disabled` module except it uses the `PAGEMAP_SCAN` [ioctl] on
126/// Linux to be more clever about calling the `reset_manually` closure.
127/// Semantically though this still has the same meaning where all of `ptr` for
128/// `len` bytes will be reset, either through `reset_manually` or `decommit`.
129/// The optimization here is that `reset_manually` will only be called on
130/// regions as-necessary and `decommit` can be skipped entirely in some
131/// situations.
132///
133/// The `PAGEMAP_SCAN` [ioctl] scans a region of memory and reports back
134/// "regions of interest" as configured by the scan. It also does things with
135/// uffd and write-protected pages, but that's not leveraged here. Specifically
136/// this function will perform a scan of `ptr` for `len` bytes which will search
137/// for pages that:
138///
139/// * Are present.
140/// * Have been written.
141/// * Are NOT backed by the "zero" page.
142/// * Are NOT backed by a "file" page.
143///
144/// By default WebAssembly memories/tables are all accessible virtual memory,
145/// but paging optimizations on Linux means they don't actually have a backing
146/// page. For example when an instance starts for the first time its entire
147/// linear memory will be mapped as anonymous memory where page-table-entries
148/// don't even exist for the new memory. Most modules will then have an initial
149/// image mapped in, but that still won't have any page table entries. When
150/// memory is accessed for the first time a page fault will be generated and
151/// handled by the kernel.
152///
153/// If memory is read then the page fault will force a PTE to be allocated to
154/// either zero-backed pages (e.g. ZFOD behavior) or a file-backed page if the
155/// memory is in the initial image mapping. For ZFOD the kernel uses a single
156/// page for the entire system of zeros and for files it uses the page map cache
157/// in the kernel to share the same page across many mappings (as it's all
158/// read-only anyway). Note that in this situation the PTE allocated will have
159/// the write permission disabled meaning that a write will later generate a
160/// page fault.
161///
162/// If memory is written then that will allocate a fresh page from the kernel.
163/// If the PTE was not previously present then the fresh page is initialized
164/// either with zeros or a copy of the contents of the file-backed mapping. If
165/// the PTE was previously present then its previous contents are copied into
166/// the new page. In all of these cases the final PTE allocate will be a private
167/// page to just this process which will be reflected nowhere else on the
168/// system.
169///
170/// Putting this all together this helps explain the search criteria for
171/// `PAGEMAP_SCAN`, notably:
172///
173/// * `Categories::PRESENT` - we're only interested in present pages, anything
174/// unmapped wasn't touched by the guest so no need for the host to touch it
175/// either.
176///
177/// * `Categories::WRITTEN` - if a page was only read by the guest no need to
178/// take a look at it as the contents aren't changed from the initial image.
179///
180/// * `!Categories::PFNZERO` - if a page is mapped to the zero page then it's
181/// guaranteed to be readonly and it means that wasm read the memory but
182/// didn't write to it, additionally meaning it doesn't need to be reset.
183///
184/// * `!Categories::FILE` - similar to `!PFNZERO` if a page is mapped to a file
185/// then for us that means it's readonly meaning wasm only read the memory,
186/// didn't write to it, so the page can be skipped.
187///
188/// The `PAGEMAP_SCAN` will report back a set of contiguous regions of memory
189/// which match our scan flags that we're looking for. Each of these regions is
190/// then passed to `reset_manually` as-is. The ioctl will additionally then
191/// report a "walk_end" address which is the last address it considered before
192/// the scan was halted. A scan can stop for 3 reasons:
193///
194/// * The end of the region of memory being scanned was reached. In this case
195/// the entire region was scanned meaning that all dirty memory was reported
196/// through `reset_manually`. This means that `decommit` can be skipped
197/// entirely (or invoked with a 0 length here which will also end up with it
198/// being skipped).
199///
200/// * The scan's `max_pages` setting was reached. The `keep_resident` argument
201/// indicates the maximal amount of memory to pass to `reset_manually` and
202/// this translates to the `max_pages` configuration option to the ioctl. The
203/// sum total of the size of all regions reported from the ioctl is guaranteed
204/// to be less than `max_pages`. This means that if a scan reaches the
205/// `keep_resident` limit before reaching the end then the ioctl will bail out
206/// early. That means that the wasm module's working set of memory was larger
207/// than `keep_resident` and then the rest of it will be `decommit`'d away.
208///
209/// * The scan's returned set of regions exceeds the capacity passed into the
210/// ioctl. The `pm_scan_arg` of the ioctl takes a `vec` and `vec_len` which is
211/// a region of memory to store a list of `page_region` structures. Below this
212/// is always `MAX_REGIONS`. If there are more than this number of disjoint
213/// regions of memory that need to be reported then the ioctl will also return
214/// early without reaching the end of memory. Note that this means that all
215/// further memory will be `decommit`'d with reported regions still going to
216/// `reset_manually`. This is arguably something we should detect and improve
217/// in Wasmtime, but for now `MAX_REGIONS` is hardcoded.
218///
219/// In the end this ends up being a "more clever" version of this function than
220/// the one in the `pagemap_disabled` module. By using `PAGEMAP_SCAN` we can
221/// search for the first `keep_resident` bytes of dirty memory written to by a
222/// wasm guest instead of assuming the first `keep_resident` bytes of the region
223/// were modified by the guest. This crucially enables the `decommit` operation
224/// to a noop if the wasm guest's set of working memory is less than
225/// `keep_resident` which means that `memcpy` is sufficient to reset a linear
226/// memory or table. This directly translates to higher throughput as it avoids
227/// IPIs and synchronization updating page tables and additionally avoids page
228/// faults on future executions of the same module.
229///
230/// # Safety
231///
232/// Requires that `ptr` is valid to read and write for `len` bytes.
233///
234/// [ioctl]: https://www.man7.org/linux/man-pages/man2/PAGEMAP_SCAN.2const.html
235pub unsafe fn reset_with_pagemap(
236 mut pagemap: Option<&PageMap>,
237 ptr: *mut u8,
238 len: HostAlignedByteCount,
239 mut keep_resident: HostAlignedByteCount,
240 mut reset_manually: impl FnMut(&mut [u8]),
241 mut decommit: impl FnMut(*mut u8, usize),
242) {
243 keep_resident = keep_resident.min(len);
244 let host_page_size = host_page_size();
245
246 if pagemap.is_some() {
247 // Nothing to keep resident? fall back to the default behavior.
248 if keep_resident.byte_count() == 0 {
249 pagemap = None;
250 }
251
252 // Keeping less than one page of memory resident when the original
253 // mapping itself is also less than a page? Also fall back to the
254 // default behavior as this'll just be a simple memcpy.
255 if keep_resident.byte_count() <= host_page_size && len.byte_count() <= host_page_size {
256 pagemap = None;
257 }
258 }
259
260 let pagemap = match pagemap {
261 Some(pagemap) => pagemap,
262
263 // Fall back to the default behavior.
264 //
265 // SAFETY: the safety requirement of
266 // `pagemap_disabled::reset_with_pagemap` is the same as this function.
267 _ => unsafe {
268 return crate::runtime::vm::pagemap_disabled::reset_with_pagemap(
269 None,
270 ptr,
271 len,
272 keep_resident,
273 reset_manually,
274 decommit,
275 );
276 },
277 };
278
279 // For now use a fixed set of regions on the stack, but in the future this
280 // may want to use a dynamically allocated vector for more regions for
281 // example.
282 const MAX_REGIONS: usize = 32;
283 let mut storage = [MaybeUninit::uninit(); MAX_REGIONS];
284
285 let scan_arg = PageMapScanBuilder::new(ptr::slice_from_raw_parts(ptr, len.byte_count()))
286 .max_pages(keep_resident.byte_count() / host_page_size)
287 // We specifically want pages that are NOT backed by the zero page or
288 // backed by files. Such pages mean that they haven't changed from their
289 // original contents, so they're inverted.
290 .category_inverted(Categories::PFNZERO | Categories::FILE)
291 // Search for pages that are written and present as those are the dirty
292 // pages. Additionally search for the zero page/file page as those are
293 // inverted above meaning we're searching for pages that specifically
294 // don't have those flags.
295 .category_mask(
296 Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO | Categories::FILE,
297 )
298 // Don't return any categories back. This helps group regions together
299 // since the reported set of categories is always empty and we otherwise
300 // aren't looking for anything in particular.
301 .return_mask(Categories::empty())
302 .build(&mut storage);
303
304 // SAFETY: this should be a safe ioctl as we control the fd we're operating
305 // on plus all of `scan_arg`, but this relies on `Ioctl` below being the
306 // correct implementation and such.
307 let result = match unsafe { ioctl(&pagemap.0, scan_arg) } {
308 Ok(result) => result,
309
310 // If the ioctl fails for whatever reason, we at least tried, so fall
311 // back to the default behavior.
312 //
313 // SAFETY: the safety requirement of
314 // `pagemap_disabled::reset_with_pagemap` is the same as this function.
315 Err(err) => unsafe {
316 log::warn!("failed pagemap scan {err}");
317 return crate::runtime::vm::pagemap_disabled::reset_with_pagemap(
318 None,
319 ptr,
320 len,
321 keep_resident,
322 reset_manually,
323 decommit,
324 );
325 },
326 };
327
328 // For all regions that were written in the scan reset them manually, then
329 // afterwards decommit everything else.
330 for region in result.regions() {
331 // SAFETY: we're relying on Linux to pass in valid region ranges within
332 // the `ptr/len` we specified to the original syscall.
333 unsafe {
334 reset_manually(&mut *region.region().cast_mut());
335 }
336 }
337
338 // Report everything after `walk_end` to the end of memory as memory that
339 // must be decommitted as the scan didn't reach it. Note that if `walk_end`
340 // is already at the end of memory then the byte size of the decommitted
341 // memory here will be 0 meaning that this is a noop.
342 let scan_size = result.walk_end().addr() - ptr.addr();
343 decommit(result.walk_end().cast_mut(), len.byte_count() - scan_size);
344}
345
346mod ioctl {
347 use rustix::ioctl::*;
348 use std::ffi::c_void;
349 use std::fmt;
350 use std::marker;
351 use std::mem::MaybeUninit;
352 use std::ptr;
353
354 bitflags::bitflags! {
355 /// Categories that can be filtered with [`PageMapScan`]
356 #[derive(Copy, Clone, PartialEq, Eq)]
357 #[repr(transparent)]
358 pub struct Categories: u64 {
359 /// The page has asynchronous write-protection enabled.
360 const WPALLOWED = 1 << 0;
361 /// The page has been written to from the time it was write protected.
362 const WRITTEN = 1 << 1;
363 /// The page is file backed.
364 const FILE = 1 << 2;
365 /// The page is present in the memory.
366 const PRESENT = 1 << 3;
367 /// The page is swapped.
368 const SWAPPED = 1 << 4;
369 /// The page has zero PFN.
370 const PFNZERO = 1 << 5;
371 /// The page is THP or Hugetlb backed.
372 const HUGE = 1 << 6;
373 // NB: I don't know what this is and I can't find documentation for
374 // it, it's just included here for complete-ness with the API that
375 // `PAGEMAP_SCAN` provides.
376 const SOFT_DIRTY = 1 << 7;
377 }
378 }
379
380 impl fmt::Debug for Categories {
381 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
382 bitflags::parser::to_writer(self, f)
383 }
384 }
385
386 impl fmt::Display for Categories {
387 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
388 bitflags::parser::to_writer(self, f)
389 }
390 }
391
392 /// Builder-style structure for building up a [`PageMapScan`] `ioctl` call.
393 pub struct PageMapScanBuilder {
394 pm_scan_arg: pm_scan_arg,
395 }
396
397 impl PageMapScanBuilder {
398 /// Creates a new page map scan that will scan the provided range of memory.
399 pub fn new(region: *const [u8]) -> PageMapScanBuilder {
400 PageMapScanBuilder {
401 pm_scan_arg: pm_scan_arg {
402 size: size_of::<pm_scan_arg>() as u64,
403 flags: 0,
404 start: region.cast::<u8>().addr() as u64,
405 end: region.cast::<u8>().addr().wrapping_add(region.len()) as u64,
406 walk_end: 0,
407 vec: 0,
408 vec_len: 0,
409 max_pages: 0,
410 category_inverted: Categories::empty(),
411 category_anyof_mask: Categories::empty(),
412 category_mask: Categories::empty(),
413 return_mask: Categories::empty(),
414 },
415 }
416 }
417
418 /// Configures the maximum number of returned pages in the output regions.
419 ///
420 /// Setting this to 0 disables this maximum.
421 pub fn max_pages(&mut self, max: usize) -> &mut PageMapScanBuilder {
422 self.pm_scan_arg.max_pages = max.try_into().unwrap();
423 self
424 }
425
426 /// Configures categories which values must match if 0 instead of 1.
427 ///
428 /// Note that this is a mask which is xor'd to the page's true
429 /// categories before testing for `category_mask`. That means that if a
430 /// bit needs to be zero then it additionally must be specified in one
431 /// of `category_mask` or `category_anyof_mask`.
432 ///
433 /// For more detail see the `pagemap_scan_is_interesting_page` function
434 /// in the Linux kernel source.
435 pub fn category_inverted(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
436 self.pm_scan_arg.category_inverted = flags;
437 self
438 }
439
440 /// Only consider pages for which all `flags` match.
441 ///
442 /// This mask is applied after `category_inverted` is used to flip bits
443 /// in a page's categories. Only pages which match all bits in `flags`
444 /// will be considered.
445 ///
446 /// For more detail see the `pagemap_scan_is_interesting_page` function
447 /// in the Linux kernel source.
448 pub fn category_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
449 self.pm_scan_arg.category_mask = flags;
450 self
451 }
452
453 /// Only consider pages for which any bit of `flags` matches.
454 ///
455 /// After `category_inverted` and `category_mask` have been applied, if
456 /// this option is specified to a non-empty value, then at least one of
457 /// `flags` must be in a page's flags to be considered. That means that
458 /// flags specified in `category_inverted` will already be inverted for
459 /// consideration here. The page categories are and'd with `flags` and
460 /// some bit must be set for the page to be considered.
461 ///
462 /// For more detail see the `pagemap_scan_is_interesting_page` function
463 /// in the Linux kernel source.
464 #[expect(dead_code, reason = "bindings for the future if we need them")]
465 pub fn category_anyof_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
466 self.pm_scan_arg.category_anyof_mask = flags;
467 self
468 }
469
470 /// Categories that are to be reported in the regions returned
471 pub fn return_mask(&mut self, flags: Categories) -> &mut PageMapScanBuilder {
472 self.pm_scan_arg.return_mask = flags;
473 self
474 }
475
476 /// Finishes this configuration and flags that the scan results will be
477 /// placed within `dst`. The returned object can be used to perform the
478 /// pagemap scan ioctl.
479 pub fn build<'a>(&self, dst: &'a mut [MaybeUninit<PageRegion>]) -> PageMapScan<'a> {
480 let mut ret = PageMapScan {
481 pm_scan_arg: self.pm_scan_arg,
482 _marker: marker::PhantomData,
483 };
484 ret.pm_scan_arg.vec = dst.as_ptr() as u64;
485 ret.pm_scan_arg.vec_len = dst.len() as u64;
486 return ret;
487 }
488 }
489
490 /// Return result of [`PageMapScanBuilder::build`] used to perform an `ioctl`.
491 #[repr(transparent)]
492 pub struct PageMapScan<'a> {
493 pm_scan_arg: pm_scan_arg,
494 _marker: marker::PhantomData<&'a mut [MaybeUninit<PageRegion>]>,
495 }
496
497 #[derive(Copy, Clone)]
498 #[repr(C)]
499 struct pm_scan_arg {
500 size: u64,
501 flags: u64,
502 start: u64,
503 end: u64,
504 walk_end: u64,
505 vec: u64,
506 vec_len: u64,
507 max_pages: u64,
508 category_inverted: Categories,
509 category_mask: Categories,
510 category_anyof_mask: Categories,
511 return_mask: Categories,
512 }
513
514 /// Return result of a [`PageMapScan`] `ioctl`.
515 ///
516 /// This reports where the kernel stopped walking with
517 /// [`PageMapScanResult::walk_end`] and the description of regions found in
518 /// [`PageMapScanResult::regions`].
519 #[derive(Debug)]
520 pub struct PageMapScanResult<'a> {
521 walk_end: *const u8,
522 regions: &'a mut [PageRegion],
523 }
524
525 impl PageMapScanResult<'_> {
526 /// Where the kernel stopped walking pages, which may be earlier than the
527 /// end of the requested region
528 pub fn walk_end(&self) -> *const u8 {
529 self.walk_end
530 }
531
532 /// Regions the kernel reported back with categories and such.
533 pub fn regions(&self) -> &[PageRegion] {
534 self.regions
535 }
536 }
537
538 /// Return value of [`PageMapScan`], description of regions in the original scan
539 /// with the categories queried.
540 #[repr(transparent)]
541 #[derive(Copy, Clone)]
542 pub struct PageRegion(page_region);
543
544 #[repr(C)]
545 #[derive(Debug, Copy, Clone)]
546 struct page_region {
547 start: u64,
548 end: u64,
549 categories: Categories,
550 }
551
552 impl PageRegion {
553 /// Returns the region of memory this represents as `*const [u8]`
554 #[inline]
555 pub fn region(&self) -> *const [u8] {
556 ptr::slice_from_raw_parts(self.start(), self.len())
557 }
558
559 /// Returns the base pointer into memory this region represents.
560 #[inline]
561 pub fn start(&self) -> *const u8 {
562 self.0.start as *const u8
563 }
564
565 /// Returns the byte length that this region represents.
566 #[inline]
567 pub fn len(&self) -> usize {
568 usize::try_from(self.0.end - self.0.start).unwrap()
569 }
570
571 /// Returns the category flags associated with this region.
572 ///
573 /// Note that this will only contain categories specified in
574 /// [`PageMapScanBuilder::return_mask`].
575 #[inline]
576 #[cfg_attr(
577 not(test),
578 expect(dead_code, reason = "bindings for the future if we need them")
579 )]
580 pub fn categories(&self) -> Categories {
581 self.0.categories
582 }
583 }
584
585 impl fmt::Debug for PageRegion {
586 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587 f.debug_struct("PageRegion")
588 .field("start", &self.start())
589 .field("len", &self.len())
590 .field("categories", &self.0.categories)
591 .finish()
592 }
593 }
594
595 // SAFETY: this implementation should uphold the various requirements that
596 // this trait has, such as `IS_MUTATING` is right, it's only used on the
597 // right platform with the right files, etc.
598 unsafe impl<'a> Ioctl for PageMapScan<'a> {
599 type Output = PageMapScanResult<'a>;
600
601 const IS_MUTATING: bool = true;
602
603 fn opcode(&self) -> Opcode {
604 opcode::read_write::<pm_scan_arg>(b'f', 16)
605 }
606
607 fn as_ptr(&mut self) -> *mut c_void {
608 (&raw mut self.pm_scan_arg).cast()
609 }
610
611 unsafe fn output_from_ptr(
612 out: IoctlOutput,
613 extract_output: *mut c_void,
614 ) -> rustix::io::Result<Self::Output> {
615 let extract_output = extract_output.cast::<pm_scan_arg>();
616 let len = usize::try_from(out).unwrap();
617 // SAFETY: it's a requirement of this method that
618 // `extract_output` is safe to read and indeed a `pm_scan_arg`.
619 // Additionally the slice returned here originated from a slice
620 // provided to `PageMapScanBuilder::build` threaded through the
621 // `vec` field and it should be safe to thread that back out through
622 // to the result.
623 let regions = unsafe {
624 assert!((len as u64) <= (*extract_output).vec_len);
625 std::slice::from_raw_parts_mut((*extract_output).vec as *mut PageRegion, len)
626 };
627 Ok(PageMapScanResult {
628 regions,
629 // SAFETY: it's a requirement of this method that
630 // `extract_output` is safe to read and indeed a `pm_scan_arg`.
631 walk_end: unsafe { (*extract_output).walk_end as *const u8 },
632 })
633 }
634 }
635}
636
637#[cfg(test)]
638mod tests {
639 use super::ioctl::*;
640 use crate::prelude::*;
641 use rustix::ioctl::*;
642 use rustix::mm::*;
643 use std::fs::File;
644 use std::ptr;
645
646 struct MmapAnonymous {
647 ptr: *mut std::ffi::c_void,
648 len: usize,
649 }
650
651 impl MmapAnonymous {
652 fn new(pages: usize) -> MmapAnonymous {
653 let len = pages * rustix::param::page_size();
654 let ptr = unsafe {
655 mmap_anonymous(
656 ptr::null_mut(),
657 len,
658 ProtFlags::READ | ProtFlags::WRITE,
659 MapFlags::PRIVATE,
660 )
661 .unwrap()
662 };
663 MmapAnonymous { ptr, len }
664 }
665
666 fn read(&self, page: usize) {
667 unsafe {
668 let offset = page * rustix::param::page_size();
669 assert!(offset < self.len);
670 std::ptr::read_volatile(self.ptr.cast::<u8>().add(offset));
671 }
672 }
673
674 fn write(&self, page: usize) {
675 unsafe {
676 let offset = page * rustix::param::page_size();
677 assert!(offset < self.len);
678 std::ptr::write_volatile(self.ptr.cast::<u8>().add(offset), 1);
679 }
680 }
681
682 fn region(&self) -> *const [u8] {
683 ptr::slice_from_raw_parts(self.ptr.cast(), self.len)
684 }
685
686 fn page_region(&self, pages: std::ops::Range<usize>) -> *const [u8] {
687 ptr::slice_from_raw_parts(
688 self.ptr
689 .cast::<u8>()
690 .wrapping_add(pages.start * rustix::param::page_size()),
691 (pages.end - pages.start) * rustix::param::page_size(),
692 )
693 }
694
695 fn end(&self) -> *const u8 {
696 self.ptr.cast::<u8>().wrapping_add(self.len)
697 }
698
699 fn page_end(&self, page: usize) -> *const u8 {
700 self.ptr
701 .cast::<u8>()
702 .wrapping_add((page + 1) * rustix::param::page_size())
703 }
704 }
705
706 impl Drop for MmapAnonymous {
707 fn drop(&mut self) {
708 unsafe {
709 munmap(self.ptr, self.len).unwrap();
710 }
711 }
712 }
713
714 fn ioctl_supported() -> bool {
715 let mmap = MmapAnonymous::new(1);
716 let mut results = Vec::with_capacity(1);
717 let fd = File::open("/proc/self/pagemap").unwrap();
718 unsafe {
719 ioctl(
720 &fd,
721 PageMapScanBuilder::new(mmap.region())
722 .category_mask(Categories::WRITTEN)
723 .return_mask(Categories::all())
724 .build(results.spare_capacity_mut()),
725 )
726 .is_ok()
727 }
728 }
729
730 #[test]
731 fn no_pages_returned() {
732 if !ioctl_supported() {
733 return;
734 }
735 let mmap = MmapAnonymous::new(10);
736 let mut results = Vec::with_capacity(10);
737 let fd = File::open("/proc/self/pagemap").unwrap();
738
739 let result = unsafe {
740 ioctl(
741 &fd,
742 PageMapScanBuilder::new(mmap.region())
743 .category_mask(Categories::WRITTEN)
744 .return_mask(Categories::all())
745 .build(results.spare_capacity_mut()),
746 )
747 .unwrap()
748 };
749 assert!(result.regions().is_empty());
750 assert_eq!(result.walk_end(), mmap.end());
751 }
752
753 #[test]
754 fn empty_region() {
755 if !ioctl_supported() {
756 return;
757 }
758 let mut results = Vec::with_capacity(10);
759 let fd = File::open("/proc/self/pagemap").unwrap();
760
761 let empty_region = ptr::slice_from_raw_parts(rustix::param::page_size() as *const u8, 0);
762 let result = unsafe {
763 ioctl(
764 &fd,
765 PageMapScanBuilder::new(empty_region)
766 .return_mask(Categories::all())
767 .build(results.spare_capacity_mut()),
768 )
769 .unwrap()
770 };
771 assert!(result.regions().is_empty());
772 }
773
774 #[test]
775 fn basic_page_flags() {
776 if !ioctl_supported() {
777 return;
778 }
779 let mmap = MmapAnonymous::new(10);
780 let mut results = Vec::with_capacity(10);
781 let fd = File::open("/proc/self/pagemap").unwrap();
782
783 mmap.read(0);
784 mmap.write(1);
785 mmap.write(2);
786 mmap.read(3);
787
788 mmap.read(5);
789 mmap.read(6);
790
791 let result = unsafe {
792 ioctl(
793 &fd,
794 PageMapScanBuilder::new(mmap.region())
795 .category_mask(Categories::WRITTEN)
796 .return_mask(Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO)
797 .build(results.spare_capacity_mut()),
798 )
799 .unwrap()
800 };
801 assert_eq!(result.regions().len(), 4);
802 assert_eq!(result.walk_end(), mmap.end());
803 assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
804 assert_eq!(
805 result.regions()[0].categories(),
806 Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
807 );
808
809 assert_eq!(result.regions()[1].region(), mmap.page_region(1..3));
810 assert_eq!(
811 result.regions()[1].categories(),
812 Categories::WRITTEN | Categories::PRESENT
813 );
814
815 assert_eq!(result.regions()[2].region(), mmap.page_region(3..4));
816 assert_eq!(
817 result.regions()[2].categories(),
818 Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
819 );
820
821 assert_eq!(result.regions()[3].region(), mmap.page_region(5..7));
822 assert_eq!(
823 result.regions()[3].categories(),
824 Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO
825 );
826 }
827
828 #[test]
829 fn only_written_pages() {
830 if !ioctl_supported() {
831 return;
832 }
833 let mmap = MmapAnonymous::new(10);
834 let mut results = Vec::with_capacity(10);
835 let fd = File::open("/proc/self/pagemap").unwrap();
836
837 mmap.read(0);
838 mmap.write(1);
839 mmap.write(2);
840 mmap.read(3);
841
842 mmap.read(5);
843 mmap.read(6);
844
845 let result = unsafe {
846 ioctl(
847 &fd,
848 PageMapScanBuilder::new(mmap.region())
849 .category_inverted(Categories::PFNZERO)
850 .category_mask(Categories::WRITTEN | Categories::PFNZERO)
851 .return_mask(Categories::WRITTEN | Categories::PRESENT | Categories::PFNZERO)
852 .build(results.spare_capacity_mut()),
853 )
854 .unwrap()
855 };
856 assert_eq!(result.regions().len(), 1);
857 assert_eq!(result.walk_end(), mmap.end());
858
859 assert_eq!(result.regions()[0].region(), mmap.page_region(1..3));
860 assert_eq!(
861 result.regions()[0].categories(),
862 Categories::WRITTEN | Categories::PRESENT
863 );
864 }
865
866 #[test]
867 fn region_limit() {
868 if !ioctl_supported() {
869 return;
870 }
871 let mmap = MmapAnonymous::new(10);
872 let mut results = Vec::with_capacity(1);
873 let fd = File::open("/proc/self/pagemap").unwrap();
874
875 mmap.read(0);
876 mmap.write(1);
877 mmap.read(2);
878 mmap.write(3);
879
880 // Ask for written|pfnzero meaning only-read pages. This should return only
881 // a single region of the first page.
882 let result = unsafe {
883 ioctl(
884 &fd,
885 PageMapScanBuilder::new(mmap.region())
886 .return_mask(Categories::WRITTEN | Categories::PFNZERO)
887 .build(results.spare_capacity_mut()),
888 )
889 .unwrap()
890 };
891 assert_eq!(result.regions().len(), 1);
892 assert_eq!(result.walk_end(), mmap.page_end(0));
893
894 assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
895 assert_eq!(
896 result.regions()[0].categories(),
897 Categories::WRITTEN | Categories::PFNZERO
898 );
899
900 // If we ask for written pages though (which seems synonymous with
901 // present?) then everything should be in one region.
902 let result = unsafe {
903 ioctl(
904 &fd,
905 PageMapScanBuilder::new(mmap.region())
906 .return_mask(Categories::WRITTEN)
907 .build(results.spare_capacity_mut()),
908 )
909 .unwrap()
910 };
911 assert_eq!(result.regions().len(), 1);
912 assert_eq!(result.walk_end(), mmap.page_end(3));
913
914 assert_eq!(result.regions()[0].region(), mmap.page_region(0..4));
915 assert_eq!(result.regions()[0].categories(), Categories::WRITTEN);
916 }
917
918 #[test]
919 fn page_limit() {
920 if !ioctl_supported() {
921 return;
922 }
923 let mmap = MmapAnonymous::new(10);
924 let mut results = Vec::with_capacity(10);
925 let fd = File::open("/proc/self/pagemap").unwrap();
926
927 mmap.read(0);
928 mmap.read(1);
929 mmap.read(2);
930 mmap.read(3);
931
932 // Ask for written|pfnzero meaning only-read pages. This should return only
933 // a single region of the first page.
934 let result = unsafe {
935 ioctl(
936 &fd,
937 PageMapScanBuilder::new(mmap.region())
938 .return_mask(Categories::WRITTEN | Categories::PFNZERO)
939 .max_pages(2)
940 .build(results.spare_capacity_mut()),
941 )
942 .unwrap()
943 };
944 assert_eq!(result.regions().len(), 1);
945 assert_eq!(result.walk_end(), mmap.page_end(1));
946
947 assert_eq!(result.regions()[0].region(), mmap.page_region(0..2));
948 assert_eq!(
949 result.regions()[0].categories(),
950 Categories::WRITTEN | Categories::PFNZERO
951 );
952 }
953
954 #[test]
955 fn page_limit_with_hole() {
956 if !ioctl_supported() {
957 return;
958 }
959 let mmap = MmapAnonymous::new(10);
960 let mut results = Vec::with_capacity(10);
961 let fd = File::open("/proc/self/pagemap").unwrap();
962
963 mmap.read(0);
964 mmap.read(2);
965 mmap.read(3);
966
967 // Ask for written|pfnzero meaning only-read pages. This should return only
968 // a single region of the first page.
969 let result = unsafe {
970 ioctl(
971 &fd,
972 PageMapScanBuilder::new(mmap.region())
973 .category_mask(Categories::WRITTEN)
974 .return_mask(Categories::WRITTEN | Categories::PFNZERO)
975 .max_pages(2)
976 .build(results.spare_capacity_mut()),
977 )
978 .unwrap()
979 };
980 assert_eq!(result.regions().len(), 2);
981 assert_eq!(result.walk_end(), mmap.page_end(2));
982
983 assert_eq!(result.regions()[0].region(), mmap.page_region(0..1));
984 assert_eq!(
985 result.regions()[0].categories(),
986 Categories::WRITTEN | Categories::PFNZERO
987 );
988 assert_eq!(result.regions()[1].region(), mmap.page_region(2..3));
989 assert_eq!(
990 result.regions()[1].categories(),
991 Categories::WRITTEN | Categories::PFNZERO
992 );
993 }
994}