regalloc2/
moves.rs

1/*
2 * Released under the terms of the Apache 2.0 license with LLVM
3 * exception. See `LICENSE` for details.
4 */
5
6use crate::{ion::data_structures::u64_key, Allocation, PReg};
7use core::fmt::Debug;
8use smallvec::{smallvec, SmallVec};
9
10/// A list of moves to be performed in sequence, with auxiliary data
11/// attached to each.
12pub type MoveVec<T> = SmallVec<[(Allocation, Allocation, T); 16]>;
13
14/// A list of moves to be performance in sequence, like a
15/// `MoveVec<T>`, except that an unchosen scratch space may occur as
16/// well, represented by `Allocation::none()`.
17#[derive(Clone, Debug)]
18pub enum MoveVecWithScratch<T> {
19    /// No scratch was actually used.
20    NoScratch(MoveVec<T>),
21    /// A scratch space was used.
22    Scratch(MoveVec<T>),
23}
24
25/// A `ParallelMoves` represents a list of alloc-to-alloc moves that
26/// must happen in parallel -- i.e., all reads of sources semantically
27/// happen before all writes of destinations, and destinations are
28/// allowed to overwrite sources. It can compute a list of sequential
29/// moves that will produce the equivalent data movement, possibly
30/// using a scratch register if one is necessary.
31pub struct ParallelMoves<T: Clone + Copy + Default> {
32    parallel_moves: MoveVec<T>,
33}
34
35impl<T: Clone + Copy + Default + PartialEq> ParallelMoves<T> {
36    pub fn new() -> Self {
37        Self {
38            parallel_moves: smallvec![],
39        }
40    }
41
42    pub fn add(&mut self, from: Allocation, to: Allocation, t: T) {
43        self.parallel_moves.push((from, to, t));
44    }
45
46    fn sources_overlap_dests(&self) -> bool {
47        // Assumes `parallel_moves` has already been sorted by `dst`
48        // in `resolve()` below. The O(n log n) cost of this loop is no
49        // worse than the sort we already did.
50        for &(src, _, _) in &self.parallel_moves {
51            if self
52                .parallel_moves
53                .binary_search_by_key(&src, |&(_, dst, _)| dst)
54                .is_ok()
55            {
56                return true;
57            }
58        }
59        false
60    }
61
62    /// Resolve the parallel-moves problem to a sequence of separate
63    /// moves, such that the combined effect of the sequential moves
64    /// is as-if all of the moves added to this `ParallelMoves`
65    /// resolver happened in parallel.
66    ///
67    /// Sometimes, if there is a cycle, a scratch register is
68    /// necessary to allow the moves to occur sequentially. In this
69    /// case, `Allocation::none()` is returned to represent the
70    /// scratch register. The caller may choose to always hold a
71    /// separate scratch register unused to allow this to be trivially
72    /// rewritten; or may dynamically search for or create a free
73    /// register as needed, if none are available.
74    pub fn resolve(mut self) -> MoveVecWithScratch<T> {
75        // Easy case: zero or one move. Just return our vec.
76        if self.parallel_moves.len() <= 1 {
77            return MoveVecWithScratch::NoScratch(self.parallel_moves);
78        }
79
80        // Sort moves so that we can efficiently test for presence.
81        // For that purpose it doesn't matter whether we sort by
82        // source or destination, but later we'll want them sorted
83        // by destination.
84        self.parallel_moves
85            .sort_by_key(|&(src, dst, _)| u64_key(dst.bits(), src.bits()));
86
87        // Duplicate moves cannot change the semantics of this
88        // parallel move set, so remove them. This is cheap since we
89        // just sorted the list.
90        self.parallel_moves.dedup();
91
92        // General case: some moves overwrite dests that other moves
93        // read as sources. We'll use a general algorithm.
94        //
95        // *Important property*: because we expect that each register
96        // has only one writer (otherwise the effect of the parallel
97        // move is undefined), each move can only block one other move
98        // (with its one source corresponding to the one writer of
99        // that source). Thus, we *can only have simple cycles* (those
100        // that are a ring of nodes, i.e., with only one path from a
101        // node back to itself); there are no SCCs that are more
102        // complex than that. We leverage this fact below to avoid
103        // having to do a full Tarjan SCC DFS (with lowest-index
104        // computation, etc.): instead, as soon as we find a cycle, we
105        // know we have the full cycle and we can do a cyclic move
106        // sequence and continue.
107
108        // Check that each destination has only one writer.
109        if cfg!(debug_assertions) {
110            let mut last_dst = None;
111            for &(_, dst, _) in &self.parallel_moves {
112                if last_dst.is_some() {
113                    debug_assert!(last_dst.unwrap() != dst);
114                }
115                last_dst = Some(dst);
116            }
117        }
118
119        // Moving an allocation into itself is technically a cycle but
120        // should have no effect, as long as there are no other writes
121        // into that destination.
122        self.parallel_moves.retain(|&mut (src, dst, _)| src != dst);
123
124        // Do any dests overlap sources? If not, we can also just
125        // return the list.
126        if !self.sources_overlap_dests() {
127            return MoveVecWithScratch::NoScratch(self.parallel_moves);
128        }
129
130        // Construct a mapping from move indices to moves they must
131        // come before. Any given move must come before a move that
132        // overwrites its destination; we have moves sorted by dest
133        // above so we can efficiently find such a move, if any.
134        const NONE: usize = usize::MAX;
135        let must_come_before: SmallVec<[usize; 16]> = self
136            .parallel_moves
137            .iter()
138            .map(|&(src, _, _)| {
139                self.parallel_moves
140                    .binary_search_by_key(&src, |&(_, dst, _)| dst)
141                    .unwrap_or(NONE)
142            })
143            .collect();
144
145        // Do a simple stack-based DFS and emit moves in postorder,
146        // then reverse at the end for RPO. Unlike Tarjan's SCC
147        // algorithm, we can emit a cycle as soon as we find one, as
148        // noted above.
149        #[derive(Clone, Copy, Debug, Eq, PartialEq)]
150        enum State {
151            /// Not on stack, not visited
152            ToDo,
153            /// On stack, not yet visited
154            Pending,
155            /// Visited
156            Done,
157        }
158        let mut ret: MoveVec<T> = smallvec![];
159        let mut stack: SmallVec<[usize; 16]> = smallvec![];
160        let mut state: SmallVec<[State; 16]> = smallvec![State::ToDo; self.parallel_moves.len()];
161        let mut scratch_used = false;
162
163        while let Some(next) = state.iter().position(|&state| state == State::ToDo) {
164            stack.push(next);
165            state[next] = State::Pending;
166
167            while let Some(&top) = stack.last() {
168                debug_assert_eq!(state[top], State::Pending);
169                let next = must_come_before[top];
170                if next == NONE || state[next] == State::Done {
171                    ret.push(self.parallel_moves[top]);
172                    state[top] = State::Done;
173                    stack.pop();
174                    while let Some(top) = stack.pop() {
175                        ret.push(self.parallel_moves[top]);
176                        state[top] = State::Done;
177                    }
178                } else if state[next] == State::ToDo {
179                    stack.push(next);
180                    state[next] = State::Pending;
181                } else {
182                    // Found a cycle -- emit a cyclic-move sequence
183                    // for the cycle on the top of stack, then normal
184                    // moves below it. Recall that these moves will be
185                    // reversed in sequence, so from the original
186                    // parallel move set
187                    //
188                    //     { B := A, C := B, A := B }
189                    //
190                    // we will generate something like:
191                    //
192                    //     A := scratch
193                    //     B := A
194                    //     C := B
195                    //     scratch := C
196                    //
197                    // which will become:
198                    //
199                    //     scratch := C
200                    //     C := B
201                    //     B := A
202                    //     A := scratch
203                    debug_assert_ne!(top, next);
204                    state[top] = State::Done;
205                    stack.pop();
206
207                    let (scratch_src, dst, dst_t) = self.parallel_moves[top];
208                    scratch_used = true;
209
210                    ret.push((Allocation::none(), dst, dst_t));
211                    while let Some(move_idx) = stack.pop() {
212                        state[move_idx] = State::Done;
213                        ret.push(self.parallel_moves[move_idx]);
214
215                        if move_idx == next {
216                            break;
217                        }
218                    }
219                    ret.push((scratch_src, Allocation::none(), T::default()));
220                }
221            }
222        }
223
224        ret.reverse();
225
226        if scratch_used {
227            MoveVecWithScratch::Scratch(ret)
228        } else {
229            MoveVecWithScratch::NoScratch(ret)
230        }
231    }
232}
233
234impl<T> MoveVecWithScratch<T> {
235    /// Fills in the scratch space, if needed, with the given
236    /// register/allocation and returns a final list of moves. The
237    /// scratch register must not occur anywhere in the parallel-move
238    /// problem given to the resolver that produced this
239    /// `MoveVecWithScratch`.
240    pub fn with_scratch(self, scratch: Allocation) -> MoveVec<T> {
241        match self {
242            MoveVecWithScratch::NoScratch(moves) => moves,
243            MoveVecWithScratch::Scratch(mut moves) => {
244                for (src, dst, _) in &mut moves {
245                    debug_assert!(
246                        *src != scratch && *dst != scratch,
247                        "Scratch register should not also be an actual source or dest of moves"
248                    );
249                    debug_assert!(
250                        !(src.is_none() && dst.is_none()),
251                        "Move resolution should not have produced a scratch-to-scratch move"
252                    );
253                    if src.is_none() {
254                        *src = scratch;
255                    }
256                    if dst.is_none() {
257                        *dst = scratch;
258                    }
259                }
260                moves
261            }
262        }
263    }
264
265    /// Unwrap without a scratch register.
266    pub fn without_scratch(self) -> Option<MoveVec<T>> {
267        match self {
268            MoveVecWithScratch::NoScratch(moves) => Some(moves),
269            MoveVecWithScratch::Scratch(..) => None,
270        }
271    }
272
273    /// Do we need a scratch register?
274    pub fn needs_scratch(&self) -> bool {
275        match self {
276            MoveVecWithScratch::NoScratch(..) => false,
277            MoveVecWithScratch::Scratch(..) => true,
278        }
279    }
280}
281
282/// Final stage of move resolution: finding or using scratch
283/// registers, creating them if necessary by using stackslots, and
284/// ensuring that the final list of moves contains no stack-to-stack
285/// moves.
286///
287/// The resolved list of moves may need one or two scratch registers,
288/// and maybe a stackslot, to ensure these conditions. Our general
289/// strategy is in two steps.
290///
291/// First, we find a scratch register, so we only have to worry about
292/// a list of moves, all with real locations as src and dest. If we're
293/// lucky and there are any registers not allocated at this
294/// program-point, we can use a real register. Otherwise, we use an
295/// extra stackslot. This is fine, because at this step,
296/// stack-to-stack moves are OK.
297///
298/// Then, we resolve stack-to-stack moves into stack-to-reg /
299/// reg-to-stack pairs. For this, we try to allocate a second free
300/// register. If unavailable, we create a new scratch stackslot to
301/// serve as a backup of one of the in-use registers, then borrow that
302/// register as the scratch register in the middle of stack-to-stack
303/// moves.
304pub struct MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
305where
306    GetReg: FnMut() -> Option<Allocation>,
307    GetStackSlot: FnMut() -> Allocation,
308    IsStackAlloc: Fn(Allocation) -> bool,
309{
310    /// Closure that finds us a PReg at the current location.
311    pub find_free_reg: GetReg,
312    /// Closure that gets us a stackslot, if needed.
313    pub get_stackslot: GetStackSlot,
314    /// Closure to determine whether an `Allocation` refers to a stack slot.
315    pub is_stack_alloc: IsStackAlloc,
316    /// Use this register if no free register is available to use as a
317    /// temporary in stack-to-stack moves. If we do use this register
318    /// for that purpose, its value will be restored by the end of the
319    /// move sequence. Provided by caller and statically chosen. This is
320    /// a very last-ditch option, so static choice is OK.
321    pub borrowed_scratch_reg: PReg,
322}
323
324impl<GetReg, GetStackSlot, IsStackAlloc> MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
325where
326    GetReg: FnMut() -> Option<Allocation>,
327    GetStackSlot: FnMut() -> Allocation,
328    IsStackAlloc: Fn(Allocation) -> bool,
329{
330    pub fn compute<T: Debug + Default + Copy>(
331        mut self,
332        moves: MoveVecWithScratch<T>,
333    ) -> MoveVec<T> {
334        let moves = if moves.needs_scratch() {
335            // Now, find a scratch allocation in order to resolve cycles.
336            let scratch = (self.find_free_reg)().unwrap_or_else(|| (self.get_stackslot)());
337            trace!("scratch resolver: scratch alloc {:?}", scratch);
338
339            moves.with_scratch(scratch)
340        } else {
341            moves.without_scratch().unwrap()
342        };
343
344        // Do we have any stack-to-stack moves? Fast return if not.
345        let stack_to_stack = moves
346            .iter()
347            .any(|&(src, dst, _)| self.is_stack_to_stack_move(src, dst));
348        if !stack_to_stack {
349            return moves;
350        }
351
352        // Allocate a scratch register for stack-to-stack move expansion.
353        let (scratch_reg, save_slot) = if let Some(reg) = (self.find_free_reg)() {
354            trace!(
355                "scratch resolver: have free stack-to-stack scratch preg: {:?}",
356                reg
357            );
358            (reg, None)
359        } else {
360            let reg = Allocation::reg(self.borrowed_scratch_reg);
361            // Stackslot into which we need to save the stack-to-stack
362            // scratch reg before doing any stack-to-stack moves, if we stole
363            // the reg.
364            let save = (self.get_stackslot)();
365            trace!(
366                "scratch resolver: stack-to-stack borrowing {:?} with save stackslot {:?}",
367                reg,
368                save
369            );
370            (reg, Some(save))
371        };
372
373        // Mutually exclusive flags for whether either scratch_reg or
374        // save_slot need to be restored from the other. Initially,
375        // scratch_reg has a value we should preserve and save_slot
376        // has garbage.
377        let mut scratch_dirty = false;
378        let mut save_dirty = true;
379
380        let mut result = smallvec![];
381        for &(src, dst, data) in &moves {
382            // Do we have a stack-to-stack move? If so, resolve.
383            if self.is_stack_to_stack_move(src, dst) {
384                trace!("scratch resolver: stack to stack: {:?} -> {:?}", src, dst);
385
386                // If the selected scratch register is stolen from the
387                // set of in-use registers, then we need to save the
388                // current contents of the scratch register before using
389                // it as a temporary.
390                if let Some(save_slot) = save_slot {
391                    // However we may have already done so for an earlier
392                    // stack-to-stack move in which case we don't need
393                    // to do it again.
394                    if save_dirty {
395                        debug_assert!(!scratch_dirty);
396                        result.push((scratch_reg, save_slot, T::default()));
397                        save_dirty = false;
398                    }
399                }
400
401                // We can't move directly from one stack slot to another
402                // on any architecture we care about, so stack-to-stack
403                // moves must go via a scratch register.
404                result.push((src, scratch_reg, data));
405                result.push((scratch_reg, dst, data));
406                scratch_dirty = true;
407            } else {
408                // This is not a stack-to-stack move, but we need to
409                // make sure that the scratch register is in the correct
410                // state if this move interacts with that register.
411                if src == scratch_reg && scratch_dirty {
412                    // We're copying from the scratch register so if
413                    // it was stolen for a stack-to-stack move then we
414                    // need to make sure it has the correct contents,
415                    // not whatever was temporarily copied into it. If
416                    // we got scratch_reg from find_free_reg then it
417                    // had better not have been used as the source of
418                    // a move. So if we're here it's because we fell
419                    // back to the caller-provided last-resort scratch
420                    // register, and we must therefore have a save-slot
421                    // allocated too.
422                    debug_assert!(!save_dirty);
423                    let save_slot = save_slot.expect("move source should not be a free register");
424                    result.push((save_slot, scratch_reg, T::default()));
425                    scratch_dirty = false;
426                }
427                if dst == scratch_reg {
428                    // We are writing something to the scratch register
429                    // so it doesn't matter what was there before. We
430                    // can avoid restoring it, but we will need to save
431                    // it again before the next stack-to-stack move.
432                    scratch_dirty = false;
433                    save_dirty = true;
434                }
435                result.push((src, dst, data));
436            }
437        }
438
439        // Now that all the stack-to-stack moves are done, restore the
440        // scratch register if necessary.
441        if let Some(save_slot) = save_slot {
442            if scratch_dirty {
443                debug_assert!(!save_dirty);
444                result.push((save_slot, scratch_reg, T::default()));
445            }
446        }
447
448        trace!("scratch resolver: got {:?}", result);
449        result
450    }
451
452    fn is_stack_to_stack_move(&self, src: Allocation, dst: Allocation) -> bool {
453        (self.is_stack_alloc)(src) && (self.is_stack_alloc)(dst)
454    }
455}