regalloc2/moves.rs
1/*
2 * Released under the terms of the Apache 2.0 license with LLVM
3 * exception. See `LICENSE` for details.
4 */
5
6use crate::{ion::data_structures::u64_key, Allocation, PReg};
7use core::fmt::Debug;
8use smallvec::{smallvec, SmallVec};
9
10/// A list of moves to be performed in sequence, with auxiliary data
11/// attached to each.
12pub type MoveVec<T> = SmallVec<[(Allocation, Allocation, T); 16]>;
13
14/// A list of moves to be performance in sequence, like a
15/// `MoveVec<T>`, except that an unchosen scratch space may occur as
16/// well, represented by `Allocation::none()`.
17#[derive(Clone, Debug)]
18pub enum MoveVecWithScratch<T> {
19 /// No scratch was actually used.
20 NoScratch(MoveVec<T>),
21 /// A scratch space was used.
22 Scratch(MoveVec<T>),
23}
24
25/// A `ParallelMoves` represents a list of alloc-to-alloc moves that
26/// must happen in parallel -- i.e., all reads of sources semantically
27/// happen before all writes of destinations, and destinations are
28/// allowed to overwrite sources. It can compute a list of sequential
29/// moves that will produce the equivalent data movement, possibly
30/// using a scratch register if one is necessary.
31pub struct ParallelMoves<T: Clone + Copy + Default> {
32 parallel_moves: MoveVec<T>,
33}
34
35impl<T: Clone + Copy + Default + PartialEq> ParallelMoves<T> {
36 pub fn new() -> Self {
37 Self {
38 parallel_moves: smallvec![],
39 }
40 }
41
42 pub fn add(&mut self, from: Allocation, to: Allocation, t: T) {
43 self.parallel_moves.push((from, to, t));
44 }
45
46 fn sources_overlap_dests(&self) -> bool {
47 // Assumes `parallel_moves` has already been sorted by `dst`
48 // in `resolve()` below. The O(n log n) cost of this loop is no
49 // worse than the sort we already did.
50 for &(src, _, _) in &self.parallel_moves {
51 if self
52 .parallel_moves
53 .binary_search_by_key(&src, |&(_, dst, _)| dst)
54 .is_ok()
55 {
56 return true;
57 }
58 }
59 false
60 }
61
62 /// Resolve the parallel-moves problem to a sequence of separate
63 /// moves, such that the combined effect of the sequential moves
64 /// is as-if all of the moves added to this `ParallelMoves`
65 /// resolver happened in parallel.
66 ///
67 /// Sometimes, if there is a cycle, a scratch register is
68 /// necessary to allow the moves to occur sequentially. In this
69 /// case, `Allocation::none()` is returned to represent the
70 /// scratch register. The caller may choose to always hold a
71 /// separate scratch register unused to allow this to be trivially
72 /// rewritten; or may dynamically search for or create a free
73 /// register as needed, if none are available.
74 pub fn resolve(mut self) -> MoveVecWithScratch<T> {
75 // Easy case: zero or one move. Just return our vec.
76 if self.parallel_moves.len() <= 1 {
77 return MoveVecWithScratch::NoScratch(self.parallel_moves);
78 }
79
80 // Sort moves so that we can efficiently test for presence.
81 // For that purpose it doesn't matter whether we sort by
82 // source or destination, but later we'll want them sorted
83 // by destination.
84 self.parallel_moves
85 .sort_by_key(|&(src, dst, _)| u64_key(dst.bits(), src.bits()));
86
87 // Duplicate moves cannot change the semantics of this
88 // parallel move set, so remove them. This is cheap since we
89 // just sorted the list.
90 self.parallel_moves.dedup();
91
92 // General case: some moves overwrite dests that other moves
93 // read as sources. We'll use a general algorithm.
94 //
95 // *Important property*: because we expect that each register
96 // has only one writer (otherwise the effect of the parallel
97 // move is undefined), each move can only block one other move
98 // (with its one source corresponding to the one writer of
99 // that source). Thus, we *can only have simple cycles* (those
100 // that are a ring of nodes, i.e., with only one path from a
101 // node back to itself); there are no SCCs that are more
102 // complex than that. We leverage this fact below to avoid
103 // having to do a full Tarjan SCC DFS (with lowest-index
104 // computation, etc.): instead, as soon as we find a cycle, we
105 // know we have the full cycle and we can do a cyclic move
106 // sequence and continue.
107
108 // Check that each destination has only one writer.
109 if cfg!(debug_assertions) {
110 let mut last_dst = None;
111 for &(_, dst, _) in &self.parallel_moves {
112 if last_dst.is_some() {
113 debug_assert!(last_dst.unwrap() != dst);
114 }
115 last_dst = Some(dst);
116 }
117 }
118
119 // Moving an allocation into itself is technically a cycle but
120 // should have no effect, as long as there are no other writes
121 // into that destination.
122 self.parallel_moves.retain(|&mut (src, dst, _)| src != dst);
123
124 // Do any dests overlap sources? If not, we can also just
125 // return the list.
126 if !self.sources_overlap_dests() {
127 return MoveVecWithScratch::NoScratch(self.parallel_moves);
128 }
129
130 // Construct a mapping from move indices to moves they must
131 // come before. Any given move must come before a move that
132 // overwrites its destination; we have moves sorted by dest
133 // above so we can efficiently find such a move, if any.
134 const NONE: usize = usize::MAX;
135 let must_come_before: SmallVec<[usize; 16]> = self
136 .parallel_moves
137 .iter()
138 .map(|&(src, _, _)| {
139 self.parallel_moves
140 .binary_search_by_key(&src, |&(_, dst, _)| dst)
141 .unwrap_or(NONE)
142 })
143 .collect();
144
145 // Do a simple stack-based DFS and emit moves in postorder,
146 // then reverse at the end for RPO. Unlike Tarjan's SCC
147 // algorithm, we can emit a cycle as soon as we find one, as
148 // noted above.
149 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
150 enum State {
151 /// Not on stack, not visited
152 ToDo,
153 /// On stack, not yet visited
154 Pending,
155 /// Visited
156 Done,
157 }
158 let mut ret: MoveVec<T> = smallvec![];
159 let mut stack: SmallVec<[usize; 16]> = smallvec![];
160 let mut state: SmallVec<[State; 16]> = smallvec![State::ToDo; self.parallel_moves.len()];
161 let mut scratch_used = false;
162
163 while let Some(next) = state.iter().position(|&state| state == State::ToDo) {
164 stack.push(next);
165 state[next] = State::Pending;
166
167 while let Some(&top) = stack.last() {
168 debug_assert_eq!(state[top], State::Pending);
169 let next = must_come_before[top];
170 if next == NONE || state[next] == State::Done {
171 ret.push(self.parallel_moves[top]);
172 state[top] = State::Done;
173 stack.pop();
174 while let Some(top) = stack.pop() {
175 ret.push(self.parallel_moves[top]);
176 state[top] = State::Done;
177 }
178 } else if state[next] == State::ToDo {
179 stack.push(next);
180 state[next] = State::Pending;
181 } else {
182 // Found a cycle -- emit a cyclic-move sequence
183 // for the cycle on the top of stack, then normal
184 // moves below it. Recall that these moves will be
185 // reversed in sequence, so from the original
186 // parallel move set
187 //
188 // { B := A, C := B, A := B }
189 //
190 // we will generate something like:
191 //
192 // A := scratch
193 // B := A
194 // C := B
195 // scratch := C
196 //
197 // which will become:
198 //
199 // scratch := C
200 // C := B
201 // B := A
202 // A := scratch
203 debug_assert_ne!(top, next);
204 state[top] = State::Done;
205 stack.pop();
206
207 let (scratch_src, dst, dst_t) = self.parallel_moves[top];
208 scratch_used = true;
209
210 ret.push((Allocation::none(), dst, dst_t));
211 while let Some(move_idx) = stack.pop() {
212 state[move_idx] = State::Done;
213 ret.push(self.parallel_moves[move_idx]);
214
215 if move_idx == next {
216 break;
217 }
218 }
219 ret.push((scratch_src, Allocation::none(), T::default()));
220 }
221 }
222 }
223
224 ret.reverse();
225
226 if scratch_used {
227 MoveVecWithScratch::Scratch(ret)
228 } else {
229 MoveVecWithScratch::NoScratch(ret)
230 }
231 }
232}
233
234impl<T> MoveVecWithScratch<T> {
235 /// Fills in the scratch space, if needed, with the given
236 /// register/allocation and returns a final list of moves. The
237 /// scratch register must not occur anywhere in the parallel-move
238 /// problem given to the resolver that produced this
239 /// `MoveVecWithScratch`.
240 pub fn with_scratch(self, scratch: Allocation) -> MoveVec<T> {
241 match self {
242 MoveVecWithScratch::NoScratch(moves) => moves,
243 MoveVecWithScratch::Scratch(mut moves) => {
244 for (src, dst, _) in &mut moves {
245 debug_assert!(
246 *src != scratch && *dst != scratch,
247 "Scratch register should not also be an actual source or dest of moves"
248 );
249 debug_assert!(
250 !(src.is_none() && dst.is_none()),
251 "Move resolution should not have produced a scratch-to-scratch move"
252 );
253 if src.is_none() {
254 *src = scratch;
255 }
256 if dst.is_none() {
257 *dst = scratch;
258 }
259 }
260 moves
261 }
262 }
263 }
264
265 /// Unwrap without a scratch register.
266 pub fn without_scratch(self) -> Option<MoveVec<T>> {
267 match self {
268 MoveVecWithScratch::NoScratch(moves) => Some(moves),
269 MoveVecWithScratch::Scratch(..) => None,
270 }
271 }
272
273 /// Do we need a scratch register?
274 pub fn needs_scratch(&self) -> bool {
275 match self {
276 MoveVecWithScratch::NoScratch(..) => false,
277 MoveVecWithScratch::Scratch(..) => true,
278 }
279 }
280}
281
282/// Final stage of move resolution: finding or using scratch
283/// registers, creating them if necessary by using stackslots, and
284/// ensuring that the final list of moves contains no stack-to-stack
285/// moves.
286///
287/// The resolved list of moves may need one or two scratch registers,
288/// and maybe a stackslot, to ensure these conditions. Our general
289/// strategy is in two steps.
290///
291/// First, we find a scratch register, so we only have to worry about
292/// a list of moves, all with real locations as src and dest. If we're
293/// lucky and there are any registers not allocated at this
294/// program-point, we can use a real register. Otherwise, we use an
295/// extra stackslot. This is fine, because at this step,
296/// stack-to-stack moves are OK.
297///
298/// Then, we resolve stack-to-stack moves into stack-to-reg /
299/// reg-to-stack pairs. For this, we try to allocate a second free
300/// register. If unavailable, we create a new scratch stackslot to
301/// serve as a backup of one of the in-use registers, then borrow that
302/// register as the scratch register in the middle of stack-to-stack
303/// moves.
304pub struct MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
305where
306 GetReg: FnMut() -> Option<Allocation>,
307 GetStackSlot: FnMut() -> Allocation,
308 IsStackAlloc: Fn(Allocation) -> bool,
309{
310 /// Closure that finds us a PReg at the current location.
311 pub find_free_reg: GetReg,
312 /// Closure that gets us a stackslot, if needed.
313 pub get_stackslot: GetStackSlot,
314 /// Closure to determine whether an `Allocation` refers to a stack slot.
315 pub is_stack_alloc: IsStackAlloc,
316 /// Use this register if no free register is available to use as a
317 /// temporary in stack-to-stack moves. If we do use this register
318 /// for that purpose, its value will be restored by the end of the
319 /// move sequence. Provided by caller and statically chosen. This is
320 /// a very last-ditch option, so static choice is OK.
321 pub borrowed_scratch_reg: PReg,
322}
323
324impl<GetReg, GetStackSlot, IsStackAlloc> MoveAndScratchResolver<GetReg, GetStackSlot, IsStackAlloc>
325where
326 GetReg: FnMut() -> Option<Allocation>,
327 GetStackSlot: FnMut() -> Allocation,
328 IsStackAlloc: Fn(Allocation) -> bool,
329{
330 pub fn compute<T: Debug + Default + Copy>(
331 mut self,
332 moves: MoveVecWithScratch<T>,
333 ) -> MoveVec<T> {
334 let moves = if moves.needs_scratch() {
335 // Now, find a scratch allocation in order to resolve cycles.
336 let scratch = (self.find_free_reg)().unwrap_or_else(|| (self.get_stackslot)());
337 trace!("scratch resolver: scratch alloc {:?}", scratch);
338
339 moves.with_scratch(scratch)
340 } else {
341 moves.without_scratch().unwrap()
342 };
343
344 // Do we have any stack-to-stack moves? Fast return if not.
345 let stack_to_stack = moves
346 .iter()
347 .any(|&(src, dst, _)| self.is_stack_to_stack_move(src, dst));
348 if !stack_to_stack {
349 return moves;
350 }
351
352 // Allocate a scratch register for stack-to-stack move expansion.
353 let (scratch_reg, save_slot) = if let Some(reg) = (self.find_free_reg)() {
354 trace!(
355 "scratch resolver: have free stack-to-stack scratch preg: {:?}",
356 reg
357 );
358 (reg, None)
359 } else {
360 let reg = Allocation::reg(self.borrowed_scratch_reg);
361 // Stackslot into which we need to save the stack-to-stack
362 // scratch reg before doing any stack-to-stack moves, if we stole
363 // the reg.
364 let save = (self.get_stackslot)();
365 trace!(
366 "scratch resolver: stack-to-stack borrowing {:?} with save stackslot {:?}",
367 reg,
368 save
369 );
370 (reg, Some(save))
371 };
372
373 // Mutually exclusive flags for whether either scratch_reg or
374 // save_slot need to be restored from the other. Initially,
375 // scratch_reg has a value we should preserve and save_slot
376 // has garbage.
377 let mut scratch_dirty = false;
378 let mut save_dirty = true;
379
380 let mut result = smallvec![];
381 for &(src, dst, data) in &moves {
382 // Do we have a stack-to-stack move? If so, resolve.
383 if self.is_stack_to_stack_move(src, dst) {
384 trace!("scratch resolver: stack to stack: {:?} -> {:?}", src, dst);
385
386 // If the selected scratch register is stolen from the
387 // set of in-use registers, then we need to save the
388 // current contents of the scratch register before using
389 // it as a temporary.
390 if let Some(save_slot) = save_slot {
391 // However we may have already done so for an earlier
392 // stack-to-stack move in which case we don't need
393 // to do it again.
394 if save_dirty {
395 debug_assert!(!scratch_dirty);
396 result.push((scratch_reg, save_slot, T::default()));
397 save_dirty = false;
398 }
399 }
400
401 // We can't move directly from one stack slot to another
402 // on any architecture we care about, so stack-to-stack
403 // moves must go via a scratch register.
404 result.push((src, scratch_reg, data));
405 result.push((scratch_reg, dst, data));
406 scratch_dirty = true;
407 } else {
408 // This is not a stack-to-stack move, but we need to
409 // make sure that the scratch register is in the correct
410 // state if this move interacts with that register.
411 if src == scratch_reg && scratch_dirty {
412 // We're copying from the scratch register so if
413 // it was stolen for a stack-to-stack move then we
414 // need to make sure it has the correct contents,
415 // not whatever was temporarily copied into it. If
416 // we got scratch_reg from find_free_reg then it
417 // had better not have been used as the source of
418 // a move. So if we're here it's because we fell
419 // back to the caller-provided last-resort scratch
420 // register, and we must therefore have a save-slot
421 // allocated too.
422 debug_assert!(!save_dirty);
423 let save_slot = save_slot.expect("move source should not be a free register");
424 result.push((save_slot, scratch_reg, T::default()));
425 scratch_dirty = false;
426 }
427 if dst == scratch_reg {
428 // We are writing something to the scratch register
429 // so it doesn't matter what was there before. We
430 // can avoid restoring it, but we will need to save
431 // it again before the next stack-to-stack move.
432 scratch_dirty = false;
433 save_dirty = true;
434 }
435 result.push((src, dst, data));
436 }
437 }
438
439 // Now that all the stack-to-stack moves are done, restore the
440 // scratch register if necessary.
441 if let Some(save_slot) = save_slot {
442 if scratch_dirty {
443 debug_assert!(!save_dirty);
444 result.push((save_slot, scratch_reg, T::default()));
445 }
446 }
447
448 trace!("scratch resolver: got {:?}", result);
449 result
450 }
451
452 fn is_stack_to_stack_move(&self, src: Allocation, dst: Allocation) -> bool {
453 (self.is_stack_alloc)(src) && (self.is_stack_alloc)(dst)
454 }
455}