linux/arch/x86/include/asm/qspinlock_paravirt.h

#ifndef __ASM_QSPINLOCK_PARAVIRT_H
#define __ASM_QSPINLOCK_PARAVIRT_H

/*
 * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
 * registers. For i386, however, only 1 32-bit register needs to be saved
 * and restored. So an optimized version of __pv_queued_spin_unlock() is
 * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
 */
#ifdef CONFIG_64BIT

PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
#define __pv_queued_spin_unlock	__pv_queued_spin_unlock
#define PV_UNLOCK		"__raw_callee_save___pv_queued_spin_unlock"
#define PV_UNLOCK_SLOWPATH	"__raw_callee_save___pv_queued_spin_unlock_slowpath"

/*
 * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
 * which combines the registers saving trunk and the body of the following
 * C code:
 *
 * void __pv_queued_spin_unlock(struct qspinlock *lock)
 * {
 *	struct __qspinlock *l = (void *)lock;
 *	u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
 *
 *	if (likely(lockval == _Q_LOCKED_VAL))
 *		return;
 *	pv_queued_spin_unlock_slowpath(lock, lockval);
 * }
 *
 * For x86-64,
 *   rdi = lock              (first argument)
 *   rsi = lockval           (second argument)
 *   rdx = internal variable (set to 0)
 */
asm    (".pushsection .text;"
	".globl " PV_UNLOCK ";"
	".align 4,0x90;"
	PV_UNLOCK ": "
	"push  %rdx;"
	"mov   $0x1,%eax;"
	"xor   %edx,%edx;"
	"lock cmpxchg %dl,(%rdi);"
	"cmp   $0x1,%al;"
	"jne   .slowpath;"
	"pop   %rdx;"
	"ret;"
	".slowpath: "
	"push   %rsi;"
	"movzbl %al,%esi;"
	"call " PV_UNLOCK_SLOWPATH ";"
	"pop    %rsi;"
	"pop    %rdx;"
	"ret;"
	".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
	".popsection");

#else /* CONFIG_64BIT */

extern void __pv_queued_spin_unlock(struct qspinlock *lock);
PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);

#endif /* CONFIG_64BIT */
#endif