/* 19_inline_asm.c
 * Concept: Embedding x86_64 assembly directly inside a C program
 *
 * Why do this? Some CPU instructions have NO C equivalent.
 * Rather than calling a library (adding overhead), you can drop
 * one instruction inline and let the compiler handle everything
 * else around it.
 *
 * Syntax:
 *   __asm__ ("instruction" : outputs : inputs : clobbers);
 *
 * Constraint codes used below:
 *   "r"  -> place this value in any general-purpose register (read)
 *   "=r" -> write result into any general-purpose register (output)
 *   "+r" -> read AND write: same register is both input and output
 */

#include <stdio.h>

/* ---------------------------------------------------------------
 * Example 1: Basic syntax — add two numbers via inline asm
 *
 * "+r"(a)  means: put 'a' in a register; it is both read and
 *          written, so the result lands back in 'a'.
 * "r"(b)   means: put 'b' in any register (read-only).
 *
 * Generated instruction: addq <reg_b>, <reg_a>
 * --------------------------------------------------------------- */
long add_asm(long a, long b) {
    __asm__ ("addq %1, %0" : "+r"(a) : "r"(b));
    return a;   /* 'a' now holds a + b */
}

/* ---------------------------------------------------------------
 * Example 2: BSR — Bit Scan Reverse
 *
 * Finds the zero-based bit position of the highest set bit.
 * The CPU does this in a single clock cycle.
 *
 * C equivalent (loop — multiple instructions):
 *   int pos = 0; unsigned long n = x; while (n >>= 1) pos++;
 *
 * "=r"(pos) -> output: result written into a register -> pos
 * "r"(n)    -> input:  n is placed in a register
 * --------------------------------------------------------------- */
long highest_bit(unsigned long n) {
    long pos;   /* must be 64-bit: bsrq writes a 64-bit register */
    __asm__ ("bsrq %1, %0" : "=r"(pos) : "r"(n));
    return pos;
}

/* ---------------------------------------------------------------
 * Example 3: POPCNT — Population Count
 *
 * Counts the number of 1-bits (set bits) in a value.
 * Used in cryptography, compression, and hamming-distance checks.
 * The CPU has a dedicated single-instruction hardware counter.
 *
 * C equivalent (loop):
 *   int count = 0; while (n) { count += n & 1; n >>= 1; }
 * --------------------------------------------------------------- */
long popcount_asm(unsigned long n) {
    long count;
    __asm__ ("popcntq %1, %0" : "=r"(count) : "r"(n));
    return count;
}

/* ---------------------------------------------------------------
 * main: demonstrate all three functions
 * --------------------------------------------------------------- */
int main(void) {

    /* Example 1 */
    printf("=== Inline Add ===\n");
    printf("add_asm(10, 32) = %ld\n\n", add_asm(10, 32));

    /* Example 2 */
    printf("=== BSR: Position of Highest Set Bit ===\n");
    printf("highest_bit(0b00000001) = bit %ld  (value   1)\n", highest_bit(1));
    printf("highest_bit(0b00001000) = bit %ld  (value   8)\n", highest_bit(8));
    printf("highest_bit(0b01100100) = bit %ld  (value 100)\n", highest_bit(100));
    printf("highest_bit(0b10000000) = bit %ld  (value 128)\n\n", highest_bit(128));

    /* Example 3 */
    printf("=== POPCNT: Count of 1-Bits ===\n");
    printf("popcount(0b00001111) = %ld  (4 ones)\n",  popcount_asm(0x0F));
    printf("popcount(0b10101010) = %ld  (4 ones)\n",  popcount_asm(0xAA));
    printf("popcount(0xFFFFFFFF) = %ld  (32 ones)\n", popcount_asm(0xFFFFFFFF));

    return 0;
}
