qmk_firmware/lib/lib8tion/math8.h

#ifndef __INC_LIB8TION_MATH_H
#define __INC_LIB8TION_MATH_H

#include "scale8.h"

///@ingroup lib8tion

///@defgroup Math Basic math operations
/// Fast, efficient 8-bit math functions specifically
/// designed for high-performance LED programming.
///
/// Because of the AVR(Arduino) and ARM assembly language
/// implementations provided, using these functions often
/// results in smaller and faster code than the equivalent
/// program using plain "C" arithmetic and logic.
///@{


/// add one byte to another, saturating at 0xFF
/// @param i - first byte to add
/// @param j - second byte to add
/// @returns the sum of i & j, capped at 0xFF
LIB8STATIC_ALWAYS_INLINE uint8_t qadd8( uint8_t i, uint8_t j)
{
#if QADD8_C == 1
    uint16_t t = i + j;
    if (t > 255) t = 255;
    return t;
#elif QADD8_AVRASM == 1
    asm volatile(
         /* First, add j to i, conditioning the C flag */
         "add %0, %1    \n\t"

         /* Now test the C flag.
           If C is clear, we branch around a load of 0xFF into i.
           If C is set, we go ahead and load 0xFF into i.
         */
         "brcc L_%=     \n\t"
         "ldi %0, 0xFF  \n\t"
         "L_%=: "
         : "+a" (i)
         : "a"  (j) );
    return i;
#elif QADD8_ARM_DSP_ASM == 1
    asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
    return i;
#else
#error "No implementation for qadd8 available."
#endif
}

/// Add one byte to another, saturating at 0x7F
/// @param i - first byte to add
/// @param j - second byte to add
/// @returns the sum of i & j, capped at 0xFF
LIB8STATIC_ALWAYS_INLINE int8_t qadd7( int8_t i, int8_t j)
{
#if QADD7_C == 1
    int16_t t = i + j;
    if (t > 127) t = 127;
    return t;
#elif QADD7_AVRASM == 1
    asm volatile(
         /* First, add j to i, conditioning the V flag */
         "add %0, %1    \n\t"

         /* Now test the V flag.
          If V is clear, we branch around a load of 0x7F into i.
          If V is set, we go ahead and load 0x7F into i.
          */
         "brvc L_%=     \n\t"
         "ldi %0, 0x7F  \n\t"
         "L_%=: "
         : "+a" (i)
         : "a"  (j) );

    return i;
#elif QADD7_ARM_DSP_ASM == 1
    asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
    return i;
#else
#error "No implementation for qadd7 available."
#endif
}

/// subtract one byte from another, saturating at 0x00
/// @returns i - j with a floor of 0
LIB8STATIC_ALWAYS_INLINE uint8_t qsub8( uint8_t i, uint8_t j)
{
#if QSUB8_C == 1
    int16_t t = i - j;
    if (t < 0) t = 0;
    return t;
#elif QSUB8_AVRASM == 1

    asm volatile(
         /* First, subtract j from i, conditioning the C flag */
         "sub %0, %1    \n\t"

         /* Now test the C flag.
          If C is clear, we branch around a load of 0x00 into i.
          If C is set, we go ahead and load 0x00 into i.
          */
         "brcc L_%=     \n\t"
         "ldi %0, 0x00  \n\t"
         "L_%=: "
         : "+a" (i)
         : "a"  (j) );

    return i;
#else
#error "No implementation for qsub8 available."
#endif
}

/// add one byte to another, with one byte result
LIB8STATIC_ALWAYS_INLINE uint8_t add8( uint8_t i, uint8_t j)
{
#if ADD8_C == 1
    uint16_t t = i + j;
    return t;
#elif ADD8_AVRASM == 1
    // Add j to i, period.
    asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
    return i;
#else
#error "No implementation for add8 available."
#endif
}

/// add one byte to another, with one byte result
LIB8STATIC_ALWAYS_INLINE uint16_t add8to16( uint8_t i, uint16_t j)
{
#if ADD8_C == 1
    uint16_t t = i + j;
    return t;
#elif ADD8_AVRASM == 1
    // Add i(one byte) to j(two bytes)
    asm volatile( "add %A[j], %[i]              \n\t"
                  "adc %B[j], __zero_reg__      \n\t"
                 : [j] "+a" (j)
                 : [i] "a"  (i)
                 );
    return i;
#else
#error "No implementation for add8to16 available."
#endif
}


/// subtract one byte from another, 8-bit result
LIB8STATIC_ALWAYS_INLINE uint8_t sub8( uint8_t i, uint8_t j)
{
#if SUB8_C == 1
    int16_t t = i - j;
    return t;
#elif SUB8_AVRASM == 1
    // Subtract j from i, period.
    asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
    return i;
#else
#error "No implementation for sub8 available."
#endif
}

/// Calculate an integer average of two unsigned
///       8-bit integer values (uint8_t).
///       Fractional results are rounded down, e.g. avg8(20,41) = 30
LIB8STATIC_ALWAYS_INLINE uint8_t avg8( uint8_t i, uint8_t j)
{
#if AVG8_C == 1
    return (i + j) >> 1;
#elif AVG8_AVRASM == 1
    asm volatile(
         /* First, add j to i, 9th bit overflows into C flag */
         "add %0, %1    \n\t"
         /* Divide by two, moving C flag into high 8th bit */
         "ror %0        \n\t"
         : "+a" (i)
         : "a"  (j) );
    return i;
#else
#error "No implementation for avg8 available."
#endif
}

/// Calculate an integer average of two unsigned
///       16-bit integer values (uint16_t).
///       Fractional results are rounded down, e.g. avg16(20,41) = 30
LIB8STATIC_ALWAYS_INLINE uint16_t avg16( uint16_t i, uint16_t j)
{
#if AVG16_C == 1
    return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;
#elif AVG16_AVRASM == 1
    asm volatile(
                 /* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
                 "add %A[i], %A[j]    \n\t"
                 /* Now, add C + jHi to iHi, 17th bit overflows into C flag */
                 "adc %B[i], %B[j]    \n\t"
                 /* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now in C */
                 "ror %B[i]        \n\t"
                 /* Divide iLo by two, moving C flag into high 8th bit */
                 "ror %A[i]        \n\t"
                 : [i] "+a" (i)
                 : [j] "a"  (j) );
    return i;
#else
#error "No implementation for avg16 available."
#endif
}


/// Calculate an integer average of two signed 7-bit
///       integers (int8_t)
///       If the first argument is even, result is rounded down.
///       If the first argument is odd, result is result up.
LIB8STATIC_ALWAYS_INLINE int8_t avg7( int8_t i, int8_t j)
{
#if AVG7_C == 1
    return ((i + j) >> 1) + (i & 0x1);
#elif AVG7_AVRASM == 1
    asm volatile(
                 "asr %1        \n\t"
                 "asr %0        \n\t"
                 "adc %0, %1    \n\t"
                 : "+a" (i)
                 : "a"  (j) );
    return i;
#else
#error "No implementation for avg7 available."
#endif
}

/// Calculate an integer average of two signed 15-bit
///       integers (int16_t)
///       If the first argument is even, result is rounded down.
///       If the first argument is odd, result is result up.
LIB8STATIC_ALWAYS_INLINE int16_t avg15( int16_t i, int16_t j)
{
#if AVG15_C == 1
    return ((int32_t)((int32_t)(i) + (int32_t)(j)) >> 1) + (i & 0x1);
#elif AVG15_AVRASM == 1
    asm volatile(
                 /* first divide j by 2, throwing away lowest bit */
                 "asr %B[j]          \n\t"
                 "ror %A[j]          \n\t"
                 /* now divide i by 2, with lowest bit going into C */
                 "asr %B[i]          \n\t"
                 "ror %A[i]          \n\t"
                 /* add j + C to i */
                 "adc %A[i], %A[j]   \n\t"
                 "adc %B[i], %B[j]   \n\t"
                 : [i] "+a" (i)
                 : [j] "a"  (j) );
    return i;
#else
#error "No implementation for avg15 available."
#endif
}


///       Calculate the remainder of one unsigned 8-bit
///       value divided by anoter, aka A % M.
///       Implemented by repeated subtraction, which is
///       very compact, and very fast if A is 'probably'
///       less than M.  If A is a large multiple of M,
///       the loop has to execute multiple times.  However,
///       even in that case, the loop is only two
///       instructions long on AVR, i.e., quick.
LIB8STATIC_ALWAYS_INLINE uint8_t mod8( uint8_t a, uint8_t m)
{
#if defined(__AVR__)
    asm volatile (
                  "L_%=:  sub %[a],%[m]    \n\t"
                  "       brcc L_%=        \n\t"
                  "       add %[a],%[m]    \n\t"
                  : [a] "+r" (a)
                  : [m] "r"  (m)
                  );
#else
    while( a >= m) a -= m;
#endif
    return a;
}

///          Add two numbers, and calculate the modulo
///          of the sum and a third number, M.
///          In other words, it returns (A+B) % M.
///          It is designed as a compact mechanism for
///          incrementing a 'mode' switch and wrapping
///          around back to 'mode 0' when the switch
///          goes past the end of the available range.
///          e.g. if you have seven modes, this switches
///          to the next one and wraps around if needed:
///            mode = addmod8( mode, 1, 7);
///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
{
#if defined(__AVR__)
    asm volatile (
                  "       add %[a],%[b]    \n\t"
                  "L_%=:  sub %[a],%[m]    \n\t"
                  "       brcc L_%=        \n\t"
                  "       add %[a],%[m]    \n\t"
                  : [a] "+r" (a)
                  : [b] "r"  (b), [m] "r" (m)
                  );
#else
    a += b;
    while( a >= m) a -= m;
#endif
    return a;
}

///          Subtract two numbers, and calculate the modulo
///          of the difference and a third number, M.
///          In other words, it returns (A-B) % M.
///          It is designed as a compact mechanism for
///          incrementing a 'mode' switch and wrapping
///          around back to 'mode 0' when the switch
///          goes past the end of the available range.
///          e.g. if you have seven modes, this switches
///          to the next one and wraps around if needed:
///            mode = addmod8( mode, 1, 7);
///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
LIB8STATIC uint8_t submod8( uint8_t a, uint8_t b, uint8_t m)
{
#if defined(__AVR__)
    asm volatile (
                  "       sub %[a],%[b]    \n\t"
                  "L_%=:  sub %[a],%[m]    \n\t"
                  "       brcc L_%=        \n\t"
                  "       add %[a],%[m]    \n\t"
                  : [a] "+r" (a)
                  : [b] "r"  (b), [m] "r" (m)
                  );
#else
    a -= b;
    while( a >= m) a -= m;
#endif
    return a;
}

/// 8x8 bit multiplication, with 8 bit result
LIB8STATIC_ALWAYS_INLINE uint8_t mul8( uint8_t i, uint8_t j)
{
#if MUL8_C == 1
    return ((uint16_t)i * (uint16_t)(j) ) & 0xFF;
#elif MUL8_AVRASM == 1
    asm volatile(
         /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
         "mul %0, %1          \n\t"
         /* Extract the LOW 8-bits (r0) */
         "mov %0, r0          \n\t"
         /* Restore r1 to "0"; it's expected to always be that */
         "clr __zero_reg__    \n\t"
         : "+a" (i)
         : "a"  (j)
         : "r0", "r1");

    return i;
#else
#error "No implementation for mul8 available."
#endif
}


/// saturating 8x8 bit multiplication, with 8 bit result
/// @returns the product of i * j, capping at 0xFF
LIB8STATIC_ALWAYS_INLINE uint8_t qmul8( uint8_t i, uint8_t j)
{
#if QMUL8_C == 1
    int p = ((uint16_t)i * (uint16_t)(j) );
    if( p > 255) p = 255;
    return p;
#elif QMUL8_AVRASM == 1
    asm volatile(
                 /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
                 "  mul %0, %1          \n\t"
                 /* If high byte of result is zero, all is well. */
                 "  tst r1              \n\t"
                 "  breq Lnospill_%=    \n\t"
                 /* If high byte of result > 0, saturate low byte to 0xFF */
                 "  ldi %0,0xFF         \n\t"
                 "  rjmp Ldone_%=       \n\t"
                 "Lnospill_%=:          \n\t"
                 /* Extract the LOW 8-bits (r0) */
                 "  mov %0, r0          \n\t"
                 "Ldone_%=:             \n\t"
                 /* Restore r1 to "0"; it's expected to always be that */
                 "  clr __zero_reg__    \n\t"
                 : "+a" (i)
                 : "a"  (j)
                 : "r0", "r1");

    return i;
#else
#error "No implementation for qmul8 available."
#endif
}


/// take abs() of a signed 8-bit uint8_t
LIB8STATIC_ALWAYS_INLINE int8_t abs8( int8_t i)
{
#if ABS8_C == 1
    if( i < 0) i = -i;
    return i;
#elif ABS8_AVRASM == 1


    asm volatile(
         /* First, check the high bit, and prepare to skip if it's clear */
         "sbrc %0, 7 \n"

         /* Negate the value */
         "neg %0     \n"

         : "+r" (i) : "r" (i) );
    return i;
#else
#error "No implementation for abs8 available."
#endif
}

///         square root for 16-bit integers
///         About three times faster and five times smaller
///         than Arduino's general sqrt on AVR.
LIB8STATIC uint8_t sqrt16(uint16_t x)
{
    if( x <= 1) {
        return x;
    }

    uint8_t low = 1; // lower bound
    uint8_t hi, mid;

    if( x > 7904) {
        hi = 255;
    } else {
        hi = (x >> 5) + 8; // initial estimate for upper bound
    }

    do {
        mid = (low + hi) >> 1;
        if ((uint16_t)(mid * mid) > x) {
            hi = mid - 1;
        } else {
            if( mid == 255) {
                return 255;
            }
            low = mid + 1;
        }
    } while (hi >= low);

    return low - 1;
}

/// blend a variable proproportion(0-255) of one byte to another
/// @param a - the starting byte value
/// @param b - the byte value to blend toward
/// @param amountOfB - the proportion (0-255) of b to blend
/// @returns a byte value between a and b, inclusive
#if (FASTLED_BLEND_FIXED == 1)
LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
{
#if BLEND8_C == 1
    uint16_t partial;
    uint8_t result;

    uint8_t amountOfA = 255 - amountOfB;

    partial = (a * amountOfA);
#if (FASTLED_SCALE8_FIXED == 1)
    partial += a;
    //partial = add8to16( a, partial);
#endif

    partial += (b * amountOfB);
#if (FASTLED_SCALE8_FIXED == 1)
    partial += b;
    //partial = add8to16( b, partial);
#endif

    result = partial >> 8;

    return result;

#elif BLEND8_AVRASM == 1
    uint16_t partial;
    uint8_t result;

    asm volatile (
        /* partial = b * amountOfB */
        "  mul %[b], %[amountOfB]        \n\t"
        "  movw %A[partial], r0          \n\t"

        /* amountOfB (aka amountOfA) = 255 - amountOfB */
        "  com %[amountOfB]              \n\t"

        /* partial += a * amountOfB (aka amountOfA) */
        "  mul %[a], %[amountOfB]        \n\t"

        "  add %A[partial], r0           \n\t"
        "  adc %B[partial], r1           \n\t"

        "  clr __zero_reg__              \n\t"

#if (FASTLED_SCALE8_FIXED == 1)
        /* partial += a */
        "  add %A[partial], %[a]         \n\t"
        "  adc %B[partial], __zero_reg__ \n\t"

        // partial += b
        "  add %A[partial], %[b]         \n\t"
        "  adc %B[partial], __zero_reg__ \n\t"
#endif

        : [partial] "=r" (partial),
          [amountOfB] "+a" (amountOfB)
        : [a] "a" (a),
          [b] "a" (b)
        : "r0", "r1"
    );

    result = partial >> 8;

    return result;

#else
#error "No implementation for blend8 available."
#endif
}

#else
LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
{
    // This version loses precision in the integer math
    // and can actually return results outside of the range
    // from a to b.  Its use is not recommended.
    uint8_t result;
    uint8_t amountOfA = 255 - amountOfB;
    result = scale8_LEAVING_R1_DIRTY( a, amountOfA)
           + scale8_LEAVING_R1_DIRTY( b, amountOfB);
    cleanup_R1();
    return result;
}
#endif


///@}
#endif
RGB Matrix Overhaul (#5372) * RGB Matrix overhaul Breakout of animations to separate files Integration of optimized int based math lib Overhaul of rgb_matrix.c and animations for performance * Updating effect function api for future extensions * Combined the keypresses \|\| keyreleases define checks into a single define so I stop forgetting it where necessary * Moving define RGB_MATRIX_KEYREACTIVE_ENABLED earlier in the include chain 7 years ago			`#ifndef __INC_LIB8TION_MATH_H`
			`#define __INC_LIB8TION_MATH_H`

			`#include "scale8.h"`

			`///@ingroup lib8tion`

			`///@defgroup Math Basic math operations`
			`/// Fast, efficient 8-bit math functions specifically`
			`/// designed for high-performance LED programming.`
			`///`
			`/// Because of the AVR(Arduino) and ARM assembly language`
			`/// implementations provided, using these functions often`
			`/// results in smaller and faster code than the equivalent`
			`/// program using plain "C" arithmetic and logic.`
			`///@{`


			`/// add one byte to another, saturating at 0xFF`
			`/// @param i - first byte to add`
			`/// @param j - second byte to add`
			`/// @returns the sum of i & j, capped at 0xFF`
			`LIB8STATIC_ALWAYS_INLINE uint8_t qadd8( uint8_t i, uint8_t j)`
			`{`
			`#if QADD8_C == 1`
			`uint16_t t = i + j;`
			`if (t > 255) t = 255;`
			`return t;`
			`#elif QADD8_AVRASM == 1`
			`asm volatile(`
			`/* First, add j to i, conditioning the C flag */`
			`"add %0, %1 \n\t"`

			`/* Now test the C flag.`
			`If C is clear, we branch around a load of 0xFF into i.`
			`If C is set, we go ahead and load 0xFF into i.`
			`*/`
			`"brcc L_%= \n\t"`
			`"ldi %0, 0xFF \n\t"`
			`"L_%=: "`
			`: "+a" (i)`
			`: "a" (j) );`
			`return i;`
			`#elif QADD8_ARM_DSP_ASM == 1`
			`asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));`
			`return i;`
			`#else`
			`#error "No implementation for qadd8 available."`
			`#endif`
			`}`

			`/// Add one byte to another, saturating at 0x7F`
			`/// @param i - first byte to add`
			`/// @param j - second byte to add`
			`/// @returns the sum of i & j, capped at 0xFF`
			`LIB8STATIC_ALWAYS_INLINE int8_t qadd7( int8_t i, int8_t j)`
			`{`
			`#if QADD7_C == 1`
			`int16_t t = i + j;`
			`if (t > 127) t = 127;`
			`return t;`
			`#elif QADD7_AVRASM == 1`
			`asm volatile(`
			`/* First, add j to i, conditioning the V flag */`
			`"add %0, %1 \n\t"`

			`/* Now test the V flag.`
			`If V is clear, we branch around a load of 0x7F into i.`
			`If V is set, we go ahead and load 0x7F into i.`
			`*/`
			`"brvc L_%= \n\t"`
			`"ldi %0, 0x7F \n\t"`
			`"L_%=: "`
			`: "+a" (i)`
			`: "a" (j) );`

			`return i;`
			`#elif QADD7_ARM_DSP_ASM == 1`
			`asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));`
			`return i;`
			`#else`
			`#error "No implementation for qadd7 available."`
			`#endif`
			`}`

			`/// subtract one byte from another, saturating at 0x00`
			`/// @returns i - j with a floor of 0`
			`LIB8STATIC_ALWAYS_INLINE uint8_t qsub8( uint8_t i, uint8_t j)`
			`{`
			`#if QSUB8_C == 1`
			`int16_t t = i - j;`
			`if (t < 0) t = 0;`
			`return t;`
			`#elif QSUB8_AVRASM == 1`

			`asm volatile(`
			`/* First, subtract j from i, conditioning the C flag */`
			`"sub %0, %1 \n\t"`

			`/* Now test the C flag.`
			`If C is clear, we branch around a load of 0x00 into i.`
			`If C is set, we go ahead and load 0x00 into i.`
			`*/`
			`"brcc L_%= \n\t"`
			`"ldi %0, 0x00 \n\t"`
			`"L_%=: "`
			`: "+a" (i)`
			`: "a" (j) );`

			`return i;`
			`#else`
			`#error "No implementation for qsub8 available."`
			`#endif`
			`}`

			`/// add one byte to another, with one byte result`
			`LIB8STATIC_ALWAYS_INLINE uint8_t add8( uint8_t i, uint8_t j)`
			`{`
			`#if ADD8_C == 1`
			`uint16_t t = i + j;`
			`return t;`
			`#elif ADD8_AVRASM == 1`
			`// Add j to i, period.`
			`asm volatile( "add %0, %1" : "+a" (i) : "a" (j));`
			`return i;`
			`#else`
			`#error "No implementation for add8 available."`
			`#endif`
			`}`

			`/// add one byte to another, with one byte result`
			`LIB8STATIC_ALWAYS_INLINE uint16_t add8to16( uint8_t i, uint16_t j)`
			`{`
			`#if ADD8_C == 1`
			`uint16_t t = i + j;`
			`return t;`
			`#elif ADD8_AVRASM == 1`
			`// Add i(one byte) to j(two bytes)`
			`asm volatile( "add %A[j], %[i] \n\t"`
			`"adc %B[j], __zero_reg__ \n\t"`
			`: [j] "+a" (j)`
			`: [i] "a" (i)`
			`);`
			`return i;`
			`#else`
			`#error "No implementation for add8to16 available."`
			`#endif`
			`}`


			`/// subtract one byte from another, 8-bit result`
			`LIB8STATIC_ALWAYS_INLINE uint8_t sub8( uint8_t i, uint8_t j)`
			`{`
			`#if SUB8_C == 1`
			`int16_t t = i - j;`
			`return t;`
			`#elif SUB8_AVRASM == 1`
			`// Subtract j from i, period.`
			`asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));`
			`return i;`
			`#else`
			`#error "No implementation for sub8 available."`
			`#endif`
			`}`

			`/// Calculate an integer average of two unsigned`
			`/// 8-bit integer values (uint8_t).`
			`/// Fractional results are rounded down, e.g. avg8(20,41) = 30`
			`LIB8STATIC_ALWAYS_INLINE uint8_t avg8( uint8_t i, uint8_t j)`
			`{`
			`#if AVG8_C == 1`
			`return (i + j) >> 1;`
			`#elif AVG8_AVRASM == 1`
			`asm volatile(`
			`/* First, add j to i, 9th bit overflows into C flag */`
			`"add %0, %1 \n\t"`
			`/* Divide by two, moving C flag into high 8th bit */`
			`"ror %0 \n\t"`
			`: "+a" (i)`
			`: "a" (j) );`
			`return i;`
			`#else`
			`#error "No implementation for avg8 available."`
			`#endif`
			`}`

			`/// Calculate an integer average of two unsigned`
			`/// 16-bit integer values (uint16_t).`
			`/// Fractional results are rounded down, e.g. avg16(20,41) = 30`
			`LIB8STATIC_ALWAYS_INLINE uint16_t avg16( uint16_t i, uint16_t j)`
			`{`
			`#if AVG16_C == 1`
			`return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;`
			`#elif AVG16_AVRASM == 1`
			`asm volatile(`
			`/* First, add jLo (heh) to iLo, 9th bit overflows into C flag */`
			`"add %A[i], %A[j] \n\t"`
			`/* Now, add C + jHi to iHi, 17th bit overflows into C flag */`
			`"adc %B[i], %B[j] \n\t"`
			`/* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now in C */`
			`"ror %B[i] \n\t"`
			`/* Divide iLo by two, moving C flag into high 8th bit */`
			`"ror %A[i] \n\t"`
			`: [i] "+a" (i)`
			`: [j] "a" (j) );`
			`return i;`
			`#else`
			`#error "No implementation for avg16 available."`
			`#endif`
			`}`


			`/// Calculate an integer average of two signed 7-bit`
			`/// integers (int8_t)`
			`/// If the first argument is even, result is rounded down.`
			`/// If the first argument is odd, result is result up.`
			`LIB8STATIC_ALWAYS_INLINE int8_t avg7( int8_t i, int8_t j)`
			`{`
			`#if AVG7_C == 1`
			`return ((i + j) >> 1) + (i & 0x1);`
			`#elif AVG7_AVRASM == 1`
			`asm volatile(`
			`"asr %1 \n\t"`
			`"asr %0 \n\t"`
			`"adc %0, %1 \n\t"`
			`: "+a" (i)`
			`: "a" (j) );`
			`return i;`
			`#else`
			`#error "No implementation for avg7 available."`
			`#endif`
			`}`

			`/// Calculate an integer average of two signed 15-bit`
			`/// integers (int16_t)`
			`/// If the first argument is even, result is rounded down.`
			`/// If the first argument is odd, result is result up.`
			`LIB8STATIC_ALWAYS_INLINE int16_t avg15( int16_t i, int16_t j)`
			`{`
			`#if AVG15_C == 1`
			`return ((int32_t)((int32_t)(i) + (int32_t)(j)) >> 1) + (i & 0x1);`
			`#elif AVG15_AVRASM == 1`
			`asm volatile(`
			`/* first divide j by 2, throwing away lowest bit */`
			`"asr %B[j] \n\t"`
			`"ror %A[j] \n\t"`
			`/* now divide i by 2, with lowest bit going into C */`
			`"asr %B[i] \n\t"`
			`"ror %A[i] \n\t"`
			`/* add j + C to i */`
			`"adc %A[i], %A[j] \n\t"`
			`"adc %B[i], %B[j] \n\t"`
			`: [i] "+a" (i)`
			`: [j] "a" (j) );`
			`return i;`
			`#else`
			`#error "No implementation for avg15 available."`
			`#endif`
			`}`


			`/// Calculate the remainder of one unsigned 8-bit`
			`/// value divided by anoter, aka A % M.`
			`/// Implemented by repeated subtraction, which is`
			`/// very compact, and very fast if A is 'probably'`
			`/// less than M. If A is a large multiple of M,`
			`/// the loop has to execute multiple times. However,`
			`/// even in that case, the loop is only two`
			`/// instructions long on AVR, i.e., quick.`
			`LIB8STATIC_ALWAYS_INLINE uint8_t mod8( uint8_t a, uint8_t m)`
			`{`
			`#if defined(__AVR__)`
			`asm volatile (`
			`"L_%=: sub %[a],%[m] \n\t"`
			`" brcc L_%= \n\t"`
			`" add %[a],%[m] \n\t"`
			`: [a] "+r" (a)`
			`: [m] "r" (m)`
			`);`
			`#else`
			`while( a >= m) a -= m;`
			`#endif`
			`return a;`
			`}`

			`/// Add two numbers, and calculate the modulo`
			`/// of the sum and a third number, M.`
			`/// In other words, it returns (A+B) % M.`
			`/// It is designed as a compact mechanism for`
			`/// incrementing a 'mode' switch and wrapping`
			`/// around back to 'mode 0' when the switch`
			`/// goes past the end of the available range.`
			`/// e.g. if you have seven modes, this switches`
			`/// to the next one and wraps around if needed:`
			`/// mode = addmod8( mode, 1, 7);`
			`///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.`
			`LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)`
			`{`
			`#if defined(__AVR__)`
			`asm volatile (`
			`" add %[a],%[b] \n\t"`
			`"L_%=: sub %[a],%[m] \n\t"`
			`" brcc L_%= \n\t"`
			`" add %[a],%[m] \n\t"`
			`: [a] "+r" (a)`
			`: [b] "r" (b), [m] "r" (m)`
			`);`
			`#else`
			`a += b;`
			`while( a >= m) a -= m;`
			`#endif`
			`return a;`
			`}`

			`/// Subtract two numbers, and calculate the modulo`
			`/// of the difference and a third number, M.`
			`/// In other words, it returns (A-B) % M.`
			`/// It is designed as a compact mechanism for`
			`/// incrementing a 'mode' switch and wrapping`
			`/// around back to 'mode 0' when the switch`
			`/// goes past the end of the available range.`
			`/// e.g. if you have seven modes, this switches`
			`/// to the next one and wraps around if needed:`
			`/// mode = addmod8( mode, 1, 7);`
			`///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.`
			`LIB8STATIC uint8_t submod8( uint8_t a, uint8_t b, uint8_t m)`
			`{`
			`#if defined(__AVR__)`
			`asm volatile (`
			`" sub %[a],%[b] \n\t"`
			`"L_%=: sub %[a],%[m] \n\t"`
			`" brcc L_%= \n\t"`
			`" add %[a],%[m] \n\t"`
			`: [a] "+r" (a)`
			`: [b] "r" (b), [m] "r" (m)`
			`);`
			`#else`
			`a -= b;`
			`while( a >= m) a -= m;`
			`#endif`
			`return a;`
			`}`

			`/// 8x8 bit multiplication, with 8 bit result`
			`LIB8STATIC_ALWAYS_INLINE uint8_t mul8( uint8_t i, uint8_t j)`
			`{`
			`#if MUL8_C == 1`
			`return ((uint16_t)i * (uint16_t)(j) ) & 0xFF;`
			`#elif MUL8_AVRASM == 1`
			`asm volatile(`
			`/* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */`
			`"mul %0, %1 \n\t"`
			`/* Extract the LOW 8-bits (r0) */`
			`"mov %0, r0 \n\t"`
			`/* Restore r1 to "0"; it's expected to always be that */`
			`"clr __zero_reg__ \n\t"`
			`: "+a" (i)`
			`: "a" (j)`
			`: "r0", "r1");`

			`return i;`
			`#else`
			`#error "No implementation for mul8 available."`
			`#endif`
			`}`


			`/// saturating 8x8 bit multiplication, with 8 bit result`
			`/// @returns the product of i * j, capping at 0xFF`
			`LIB8STATIC_ALWAYS_INLINE uint8_t qmul8( uint8_t i, uint8_t j)`
			`{`
			`#if QMUL8_C == 1`
			`int p = ((uint16_t)i * (uint16_t)(j) );`
			`if( p > 255) p = 255;`
			`return p;`
			`#elif QMUL8_AVRASM == 1`
			`asm volatile(`
			`/* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */`
			`" mul %0, %1 \n\t"`
			`/* If high byte of result is zero, all is well. */`
			`" tst r1 \n\t"`
			`" breq Lnospill_%= \n\t"`
			`/* If high byte of result > 0, saturate low byte to 0xFF */`
			`" ldi %0,0xFF \n\t"`
			`" rjmp Ldone_%= \n\t"`
			`"Lnospill_%=: \n\t"`
			`/* Extract the LOW 8-bits (r0) */`
			`" mov %0, r0 \n\t"`
			`"Ldone_%=: \n\t"`
			`/* Restore r1 to "0"; it's expected to always be that */`
			`" clr __zero_reg__ \n\t"`
			`: "+a" (i)`
			`: "a" (j)`
			`: "r0", "r1");`

			`return i;`
			`#else`
			`#error "No implementation for qmul8 available."`
			`#endif`
			`}`


			`/// take abs() of a signed 8-bit uint8_t`
			`LIB8STATIC_ALWAYS_INLINE int8_t abs8( int8_t i)`
			`{`
			`#if ABS8_C == 1`
			`if( i < 0) i = -i;`
			`return i;`
			`#elif ABS8_AVRASM == 1`


			`asm volatile(`
			`/* First, check the high bit, and prepare to skip if it's clear */`
			`"sbrc %0, 7 \n"`

			`/* Negate the value */`
			`"neg %0 \n"`

			`: "+r" (i) : "r" (i) );`
			`return i;`
			`#else`
			`#error "No implementation for abs8 available."`
			`#endif`
			`}`

			`/// square root for 16-bit integers`
			`/// About three times faster and five times smaller`
			`/// than Arduino's general sqrt on AVR.`
			`LIB8STATIC uint8_t sqrt16(uint16_t x)`
			`{`
			`if( x <= 1) {`
			`return x;`
			`}`

			`uint8_t low = 1; // lower bound`
			`uint8_t hi, mid;`

			`if( x > 7904) {`
			`hi = 255;`
			`} else {`
			`hi = (x >> 5) + 8; // initial estimate for upper bound`
			`}`

			`do {`
			`mid = (low + hi) >> 1;`
			`if ((uint16_t)(mid * mid) > x) {`
			`hi = mid - 1;`
			`} else {`
			`if( mid == 255) {`
			`return 255;`
			`}`
			`low = mid + 1;`
			`}`
			`} while (hi >= low);`

			`return low - 1;`
			`}`

			`/// blend a variable proproportion(0-255) of one byte to another`
			`/// @param a - the starting byte value`
			`/// @param b - the byte value to blend toward`
			`/// @param amountOfB - the proportion (0-255) of b to blend`
			`/// @returns a byte value between a and b, inclusive`
			`#if (FASTLED_BLEND_FIXED == 1)`
			`LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)`
			`{`
			`#if BLEND8_C == 1`
			`uint16_t partial;`
			`uint8_t result;`

			`uint8_t amountOfA = 255 - amountOfB;`

			`partial = (a * amountOfA);`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`partial += a;`
			`//partial = add8to16( a, partial);`
			`#endif`

			`partial += (b * amountOfB);`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`partial += b;`
			`//partial = add8to16( b, partial);`
			`#endif`

			`result = partial >> 8;`

			`return result;`

			`#elif BLEND8_AVRASM == 1`
			`uint16_t partial;`
			`uint8_t result;`

			`asm volatile (`
			`/* partial = b * amountOfB */`
			`" mul %[b], %[amountOfB] \n\t"`
			`" movw %A[partial], r0 \n\t"`

			`/* amountOfB (aka amountOfA) = 255 - amountOfB */`
			`" com %[amountOfB] \n\t"`

			`/* partial += a * amountOfB (aka amountOfA) */`
			`" mul %[a], %[amountOfB] \n\t"`

			`" add %A[partial], r0 \n\t"`
			`" adc %B[partial], r1 \n\t"`

			`" clr __zero_reg__ \n\t"`

			`#if (FASTLED_SCALE8_FIXED == 1)`
			`/* partial += a */`
			`" add %A[partial], %[a] \n\t"`
			`" adc %B[partial], __zero_reg__ \n\t"`

			`// partial += b`
			`" add %A[partial], %[b] \n\t"`
			`" adc %B[partial], __zero_reg__ \n\t"`
			`#endif`

			`: [partial] "=r" (partial),`
			`[amountOfB] "+a" (amountOfB)`
			`: [a] "a" (a),`
			`[b] "a" (b)`
			`: "r0", "r1"`
			`);`

			`result = partial >> 8;`

			`return result;`

			`#else`
			`#error "No implementation for blend8 available."`
			`#endif`
			`}`

			`#else`
			`LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)`
			`{`
			`// This version loses precision in the integer math`
			`// and can actually return results outside of the range`
			`// from a to b. Its use is not recommended.`
			`uint8_t result;`
			`uint8_t amountOfA = 255 - amountOfB;`
			`result = scale8_LEAVING_R1_DIRTY( a, amountOfA)`
			`+ scale8_LEAVING_R1_DIRTY( b, amountOfB);`
			`cleanup_R1();`
			`return result;`
			`}`
			`#endif`


			`///@}`
			`#endif`