qmk_firmware/lib/lib8tion/scale8.h

#ifndef __INC_LIB8TION_SCALE_H
#define __INC_LIB8TION_SCALE_H

///@ingroup lib8tion

///@defgroup Scaling Scaling functions
/// Fast, efficient 8-bit scaling functions specifically
/// designed for high-performance LED programming.
///
/// Because of the AVR(Arduino) and ARM assembly language
/// implementations provided, using these functions often
/// results in smaller and faster code than the equivalent
/// program using plain "C" arithmetic and logic.
///@{

///  scale one byte by a second one, which is treated as
///  the numerator of a fraction whose denominator is 256
///  In other words, it computes i * (scale / 256)
///  4 clocks AVR with MUL, 2 clocks ARM
LIB8STATIC_ALWAYS_INLINE uint8_t scale8( uint8_t i, fract8 scale)
{
#if SCALE8_C == 1
#if (FASTLED_SCALE8_FIXED == 1)
    return (((uint16_t)i) * (1+(uint16_t)(scale))) >> 8;
#else
    return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
#endif
#elif SCALE8_AVRASM == 1
#if defined(LIB8_ATTINY)
#if (FASTLED_SCALE8_FIXED == 1)
    uint8_t work=i;
#else
    uint8_t work=0;
#endif
    uint8_t cnt=0x80;
    asm volatile(
#if (FASTLED_SCALE8_FIXED == 1)
        "  inc %[scale]                 \n\t"
        "  breq DONE_%=                 \n\t"
        "  clr %[work]                  \n\t"
#endif
        "LOOP_%=:                       \n\t"
        /*"  sbrc %[scale], 0             \n\t"
        "  add %[work], %[i]            \n\t"
        "  ror %[work]                  \n\t"
        "  lsr %[scale]                 \n\t"
        "  clc                          \n\t"*/
        "  sbrc %[scale], 0             \n\t"
        "  add %[work], %[i]            \n\t"
        "  ror %[work]                  \n\t"
        "  lsr %[scale]                 \n\t"
        "  lsr %[cnt]                   \n\t"
        "brcc LOOP_%=                   \n\t"
        "DONE_%=:                       \n\t"
        : [work] "+r" (work), [cnt] "+r" (cnt)
        : [scale] "r" (scale), [i] "r" (i)
        :
      );
    return work;
#else
    asm volatile(
#if (FASTLED_SCALE8_FIXED==1)
        // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
        "mul %0, %1          \n\t"
        // Add i to r0, possibly setting the carry flag
        "add r0, %0         \n\t"
        // load the immediate 0 into i (note, this does _not_ touch any flags)
        "ldi %0, 0x00       \n\t"
        // walk and chew gum at the same time
        "adc %0, r1          \n\t"
#else
         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
         "mul %0, %1          \n\t"
         /* Move the high 8-bits of the product (r1) back to i */
         "mov %0, r1          \n\t"
         /* Restore r1 to "0"; it's expected to always be that */
#endif
         "clr __zero_reg__    \n\t"

         : "+a" (i)      /* writes to i */
         : "a"  (scale)  /* uses scale */
         : "r0", "r1"    /* clobbers r0, r1 */ );

    /* Return the result */
    return i;
#endif
#else
#error "No implementation for scale8 available."
#endif
}


///  The "video" version of scale8 guarantees that the output will
///  be only be zero if one or both of the inputs are zero.  If both
///  inputs are non-zero, the output is guaranteed to be non-zero.
///  This makes for better 'video'/LED dimming, at the cost of
///  several additional cycles.
LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video( uint8_t i, fract8 scale)
{
#if SCALE8_C == 1 || defined(LIB8_ATTINY)
    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
    return j;
#elif SCALE8_AVRASM == 1
    uint8_t j=0;
    asm volatile(
        "  tst %[i]\n\t"
        "  breq L_%=\n\t"
        "  mul %[i], %[scale]\n\t"
        "  mov %[j], r1\n\t"
        "  clr __zero_reg__\n\t"
        "  cpse %[scale], r1\n\t"
        "  subi %[j], 0xFF\n\t"
        "L_%=: \n\t"
        : [j] "+a" (j)
        : [i] "a" (i), [scale] "a" (scale)
        : "r0", "r1");

    return j;
    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
    // asm volatile(
    //      "      tst %0           \n"
    //      "      breq L_%=        \n"
    //      "      mul %0, %1       \n"
    //      "      mov %0, r1       \n"
    //      "      add %0, %2       \n"
    //      "      clr __zero_reg__ \n"
    //      "L_%=:                  \n"

    //      : "+a" (i)
    //      : "a" (scale), "a" (nonzeroscale)
    //      : "r0", "r1");

    // // Return the result
    // return i;
#else
#error "No implementation for scale8_video available."
#endif
}


/// This version of scale8 does not clean up the R1 register on AVR
/// If you are doing several 'scale8's in a row, use this, and
/// then explicitly call cleanup_R1.
LIB8STATIC_ALWAYS_INLINE uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
{
#if SCALE8_C == 1
#if (FASTLED_SCALE8_FIXED == 1)
    return (((uint16_t)i) * ((uint16_t)(scale)+1)) >> 8;
#else
    return ((int)i * (int)(scale) ) >> 8;
#endif
#elif SCALE8_AVRASM == 1
    asm volatile(
      #if (FASTLED_SCALE8_FIXED==1)
              // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
              "mul %0, %1          \n\t"
              // Add i to r0, possibly setting the carry flag
              "add r0, %0         \n\t"
              // load the immediate 0 into i (note, this does _not_ touch any flags)
              "ldi %0, 0x00       \n\t"
              // walk and chew gum at the same time
              "adc %0, r1          \n\t"
      #else
         /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
         "mul %0, %1    \n\t"
         /* Move the high 8-bits of the product (r1) back to i */
         "mov %0, r1    \n\t"
      #endif
         /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF  */
         /* "clr __zero_reg__    \n\t" */

         : "+a" (i)      /* writes to i */
         : "a"  (scale)  /* uses scale */
         : "r0", "r1"    /* clobbers r0, r1 */ );

    // Return the result
    return i;
#else
#error "No implementation for scale8_LEAVING_R1_DIRTY available."
#endif
}


/// This version of scale8_video does not clean up the R1 register on AVR
/// If you are doing several 'scale8_video's in a row, use this, and
/// then explicitly call cleanup_R1.
LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
{
#if SCALE8_C == 1 || defined(LIB8_ATTINY)
    uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
    // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
    return j;
#elif SCALE8_AVRASM == 1
    uint8_t j=0;
    asm volatile(
        "  tst %[i]\n\t"
        "  breq L_%=\n\t"
        "  mul %[i], %[scale]\n\t"
        "  mov %[j], r1\n\t"
        "  breq L_%=\n\t"
        "  subi %[j], 0xFF\n\t"
        "L_%=: \n\t"
        : [j] "+a" (j)
        : [i] "a" (i), [scale] "a" (scale)
        : "r0", "r1");

    return j;
    // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
    // asm volatile(
    //      "      tst %0           \n"
    //      "      breq L_%=        \n"
    //      "      mul %0, %1       \n"
    //      "      mov %0, r1       \n"
    //      "      add %0, %2       \n"
    //      "      clr __zero_reg__ \n"
    //      "L_%=:                  \n"

    //      : "+a" (i)
    //      : "a" (scale), "a" (nonzeroscale)
    //      : "r0", "r1");

    // // Return the result
    // return i;
#else
#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
#endif
}

/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls
LIB8STATIC_ALWAYS_INLINE void cleanup_R1(void)
{
#if CLEANUP_R1_AVRASM == 1
    // Restore r1 to "0"; it's expected to always be that
    asm volatile( "clr __zero_reg__  \n\t" : : : "r1" );
#endif
}


/// scale a 16-bit unsigned value by an 8-bit value,
///         considered as numerator of a fraction whose denominator
///         is 256. In other words, it computes i * (scale / 256)

LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
{
#if SCALE16BY8_C == 1
    uint16_t result;
#if FASTLED_SCALE8_FIXED == 1
    result = (i * (1+((uint16_t)scale))) >> 8;
#else
    result = (i * scale) / 256;
#endif
    return result;
#elif SCALE16BY8_AVRASM == 1
#if FASTLED_SCALE8_FIXED == 1
    uint16_t result = 0;
    asm volatile(
                 // result.A = HighByte( (i.A x scale) + i.A )
                 "  mul %A[i], %[scale]                 \n\t"
                 "  add r0, %A[i]                       \n\t"
            //   "  adc r1, [zero]                      \n\t"
            //   "  mov %A[result], r1                  \n\t"
                 "  adc %A[result], r1                  \n\t"

                 // result.A-B += i.B x scale
                 "  mul %B[i], %[scale]                 \n\t"
                 "  add %A[result], r0                  \n\t"
                 "  adc %B[result], r1                  \n\t"

                 // cleanup r1
                 "  clr __zero_reg__                    \n\t"

                 // result.A-B += i.B
                 "  add %A[result], %B[i]               \n\t"
                 "  adc %B[result], __zero_reg__        \n\t"

                 : [result] "+r" (result)
                 : [i] "r" (i), [scale] "r" (scale)
                 : "r0", "r1"
                 );
    return result;
#else
    uint16_t result = 0;
    asm volatile(
         // result.A = HighByte(i.A x j )
         "  mul %A[i], %[scale]                 \n\t"
         "  mov %A[result], r1                  \n\t"
         //"  clr %B[result]                      \n\t"

         // result.A-B += i.B x j
         "  mul %B[i], %[scale]                 \n\t"
         "  add %A[result], r0                  \n\t"
         "  adc %B[result], r1                  \n\t"

         // cleanup r1
         "  clr __zero_reg__                    \n\t"

         : [result] "+r" (result)
         : [i] "r" (i), [scale] "r" (scale)
         : "r0", "r1"
         );
    return result;
#endif
#else
    #error "No implementation for scale16by8 available."
#endif
}

/// scale a 16-bit unsigned value by a 16-bit value,
///         considered as numerator of a fraction whose denominator
///         is 65536. In other words, it computes i * (scale / 65536)

LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
{
  #if SCALE16_C == 1
    uint16_t result;
#if FASTLED_SCALE8_FIXED == 1
    result = ((uint32_t)(i) * (1+(uint32_t)(scale))) / 65536;
#else
    result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
#endif
    return result;
#elif SCALE16_AVRASM == 1
#if FASTLED_SCALE8_FIXED == 1
    // implemented sort of like
    //   result = ((i * scale) + i ) / 65536
    //
    // why not like this, you may ask?
    //   result = (i * (scale+1)) / 65536
    // the answer is that if scale is 65535, then scale+1
    // will be zero, which is not what we want.
    uint32_t result;
    asm volatile(
                 // result.A-B  = i.A x scale.A
                 "  mul %A[i], %A[scale]                 \n\t"
                 //  save results...
                 // basic idea:
                 //"  mov %A[result], r0                 \n\t"
                 //"  mov %B[result], r1                 \n\t"
                 // which can be written as...
                 "  movw %A[result], r0                   \n\t"
                 // Because we're going to add i.A-B to
                 // result.A-D, we DO need to keep both
                 // the r0 and r1 portions of the product
                 // UNlike in the 'unfixed scale8' version.
                 // So the movw here is needed.
                 : [result] "=r" (result)
                 : [i] "r" (i),
                 [scale] "r" (scale)
                 : "r0", "r1"
                 );

    asm volatile(
                 // result.C-D  = i.B x scale.B
                 "  mul %B[i], %B[scale]                 \n\t"
                 //"  mov %C[result], r0                 \n\t"
                 //"  mov %D[result], r1                 \n\t"
                 "  movw %C[result], r0                   \n\t"
                 : [result] "+r" (result)
                 : [i] "r" (i),
                 [scale] "r" (scale)
                 : "r0", "r1"
                 );

    const uint8_t  zero = 0;
    asm volatile(
                 // result.B-D += i.B x scale.A
                 "  mul %B[i], %A[scale]                 \n\t"

                 "  add %B[result], r0                   \n\t"
                 "  adc %C[result], r1                   \n\t"
                 "  adc %D[result], %[zero]              \n\t"

                 // result.B-D += i.A x scale.B
                 "  mul %A[i], %B[scale]                 \n\t"

                 "  add %B[result], r0                   \n\t"
                 "  adc %C[result], r1                   \n\t"
                 "  adc %D[result], %[zero]              \n\t"

                 // cleanup r1
                 "  clr r1                               \n\t"

                 : [result] "+r" (result)
                 : [i] "r" (i),
                 [scale] "r" (scale),
                 [zero] "r" (zero)
                 : "r0", "r1"
                 );

    asm volatile(
                 // result.A-D += i.A-B
                 "  add %A[result], %A[i]                \n\t"
                 "  adc %B[result], %B[i]                \n\t"
                 "  adc %C[result], %[zero]              \n\t"
                 "  adc %D[result], %[zero]              \n\t"
                 : [result] "+r" (result)
                 : [i] "r" (i),
                 [zero] "r" (zero)
                 );

    result = result >> 16;
    return result;
#else
    uint32_t result;
    asm volatile(
                 // result.A-B  = i.A x scale.A
                 "  mul %A[i], %A[scale]                 \n\t"
                 //  save results...
                 // basic idea:
                 //"  mov %A[result], r0                 \n\t"
                 //"  mov %B[result], r1                 \n\t"
                 // which can be written as...
                 "  movw %A[result], r0                   \n\t"
                 // We actually don't need to do anything with r0,
                 // as result.A is never used again here, so we
                 // could just move the high byte, but movw is
                 // one clock cycle, just like mov, so might as
                 // well, in case we want to use this code for
                 // a generic 16x16 multiply somewhere.

                 : [result] "=r" (result)
                 : [i] "r" (i),
                   [scale] "r" (scale)
                 : "r0", "r1"
                 );

    asm volatile(
                 // result.C-D  = i.B x scale.B
                 "  mul %B[i], %B[scale]                 \n\t"
                 //"  mov %C[result], r0                 \n\t"
                 //"  mov %D[result], r1                 \n\t"
                 "  movw %C[result], r0                   \n\t"
                 : [result] "+r" (result)
                 : [i] "r" (i),
                   [scale] "r" (scale)
                 : "r0", "r1"
                 );

    const uint8_t  zero = 0;
    asm volatile(
                 // result.B-D += i.B x scale.A
                 "  mul %B[i], %A[scale]                 \n\t"

                 "  add %B[result], r0                   \n\t"
                 "  adc %C[result], r1                   \n\t"
                 "  adc %D[result], %[zero]              \n\t"

                 // result.B-D += i.A x scale.B
                 "  mul %A[i], %B[scale]                 \n\t"

                 "  add %B[result], r0                   \n\t"
                 "  adc %C[result], r1                   \n\t"
                 "  adc %D[result], %[zero]              \n\t"

                 // cleanup r1
                 "  clr r1                               \n\t"

                 : [result] "+r" (result)
                 : [i] "r" (i),
                   [scale] "r" (scale),
                   [zero] "r" (zero)
                 : "r0", "r1"
                 );

    result = result >> 16;
    return result;
#endif
#else
    #error "No implementation for scale16 available."
#endif
}
///@}

///@defgroup Dimming Dimming and brightening functions
///
/// Dimming and brightening functions
///
/// The eye does not respond in a linear way to light.
/// High speed PWM'd LEDs at 50% duty cycle appear far
/// brighter then the 'half as bright' you might expect.
///
/// If you want your midpoint brightness leve (128) to
/// appear half as bright as 'full' brightness (255), you
/// have to apply a 'dimming function'.
///@{

/// Adjust a scaling value for dimming
LIB8STATIC uint8_t dim8_raw( uint8_t x)
{
    return scale8( x, x);
}

/// Adjust a scaling value for dimming for video (value will never go below 1)
LIB8STATIC uint8_t dim8_video( uint8_t x)
{
    return scale8_video( x, x);
}

/// Linear version of the dimming function that halves for values < 128
LIB8STATIC uint8_t dim8_lin( uint8_t x )
{
    if( x & 0x80 ) {
        x = scale8( x, x);
    } else {
        x += 1;
        x /= 2;
    }
    return x;
}

/// inverse of the dimming function, brighten a value
LIB8STATIC uint8_t brighten8_raw( uint8_t x)
{
    uint8_t ix = 255 - x;
    return 255 - scale8( ix, ix);
}

/// inverse of the dimming function, brighten a value
LIB8STATIC uint8_t brighten8_video( uint8_t x)
{
    uint8_t ix = 255 - x;
    return 255 - scale8_video( ix, ix);
}

/// inverse of the dimming function, brighten a value
LIB8STATIC uint8_t brighten8_lin( uint8_t x )
{
    uint8_t ix = 255 - x;
    if( ix & 0x80 ) {
        ix = scale8( ix, ix);
    } else {
        ix += 1;
        ix /= 2;
    }
    return 255 - ix;
}

///@}
#endif
RGB Matrix Overhaul (#5372) * RGB Matrix overhaul Breakout of animations to separate files Integration of optimized int based math lib Overhaul of rgb_matrix.c and animations for performance * Updating effect function api for future extensions * Combined the keypresses \|\| keyreleases define checks into a single define so I stop forgetting it where necessary * Moving define RGB_MATRIX_KEYREACTIVE_ENABLED earlier in the include chain 7 years ago			`#ifndef __INC_LIB8TION_SCALE_H`
			`#define __INC_LIB8TION_SCALE_H`

			`///@ingroup lib8tion`

			`///@defgroup Scaling Scaling functions`
			`/// Fast, efficient 8-bit scaling functions specifically`
			`/// designed for high-performance LED programming.`
			`///`
			`/// Because of the AVR(Arduino) and ARM assembly language`
			`/// implementations provided, using these functions often`
			`/// results in smaller and faster code than the equivalent`
			`/// program using plain "C" arithmetic and logic.`
			`///@{`

			`/// scale one byte by a second one, which is treated as`
			`/// the numerator of a fraction whose denominator is 256`
			`/// In other words, it computes i * (scale / 256)`
			`/// 4 clocks AVR with MUL, 2 clocks ARM`
			`LIB8STATIC_ALWAYS_INLINE uint8_t scale8( uint8_t i, fract8 scale)`
			`{`
			`#if SCALE8_C == 1`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`return (((uint16_t)i) * (1+(uint16_t)(scale))) >> 8;`
			`#else`
			`return ((uint16_t)i * (uint16_t)(scale) ) >> 8;`
			`#endif`
			`#elif SCALE8_AVRASM == 1`
			`#if defined(LIB8_ATTINY)`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`uint8_t work=i;`
			`#else`
			`uint8_t work=0;`
			`#endif`
			`uint8_t cnt=0x80;`
			`asm volatile(`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`" inc %[scale] \n\t"`
			`" breq DONE_%= \n\t"`
			`" clr %[work] \n\t"`
			`#endif`
			`"LOOP_%=: \n\t"`
			`/*" sbrc %[scale], 0 \n\t"`
			`" add %[work], %[i] \n\t"`
			`" ror %[work] \n\t"`
			`" lsr %[scale] \n\t"`
			`" clc \n\t"*/`
			`" sbrc %[scale], 0 \n\t"`
			`" add %[work], %[i] \n\t"`
			`" ror %[work] \n\t"`
			`" lsr %[scale] \n\t"`
			`" lsr %[cnt] \n\t"`
			`"brcc LOOP_%= \n\t"`
			`"DONE_%=: \n\t"`
			`: [work] "+r" (work), [cnt] "+r" (cnt)`
			`: [scale] "r" (scale), [i] "r" (i)`
			`:`
			`);`
			`return work;`
			`#else`
			`asm volatile(`
			`#if (FASTLED_SCALE8_FIXED==1)`
			`// Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0`
			`"mul %0, %1 \n\t"`
			`// Add i to r0, possibly setting the carry flag`
			`"add r0, %0 \n\t"`
			`// load the immediate 0 into i (note, this does _not_ touch any flags)`
			`"ldi %0, 0x00 \n\t"`
			`// walk and chew gum at the same time`
			`"adc %0, r1 \n\t"`
			`#else`
			`/* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */`
			`"mul %0, %1 \n\t"`
			`/* Move the high 8-bits of the product (r1) back to i */`
			`"mov %0, r1 \n\t"`
			`/* Restore r1 to "0"; it's expected to always be that */`
			`#endif`
			`"clr __zero_reg__ \n\t"`

			`: "+a" (i) /* writes to i */`
			`: "a" (scale) /* uses scale */`
			`: "r0", "r1" /* clobbers r0, r1 */ );`

			`/* Return the result */`
			`return i;`
			`#endif`
			`#else`
			`#error "No implementation for scale8 available."`
			`#endif`
			`}`


			`/// The "video" version of scale8 guarantees that the output will`
			`/// be only be zero if one or both of the inputs are zero. If both`
			`/// inputs are non-zero, the output is guaranteed to be non-zero.`
			`/// This makes for better 'video'/LED dimming, at the cost of`
			`/// several additional cycles.`
			`LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video( uint8_t i, fract8 scale)`
			`{`
			`#if SCALE8_C == 1 \|\| defined(LIB8_ATTINY)`
			`uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);`
			`// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;`
			`// uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;`
			`return j;`
			`#elif SCALE8_AVRASM == 1`
			`uint8_t j=0;`
			`asm volatile(`
			`" tst %[i]\n\t"`
			`" breq L_%=\n\t"`
			`" mul %[i], %[scale]\n\t"`
			`" mov %[j], r1\n\t"`
			`" clr __zero_reg__\n\t"`
			`" cpse %[scale], r1\n\t"`
			`" subi %[j], 0xFF\n\t"`
			`"L_%=: \n\t"`
			`: [j] "+a" (j)`
			`: [i] "a" (i), [scale] "a" (scale)`
			`: "r0", "r1");`

			`return j;`
			`// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;`
			`// asm volatile(`
			`// " tst %0 \n"`
			`// " breq L_%= \n"`
			`// " mul %0, %1 \n"`
			`// " mov %0, r1 \n"`
			`// " add %0, %2 \n"`
			`// " clr __zero_reg__ \n"`
			`// "L_%=: \n"`

			`// : "+a" (i)`
			`// : "a" (scale), "a" (nonzeroscale)`
			`// : "r0", "r1");`

			`// // Return the result`
			`// return i;`
			`#else`
			`#error "No implementation for scale8_video available."`
			`#endif`
			`}`


			`/// This version of scale8 does not clean up the R1 register on AVR`
			`/// If you are doing several 'scale8's in a row, use this, and`
			`/// then explicitly call cleanup_R1.`
			`LIB8STATIC_ALWAYS_INLINE uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)`
			`{`
			`#if SCALE8_C == 1`
			`#if (FASTLED_SCALE8_FIXED == 1)`
			`return (((uint16_t)i) * ((uint16_t)(scale)+1)) >> 8;`
			`#else`
			`return ((int)i * (int)(scale) ) >> 8;`
			`#endif`
			`#elif SCALE8_AVRASM == 1`
			`asm volatile(`
			`#if (FASTLED_SCALE8_FIXED==1)`
			`// Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0`
			`"mul %0, %1 \n\t"`
			`// Add i to r0, possibly setting the carry flag`
			`"add r0, %0 \n\t"`
			`// load the immediate 0 into i (note, this does _not_ touch any flags)`
			`"ldi %0, 0x00 \n\t"`
			`// walk and chew gum at the same time`
			`"adc %0, r1 \n\t"`
			`#else`
			`/* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */`
			`"mul %0, %1 \n\t"`
			`/* Move the high 8-bits of the product (r1) back to i */`
			`"mov %0, r1 \n\t"`
			`#endif`
			`/* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */`
			`/* "clr __zero_reg__ \n\t" */`

			`: "+a" (i) /* writes to i */`
			`: "a" (scale) /* uses scale */`
			`: "r0", "r1" /* clobbers r0, r1 */ );`

			`// Return the result`
			`return i;`
			`#else`
			`#error "No implementation for scale8_LEAVING_R1_DIRTY available."`
			`#endif`
			`}`


			`/// This version of scale8_video does not clean up the R1 register on AVR`
			`/// If you are doing several 'scale8_video's in a row, use this, and`
			`/// then explicitly call cleanup_R1.`
			`LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)`
			`{`
			`#if SCALE8_C == 1 \|\| defined(LIB8_ATTINY)`
			`uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);`
			`// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;`
			`// uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;`
			`return j;`
			`#elif SCALE8_AVRASM == 1`
			`uint8_t j=0;`
			`asm volatile(`
			`" tst %[i]\n\t"`
			`" breq L_%=\n\t"`
			`" mul %[i], %[scale]\n\t"`
			`" mov %[j], r1\n\t"`
			`" breq L_%=\n\t"`
			`" subi %[j], 0xFF\n\t"`
			`"L_%=: \n\t"`
			`: [j] "+a" (j)`
			`: [i] "a" (i), [scale] "a" (scale)`
			`: "r0", "r1");`

			`return j;`
			`// uint8_t nonzeroscale = (scale != 0) ? 1 : 0;`
			`// asm volatile(`
			`// " tst %0 \n"`
			`// " breq L_%= \n"`
			`// " mul %0, %1 \n"`
			`// " mov %0, r1 \n"`
			`// " add %0, %2 \n"`
			`// " clr __zero_reg__ \n"`
			`// "L_%=: \n"`

			`// : "+a" (i)`
			`// : "a" (scale), "a" (nonzeroscale)`
			`// : "r0", "r1");`

			`// // Return the result`
			`// return i;`
			`#else`
			`#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."`
			`#endif`
			`}`

			`/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls`
			`LIB8STATIC_ALWAYS_INLINE void cleanup_R1(void)`
			`{`
			`#if CLEANUP_R1_AVRASM == 1`
			`// Restore r1 to "0"; it's expected to always be that`
			`asm volatile( "clr __zero_reg__ \n\t" : : : "r1" );`
			`#endif`
			`}`


			`/// scale a 16-bit unsigned value by an 8-bit value,`
			`/// considered as numerator of a fraction whose denominator`
			`/// is 256. In other words, it computes i * (scale / 256)`

			`LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )`
			`{`
			`#if SCALE16BY8_C == 1`
			`uint16_t result;`
			`#if FASTLED_SCALE8_FIXED == 1`
			`result = (i * (1+((uint16_t)scale))) >> 8;`
			`#else`
			`result = (i * scale) / 256;`
			`#endif`
			`return result;`
			`#elif SCALE16BY8_AVRASM == 1`
			`#if FASTLED_SCALE8_FIXED == 1`
			`uint16_t result = 0;`
			`asm volatile(`
			`// result.A = HighByte( (i.A x scale) + i.A )`
			`" mul %A[i], %[scale] \n\t"`
			`" add r0, %A[i] \n\t"`
			`// " adc r1, [zero] \n\t"`
			`// " mov %A[result], r1 \n\t"`
			`" adc %A[result], r1 \n\t"`

			`// result.A-B += i.B x scale`
			`" mul %B[i], %[scale] \n\t"`
			`" add %A[result], r0 \n\t"`
			`" adc %B[result], r1 \n\t"`

			`// cleanup r1`
			`" clr __zero_reg__ \n\t"`

			`// result.A-B += i.B`
			`" add %A[result], %B[i] \n\t"`
			`" adc %B[result], __zero_reg__ \n\t"`

			`: [result] "+r" (result)`
			`: [i] "r" (i), [scale] "r" (scale)`
			`: "r0", "r1"`
			`);`
			`return result;`
			`#else`
			`uint16_t result = 0;`
			`asm volatile(`
			`// result.A = HighByte(i.A x j )`
			`" mul %A[i], %[scale] \n\t"`
			`" mov %A[result], r1 \n\t"`
			`//" clr %B[result] \n\t"`

			`// result.A-B += i.B x j`
			`" mul %B[i], %[scale] \n\t"`
			`" add %A[result], r0 \n\t"`
			`" adc %B[result], r1 \n\t"`

			`// cleanup r1`
			`" clr __zero_reg__ \n\t"`

			`: [result] "+r" (result)`
			`: [i] "r" (i), [scale] "r" (scale)`
			`: "r0", "r1"`
			`);`
			`return result;`
			`#endif`
			`#else`
			`#error "No implementation for scale16by8 available."`
			`#endif`
			`}`

			`/// scale a 16-bit unsigned value by a 16-bit value,`
			`/// considered as numerator of a fraction whose denominator`
			`/// is 65536. In other words, it computes i * (scale / 65536)`

			`LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )`
			`{`
			`#if SCALE16_C == 1`
			`uint16_t result;`
			`#if FASTLED_SCALE8_FIXED == 1`
			`result = ((uint32_t)(i) * (1+(uint32_t)(scale))) / 65536;`
			`#else`
			`result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;`
			`#endif`
			`return result;`
			`#elif SCALE16_AVRASM == 1`
			`#if FASTLED_SCALE8_FIXED == 1`
			`// implemented sort of like`
			`// result = ((i * scale) + i ) / 65536`
			`//`
			`// why not like this, you may ask?`
			`// result = (i * (scale+1)) / 65536`
			`// the answer is that if scale is 65535, then scale+1`
			`// will be zero, which is not what we want.`
			`uint32_t result;`
			`asm volatile(`
			`// result.A-B = i.A x scale.A`
			`" mul %A[i], %A[scale] \n\t"`
			`// save results...`
			`// basic idea:`
			`//" mov %A[result], r0 \n\t"`
			`//" mov %B[result], r1 \n\t"`
			`// which can be written as...`
			`" movw %A[result], r0 \n\t"`
			`// Because we're going to add i.A-B to`
			`// result.A-D, we DO need to keep both`
			`// the r0 and r1 portions of the product`
			`// UNlike in the 'unfixed scale8' version.`
			`// So the movw here is needed.`
			`: [result] "=r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale)`
			`: "r0", "r1"`
			`);`

			`asm volatile(`
			`// result.C-D = i.B x scale.B`
			`" mul %B[i], %B[scale] \n\t"`
			`//" mov %C[result], r0 \n\t"`
			`//" mov %D[result], r1 \n\t"`
			`" movw %C[result], r0 \n\t"`
			`: [result] "+r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale)`
			`: "r0", "r1"`
			`);`

			`const uint8_t zero = 0;`
			`asm volatile(`
			`// result.B-D += i.B x scale.A`
			`" mul %B[i], %A[scale] \n\t"`

			`" add %B[result], r0 \n\t"`
			`" adc %C[result], r1 \n\t"`
			`" adc %D[result], %[zero] \n\t"`

			`// result.B-D += i.A x scale.B`
			`" mul %A[i], %B[scale] \n\t"`

			`" add %B[result], r0 \n\t"`
			`" adc %C[result], r1 \n\t"`
			`" adc %D[result], %[zero] \n\t"`

			`// cleanup r1`
			`" clr r1 \n\t"`

			`: [result] "+r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale),`
			`[zero] "r" (zero)`
			`: "r0", "r1"`
			`);`

			`asm volatile(`
			`// result.A-D += i.A-B`
			`" add %A[result], %A[i] \n\t"`
			`" adc %B[result], %B[i] \n\t"`
			`" adc %C[result], %[zero] \n\t"`
			`" adc %D[result], %[zero] \n\t"`
			`: [result] "+r" (result)`
			`: [i] "r" (i),`
			`[zero] "r" (zero)`
			`);`

			`result = result >> 16;`
			`return result;`
			`#else`
			`uint32_t result;`
			`asm volatile(`
			`// result.A-B = i.A x scale.A`
			`" mul %A[i], %A[scale] \n\t"`
			`// save results...`
			`// basic idea:`
			`//" mov %A[result], r0 \n\t"`
			`//" mov %B[result], r1 \n\t"`
			`// which can be written as...`
			`" movw %A[result], r0 \n\t"`
			`// We actually don't need to do anything with r0,`
			`// as result.A is never used again here, so we`
			`// could just move the high byte, but movw is`
			`// one clock cycle, just like mov, so might as`
			`// well, in case we want to use this code for`
			`// a generic 16x16 multiply somewhere.`

			`: [result] "=r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale)`
			`: "r0", "r1"`
			`);`

			`asm volatile(`
			`// result.C-D = i.B x scale.B`
			`" mul %B[i], %B[scale] \n\t"`
			`//" mov %C[result], r0 \n\t"`
			`//" mov %D[result], r1 \n\t"`
			`" movw %C[result], r0 \n\t"`
			`: [result] "+r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale)`
			`: "r0", "r1"`
			`);`

			`const uint8_t zero = 0;`
			`asm volatile(`
			`// result.B-D += i.B x scale.A`
			`" mul %B[i], %A[scale] \n\t"`

			`" add %B[result], r0 \n\t"`
			`" adc %C[result], r1 \n\t"`
			`" adc %D[result], %[zero] \n\t"`

			`// result.B-D += i.A x scale.B`
			`" mul %A[i], %B[scale] \n\t"`

			`" add %B[result], r0 \n\t"`
			`" adc %C[result], r1 \n\t"`
			`" adc %D[result], %[zero] \n\t"`

			`// cleanup r1`
			`" clr r1 \n\t"`

			`: [result] "+r" (result)`
			`: [i] "r" (i),`
			`[scale] "r" (scale),`
			`[zero] "r" (zero)`
			`: "r0", "r1"`
			`);`

			`result = result >> 16;`
			`return result;`
			`#endif`
			`#else`
			`#error "No implementation for scale16 available."`
			`#endif`
			`}`
			`///@}`

			`///@defgroup Dimming Dimming and brightening functions`
			`///`
			`/// Dimming and brightening functions`
			`///`
			`/// The eye does not respond in a linear way to light.`
			`/// High speed PWM'd LEDs at 50% duty cycle appear far`
			`/// brighter then the 'half as bright' you might expect.`
			`///`
			`/// If you want your midpoint brightness leve (128) to`
			`/// appear half as bright as 'full' brightness (255), you`
			`/// have to apply a 'dimming function'.`
			`///@{`

			`/// Adjust a scaling value for dimming`
			`LIB8STATIC uint8_t dim8_raw( uint8_t x)`
			`{`
			`return scale8( x, x);`
			`}`

			`/// Adjust a scaling value for dimming for video (value will never go below 1)`
			`LIB8STATIC uint8_t dim8_video( uint8_t x)`
			`{`
			`return scale8_video( x, x);`
			`}`

			`/// Linear version of the dimming function that halves for values < 128`
			`LIB8STATIC uint8_t dim8_lin( uint8_t x )`
			`{`
			`if( x & 0x80 ) {`
			`x = scale8( x, x);`
			`} else {`
			`x += 1;`
			`x /= 2;`
			`}`
			`return x;`
			`}`

			`/// inverse of the dimming function, brighten a value`
			`LIB8STATIC uint8_t brighten8_raw( uint8_t x)`
			`{`
			`uint8_t ix = 255 - x;`
			`return 255 - scale8( ix, ix);`
			`}`

			`/// inverse of the dimming function, brighten a value`
			`LIB8STATIC uint8_t brighten8_video( uint8_t x)`
			`{`
			`uint8_t ix = 255 - x;`
			`return 255 - scale8_video( ix, ix);`
			`}`

			`/// inverse of the dimming function, brighten a value`
			`LIB8STATIC uint8_t brighten8_lin( uint8_t x )`
			`{`
			`uint8_t ix = 255 - x;`
			`if( ix & 0x80 ) {`
			`ix = scale8( ix, ix);`
			`} else {`
			`ix += 1;`
			`ix /= 2;`
			`}`
			`return 255 - ix;`
			`}`

			`///@}`
			`#endif`