/* msm-swfill.c
 *
 * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Code Aurora nor
 *       the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

// TODO: Check if this is needed for fills.  Rename it?
// Shared software blit code.
#include "msm-swblits.h"

/* Alignment check macro functions used to determine if two pointers are aligned with a specified granularity. */
#define SW_CHECK_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,REQUIRED_ALIGNMENT) \
   (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT))

/* Alignment check macro functions used to determine if two pointers (along with pitches) are aligned with a specified granularity.        */
/* (Having the pitches aligned, as well as the pointers, insures that all pointers when incremented by the pitches will still be aligned.) */
#define SW_CHECK_PITCHED_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,dpitch,spitch,REQUIRED_ALIGNMENT) \
   (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \
    && (abs(dpitch) % (ALIGNMENT_BYTE_SIZE)) == 0)



static inline void
memset16_NoAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int count)
{
   const uint32_t packedSource32 = (uint32_t) src
                                   | ((uint32_t) src << 16);
   const uint64_t packedSource64 = (uint64_t) src
                                   | ((uint64_t) src << 16)
                                   | ((uint64_t) src << 32)
                                   | ((uint64_t) src << 48);

   // Quickly branch to customized code for each width.
   switch (count)
   {
      // Cases 0-7 are designed to be optimal in that they generate a minimal number of aligned operations with minimal alignment test code.
      case 0:  break;
      case 1:
               *(uint16_t *)(dst) = src;
               break;

      case 2:
               if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
               }
               else
               {
                  *(uint16_t *)(dst) = src;
                  *(uint16_t *)(dst+1*BYTES_PER_UINT16_T) = src;
               }
               break;

      case 3:
               if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
                  *(uint16_t *)(dst+BYTES_PER_UINT32_T) = src;
               }
               else {
                  *(uint16_t *)(dst) = src;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32;
               }
               break;

      case 4:
               if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint64_t *)(dst) = packedSource64;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT32_T) = packedSource32;
               }
               else {
                  *(uint16_t *)(dst) = src;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32;
                  *(uint16_t *)(dst+BYTES_PER_UINT16_T+BYTES_PER_UINT32_T) = src;
               }
               break;

      case 5:
               if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint64_t *)(dst) = packedSource64;
                  *(uint16_t *)(dst+BYTES_PER_UINT64_T) = src;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT32_T) = packedSource32;
                  *(uint16_t *)(dst+BYTES_PER_UINT32_T+BYTES_PER_UINT32_T) = src;
               }
               else {
                  *(uint16_t *)(dst) = src;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T+BYTES_PER_UINT32_T) = packedSource32;
               }
               break;

      case 6:
               if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint64_t *)(dst) = packedSource64;
                  *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
                  *(uint64_t *)(dst+BYTES_PER_UINT32_T) = packedSource64;
               }
               else {
                  *(uint16_t *)(dst) = src;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = packedSource32;
                  *(uint16_t *)(dst+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = src;
               }
               break;

      case 7:
               if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint64_t *)(dst) = packedSource64;
                  *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32;
                  *(uint16_t *)(dst+BYTES_PER_UINT64_T+BYTES_PER_UINT32_T) = src;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,0)) {
                  *(uint32_t *)(dst) = packedSource32;
                  *(uint64_t *)(dst+BYTES_PER_UINT32_T) = packedSource64;
                  *(uint16_t *)(dst+BYTES_PER_UINT32_T+BYTES_PER_UINT64_T) = src;
               }
               else {
                  *(uint16_t *)(dst) = src;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T) = packedSource32;
                  *(uint32_t *)(dst+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T) = packedSource32;
               }
               break;
      default:
               break;
   }
}


static inline void
memset16_NeonAlignmentAssumptions_UpTo7Count(uint8_t *dst, uint16_t src, int count)
{
   const uint32_t packedSource32 = (uint32_t) src
                                   | ((uint32_t) src << 16);
   const uint64_t packedSource64 = (uint64_t) src
                                   | ((uint64_t) src << 16)
                                   | ((uint64_t) src << 32)
                                   | ((uint64_t) src << 48);

   // Quickly branch to customized code for each width.
   // NOTE: We don't need any alignment checks because dest is assumed to already be Neon-aligned
   //       (which guarantees double-word, word and half-word alignment as well).
   switch (count)
   {
      // Cases 0-7 are designed to be optimal in that they generate a minimal number of aligned operations with minimal alignment test code.
      case 0:  break;
      case 1:
               *(uint16_t *)(dst) = src;
               break;
      case 2:
               *(uint32_t *)(dst) = packedSource32;
               break;

      case 3:
               *(uint32_t *)(dst) = packedSource32;
               *(uint16_t *)(dst+BYTES_PER_UINT32_T) = src;
               break;

      case 4:
               *(uint64_t *)(dst) = packedSource64;
               break;

      case 5:
               *(uint64_t *)(dst) = packedSource64;
               *(uint16_t *)(dst+BYTES_PER_UINT64_T) = src;
               break;

      case 6:
               *(uint64_t *)(dst) = packedSource64;
               *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32;
               break;

      case 7:
               *(uint64_t *)dst = packedSource64;
               *(uint32_t *)(dst+BYTES_PER_UINT64_T) = packedSource32;
               *(uint16_t *)(dst+BYTES_PER_UINT64_T+BYTES_PER_UINT32_T) = src;
               break;
   }
}


static inline void
memset16_AssumesNeonAlignment(uint8_t *dst, uint16_t src, int count)
{
   // Get the number of pixels that can be packed into the eight lanes of a 16x8 vector of uint16_t words.
   const int PIXELS_IN_VECTOR_COLUMN = BYTES_PER_UINT16X8_T / BYTES_PER_UINT16_T;

   // Duplicate the 16-bit source value into 8 lanes of a 16x8 vector (8 pixels).
   uint16x8_t packedSource128 = vdupq_n_u16(src);

   // Quickly branch to customized code for each width.
   switch (count / PIXELS_IN_VECTOR_COLUMN)
   {
      // Cases are designed to be near-optimal in terms of number of operations, but they don't attempt to align memory access.
      // (This can result in slowdowns unless the function is called with an aligned destination pointer.)
      case 0:  break;
      case  1:
               {
                  // If we get here, we can assume there are 8 pixels to copy,
                  // so copy one vector worth of pixels.
                  const int ONE_COLUMN = 1;
                  count -=  PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN;
                  vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                  dst += BYTES_PER_UINT16X8_T * ONE_COLUMN;
               }
               break;

      case 2:
               {
                  // If we get here, we can assume there are 16 pixels to copy,
                  // so copy two vectors worth of pixels.
                  const int TWO_COLUMNS = 2;
                  count -=  PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS;
                  vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                  dst += BYTES_PER_UINT16X8_T * TWO_COLUMNS;
               }
               break;

      case 3:
               {
                  // If we get here, we can assume there are 24 pixels to copy,
                  // so copy three vectors worth of pixels.
                  const int THREE_COLUMNS = 3;
                  count -=  PIXELS_IN_VECTOR_COLUMN * THREE_COLUMNS;
                  vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128);
                  dst += BYTES_PER_UINT16X8_T * THREE_COLUMNS;
               }
               break;

      case 4:
               {
                  // If we get here, we can assume there are 32 pixels to copy,
                  // so copy three vectors worth of pixels.
                  const int FOUR_COLUMNS = 4;
                  count -=  PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS;
                  vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128);
                  vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128);
                  dst += BYTES_PER_UINT16X8_T * FOUR_COLUMNS;
               }
               break;

      default:
               {
                  // Copy multiple columns of a vector -- eight vectors at a time.
                  const int EIGHT_COLUMNS = 8;
                  while (count >= PIXELS_IN_VECTOR_COLUMN * EIGHT_COLUMNS) {
                     count -=  PIXELS_IN_VECTOR_COLUMN * EIGHT_COLUMNS;
                     vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+4*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+5*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+6*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+7*BYTES_PER_UINT16X8_T),packedSource128);

                     dst += BYTES_PER_UINT16X8_T * EIGHT_COLUMNS;
                  }

                  // If we get here, we can assume there are less than 64 pixels to copy.
                  // Copy multiple columns of a vector -- up to four vectors (32 pixels).
                  const int FOUR_COLUMNS = 4;
                  if (count >= PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS) {
                     count -=  PIXELS_IN_VECTOR_COLUMN * FOUR_COLUMNS;
                     vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+2*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+3*BYTES_PER_UINT16X8_T),packedSource128);
                     dst += BYTES_PER_UINT16X8_T * FOUR_COLUMNS;
                  }

                  // If we get here, we can assume there are less than 32 pixels to copy.
                  // Copy multiple columns of a vector -- up to two vectors (16 pixels).
                  const int TWO_COLUMNS = 2;
                  if (count >= PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS) {
                     count -=  PIXELS_IN_VECTOR_COLUMN * TWO_COLUMNS;
                     vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                     vst1q_u16((uint16_t *)(dst+1*BYTES_PER_UINT16X8_T),packedSource128);
                     dst += BYTES_PER_UINT16X8_T * TWO_COLUMNS;
                  }

                  // If we get here, we can assume there are less than 16 pixels to copy.
                  // If there is one vector left (with eight pixels), then copy it.
                  const int ONE_COLUMN = 1;
                  if (count >= PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN) {
                     count -=  PIXELS_IN_VECTOR_COLUMN * ONE_COLUMN;
                     vst1q_u16((uint16_t *)(dst+0*BYTES_PER_UINT16X8_T),packedSource128);
                     dst += BYTES_PER_UINT16X8_T * ONE_COLUMN;
                  }
               }
               break;
   }

   // Quickly fill remaining pixels (up to 7).
   memset16_NeonAlignmentAssumptions_UpTo7Count(dst, src, count);
}


static inline void
memset16_Test(uint16_t *dst, uint16_t src, int count)
{
   // Get the number of pixels that can be packed into the eight lanes of a 16x8 vector of uint16_t words.
   const int PIXELS_IN_VECTOR_COLUMN = BYTES_PER_UINT16X8_T / BYTES_PER_UINT16_T;

   // For narrow widths, do an optimized fill for both sides of the rectangle.
   if (count < PIXELS_IN_VECTOR_COLUMN)
   {
      memset16_NoAlignmentAssumptions_UpTo7Count((void*) dst, src, count);
   }

   // Otherwise, if the rectangle is not Neon-aligned, first fill the unaligned portion,
   // then fill the middle using Neon operations and finish the right using non-Neon operations.
   else {
      // Compute the misalignment from the optimal copy alignment (assumed to be the size of a Neon vector).
      // (NOTE: It is also assumed that the pointer is already pixel-aligned.)
      const int pixelMisalignment = (((int) dst) & (BYTES_PER_UINT16X8_T - 1)) / BYTES_PER_UINT16_T;
      if (pixelMisalignment != 0)
      {
         // Compute the number of pixels to fill that would align the rest of the rectangle.
         // NOTE: Since count is guaranteed to be >= PIXELS_IN_VECTOR_COLUMN at this point,
         //       pixelsToCopyForAlignment is guaranteed to be less than count.
         const int pixelsToCopyForAlignment = PIXELS_IN_VECTOR_COLUMN - pixelMisalignment;
         count -= pixelsToCopyForAlignment;

         // Don't assume any pre-existing alignment when filling up to PIXELS_IN_VECTOR_COLUMN - 1 (7 for 16bpp).
         memset16_NoAlignmentAssumptions_UpTo7Count((void *) dst, src, pixelsToCopyForAlignment);
         dst += pixelsToCopyForAlignment;
      }

      // Copy remaining pixels using Neon and non-Neon instructions.
      // NOTE: This assumes that dst is aligned optimally for Neon instructions.
      memset16_AssumesNeonAlignment((void *) dst, src, count);
   }
}

/* Do multiple row fills with a specific memory set function. */
#define DO_MULTIPLE_FILLS_WITH_MEMSET(MEMSET_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \
do { \
   BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
   /* Draw one row at a time, in the most efficient way. */ \
   while (h != 0) { \
      h -= 1; \
\
      MEMSET_FUNCTION((void *) (dst), src, w); \
\
      dst += dpitch; \
   } \
\
   UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
} while (0)


/* Fill a line of 16bpp pixels.                                                          */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */

static inline void
swFillRect16Bpp_Unaligned(unsigned char *dst, uint16_t src, int w, int h, int dpitch, BOOL blockSignalsForVFP)
{
   // Handle single-pixel width columns as a special case.
   // Since this function only requires half-word-alignment, which is guaranteed at this point,
   // it's safe to call now with no further tests.
   if (w == 1) {
      swFill2ByteWideRectangle_HalfWordAligned(dst, src, h, dpitch);
      return;
   }

   if (w < 64) {
      // For narrow rectangles, block signals only once for the entire rectangles.
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
      DO_MULTIPLE_FILLS_WITH_MEMSET(memset16_Test,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
      UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
   }
   else {
      // For wider rectangles, block and unblock signals for every row.
      DO_MULTIPLE_FILLS_WITH_MEMSET(memset16_Test,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
   }
}


/* Fill a line of 32bpp pixels.                                                          */
/* (Pointers are assumed to be word-aligned, which should be guaranteed for 32bpp.) */

static inline void
swFillRect32Bpp_Unaligned(unsigned char *dst, uint32_t src, int w, int h, int dpitch, BOOL blockSignalsForVFP)
{
   // Handle single-pixel width columns as a special case.
   // Since this function only requires half-word-alignment, which is guaranteed at this point,
   // it's safe to call now with no further tests.
   if (w == 1) {
      swFill4ByteWideRectangle_WordAligned(dst, src, h, dpitch);
      return;
   }

   if (w < 32) {
      // For narrow rectangles, block signals only once for the entire rectangles.
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,SIGNAL_BLOCK_NOOP,SIGNAL_BLOCK_NOOP);
      UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
   }
   else {
      // For wider rectangles, block and unblock signals for every row.
      DO_MULTIPLE_FILLS_WITH_MEMSET(memset32,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
   }
}


/* Perform a solid fill. */

void swFill(MSMPtr pMsm, int byteOffset, int destSurfaceWidthPixels,
            int x, int y, int w, int h, uint32_t src,
            int bitsPerPixel, BOOL blockSignalsForVFP)
{
   int bytesPerPixel = bitsPerPixel / 8;

   int dpitch = destSurfaceWidthPixels * bytesPerPixel;

   uint8_t *dst = (uint8_t *)(pMsm->fbmem + byteOffset + y * dpitch + x * bytesPerPixel);

   // This is a trivial one-pixel copy that avoids most overhead.
   // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.)
   if (h == 1 && w == 1) {
      switch (bitsPerPixel) {
         case 16: *(uint16_t *)dst = src;
                  break;
         case 32: *(uint32_t *)dst = src;
                  break;
         default: break;
      }
      return;
   }

   // Call BPP-specific code to draw pixels.
   switch (bitsPerPixel) {
      case 16: swFillRect16Bpp_Unaligned(dst, src, w, h, dpitch, blockSignalsForVFP);
               break;
      case 32: swFillRect32Bpp_Unaligned(dst, src, w, h, dpitch, blockSignalsForVFP);
               break;
      default: return;
   }
}
