/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*********************** compute mean of pixels in block *********************/
#ifndef __MEAN_MMX_H__
#define __MEAN_MMX_H__

#ifdef HAS_MMX
/*  mean_withoutmask                                                         */
/*                                                                           */
/*  Description:                                                             */
/*    Compute mean of pixels values in a macroblock.                         */
/*                                                                           */
/*  Arguments:                                                               */
/*    unsigned char *input: the input macroblock                             */
/*    short pitch: number of pixels to the next line                         */
/*    unsigned short *mean: the returned mean.                               */
/*                                                                           */
/*  Return value:                                                            */
/*    unsigned short : pixel count.                                          */

static unsigned short inline mean_withoutmask(unsigned char *input,
					      unsigned long pitch,
					      unsigned long *mean)
{
  register int m;
  int dummy1;

#define MEAN_STEP()                                                             \
      "movq (%0),     %%mm0\n"   /* load 1st texture line */			\
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */			\
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */			\
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */			\
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */	\
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */	\
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */	\
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */	\
      "paddw %%mm0, %%mm5\n"     /* accumulate texture */			\
      "paddw %%mm1, %%mm5\n"     /* accumulate texture */			\
      "paddw %%mm2, %%mm5\n"     /* accumulate texture */			\
      "paddw %%mm3, %%mm5\n"     /* accumulate texture */			\
      "addl %1, %0\n"            /* move one texture line down */		\
      "addl %1, %0\n"            /* move one texture line down */

  /* Compute mean of visible pixels */
  asm volatile (
      "pxor %%mm7, %%mm7\n"      /* mm7 = zero        */
      "pxor %%mm5, %%mm5\n"      /* mm5 = texture accumulator */
      MEAN_STEP()
      MEAN_STEP()
      MEAN_STEP()
      MEAN_STEP()
      : "=r"(dummy1), "=r"(pitch)
      : "0"(input), "1"(pitch)
      : "memory");

  asm volatile (/* line accumulation */
      "movq %%mm5, %%mm1\n"       /* copy column pixel partial sums */
      "psrlq $32, %%mm5\n"        /* move high dword low */
      "paddw %%mm5, %%mm1\n"      /* sum partial sums */
      "movq %%mm1, %%mm0\n"       /* copy column pixel partial sums */
      "psrlq $16, %%mm0\n"        /* move high word low */
      "paddw %%mm0, %%mm1\n"      /* sum partial sums */
      "movd %%mm1, %%ecx\n"       /* get result in ecx */
      "andl $0x0000ffff, %%ecx\n" /* mask final value */
      : "=c" (m)
      );
  *mean = m >> 6;
  return(64);
}

/*  mean_withmask                                                            */
/*                                                                           */
/*  Description:                                                             */
/*    Compute mean of pixels values in an macroblock using a binary mask.    */
/*                                                                           */
/*  Arguments:                                                               */
/*    unsigned char *input: the input macroblock                             */
/*    unsigned char *mask: the input binary alpha block                      */
/*    short pitch: number of pixels to the next line                         */
/*    unsigned short *mean: the returned mean.                               */
/*                                                                           */
/*  Return value:                                                            */
/*    unsigned short : pixel count.                                          */

static unsigned short inline mean_withmask(unsigned char *input,
					   unsigned char *mask,
					   unsigned long pitch,
					   unsigned long *mean)
{
  register int c, m;
  int dummy1, dummy2;

  /* compute mean of visible pixels */
  asm volatile (
      "pxor %%mm7, %%mm7\n"      /* mm7 = zero        */
      "pxor %%mm6, %%mm6\n"      /* mm6 = mask    accumulator */
      "pxor %%mm5, %%mm5\n"      /* mm5 = texture accumulator */
      /* 1st pass */
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
      "movq (%2),     %%mm1\n"   /* load 1st mask line */
      "movq (%2, %1), %%mm3\n"   /* load 2nd mask line */
      "pcmpgtb %%mm7, %%mm1\n"   /* saturate 1st mask line */
      "pcmpgtb %%mm7, %%mm3\n"   /* saturate 2nd mask line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            /* move one mask line down */
      /* 2nd pass */
      "movq (%0),     %%mm0\n"   /* load 3rd texture line */
      "movq (%0, %1), %%mm2\n"   /* load 4th texture line */
      "movq (%2),     %%mm1\n"   /* load 3rd mask line */
      "movq (%2, %1), %%mm3\n"   /* load 4th mask line */
      "pcmpgtb %%mm7, %%mm1\n"   /* saturate 3rd mask line */
      "pcmpgtb %%mm7, %%mm3\n"   /* saturate 4th mask line */
      "pand %%mm1, %%mm0\n"        /* mask 3rd texture line */
      "pand %%mm3, %%mm2\n"        /* mask 4th texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 3rd texture line */
      "movq %%mm2, %%mm3\n"      /* copy 4th texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            /* move one mask line down */
      /* 3rd pass */
      "movq (%0),     %%mm0\n"   /* load 5th texture line */
      "movq (%0, %1), %%mm2\n"   /* load 6th texture line */
      "movq (%2),     %%mm1\n"   /* load 5th mask line */
      "movq (%2, %1), %%mm3\n"   /* load 6th mask line */
      "pcmpgtb %%mm7, %%mm1\n"   /* saturate 5th mask line */
      "pcmpgtb %%mm7, %%mm3\n"   /* saturate 6th mask line */
      "pand %%mm1, %%mm0\n"        /* mask 5th texture line */
      "pand %%mm3, %%mm2\n"        /* mask 6th texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 5th texture line */
      "movq %%mm2, %%mm3\n"      /* copy 6th texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            /* move one mask line down */
      /* 4th pass */
      "movq (%0),     %%mm0\n"   /* load 7th texture line */
      "movq (%0, %1), %%mm2\n"   /* load 8th texture line */
      "movq (%2),     %%mm1\n"   /* load 7th mask line */
      "movq (%2, %1), %%mm3\n"   /* load 8th mask line */
      "pcmpgtb %%mm7, %%mm1\n"   /* saturate 1st mask line */
      "pcmpgtb %%mm7, %%mm3\n"   /* saturate 2nd mask line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      : "=r"(dummy1), "=r"(pitch), "=r"(dummy2)
      : "0"(input), "1"(pitch), "2"(mask)
      : "memory");

  asm volatile (/* line accumulation */
      "movq %%mm6, %%mm0\n"       /* copy column pixel counts */
      "psrlq $32, %%mm6\n"        /* move high dword low */
      "paddb %%mm6, %%mm0\n"      /* sum column counts */
      "movd %%mm0, %%eax\n"       /* get 4 packed counts to register eax */
      "movl %%eax, %%ecx\n"       /* copy register eax to register ecx */
      "bswap %%ecx\n"             /* swap temporary register ecx */
      "addw %%cx, %%ax\n"         /* sum low words */
      "addb %%ah, %%al\n"         /* sum low bytes */
      "andl $0x000000ff, %%eax\n" /* mask final value */
      "movq %%mm5, %%mm1\n"       /* copy column pixel partial sums */
      "psrlq $32, %%mm5\n"        /* move high dword low */
      "paddw %%mm5, %%mm1\n"      /* sum partial sums */
      "movq %%mm1, %%mm0\n"       /* copy column pixel partial sums */
      "psrlq $16, %%mm0\n"        /* move high word low */
      "paddw %%mm0, %%mm1\n"      /* sum partial sums */
      "movd %%mm1, %%ecx\n"       /* get result in ecx */
      "andl $0x0000ffff, %%ecx\n" /* mask final value */
      : "=a" (c), "=c" (m)
      );

  if(c) m /= c;
  
  *mean = m;
  return(c);
}

#endif
#endif
