/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*********************** fetch data from current picture *********************/

static void inline prefetch_withoutmask(unsigned char *input,
					dct_t *output,
					unsigned char *mask, /* unused */
					int pitch)
{
  int dummy;
  
  asm volatile ("pxor %%mm7, %%mm7\n"
		"movq (%0),     %%mm0\n"
		"movq (%0, %2), %%mm2\n"
		"movq %%mm0, %%mm1\n"
		"movq %%mm2, %%mm3\n"
		"punpcklbw %%mm7, %%mm0\n"
		"punpckhbw %%mm7, %%mm1\n"
		"punpcklbw %%mm7, %%mm2\n"
		"punpckhbw %%mm7, %%mm3\n"
		"addl %2, %0\n"
		"addl %2, %0\n"
		"movq %%mm0, 0x00(%1)\n"
		"movq %%mm1, 0x08(%1)\n"
		"movq %%mm2, 0x10(%1)\n"
		"movq %%mm3, 0x18(%1)\n"
		"movq (%0),     %%mm0\n"
		"movq (%0, %2), %%mm2\n"
		"movq %%mm0, %%mm1\n"
		"movq %%mm2, %%mm3\n"
		"punpcklbw %%mm7, %%mm0\n"
		"punpckhbw %%mm7, %%mm1\n"
		"punpcklbw %%mm7, %%mm2\n"
		"punpckhbw %%mm7, %%mm3\n"
		"addl %2, %0\n"
		"addl %2, %0\n"
		"movq %%mm0, 0x20(%1)\n"
		"movq %%mm1, 0x28(%1)\n"
		"movq %%mm2, 0x30(%1)\n"
		"movq %%mm3, 0x38(%1)\n"
		"movq (%0),     %%mm0\n"
		"movq (%0, %2), %%mm2\n"
		"movq %%mm0, %%mm1\n"
		"movq %%mm2, %%mm3\n"
		"punpcklbw %%mm7, %%mm0\n"
		"punpckhbw %%mm7, %%mm1\n"
		"punpcklbw %%mm7, %%mm2\n"
		"punpckhbw %%mm7, %%mm3\n"
		"addl %2, %0\n"
		"addl %2, %0\n"
		"movq %%mm0, 0x40(%1)\n"
		"movq %%mm1, 0x48(%1)\n"
		"movq %%mm2, 0x50(%1)\n"
		"movq %%mm3, 0x58(%1)\n"
		"movq (%0),     %%mm0\n"
		"movq (%0, %2), %%mm2\n"
		"movq %%mm0, %%mm1\n"
		"movq %%mm2, %%mm3\n"
		"punpcklbw %%mm7, %%mm0\n"
		"punpckhbw %%mm7, %%mm1\n"
		"punpcklbw %%mm7, %%mm2\n"
		"punpckhbw %%mm7, %%mm3\n"
		"addl %2, %0\n"
		"addl %2, %0\n"
		"movq %%mm0, 0x60(%1)\n"
		"movq %%mm1, 0x68(%1)\n"
		"movq %%mm2, 0x70(%1)\n"
		"movq %%mm3, 0x78(%1)\n"
		: "=r"(dummy), "=r"(output), "=r"(pitch)
		: "0"(input), "1"(output), "2"(pitch)
		: "memory");
}


static void inline prefetch_Y_withmask(unsigned char *input,
				       dct_t *output,
				       unsigned char *mask,
				       int pitch)
{
  int dummy1, dummy2;
  int mean, count;

  /* compute mean of visible pixels */
  asm volatile (
	"pxor %%mm7, %%mm7\n"      /* mm7 = zero        */
	"pxor %%mm6, %%mm6\n"      /* mm6 = mask    accumulator */
	"pxor %%mm5, %%mm5\n"      /* mm5 = texture accumulator */
	/* 1st pass */
	"movq (%0),     %%mm0\n"   /* load 1st texture line */
	"movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
	"movq (%2),     %%mm1\n"   /* load 1st mask line */
	"movq (%2, %1), %%mm3\n"   /* load 2nd mask line */
	"pcmpgtb %%mm7, %%mm1\n"   /* saturate 1st mask line */
	"pcmpgtb %%mm7, %%mm3\n"   /* saturate 2nd mask line */
	"pand %%mm1, %%mm0\n"        /* mask 1st texture line */
	"pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
	"psubsb %%mm1, %%mm6\n"      /* accumulate mask */
	"psubsb %%mm3, %%mm6\n"      /* accumulate mask */
	"movq %%mm0, %%mm1\n"      /* copy 1st texture line */
	"movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
	"punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
	"punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
	"paddw %%mm0, %%mm5\n"       /* accumulate texture */
	"paddw %%mm1, %%mm5\n"       /* accumulate texture */
	"paddw %%mm2, %%mm5\n"       /* accumulate texture */
	"paddw %%mm3, %%mm5\n"       /* accumulate texture */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %2\n"            /* move one mask line down */
	"addl %1, %2\n"            /* move one mask line down */
	/* 2nd pass */
	"movq (%0),     %%mm0\n"   /* load 3rd texture line */
	"movq (%0, %1), %%mm2\n"   /* load 4th texture line */
	"movq (%2),     %%mm1\n"   /* load 3rd mask line */
	"movq (%2, %1), %%mm3\n"   /* load 4th mask line */
	"pcmpgtb %%mm7, %%mm1\n"   /* saturate 3rd mask line */
	"pcmpgtb %%mm7, %%mm3\n"   /* saturate 4th mask line */
	"pand %%mm1, %%mm0\n"        /* mask 3rd texture line */
	"pand %%mm3, %%mm2\n"        /* mask 4th texture line */
	"psubsb %%mm1, %%mm6\n"      /* accumulate mask */
	"psubsb %%mm3, %%mm6\n"      /* accumulate mask */
	"movq %%mm0, %%mm1\n"      /* copy 3rd texture line */
	"movq %%mm2, %%mm3\n"      /* copy 4th texture line */
	"punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
	"punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
	"paddw %%mm0, %%mm5\n"       /* accumulate texture */
	"paddw %%mm1, %%mm5\n"       /* accumulate texture */
	"paddw %%mm2, %%mm5\n"       /* accumulate texture */
	"paddw %%mm3, %%mm5\n"       /* accumulate texture */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %2\n"            /* move one mask line down */
	"addl %1, %2\n"            /* move one mask line down */
	/* 3rd pass */
	"movq (%0),     %%mm0\n"   /* load 5th texture line */
	"movq (%0, %1), %%mm2\n"   /* load 6th texture line */
	"movq (%2),     %%mm1\n"   /* load 5th mask line */
	"movq (%2, %1), %%mm3\n"   /* load 6th mask line */
	"pcmpgtb %%mm7, %%mm1\n"   /* saturate 5th mask line */
	"pcmpgtb %%mm7, %%mm3\n"   /* saturate 6th mask line */
	"pand %%mm1, %%mm0\n"        /* mask 5th texture line */
	"pand %%mm3, %%mm2\n"        /* mask 6th texture line */
	"psubsb %%mm1, %%mm6\n"      /* accumulate mask */
	"psubsb %%mm3, %%mm6\n"      /* accumulate mask */
	"movq %%mm0, %%mm1\n"      /* copy 5th texture line */
	"movq %%mm2, %%mm3\n"      /* copy 6th texture line */
	"punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
	"punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
	"paddw %%mm0, %%mm5\n"       /* accumulate texture */
	"paddw %%mm1, %%mm5\n"       /* accumulate texture */
	"paddw %%mm2, %%mm5\n"       /* accumulate texture */
	"paddw %%mm3, %%mm5\n"       /* accumulate texture */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %0\n"            /* move one texture line down */
	"addl %1, %2\n"            /* move one mask line down */
	"addl %1, %2\n"            /* move one mask line down */
	/* 4th pass */
	"movq (%0),     %%mm0\n"   /* load 7th texture line */
	"movq (%0, %1), %%mm2\n"   /* load 8th texture line */
	"movq (%2),     %%mm1\n"   /* load 7th mask line */
	"movq (%2, %1), %%mm3\n"   /* load 8th mask line */
	"pcmpgtb %%mm7, %%mm1\n"   /* saturate 1st mask line */
	"pcmpgtb %%mm7, %%mm3\n"   /* saturate 2nd mask line */
	"pand %%mm1, %%mm0\n"        /* mask 1st texture line */
	"pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
	"psubsb %%mm1, %%mm6\n"      /* accumulate mask */
	"psubsb %%mm3, %%mm6\n"      /* accumulate mask */
	"movq %%mm0, %%mm1\n"      /* copy 1st texture line */
	"movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
	"punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
	"punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
	"punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
	"paddw %%mm0, %%mm5\n"       /* accumulate texture */
	"paddw %%mm1, %%mm5\n"       /* accumulate texture */
	"paddw %%mm2, %%mm5\n"       /* accumulate texture */
	"paddw %%mm3, %%mm5\n"       /* accumulate texture */
	: "=r"(dummy1), "=r"(pitch), "=r"(dummy2)
	: "0"(input), "1"(pitch), "2"(mask)
	: "memory");

  asm volatile (/* line accumulation */
	"movq %%mm6, %%mm0\n"       /* copy column pixel counts */
	"psrlq $32, %%mm6\n"        /* move high dword low */
	"paddb %%mm6, %%mm0\n"      /* sum column counts */
	"movd %%mm0, %%eax\n"       /* get 4 packed counts to register eax */
	"movl %%eax, %%ecx\n"       /* copy register eax to register ecx */
	"bswap %%ecx\n"             /* swap temporary register ecx */
	"addw %%cx, %%ax\n"         /* sum low words */
	"addb %%ah, %%al\n"         /* sum low bytes */
	"andl $0x000000ff, %%eax\n" /* mask final value */
	"movq %%mm5, %%mm1\n"       /* copy column pixel partial sums */
	"psrlq $32, %%mm5\n"        /* move high dword low */
	"paddw %%mm5, %%mm1\n"      /* sum partial sums */
	"movq %%mm1, %%mm0\n"       /* copy column pixel partial sums */
	"psrlq $16, %%mm0\n"        /* move high word low */
	"paddw %%mm0, %%mm1\n"      /* sum partial sums */
	"movd %%mm1, %%ecx\n"       /* get result in ecx */
	"andl $0x0000ffff, %%ecx\n" /* mask final value */
	: "=a" (count), "=c" (mean)
	);

  if(count) mean /= count;

  asm volatile (/* replicate mean for padding */
	"movd %%ecx, %%mm6\n"       /* mm6 will hold mean value */
	"punpcklbw %%mm6, %%mm6\n"  /* replicate mean to dword */
	"punpcklwd %%mm6, %%mm6\n"  /* replicate mean to dword */
	"punpckldq %%mm6, %%mm6\n"  /* replicate mean to qword */
	:
	: "c" (mean));

  /* fetch and fill empty pixels with mean value */
#define PREFETCH_Y_MASK_STEP(x, y)					\
    "movq (%0),     %%mm0\n"   /* load 1st texture line */		\
    "movq (%0, %2), %%mm2\n"   /* load 2nd texture line */		\
    "movq (%3),     %%mm1\n"   /* load 1st mask line */			\
    "movq (%3, %2), %%mm3\n"   /* load 2nd mask line */			\
    "pcmpgtb %%mm7, %%mm1\n"   /* saturate 1st mask line */		\
    "pcmpgtb %%mm7, %%mm3\n"   /* saturate 2nd mask line */		\
    "pand %%mm1, %%mm0\n"      /* mask 1st texture line */		\
    "pand %%mm3, %%mm2\n"      /* mask 2nd texture line */		\
    "pcmpeqb %%mm7, %%mm1\n"   /* invert mask */			\
    "pcmpeqb %%mm7, %%mm3\n"   /* invert mask */			\
    "movq %%mm6, %%mm4\n"      /* load mean value */			\
    "movq %%mm6, %%mm5\n"      /* load mean value */			\
    "pand %%mm1, %%mm4\n"      /* mask mean for 1st texture line */	\
    "pand %%mm3, %%mm5\n"      /* mask mean for 2nd texture line */	\
    "por %%mm4, %%mm0\n"       /* join texture and mean */		\
    "por %%mm5, %%mm2\n"       /* join texture and mean */		\
    "movq %%mm0, %%mm1\n"      /* copy 1st texture line */		\
    "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */		\
    "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */		\
    "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */		\
    "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */		\
    "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */		\
    "movq %%mm0, 0x" #x "0(%1)\n"   /* store texture */			\
    "movq %%mm1, 0x" #x "8(%1)\n"   /* store texture */			\
    "movq %%mm2, 0x" #y "0(%1)\n"   /* store texture */			\
    "movq %%mm3, 0x" #y "8(%1)\n"   /* store texture */			\
    "addl %2, %0\n"            /* move one texture line down */		\
    "addl %2, %0\n"            /* move one texture line down */		\
    "addl %2, %3\n"            /* move one mask line down */		\
    "addl %2, %3\n"            /* move one mask line down */
  
  asm volatile (PREFETCH_Y_MASK_STEP(0, 1)
		PREFETCH_Y_MASK_STEP(2, 3)
		PREFETCH_Y_MASK_STEP(4, 5)
		PREFETCH_Y_MASK_STEP(6, 7)
		: "=r"(dummy1), "=r"(output), "=r"(pitch), "=r"(dummy2)
		: "0"(input), "1"(output), "2"(pitch), "3"(mask)
		: "memory");

  /* TODO: bilinear filtering */
}

static void inline prefetch_C_withmask(unsigned char *input,
				       dct_t *output,
				       unsigned char *mask,
				       int pitch)
{
  int dummy1, dummy2;
  int mean, count;

  /* compute mean of visible pixels */
  asm volatile (
      "pxor %%mm7, %%mm7\n"      /* mm7 = zero        */
      "pxor %%mm6, %%mm6\n"      /* mm6 = mask    accumulator */
      "pxor %%mm5, %%mm5\n"      /* mm5 = texture accumulator */
      /* 1st pass */
      "movq (%2),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%2, %1, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%2),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%2, %1, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      /* 2nd pass */
      "movq (%2),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%2, %1, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%2),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%2, %1, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      /* 3rd pass */
      "movq (%2),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%2, %1, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "movq %%mm2, %%mm1\n"
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%2),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%2, %1, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      /* 4th pass */
      "movq (%2),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%2, %1, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%2),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%2, %1, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%2),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%2, %1, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %1, %2\n"            /* move one mask line down */
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "addl %1, %2\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %1), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"        /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"        /* mask 2nd texture line */
      "psubsb %%mm1, %%mm6\n"      /* accumulate mask */
      "psubsb %%mm3, %%mm6\n"      /* accumulate mask */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */
      "paddw %%mm0, %%mm5\n"       /* accumulate texture */
      "paddw %%mm1, %%mm5\n"       /* accumulate texture */
      "paddw %%mm2, %%mm5\n"       /* accumulate texture */
      "paddw %%mm3, %%mm5\n"       /* accumulate texture */
      "addl %1, %0\n"            /* move one texture line down */
      "addl %1, %0\n"            /* move one texture line down */
      : "=r"(dummy1), "=r"(pitch), "=r"(dummy2)
      : "0"(input), "1"(pitch), "2"(mask)
      : "memory");

  asm volatile (/* line accumulation */
      "movq %%mm6, %%mm0\n"       /* copy column pixel counts */
      "psrlq $32, %%mm6\n"        /* move high dword low */
      "paddb %%mm6, %%mm0\n"      /* sum column counts */
      "movd %%mm0, %%eax\n"       /* get 4 packed counts to register eax */
      "movl %%eax, %%ecx\n"       /* copy register eax to register ecx */
      "bswap %%ecx\n"             /* swap temporary register ecx */
      "addw %%cx, %%ax\n"         /* sum low words */
      "addb %%ah, %%al\n"         /* sum low bytes */
      "andl $0x000000ff, %%eax\n" /* mask final value */
      "movq %%mm5, %%mm1\n"       /* copy column pixel partial sums */
      "psrlq $32, %%mm5\n"        /* move high dword low */
      "paddw %%mm5, %%mm1\n"      /* sum partial sums */
      "movq %%mm1, %%mm0\n"       /* copy column pixel partial sums */
      "psrlq $16, %%mm0\n"         /* move high word low */
      "paddw %%mm0, %%mm1\n"      /* sum partial sums */
      "movd %%mm1, %%ecx\n"       /* get result in ecx */
      "andl $0x0000ffff, %%ecx\n" /* mask final value */
      : "=a" (count), "=c" (mean)
      );

  if(count) mean /= count;

  /* replicate mean for padding */
  asm volatile (
      "movd %%ecx, %%mm6\n"       /* mm6 will hold mean value */
      "punpcklbw %%mm6, %%mm6\n"  /* replicate mean to dword */
      "punpcklwd %%mm6, %%mm6\n"  /* replicate mean to dword */
      "punpckldq %%mm6, %%mm6\n"  /* replicate mean to qword */
      :
      : "c" (mean));

  /* fetch and fill empty pixels with mean value */
  asm volatile (
      /* 1st pass */
      "movq (%3),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%3, %2, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%3),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%3, %2, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %2), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"      /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"      /* mask 2nd texture line */
      "pcmpeqb %%mm7, %%mm1\n"   /* invert mask */
      "pcmpeqb %%mm7, %%mm3\n"   /* invert mask */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm1, %%mm4\n"      /* mask mean for 1st texture line */
      "por %%mm4, %%mm0\n"       /* join texture and mean */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm3, %%mm4\n"      /* mask mean for 2nd texture line */
      "por %%mm4, %%mm2\n"       /* join texture and mean */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */
      "movq %%mm0, 0x00(%1)\n"   /* store texture */
      "movq %%mm1, 0x08(%1)\n"   /* store texture */
      "movq %%mm2, 0x10(%1)\n"   /* store texture */
      "movq %%mm3, 0x18(%1)\n"   /* store texture */
      "addl %2, %0\n"            /* move one texture line down */
      "addl %2, %0\n"            /* move one texture line down */
      /* 2nd pass */
      "movq (%3),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%3, %2, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%3),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%3, %2, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %2), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"      /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"      /* mask 2nd texture line */
      "pcmpeqb %%mm7, %%mm1\n"   /* invert mask */
      "pcmpeqb %%mm7, %%mm3\n"   /* invert mask */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm1, %%mm4\n"      /* mask mean for 1st texture line */
      "por %%mm4, %%mm0\n"       /* join texture and mean */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm3, %%mm4\n"      /* mask mean for 2nd texture line */
      "por %%mm4, %%mm2\n"       /* join texture and mean */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */
      "movq %%mm0, 0x20(%1)\n"   /* store texture */
      "movq %%mm1, 0x28(%1)\n"   /* store texture */
      "movq %%mm2, 0x30(%1)\n"   /* store texture */
      "movq %%mm3, 0x38(%1)\n"   /* store texture */
      "addl %2, %0\n"            /* move one texture line down */
      "addl %2, %0\n"            /* move one texture line down */
      /* 3rd pass */
      "movq (%3),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%3, %2, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%3),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%3, %2, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %2), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"      /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"      /* mask 2nd texture line */
      "pcmpeqb %%mm7, %%mm1\n"   /* invert mask */
      "pcmpeqb %%mm7, %%mm3\n"   /* invert mask */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm1, %%mm4\n"      /* mask mean for 1st texture line */
      "por %%mm4, %%mm0\n"       /* join texture and mean */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm3, %%mm4\n"      /* mask mean for 2nd texture line */
      "por %%mm4, %%mm2\n"       /* join texture and mean */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */
      "movq %%mm0, 0x40(%1)\n"   /* store texture */
      "movq %%mm1, 0x48(%1)\n"   /* store texture */
      "movq %%mm2, 0x50(%1)\n"   /* store texture */
      "movq %%mm3, 0x58(%1)\n"   /* store texture */
      "addl %2, %0\n"            /* move one texture line down */
      "addl %2, %0\n"            /* move one texture line down */
      /* 4th pass */
      "movq (%3),     %%mm0\n"   /* load 1st mask line up left part */
      "movq (%3, %2, 2), %%mm1\n"   /* load 1st mask line down left part */
      "por %%mm0, %%mm1\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 1st mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 1st mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm1\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm1\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%3),     %%mm0\n"   /* load 2nd mask line up left part */
      "movq (%3, %2, 2), %%mm3\n"   /* load 2nd mask line down left part */
      "por %%mm0, %%mm3\n"       /* conservative subsample */
      "movq 8(%3),     %%mm0\n"  /* load 2nd mask line up right part */
      "movq 8(%3, %2, 2), %%mm2\n"  /* load 2nd mask line down right part */
      "por %%mm0, %%mm2\n"       /* conservative subsample */
      "packsswb %%mm2, %%mm3\n"  /* 0000->00 00FF->7F FF00->80 FFFF->FF */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->FF 00FF->00 FF00->00 FFFF->00 */
      "pcmpeqb %%mm7, %%mm3\n"   /* 0000->00 00FF->FF FF00->FF FFFF->FF */
      "addl %2, %3\n"            /* move one mask line down */
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "addl %2, %3\n"            
      "movq (%0),     %%mm0\n"   /* load 1st texture line */
      "movq (%0, %2), %%mm2\n"   /* load 2nd texture line */
      "pand %%mm1, %%mm0\n"      /* mask 1st texture line */
      "pand %%mm3, %%mm2\n"      /* mask 2nd texture line */
      "pcmpeqb %%mm7, %%mm1\n"   /* invert mask */
      "pcmpeqb %%mm7, %%mm3\n"   /* invert mask */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm1, %%mm4\n"      /* mask mean for 1st texture line */
      "por %%mm4, %%mm0\n"       /* join texture and mean */
      "movq %%mm6, %%mm4\n"      /* load mean value */
      "pand %%mm3, %%mm4\n"      /* mask mean for 2nd texture line */
      "por %%mm4, %%mm2\n"       /* join texture and mean */
      "movq %%mm0, %%mm1\n"      /* copy 1st texture line */
      "movq %%mm2, %%mm3\n"      /* copy 2nd texture line */
      "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */
      "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */
      "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */
      "movq %%mm0, 0x60(%1)\n"   /* store texture */
      "movq %%mm1, 0x68(%1)\n"   /* store texture */
      "movq %%mm2, 0x70(%1)\n"   /* store texture */
      "movq %%mm3, 0x78(%1)\n"   /* store texture */
      "addl %2, %0\n"            /* move one texture line down */
      "addl %2, %0\n"            /* move one texture line down */
      : "=r"(dummy1), "=r"(output), "=r"(pitch), "=r"(dummy2)
      : "0"(input), "1"(output), "2"(pitch), "3"(mask)
      : "memory");
  /* TODO: bilinear filtering */
}

static void inline diff(unsigned char *input,
		        unsigned char *ref,
			dct_t *output,
			int ipitch,
			int rpitch)
{
  int dummy1, dummy2;

  /* simple diff */
#define DIFF_STEP(x)				\
    "movq (%0), %%mm0\n"			\
    "movq (%3), %%mm2\n"			\
    "movq %%mm0, %%mm1\n"			\
    "movq %%mm2, %%mm3\n"			\
    "punpcklbw %%mm7, %%mm0\n"			\
    "punpckhbw %%mm7, %%mm1\n"			\
    "punpcklbw %%mm7, %%mm2\n"			\
    "punpckhbw %%mm7, %%mm3\n"			\
    "psubsw %%mm2, %%mm0\n"			\
    "psubsw %%mm3, %%mm1\n"			\
    "movq %%mm0, 0x" #x "0(%1)\n"		\
    "movq %%mm1, 0x" #x "8(%1)\n"		\
    "addl %2, %0\n"				\
    "addl %4, %3\n"

  asm volatile ("pxor %%mm7, %%mm7\n"
		DIFF_STEP(0)
		DIFF_STEP(1)
		DIFF_STEP(2)
		DIFF_STEP(3)
		DIFF_STEP(4)
		DIFF_STEP(5)
		DIFF_STEP(6)
		DIFF_STEP(7)
		: "=r"(dummy1), "=r"(output), "=r"(ipitch), "=r"(dummy2), "=r"(rpitch)
		: "0"(input), "1"(output), "2"(ipitch), "3"(ref), "4"(rpitch)
		: "memory");
}

