http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
http://hermes.terminal.at/intel2gas/
ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
http://rufus.w3.org/linux/RPM/binutils.html
http://www.debian.org/Packages/stable/devel/binutils.html
ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
ftp://ftp.gnu.org/pub/gnu/binutils/
http://gcc.gnu.org/fom_serv/cache/23.html
http://sourceforge.net/bugs/
http://pobox.com/~newt/code/gpr-latest.zip
#define PNG_INTERNAL
#include "png.h"
#if defined(PNG_USE_PNGGCCRD)
int PNGAPI png_mmx_support(void);
#ifdef PNG_USE_LOCAL_ARRAYS
static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
#endif
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
#if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__) || \
    defined(__OS2__)
#  define _mmx_supported  mmx_supported
#  define _const4         const4
#  define _const6         const6
#  define _mask8_0        mask8_0
#  define _mask16_1       mask16_1
#  define _mask16_0       mask16_0
#  define _mask24_2       mask24_2
#  define _mask24_1       mask24_1
#  define _mask24_0       mask24_0
#  define _mask32_3       mask32_3
#  define _mask32_2       mask32_2
#  define _mask32_1       mask32_1
#  define _mask32_0       mask32_0
#  define _mask48_5       mask48_5
#  define _mask48_4       mask48_4
#  define _mask48_3       mask48_3
#  define _mask48_2       mask48_2
#  define _mask48_1       mask48_1
#  define _mask48_0       mask48_0
#  define _LBCarryMask    LBCarryMask
#  define _HBClearMask    HBClearMask
#  define _ActiveMask     ActiveMask
#  define _ActiveMask2    ActiveMask2
#  define _ActiveMaskEnd  ActiveMaskEnd
#  define _ShiftBpp       ShiftBpp
#  define _ShiftRem       ShiftRem
#ifdef PNG_THREAD_UNSAFE_OK
#  define _unmask         unmask
#  define _FullLength     FullLength
#  define _MMXLength      MMXLength
#  define _dif            dif
#  define _patemp         patemp
#  define _pbtemp         pbtemp
#  define _pctemp         pctemp
#endif
#endif
#ifdef PNG_THREAD_UNSAFE_OK
static int _unmask;
#endif
static unsigned long long _mask8_0  = 0x0102040810204080LL;
static unsigned long long _mask16_1 = 0x0101020204040808LL;
static unsigned long long _mask16_0 = 0x1010202040408080LL;
static unsigned long long _mask24_2 = 0x0101010202020404LL;
static unsigned long long _mask24_1 = 0x0408080810101020LL;
static unsigned long long _mask24_0 = 0x2020404040808080LL;
static unsigned long long _mask32_3 = 0x0101010102020202LL;
static unsigned long long _mask32_2 = 0x0404040408080808LL;
static unsigned long long _mask32_1 = 0x1010101020202020LL;
static unsigned long long _mask32_0 = 0x4040404080808080LL;
static unsigned long long _mask48_5 = 0x0101010101010202LL;
static unsigned long long _mask48_4 = 0x0202020204040404LL;
static unsigned long long _mask48_3 = 0x0404080808080808LL;
static unsigned long long _mask48_2 = 0x1010101010102020LL;
static unsigned long long _mask48_1 = 0x2020202040404040LL;
static unsigned long long _mask48_0 = 0x4040808080808080LL;
static unsigned long long _const4   = 0x0000000000FFFFFFLL;
static unsigned long long _const6   = 0x00000000000000FFLL;
#ifdef PNG_THREAD_UNSAFE_OK
static png_uint_32  _FullLength;
static png_uint_32  _MMXLength;
static int          _dif;
static int          _patemp; 
static int          _pbtemp;
static int          _pctemp;
#endif
void 
png_squelch_warnings(void)
{
#ifdef PNG_THREAD_UNSAFE_OK
   _dif = _dif;
   _patemp = _patemp;
   _pbtemp = _pbtemp;
   _pctemp = _pctemp;
   _MMXLength = _MMXLength;
#endif
   _const4  = _const4;
   _const6  = _const6;
   _mask8_0  = _mask8_0;
   _mask16_1 = _mask16_1;
   _mask16_0 = _mask16_0;
   _mask24_2 = _mask24_2;
   _mask24_1 = _mask24_1;
   _mask24_0 = _mask24_0;
   _mask32_3 = _mask32_3;
   _mask32_2 = _mask32_2;
   _mask32_1 = _mask32_1;
   _mask32_0 = _mask32_0;
   _mask48_5 = _mask48_5;
   _mask48_4 = _mask48_4;
   _mask48_3 = _mask48_3;
   _mask48_2 = _mask48_2;
   _mask48_1 = _mask48_1;
   _mask48_0 = _mask48_0;
}
#endif 
static int _mmx_supported = 2;
#if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
#define BPP2  2
#define BPP3  3 
#define BPP4  4
#define BPP6  6 
#define BPP8  8
void 
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
{
   png_debug(1, "in png_combine_row (pnggccrd.c)\n");
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
   if (_mmx_supported == 2) {
#if !defined(PNG_1_0_X)
       
       png_warning(png_ptr, "asm_flags may not have been initialized");
#endif
       png_mmx_support();
   }
#endif
   if (mask == 0xff)
   {
      png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
      png_memcpy(row, png_ptr->row_buf + 1,
       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
   }
   else   
   {
      switch (png_ptr->row_info.pixel_depth)
      {
         case 1:        
         {
            png_bytep sp;
            png_bytep dp;
            int s_inc, s_start, s_end;
            int m;
            int shift;
            png_uint_32 i;
            sp = png_ptr->row_buf + 1;
            dp = row;
            m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (png_ptr->transformations & PNG_PACKSWAP)
            {
                s_start = 0;
                s_end = 7;
                s_inc = 1;
            }
            else
#endif
            {
                s_start = 7;
                s_end = 0;
                s_inc = -1;
            }
            shift = s_start;
            for (i = 0; i < png_ptr->width; i++)
            {
               if (m & mask)
               {
                  int value;
                  value = (*sp >> shift) & 0x1;
                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
                  *dp |= (png_byte)(value << shift);
               }
               if (shift == s_end)
               {
                  shift = s_start;
                  sp++;
                  dp++;
               }
               else
                  shift += s_inc;
               if (m == 1)
                  m = 0x80;
               else
                  m >>= 1;
            }
            break;
         }
         case 2:        
         {
            png_bytep sp;
            png_bytep dp;
            int s_start, s_end, s_inc;
            int m;
            int shift;
            png_uint_32 i;
            int value;
            sp = png_ptr->row_buf + 1;
            dp = row;
            m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (png_ptr->transformations & PNG_PACKSWAP)
            {
               s_start = 0;
               s_end = 6;
               s_inc = 2;
            }
            else
#endif
            {
               s_start = 6;
               s_end = 0;
               s_inc = -2;
            }
            shift = s_start;
            for (i = 0; i < png_ptr->width; i++)
            {
               if (m & mask)
               {
                  value = (*sp >> shift) & 0x3;
                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
                  *dp |= (png_byte)(value << shift);
               }
               if (shift == s_end)
               {
                  shift = s_start;
                  sp++;
                  dp++;
               }
               else
                  shift += s_inc;
               if (m == 1)
                  m = 0x80;
               else
                  m >>= 1;
            }
            break;
         }
         case 4:        
         {
            png_bytep sp;
            png_bytep dp;
            int s_start, s_end, s_inc;
            int m;
            int shift;
            png_uint_32 i;
            int value;
            sp = png_ptr->row_buf + 1;
            dp = row;
            m = 0x80;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (png_ptr->transformations & PNG_PACKSWAP)
            {
               s_start = 0;
               s_end = 4;
               s_inc = 4;
            }
            else
#endif
            {
               s_start = 4;
               s_end = 0;
               s_inc = -4;
            }
            shift = s_start;
            for (i = 0; i < png_ptr->width; i++)
            {
               if (m & mask)
               {
                  value = (*sp >> shift) & 0xf;
                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
                  *dp |= (png_byte)(value << shift);
               }
               if (shift == s_end)
               {
                  shift = s_start;
                  sp++;
                  dp++;
               }
               else
                  shift += s_inc;
               if (m == 1)
                  m = 0x80;
               else
                  m >>= 1;
            }
            break;
         }
         case 8:        
         {
            png_bytep srcptr;
            png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
                 )
#else
            if (_mmx_supported)
#endif
            {
               png_uint_32 len;
               int diff;
               int dummy_value_a;   
               int dummy_value_d;
               int dummy_value_c;
               int dummy_value_S;
               int dummy_value_D;
               _unmask = ~mask;            
               srcptr = png_ptr->row_buf + 1;
               dstptr = row;
               len  = png_ptr->width &~7;  
               diff = (int) (png_ptr->width & 7);  
               __asm__ __volatile__ (
                  "movd      _unmask, %%mm7  \n\t" 
                  "psubb     %%mm6, %%mm6    \n\t" 
                  "punpcklbw %%mm7, %%mm7    \n\t"
                  "punpcklwd %%mm7, %%mm7    \n\t"
                  "punpckldq %%mm7, %%mm7    \n\t" 
                  "movq      _mask8_0, %%mm0 \n\t"
                  "pand      %%mm7, %%mm0    \n\t" 
                  "pcmpeqb   %%mm6, %%mm0    \n\t" 
                  "cmpl      $0, %%ecx       \n\t" 
                  "je        mainloop8end    \n\t"
                "mainloop8:                  \n\t"
                  "movq      (%%esi), %%mm4  \n\t" 
                  "pand      %%mm0, %%mm4    \n\t"
                  "movq      %%mm0, %%mm6    \n\t"
                  "pandn     (%%edi), %%mm6  \n\t" 
                  "por       %%mm6, %%mm4    \n\t"
                  "movq      %%mm4, (%%edi)  \n\t"
                  "addl      $8, %%esi       \n\t" 
                  "addl      $8, %%edi       \n\t"
                  "subl      $8, %%ecx       \n\t" 
                  "ja        mainloop8       \n\t"
                "mainloop8end:               \n\t"
                  "movl      %%eax, %%ecx    \n\t"
                  "cmpl      $0, %%ecx       \n\t"
                  "jz        end8            \n\t"
                  "sall      $24, %%edx      \n\t" 
                "secondloop8:                \n\t"
                  "sall      %%edx           \n\t" 
                  "jnc       skip8           \n\t" 
                  "movb      (%%esi), %%al   \n\t"
                  "movb      %%al, (%%edi)   \n\t"
                "skip8:                      \n\t"
                  "incl      %%esi           \n\t"
                  "incl      %%edi           \n\t"
                  "decl      %%ecx           \n\t"
                  "jnz       secondloop8     \n\t"
                "end8:                       \n\t"
                  "EMMS                      \n\t"  
                  : "=a" (dummy_value_a),           
                    "=d" (dummy_value_d),
                    "=c" (dummy_value_c),
                    "=S" (dummy_value_S),
                    "=D" (dummy_value_D)
                  : "3" (srcptr),      
                    "4" (dstptr),      
                    "0" (diff),        
                    "2" (len),         
                    "1" (mask)         
#if 0  
                  : "%mm0", "%mm4", "%mm6", "%mm7"  
#endif
               );
            }
            else 
#endif 
            {
               register png_uint_32 i;
               png_uint_32 initial_val = png_pass_start[png_ptr->pass];
                 
               register int stride = png_pass_inc[png_ptr->pass];
                 
               register int rep_bytes = png_pass_width[png_ptr->pass];
                 
               png_uint_32 len = png_ptr->width &~7;  
               int diff = (int) (png_ptr->width & 7); 
               register png_uint_32 final_val = len;  
               srcptr = png_ptr->row_buf + 1 + initial_val;
               dstptr = row + initial_val;
               for (i = initial_val; i < final_val; i += stride)
               {
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
               if (diff)  
               {
                  final_val+=diff  ;
                  for (; i < final_val; i += stride)
                  {
                     if (rep_bytes > (int)(final_val-i))
                        rep_bytes = (int)(final_val-i);
                     png_memcpy(dstptr, srcptr, rep_bytes);
                     srcptr += stride;
                     dstptr += stride;
                  }
               }
            } 
            break;
         }       
         case 16:       
         {
            png_bytep srcptr;
            png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
                 )
#else
            if (_mmx_supported)
#endif
            {
               png_uint_32 len;
               int diff;
               int dummy_value_a;   
               int dummy_value_d;
               int dummy_value_c;
               int dummy_value_S;
               int dummy_value_D;
               _unmask = ~mask;            
               srcptr = png_ptr->row_buf + 1;
               dstptr = row;
               len  = png_ptr->width &~7;  
               diff = (int) (png_ptr->width & 7); 
               __asm__ __volatile__ (
                  "movd      _unmask, %%mm7   \n\t" 
                  "psubb     %%mm6, %%mm6     \n\t" 
                  "punpcklbw %%mm7, %%mm7     \n\t"
                  "punpcklwd %%mm7, %%mm7     \n\t"
                  "punpckldq %%mm7, %%mm7     \n\t" 
                  "movq      _mask16_0, %%mm0 \n\t"
                  "movq      _mask16_1, %%mm1 \n\t"
                  "pand      %%mm7, %%mm0     \n\t"
                  "pand      %%mm7, %%mm1     \n\t"
                  "pcmpeqb   %%mm6, %%mm0     \n\t"
                  "pcmpeqb   %%mm6, %%mm1     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        mainloop16end    \n\t"
                "mainloop16:                  \n\t"
                  "movq      (%%esi), %%mm4   \n\t"
                  "pand      %%mm0, %%mm4     \n\t"
                  "movq      %%mm0, %%mm6     \n\t"
                  "movq      (%%edi), %%mm7   \n\t"
                  "pandn     %%mm7, %%mm6     \n\t"
                  "por       %%mm6, %%mm4     \n\t"
                  "movq      %%mm4, (%%edi)   \n\t"
                  "movq      8(%%esi), %%mm5  \n\t"
                  "pand      %%mm1, %%mm5     \n\t"
                  "movq      %%mm1, %%mm7     \n\t"
                  "movq      8(%%edi), %%mm6  \n\t"
                  "pandn     %%mm6, %%mm7     \n\t"
                  "por       %%mm7, %%mm5     \n\t"
                  "movq      %%mm5, 8(%%edi)  \n\t"
                  "addl      $16, %%esi       \n\t" 
                  "addl      $16, %%edi       \n\t"
                  "subl      $8, %%ecx        \n\t" 
                  "ja        mainloop16       \n\t"
                "mainloop16end:               \n\t"
                  "movl      %%eax, %%ecx     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        end16            \n\t"
                  "sall      $24, %%edx       \n\t" 
                "secondloop16:                \n\t"
                  "sall      %%edx            \n\t" 
                  "jnc       skip16           \n\t" 
                  "movw      (%%esi), %%ax    \n\t"
                  "movw      %%ax, (%%edi)    \n\t"
                "skip16:                      \n\t"
                  "addl      $2, %%esi        \n\t"
                  "addl      $2, %%edi        \n\t"
                  "decl      %%ecx            \n\t"
                  "jnz       secondloop16     \n\t"
                "end16:                       \n\t"
                  "EMMS                       \n\t" 
                  : "=a" (dummy_value_a),           
                    "=c" (dummy_value_c),
                    "=d" (dummy_value_d),
                    "=S" (dummy_value_S),
                    "=D" (dummy_value_D)
                  : "0" (diff),        
                    "1" (len),         
                    "2" (mask),        
                    "3" (srcptr),      
                    "4" (dstptr)       
#if 0  
                  : "%mm0", "%mm1", "%mm4"          
                  , "%mm5", "%mm6", "%mm7"
#endif
               );
            }
            else 
#endif 
            {
               register png_uint_32 i;
               png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
                 
               register int stride = BPP2 * png_pass_inc[png_ptr->pass];
                 
               register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
                 
               png_uint_32 len = png_ptr->width &~7;  
               int diff = (int) (png_ptr->width & 7); 
               register png_uint_32 final_val = BPP2 * len;   
               srcptr = png_ptr->row_buf + 1 + initial_val;
               dstptr = row + initial_val;
               for (i = initial_val; i < final_val; i += stride)
               {
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
               if (diff)  
               {
                  final_val+=diff*BPP2;
                  for (; i < final_val; i += stride)
                  {
                     if (rep_bytes > (int)(final_val-i))
                        rep_bytes = (int)(final_val-i);
                     png_memcpy(dstptr, srcptr, rep_bytes);
                     srcptr += stride;
                     dstptr += stride;
                  }
               }
            } 
            break;
         }       
         case 24:       
         {
            png_bytep srcptr;
            png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
                 )
#else
            if (_mmx_supported)
#endif
            {
               png_uint_32 len;
               int diff;
               int dummy_value_a;   
               int dummy_value_d;
               int dummy_value_c;
               int dummy_value_S;
               int dummy_value_D;
               _unmask = ~mask;            
               srcptr = png_ptr->row_buf + 1;
               dstptr = row;
               len  = png_ptr->width &~7;  
               diff = (int) (png_ptr->width & 7); 
               __asm__ __volatile__ (
                  "movd      _unmask, %%mm7   \n\t" 
                  "psubb     %%mm6, %%mm6     \n\t" 
                  "punpcklbw %%mm7, %%mm7     \n\t"
                  "punpcklwd %%mm7, %%mm7     \n\t"
                  "punpckldq %%mm7, %%mm7     \n\t" 
                  "movq      _mask24_0, %%mm0 \n\t"
                  "movq      _mask24_1, %%mm1 \n\t"
                  "movq      _mask24_2, %%mm2 \n\t"
                  "pand      %%mm7, %%mm0     \n\t"
                  "pand      %%mm7, %%mm1     \n\t"
                  "pand      %%mm7, %%mm2     \n\t"
                  "pcmpeqb   %%mm6, %%mm0     \n\t"
                  "pcmpeqb   %%mm6, %%mm1     \n\t"
                  "pcmpeqb   %%mm6, %%mm2     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        mainloop24end    \n\t"
                "mainloop24:                  \n\t"
                  "movq      (%%esi), %%mm4   \n\t"
                  "pand      %%mm0, %%mm4     \n\t"
                  "movq      %%mm0, %%mm6     \n\t"
                  "movq      (%%edi), %%mm7   \n\t"
                  "pandn     %%mm7, %%mm6     \n\t"
                  "por       %%mm6, %%mm4     \n\t"
                  "movq      %%mm4, (%%edi)   \n\t"
                  "movq      8(%%esi), %%mm5  \n\t"
                  "pand      %%mm1, %%mm5     \n\t"
                  "movq      %%mm1, %%mm7     \n\t"
                  "movq      8(%%edi), %%mm6  \n\t"
                  "pandn     %%mm6, %%mm7     \n\t"
                  "por       %%mm7, %%mm5     \n\t"
                  "movq      %%mm5, 8(%%edi)  \n\t"
                  "movq      16(%%esi), %%mm6 \n\t"
                  "pand      %%mm2, %%mm6     \n\t"
                  "movq      %%mm2, %%mm4     \n\t"
                  "movq      16(%%edi), %%mm7 \n\t"
                  "pandn     %%mm7, %%mm4     \n\t"
                  "por       %%mm4, %%mm6     \n\t"
                  "movq      %%mm6, 16(%%edi) \n\t"
                  "addl      $24, %%esi       \n\t" 
                  "addl      $24, %%edi       \n\t"
                  "subl      $8, %%ecx        \n\t" 
                  "ja        mainloop24       \n\t"
                "mainloop24end:               \n\t"
                  "movl      %%eax, %%ecx     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        end24            \n\t"
                  "sall      $24, %%edx       \n\t" 
                "secondloop24:                \n\t"
                  "sall      %%edx            \n\t" 
                  "jnc       skip24           \n\t" 
                  "movw      (%%esi), %%ax    \n\t"
                  "movw      %%ax, (%%edi)    \n\t"
                  "xorl      %%eax, %%eax     \n\t"
                  "movb      2(%%esi), %%al   \n\t"
                  "movb      %%al, 2(%%edi)   \n\t"
                "skip24:                      \n\t"
                  "addl      $3, %%esi        \n\t"
                  "addl      $3, %%edi        \n\t"
                  "decl      %%ecx            \n\t"
                  "jnz       secondloop24     \n\t"
                "end24:                       \n\t"
                  "EMMS                       \n\t" 
                  : "=a" (dummy_value_a),           
                    "=d" (dummy_value_d),
                    "=c" (dummy_value_c),
                    "=S" (dummy_value_S),
                    "=D" (dummy_value_D)
                  : "3" (srcptr),      
                    "4" (dstptr),      
                    "0" (diff),        
                    "2" (len),         
                    "1" (mask)         
#if 0  
                  : "%mm0", "%mm1", "%mm2"          
                  , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
               );
            }
            else 
#endif 
            {
               register png_uint_32 i;
               png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
                 
               register int stride = BPP3 * png_pass_inc[png_ptr->pass];
                 
               register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
                 
               png_uint_32 len = png_ptr->width &~7;  
               int diff = (int) (png_ptr->width & 7); 
               register png_uint_32 final_val = BPP3 * len;   
               srcptr = png_ptr->row_buf + 1 + initial_val;
               dstptr = row + initial_val;
               for (i = initial_val; i < final_val; i += stride)
               {
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
               if (diff)  
               {
                  final_val+=diff*BPP3;
                  for (; i < final_val; i += stride)
                  {
                     if (rep_bytes > (int)(final_val-i))
                        rep_bytes = (int)(final_val-i);
                     png_memcpy(dstptr, srcptr, rep_bytes);
                     srcptr += stride;
                     dstptr += stride;
                  }
               }
            } 
            break;
         }       
         case 32:       
         {
            png_bytep srcptr;
            png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
                 )
#else
            if (_mmx_supported)
#endif
            {
               png_uint_32 len;
               int diff;
               int dummy_value_a;   
               int dummy_value_d;
               int dummy_value_c;
               int dummy_value_S;
               int dummy_value_D;
               _unmask = ~mask;            
               srcptr = png_ptr->row_buf + 1;
               dstptr = row;
               len  = png_ptr->width &~7;  
               diff = (int) (png_ptr->width & 7); 
               __asm__ __volatile__ (
                  "movd      _unmask, %%mm7   \n\t" 
                  "psubb     %%mm6, %%mm6     \n\t" 
                  "punpcklbw %%mm7, %%mm7     \n\t"
                  "punpcklwd %%mm7, %%mm7     \n\t"
                  "punpckldq %%mm7, %%mm7     \n\t" 
                  "movq      _mask32_0, %%mm0 \n\t"
                  "movq      _mask32_1, %%mm1 \n\t"
                  "movq      _mask32_2, %%mm2 \n\t"
                  "movq      _mask32_3, %%mm3 \n\t"
                  "pand      %%mm7, %%mm0     \n\t"
                  "pand      %%mm7, %%mm1     \n\t"
                  "pand      %%mm7, %%mm2     \n\t"
                  "pand      %%mm7, %%mm3     \n\t"
                  "pcmpeqb   %%mm6, %%mm0     \n\t"
                  "pcmpeqb   %%mm6, %%mm1     \n\t"
                  "pcmpeqb   %%mm6, %%mm2     \n\t"
                  "pcmpeqb   %%mm6, %%mm3     \n\t"
                  "cmpl      $0, %%ecx        \n\t" 
                  "jz        mainloop32end    \n\t"
                "mainloop32:                  \n\t"
                  "movq      (%%esi), %%mm4   \n\t"
                  "pand      %%mm0, %%mm4     \n\t"
                  "movq      %%mm0, %%mm6     \n\t"
                  "movq      (%%edi), %%mm7   \n\t"
                  "pandn     %%mm7, %%mm6     \n\t"
                  "por       %%mm6, %%mm4     \n\t"
                  "movq      %%mm4, (%%edi)   \n\t"
                  "movq      8(%%esi), %%mm5  \n\t"
                  "pand      %%mm1, %%mm5     \n\t"
                  "movq      %%mm1, %%mm7     \n\t"
                  "movq      8(%%edi), %%mm6  \n\t"
                  "pandn     %%mm6, %%mm7     \n\t"
                  "por       %%mm7, %%mm5     \n\t"
                  "movq      %%mm5, 8(%%edi)  \n\t"
                  "movq      16(%%esi), %%mm6 \n\t"
                  "pand      %%mm2, %%mm6     \n\t"
                  "movq      %%mm2, %%mm4     \n\t"
                  "movq      16(%%edi), %%mm7 \n\t"
                  "pandn     %%mm7, %%mm4     \n\t"
                  "por       %%mm4, %%mm6     \n\t"
                  "movq      %%mm6, 16(%%edi) \n\t"
                  "movq      24(%%esi), %%mm7 \n\t"
                  "pand      %%mm3, %%mm7     \n\t"
                  "movq      %%mm3, %%mm5     \n\t"
                  "movq      24(%%edi), %%mm4 \n\t"
                  "pandn     %%mm4, %%mm5     \n\t"
                  "por       %%mm5, %%mm7     \n\t"
                  "movq      %%mm7, 24(%%edi) \n\t"
                  "addl      $32, %%esi       \n\t" 
                  "addl      $32, %%edi       \n\t"
                  "subl      $8, %%ecx        \n\t" 
                  "ja        mainloop32       \n\t"
                "mainloop32end:               \n\t"
                  "movl      %%eax, %%ecx     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        end32            \n\t"
                  "sall      $24, %%edx       \n\t" 
                "secondloop32:                \n\t"
                  "sall      %%edx            \n\t" 
                  "jnc       skip32           \n\t" 
                  "movl      (%%esi), %%eax   \n\t"
                  "movl      %%eax, (%%edi)   \n\t"
                "skip32:                      \n\t"
                  "addl      $4, %%esi        \n\t"
                  "addl      $4, %%edi        \n\t"
                  "decl      %%ecx            \n\t"
                  "jnz       secondloop32     \n\t"
                "end32:                       \n\t"
                  "EMMS                       \n\t" 
                  : "=a" (dummy_value_a),           
                    "=d" (dummy_value_d),
                    "=c" (dummy_value_c),
                    "=S" (dummy_value_S),
                    "=D" (dummy_value_D)
                  : "3" (srcptr),      
                    "4" (dstptr),      
                    "0" (diff),        
                    "2" (len),         
                    "1" (mask)         
#if 0  
                  : "%mm0", "%mm1", "%mm2", "%mm3"  
                  , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
               );
            }
            else 
#endif 
            {
               register png_uint_32 i;
               png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
                 
               register int stride = BPP4 * png_pass_inc[png_ptr->pass];
                 
               register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
                 
               png_uint_32 len = png_ptr->width &~7;  
               int diff = (int) (png_ptr->width & 7); 
               register png_uint_32 final_val = BPP4 * len;   
               srcptr = png_ptr->row_buf + 1 + initial_val;
               dstptr = row + initial_val;
               for (i = initial_val; i < final_val; i += stride)
               {
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
               if (diff)  
               {
                  final_val+=diff*BPP4;
                  for (; i < final_val; i += stride)
                  {
                     if (rep_bytes > (int)(final_val-i))
                        rep_bytes = (int)(final_val-i);
                     png_memcpy(dstptr, srcptr, rep_bytes);
                     srcptr += stride;
                     dstptr += stride;
                  }
               }
            } 
            break;
         }       
         case 48:       
         {
            png_bytep srcptr;
            png_bytep dstptr;
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
                 )
#else
            if (_mmx_supported)
#endif
            {
               png_uint_32 len;
               int diff;
               int dummy_value_a;   
               int dummy_value_d;
               int dummy_value_c;
               int dummy_value_S;
               int dummy_value_D;
               _unmask = ~mask;            
               srcptr = png_ptr->row_buf + 1;
               dstptr = row;
               len  = png_ptr->width &~7;  
               diff = (int) (png_ptr->width & 7); 
               __asm__ __volatile__ (
                  "movd      _unmask, %%mm7   \n\t" 
                  "psubb     %%mm6, %%mm6     \n\t" 
                  "punpcklbw %%mm7, %%mm7     \n\t"
                  "punpcklwd %%mm7, %%mm7     \n\t"
                  "punpckldq %%mm7, %%mm7     \n\t" 
                  "movq      _mask48_0, %%mm0 \n\t"
                  "movq      _mask48_1, %%mm1 \n\t"
                  "movq      _mask48_2, %%mm2 \n\t"
                  "movq      _mask48_3, %%mm3 \n\t"
                  "movq      _mask48_4, %%mm4 \n\t"
                  "movq      _mask48_5, %%mm5 \n\t"
                  "pand      %%mm7, %%mm0     \n\t"
                  "pand      %%mm7, %%mm1     \n\t"
                  "pand      %%mm7, %%mm2     \n\t"
                  "pand      %%mm7, %%mm3     \n\t"
                  "pand      %%mm7, %%mm4     \n\t"
                  "pand      %%mm7, %%mm5     \n\t"
                  "pcmpeqb   %%mm6, %%mm0     \n\t"
                  "pcmpeqb   %%mm6, %%mm1     \n\t"
                  "pcmpeqb   %%mm6, %%mm2     \n\t"
                  "pcmpeqb   %%mm6, %%mm3     \n\t"
                  "pcmpeqb   %%mm6, %%mm4     \n\t"
                  "pcmpeqb   %%mm6, %%mm5     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        mainloop48end    \n\t"
                "mainloop48:                  \n\t"
                  "movq      (%%esi), %%mm7   \n\t"
                  "pand      %%mm0, %%mm7     \n\t"
                  "movq      %%mm0, %%mm6     \n\t"
                  "pandn     (%%edi), %%mm6   \n\t"
                  "por       %%mm6, %%mm7     \n\t"
                  "movq      %%mm7, (%%edi)   \n\t"
                  "movq      8(%%esi), %%mm6  \n\t"
                  "pand      %%mm1, %%mm6     \n\t"
                  "movq      %%mm1, %%mm7     \n\t"
                  "pandn     8(%%edi), %%mm7  \n\t"
                  "por       %%mm7, %%mm6     \n\t"
                  "movq      %%mm6, 8(%%edi)  \n\t"
                  "movq      16(%%esi), %%mm6 \n\t"
                  "pand      %%mm2, %%mm6     \n\t"
                  "movq      %%mm2, %%mm7     \n\t"
                  "pandn     16(%%edi), %%mm7 \n\t"
                  "por       %%mm7, %%mm6     \n\t"
                  "movq      %%mm6, 16(%%edi) \n\t"
                  "movq      24(%%esi), %%mm7 \n\t"
                  "pand      %%mm3, %%mm7     \n\t"
                  "movq      %%mm3, %%mm6     \n\t"
                  "pandn     24(%%edi), %%mm6 \n\t"
                  "por       %%mm6, %%mm7     \n\t"
                  "movq      %%mm7, 24(%%edi) \n\t"
                  "movq      32(%%esi), %%mm6 \n\t"
                  "pand      %%mm4, %%mm6     \n\t"
                  "movq      %%mm4, %%mm7     \n\t"
                  "pandn     32(%%edi), %%mm7 \n\t"
                  "por       %%mm7, %%mm6     \n\t"
                  "movq      %%mm6, 32(%%edi) \n\t"
                  "movq      40(%%esi), %%mm7 \n\t"
                  "pand      %%mm5, %%mm7     \n\t"
                  "movq      %%mm5, %%mm6     \n\t"
                  "pandn     40(%%edi), %%mm6 \n\t"
                  "por       %%mm6, %%mm7     \n\t"
                  "movq      %%mm7, 40(%%edi) \n\t"
                  "addl      $48, %%esi       \n\t" 
                  "addl      $48, %%edi       \n\t"
                  "subl      $8, %%ecx        \n\t" 
                  "ja        mainloop48       \n\t"
                "mainloop48end:               \n\t"
                  "movl      %%eax, %%ecx     \n\t"
                  "cmpl      $0, %%ecx        \n\t"
                  "jz        end48            \n\t"
                  "sall      $24, %%edx       \n\t" 
                "secondloop48:                \n\t"
                  "sall      %%edx            \n\t" 
                  "jnc       skip48           \n\t" 
                  "movl      (%%esi), %%eax   \n\t"
                  "movl      %%eax, (%%edi)   \n\t"
                "skip48:                      \n\t"
                  "addl      $4, %%esi        \n\t"
                  "addl      $4, %%edi        \n\t"
                  "decl      %%ecx            \n\t"
                  "jnz       secondloop48     \n\t"
                "end48:                       \n\t"
                  "EMMS                       \n\t" 
                  : "=a" (dummy_value_a),           
                    "=d" (dummy_value_d),
                    "=c" (dummy_value_c),
                    "=S" (dummy_value_S),
                    "=D" (dummy_value_D)
                  : "3" (srcptr),      
                    "4" (dstptr),      
                    "0" (diff),        
                    "2" (len),         
                    "1" (mask)         
#if 0  
                  : "%mm0", "%mm1", "%mm2", "%mm3"  
                  , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
               );
            }
            else 
#endif 
            {
               register png_uint_32 i;
               png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
                 
               register int stride = BPP6 * png_pass_inc[png_ptr->pass];
                 
               register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
                 
               png_uint_32 len = png_ptr->width &~7;  
               int diff = (int) (png_ptr->width & 7); 
               register png_uint_32 final_val = BPP6 * len;   
               srcptr = png_ptr->row_buf + 1 + initial_val;
               dstptr = row + initial_val;
               for (i = initial_val; i < final_val; i += stride)
               {
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
               if (diff)  
               {
                  final_val+=diff*BPP6;
                  for (; i < final_val; i += stride)
                  {
                     if (rep_bytes > (int)(final_val-i))
                        rep_bytes = (int)(final_val-i);
                     png_memcpy(dstptr, srcptr, rep_bytes);
                     srcptr += stride;
                     dstptr += stride;
                  }
               }
            } 
            break;
         }       
         case 64:       
         {
            png_bytep srcptr;
            png_bytep dstptr;
            register png_uint_32 i;
            png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
              
            register int stride = BPP8 * png_pass_inc[png_ptr->pass];
              
            register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
              
            png_uint_32 len = png_ptr->width &~7;  
            int diff = (int) (png_ptr->width & 7); 
            register png_uint_32 final_val = BPP8 * len;   
            srcptr = png_ptr->row_buf + 1 + initial_val;
            dstptr = row + initial_val;
            for (i = initial_val; i < final_val; i += stride)
            {
               png_memcpy(dstptr, srcptr, rep_bytes);
               srcptr += stride;
               dstptr += stride;
            }
            if (diff)  
            {
               final_val+=diff*BPP8;
               for (; i < final_val; i += stride)
               {
                  if (rep_bytes > (int)(final_val-i))
                     rep_bytes = (int)(final_val-i);
                  png_memcpy(dstptr, srcptr, rep_bytes);
                  srcptr += stride;
                  dstptr += stride;
               }
            }
            break;
         }       
         default: 
         {
            
            png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
            break;
         }
      } 
   } 
} 
#endif 
#if defined(PNG_READ_INTERLACING_SUPPORTED)
#if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
void 
png_do_read_interlace(png_structp png_ptr)
{
   png_row_infop row_info = &(png_ptr->row_info);
   png_bytep row = png_ptr->row_buf + 1;
   int pass = png_ptr->pass;
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
   png_uint_32 transformations = png_ptr->transformations;
#endif
   png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
   if (_mmx_supported == 2) {
#if !defined(PNG_1_0_X)
       
       png_warning(png_ptr, "asm_flags may not have been initialized");
#endif
       png_mmx_support();
   }
#endif
   if (row != NULL && row_info != NULL)
   {
      png_uint_32 final_width;
      final_width = row_info->width * png_pass_inc[pass];
      switch (row_info->pixel_depth)
      {
         case 1:
         {
            png_bytep sp, dp;
            int sshift, dshift;
            int s_start, s_end, s_inc;
            png_byte v;
            png_uint_32 i;
            int j;
            sp = row + (png_size_t)((row_info->width - 1) >> 3);
            dp = row + (png_size_t)((final_width - 1) >> 3);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (transformations & PNG_PACKSWAP)
            {
               sshift = (int)((row_info->width + 7) & 7);
               dshift = (int)((final_width + 7) & 7);
               s_start = 7;
               s_end = 0;
               s_inc = -1;
            }
            else
#endif
            {
               sshift = 7 - (int)((row_info->width + 7) & 7);
               dshift = 7 - (int)((final_width + 7) & 7);
               s_start = 0;
               s_end = 7;
               s_inc = 1;
            }
            for (i = row_info->width; i; i--)
            {
               v = (png_byte)((*sp >> sshift) & 0x1);
               for (j = 0; j < png_pass_inc[pass]; j++)
               {
                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
                  *dp |= (png_byte)(v << dshift);
                  if (dshift == s_end)
                  {
                     dshift = s_start;
                     dp--;
                  }
                  else
                     dshift += s_inc;
               }
               if (sshift == s_end)
               {
                  sshift = s_start;
                  sp--;
               }
               else
                  sshift += s_inc;
            }
            break;
         }
         case 2:
         {
            png_bytep sp, dp;
            int sshift, dshift;
            int s_start, s_end, s_inc;
            png_uint_32 i;
            sp = row + (png_size_t)((row_info->width - 1) >> 2);
            dp = row + (png_size_t)((final_width - 1) >> 2);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (transformations & PNG_PACKSWAP)
            {
               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
               s_start = 6;
               s_end = 0;
               s_inc = -2;
            }
            else
#endif
            {
               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
               s_start = 0;
               s_end = 6;
               s_inc = 2;
            }
            for (i = row_info->width; i; i--)
            {
               png_byte v;
               int j;
               v = (png_byte)((*sp >> sshift) & 0x3);
               for (j = 0; j < png_pass_inc[pass]; j++)
               {
                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
                  *dp |= (png_byte)(v << dshift);
                  if (dshift == s_end)
                  {
                     dshift = s_start;
                     dp--;
                  }
                  else
                     dshift += s_inc;
               }
               if (sshift == s_end)
               {
                  sshift = s_start;
                  sp--;
               }
               else
                  sshift += s_inc;
            }
            break;
         }
         case 4:
         {
            png_bytep sp, dp;
            int sshift, dshift;
            int s_start, s_end, s_inc;
            png_uint_32 i;
            sp = row + (png_size_t)((row_info->width - 1) >> 1);
            dp = row + (png_size_t)((final_width - 1) >> 1);
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
            if (transformations & PNG_PACKSWAP)
            {
               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
               s_start = 4;
               s_end = 0;
               s_inc = -4;
            }
            else
#endif
            {
               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
               s_start = 0;
               s_end = 4;
               s_inc = 4;
            }
            for (i = row_info->width; i; i--)
            {
               png_byte v;
               int j;
               v = (png_byte)((*sp >> sshift) & 0xf);
               for (j = 0; j < png_pass_inc[pass]; j++)
               {
                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
                  *dp |= (png_byte)(v << dshift);
                  if (dshift == s_end)
                  {
                     dshift = s_start;
                     dp--;
                  }
                  else
                     dshift += s_inc;
               }
               if (sshift == s_end)
               {
                  sshift = s_start;
                  sp--;
               }
               else
                  sshift += s_inc;
            }
            break;
         }
       
         default: 
         {
#if 0
#endif
            png_bytep sptr, dp;
            png_uint_32 i;
            png_size_t pixel_bytes;
            int width = (int)row_info->width;
            pixel_bytes = (row_info->pixel_depth >> 3);
            
            sptr = row + (width - 1) * pixel_bytes;
            
            dp = row + (final_width - 1) * pixel_bytes;
            
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
                 )
#else
            if (_mmx_supported)
#endif
            {
               
               if (pixel_bytes == 3)
               {
                  if (((pass == 0) || (pass == 1)) && width)
                  {
                     int dummy_value_c;   
                     int dummy_value_S;
                     int dummy_value_D;
                     int dummy_value_a;
                     __asm__ __volatile__ (
                        "subl $21, %%edi         \n\t"
                                     
                     ".loop3_pass0:              \n\t"
                        "movd (%%esi), %%mm0     \n\t" 
                        "pand (%3), %%mm0        \n\t" 
                        "movq %%mm0, %%mm1       \n\t" 
                        "psllq $16, %%mm0        \n\t" 
                        "movq %%mm0, %%mm2       \n\t" 
                        "psllq $24, %%mm0        \n\t" 
                        "psrlq $8, %%mm1         \n\t" 
                        "por %%mm2, %%mm0        \n\t" 
                        "por %%mm1, %%mm0        \n\t" 
                        "movq %%mm0, %%mm3       \n\t" 
                        "psllq $16, %%mm0        \n\t" 
                        "movq %%mm3, %%mm4       \n\t" 
                        "punpckhdq %%mm0, %%mm3  \n\t" 
                        "movq %%mm4, 16(%%edi)   \n\t"
                        "psrlq $32, %%mm0        \n\t" 
                        "movq %%mm3, 8(%%edi)    \n\t"
                        "punpckldq %%mm4, %%mm0  \n\t" 
                        "subl $3, %%esi          \n\t"
                        "movq %%mm0, (%%edi)     \n\t"
                        "subl $24, %%edi         \n\t"
                        "decl %%ecx              \n\t"
                        "jnz .loop3_pass0        \n\t"
                        "EMMS                    \n\t" 
                        : "=c" (dummy_value_c),        
                          "=S" (dummy_value_S),
                          "=D" (dummy_value_D),
                          "=a" (dummy_value_a)
                        : "1" (sptr),      
                          "2" (dp),        
                          "0" (width),     
                          "3" (&_const4)  
#if 0  
                        : "%mm0", "%mm1", "%mm2"       
                        , "%mm3", "%mm4"
#endif
                     );
                  }
                  else if (((pass == 2) || (pass == 3)) && width)
                  {
                     int dummy_value_c;   
                     int dummy_value_S;
                     int dummy_value_D;
                     int dummy_value_a;
                     __asm__ __volatile__ (
                        "subl $9, %%edi          \n\t"
                                     
                     ".loop3_pass2:              \n\t"
                        "movd (%%esi), %%mm0     \n\t" 
                        "pand (%3), %%mm0     \n\t" 
                        "movq %%mm0, %%mm1       \n\t" 
                        "psllq $16, %%mm0        \n\t" 
                        "movq %%mm0, %%mm2       \n\t" 
                        "psllq $24, %%mm0        \n\t" 
                        "psrlq $8, %%mm1         \n\t" 
                        "por %%mm2, %%mm0        \n\t" 
                        "por %%mm1, %%mm0        \n\t" 
                        "movq %%mm0, 4(%%edi)    \n\t"
                        "psrlq $16, %%mm0        \n\t" 
                        "subl $3, %%esi          \n\t"
                        "movd %%mm0, (%%edi)     \n\t"
                        "subl $12, %%edi         \n\t"
                        "decl %%ecx              \n\t"
                        "jnz .loop3_pass2        \n\t"
                        "EMMS                    \n\t" 
                        : "=c" (dummy_value_c),        
                          "=S" (dummy_value_S),
                          "=D" (dummy_value_D),
                          "=a" (dummy_value_a)
                        : "1" (sptr),      
                          "2" (dp),        
                          "0" (width),     
                          "3" (&_const4)  
#if 0  
                        : "%mm0", "%mm1", "%mm2"       
#endif
                     );
                  }
                  else if (width) 
                  {
                     int width_mmx = ((width >> 1) << 1) - 8;   
                     if (width_mmx < 0)
                         width_mmx = 0;
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        
                        
                        
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        int dummy_value_a;
                        int dummy_value_d;
                        __asm__ __volatile__ (
                           "subl $3, %%esi          \n\t"
                           "subl $9, %%edi          \n\t"
                                        
                        ".loop3_pass4:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "movq %%mm0, %%mm2       \n\t" 
                           "psllq $24, %%mm0        \n\t" 
                           "pand (%3), %%mm1          \n\t" 
                           "psrlq $24, %%mm2        \n\t" 
                           "por %%mm1, %%mm0        \n\t" 
                           "movq %%mm2, %%mm3       \n\t" 
                           "psllq $8, %%mm2         \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "psrlq $16, %%mm3        \n\t" 
                           "pand (%4), %%mm3     \n\t" 
                           "por %%mm3, %%mm2        \n\t" 
                           "subl $6, %%esi          \n\t"
                           "movd %%mm2, 8(%%edi)    \n\t"
                           "subl $12, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop3_pass4        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D),
                             "=a" (dummy_value_a),
                             "=d" (dummy_value_d)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx), 
                             "3" (&_const4), 
                             "4" (&_const6)  
#if 0  
                           : "%mm0", "%mm1"               
                           , "%mm2", "%mm3"
#endif
                        );
                     }
                     sptr -= width_mmx*3;
                     dp -= width_mmx*6;
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        png_memcpy(v, sptr, 3);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           png_memcpy(dp, v, 3);
                           dp -= 3;
                        }
                        sptr -= 3;
                     }
                  }
               } 
               
               else if (pixel_bytes == 1)
               {
                  if (((pass == 0) || (pass == 1)) && width)
                  {
                     int width_mmx = ((width >> 2) << 2);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $3, %%esi          \n\t"
                           "subl $31, %%edi         \n\t"
                        ".loop1_pass0:              \n\t"
                           "movd (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpcklbw %%mm0, %%mm0  \n\t" 
                           "movq %%mm0, %%mm2       \n\t" 
                           "punpcklwd %%mm0, %%mm0  \n\t" 
                           "movq %%mm0, %%mm3       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm3, %%mm3  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "punpckhwd %%mm2, %%mm2  \n\t" 
                           "movq %%mm3, 8(%%edi)    \n\t"
                           "movq %%mm2, %%mm4       \n\t" 
                           "punpckldq %%mm2, %%mm2  \n\t" 
                           "punpckhdq %%mm4, %%mm4  \n\t" 
                           "movq %%mm2, 16(%%edi)   \n\t"
                           "subl $4, %%esi          \n\t"
                           "movq %%mm4, 24(%%edi)   \n\t"
                           "subl $32, %%edi         \n\t"
                           "subl $4, %%ecx          \n\t"
                           "jnz .loop1_pass0        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1", "%mm2"       
                           , "%mm3", "%mm4"
#endif
                        );
                     }
                     sptr -= width_mmx;
                     dp -= width_mmx*8;
                     for (i = width; i; i--)
                     {
                        int j;
                       
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           *dp-- = *sptr;
                        }
                        --sptr;
                     }
                  }
                  else if (((pass == 2) || (pass == 3)) && width)
                  {
                     int width_mmx = ((width >> 2) << 2);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $3, %%esi          \n\t"
                           "subl $15, %%edi         \n\t"
                        ".loop1_pass2:              \n\t"
                           "movd (%%esi), %%mm0     \n\t" 
                           "punpcklbw %%mm0, %%mm0  \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpcklwd %%mm0, %%mm0  \n\t" 
                           "punpckhwd %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $4, %%esi          \n\t"
                           "movq %%mm1, 8(%%edi)    \n\t"
                           "subl $16, %%edi         \n\t"
                           "subl $4, %%ecx          \n\t"
                           "jnz .loop1_pass2        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= width_mmx;
                     dp -= width_mmx*4;
                     for (i = width; i; i--)
                     {
                        int j;
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           *dp-- = *sptr;
                        }
                        --sptr;
                     }
                  }
                  else if (width)  
                  {
                     int width_mmx = ((width >> 3) << 3);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $7, %%esi          \n\t"
                           "subl $15, %%edi         \n\t"
                        ".loop1_pass4:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpcklbw %%mm0, %%mm0  \n\t" 
                           "punpckhbw %%mm1, %%mm1  \n\t" 
                           "movq %%mm1, 8(%%edi)    \n\t"
                           "subl $8, %%esi          \n\t"
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $16, %%edi         \n\t"
                           "subl $8, %%ecx          \n\t"
                           "jnz .loop1_pass4        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= width_mmx;
                     dp -= width_mmx*2;
                     for (i = width; i; i--)
                     {
                        int j;
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           *dp-- = *sptr;
                        }
                        --sptr;
                     }
                  }
               } 
               
               else if (pixel_bytes == 2)
               {
                  if (((pass == 0) || (pass == 1)) && width)
                  {
                     int width_mmx = ((width >> 1) << 1);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $2, %%esi          \n\t"
                           "subl $30, %%edi         \n\t"
                        ".loop2_pass0:              \n\t"
                           "movd (%%esi), %%mm0     \n\t" 
                           "punpcklwd %%mm0, %%mm0  \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "movq %%mm0, 8(%%edi)    \n\t"
                           "movq %%mm1, 16(%%edi)   \n\t"
                           "subl $4, %%esi          \n\t"
                           "movq %%mm1, 24(%%edi)   \n\t"
                           "subl $32, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop2_pass0        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= (width_mmx*2 - 2); 
                     dp -= (width_mmx*16 - 2);  
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 2;
                        png_memcpy(v, sptr, 2);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 2;
                           png_memcpy(dp, v, 2);
                        }
                     }
                  }
                  else if (((pass == 2) || (pass == 3)) && width)
                  {
                     int width_mmx = ((width >> 1) << 1) ;
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $2, %%esi          \n\t"
                           "subl $14, %%edi         \n\t"
                        ".loop2_pass2:              \n\t"
                           "movd (%%esi), %%mm0     \n\t" 
                           "punpcklwd %%mm0, %%mm0  \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $4, %%esi          \n\t"
                           "movq %%mm1, 8(%%edi)    \n\t"
                           "subl $16, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop2_pass2        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= (width_mmx*2 - 2); 
                     dp -= (width_mmx*8 - 2);   
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 2;
                        png_memcpy(v, sptr, 2);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 2;
                           png_memcpy(dp, v, 2);
                        }
                     }
                  }
                  else if (width)  
                  {
                     int width_mmx = ((width >> 1) << 1) ;
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $2, %%esi          \n\t"
                           "subl $6, %%edi          \n\t"
                        ".loop2_pass4:              \n\t"
                           "movd (%%esi), %%mm0     \n\t" 
                           "punpcklwd %%mm0, %%mm0  \n\t" 
                           "subl $4, %%esi          \n\t"
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $8, %%edi          \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop2_pass4        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0"                       
#endif
                        );
                     }
                     sptr -= (width_mmx*2 - 2); 
                     dp -= (width_mmx*4 - 2);   
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 2;
                        png_memcpy(v, sptr, 2);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 2;
                           png_memcpy(dp, v, 2);
                        }
                     }
                  }
               } 
               
               else if (pixel_bytes == 4)
               {
                  if (((pass == 0) || (pass == 1)) && width)
                  {
                     int width_mmx = ((width >> 1) << 1);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $4, %%esi          \n\t"
                           "subl $60, %%edi         \n\t"
                        ".loop4_pass0:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "movq %%mm0, 8(%%edi)    \n\t"
                           "movq %%mm0, 16(%%edi)   \n\t"
                           "movq %%mm0, 24(%%edi)   \n\t"
                           "movq %%mm1, 32(%%edi)   \n\t"
                           "movq %%mm1, 40(%%edi)   \n\t"
                           "movq %%mm1, 48(%%edi)   \n\t"
                           "subl $8, %%esi          \n\t"
                           "movq %%mm1, 56(%%edi)   \n\t"
                           "subl $64, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop4_pass0        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= (width_mmx*4 - 4); 
                     dp -= (width_mmx*32 - 4);  
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 4;
                        png_memcpy(v, sptr, 4);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 4;
                           png_memcpy(dp, v, 4);
                        }
                     }
                  }
                  else if (((pass == 2) || (pass == 3)) && width)
                  {
                     int width_mmx = ((width >> 1) << 1);
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $4, %%esi          \n\t"
                           "subl $28, %%edi         \n\t"
                        ".loop4_pass2:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "movq %%mm0, 8(%%edi)    \n\t"
                           "movq %%mm1, 16(%%edi)   \n\t"
                           "movq %%mm1, 24(%%edi)   \n\t"
                           "subl $8, %%esi          \n\t"
                           "subl $32, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop4_pass2        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= (width_mmx*4 - 4); 
                     dp -= (width_mmx*16 - 4);  
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 4;
                        png_memcpy(v, sptr, 4);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 4;
                           png_memcpy(dp, v, 4);
                        }
                     }
                  }
                  else if (width)  
                  {
                     int width_mmx = ((width >> 1) << 1) ;
                     width -= width_mmx;        
                     if (width_mmx)
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $4, %%esi          \n\t"
                           "subl $12, %%edi         \n\t"
                        ".loop4_pass4:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, %%mm1       \n\t" 
                           "punpckldq %%mm0, %%mm0  \n\t" 
                           "punpckhdq %%mm1, %%mm1  \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $8, %%esi          \n\t"
                           "movq %%mm1, 8(%%edi)    \n\t"
                           "subl $16, %%edi         \n\t"
                           "subl $2, %%ecx          \n\t"
                           "jnz .loop4_pass4        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width_mmx)  
#if 0  
                           : "%mm0", "%mm1"               
#endif
                        );
                     }
                     sptr -= (width_mmx*4 - 4); 
                     dp -= (width_mmx*8 - 4);   
                     for (i = width; i; i--)
                     {
                        png_byte v[8];
                        int j;
                        sptr -= 4;
                        png_memcpy(v, sptr, 4);
                        for (j = 0; j < png_pass_inc[pass]; j++)
                        {
                           dp -= 4;
                           png_memcpy(dp, v, 4);
                        }
                     }
                  }
               } 
               
               else if (pixel_bytes == 8)
               {
                  
                  if (((pass == 0) || (pass == 1)) && width)
                  {
                     int dummy_value_c;  
                     int dummy_value_S;
                     int dummy_value_D;
                     
                     
                     __asm__ __volatile__ (
                        "subl $56, %%edi         \n\t" 
                     ".loop8_pass0:              \n\t"
                        "movq (%%esi), %%mm0     \n\t" 
                        "movq %%mm0, (%%edi)     \n\t"
                        "movq %%mm0, 8(%%edi)    \n\t"
                        "movq %%mm0, 16(%%edi)   \n\t"
                        "movq %%mm0, 24(%%edi)   \n\t"
                        "movq %%mm0, 32(%%edi)   \n\t"
                        "movq %%mm0, 40(%%edi)   \n\t"
                        "movq %%mm0, 48(%%edi)   \n\t"
                        "subl $8, %%esi          \n\t"
                        "movq %%mm0, 56(%%edi)   \n\t"
                        "subl $64, %%edi         \n\t"
                        "decl %%ecx              \n\t"
                        "jnz .loop8_pass0        \n\t"
                        "EMMS                    \n\t" 
                        : "=c" (dummy_value_c),        
                          "=S" (dummy_value_S),
                          "=D" (dummy_value_D)
                        : "1" (sptr),      
                          "2" (dp),        
                          "0" (width)      
#if 0  
                        : "%mm0"                       
#endif
                     );
                  }
                  else if (((pass == 2) || (pass == 3)) && width)
                  {
                     
                     
                     
                     
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $24, %%edi         \n\t" 
                        ".loop8_pass2:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "movq %%mm0, 8(%%edi)    \n\t"
                           "movq %%mm0, 16(%%edi)   \n\t"
                           "subl $8, %%esi          \n\t"
                           "movq %%mm0, 24(%%edi)   \n\t"
                           "subl $32, %%edi         \n\t"
                           "decl %%ecx              \n\t"
                           "jnz .loop8_pass2        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width)      
#if 0  
                           : "%mm0"                       
#endif
                        );
                     }
                  }
                  else if (width)  
                  {
                     
                     
                     {
                        int dummy_value_c;  
                        int dummy_value_S;
                        int dummy_value_D;
                        __asm__ __volatile__ (
                           "subl $8, %%edi          \n\t" 
                        ".loop8_pass4:              \n\t"
                           "movq (%%esi), %%mm0     \n\t" 
                           "movq %%mm0, (%%edi)     \n\t"
                           "subl $8, %%esi          \n\t"
                           "movq %%mm0, 8(%%edi)    \n\t"
                           "subl $16, %%edi         \n\t"
                           "decl %%ecx              \n\t"
                           "jnz .loop8_pass4        \n\t"
                           "EMMS                    \n\t" 
                           : "=c" (dummy_value_c),        
                             "=S" (dummy_value_S),
                             "=D" (dummy_value_D)
                           : "1" (sptr),      
                             "2" (dp),        
                             "0" (width)      
#if 0  
                           : "%mm0"                       
#endif
                        );
                     }
                  }
               } 
               
               else if (pixel_bytes == 6)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 6);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, 6);
                        dp -= 6;
                     }
                     sptr -= 6;
                  }
               } 
               
               else
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, pixel_bytes);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, pixel_bytes);
                        dp -= pixel_bytes;
                     }
                     sptr-= pixel_bytes;
                  }
               }
            } 
            else 
                 
                 
#endif 
            {
               if (pixel_bytes == 1)
               {
                  for (i = width; i; i--)
                  {
                     int j;
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        *dp-- = *sptr;
                     }
                     --sptr;
                  }
               }
               else if (pixel_bytes == 3)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 3);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, 3);
                        dp -= 3;
                     }
                     sptr -= 3;
                  }
               }
               else if (pixel_bytes == 2)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 2);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, 2);
                        dp -= 2;
                     }
                     sptr -= 2;
                  }
               }
               else if (pixel_bytes == 4)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 4);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
#ifdef PNG_DEBUG
                        if (dp < row || dp+3 > row+png_ptr->row_buf_size)
                        {
                           printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
                             row, dp, row+png_ptr->row_buf_size);
                           printf("row_buf=%d\n",png_ptr->row_buf_size);
                        }
#endif
                        png_memcpy(dp, v, 4);
                        dp -= 4;
                     }
                     sptr -= 4;
                  }
               }
               else if (pixel_bytes == 6)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 6);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, 6);
                        dp -= 6;
                     }
                     sptr -= 6;
                  }
               }
               else if (pixel_bytes == 8)
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, 8);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, 8);
                        dp -= 8;
                     }
                     sptr -= 8;
                  }
               }
               else     
               {
                  for (i = width; i; i--)
                  {
                     png_byte v[8];
                     int j;
                     png_memcpy(v, sptr, pixel_bytes);
                     for (j = 0; j < png_pass_inc[pass]; j++)
                     {
                        png_memcpy(dp, v, pixel_bytes);
                        dp -= pixel_bytes;
                     }
                     sptr -= pixel_bytes;
                  }
               }
            } 
            break;
         }
      } 
      row_info->width = final_width;
      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
   }
} 
#endif 
#endif 
#if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
union uAll {
   long long use;
   double  align;
} _LBCarryMask = {0x0101010101010101LL},
  _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
  _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
#ifdef PNG_THREAD_UNSAFE_OK
static void 
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
                            png_bytep prev_row)
{
   int bpp;
   int dummy_value_c;   
   int dummy_value_S;
   int dummy_value_D;
   bpp = (row_info->pixel_depth + 7) >> 3;  
   _FullLength  = row_info->rowbytes;       
   __asm__ __volatile__ (
      
#ifdef __PIC__
      "pushl %%ebx                 \n\t" 
#endif
      "xorl %%ebx, %%ebx           \n\t" 
      "movl %%edi, %%edx           \n\t"
      "subl %%ecx, %%edx           \n\t" 
      "xorl %%eax,%%eax            \n\t"
      
      
   "avg_rlp:                       \n\t"
      "movb (%%esi,%%ebx,),%%al    \n\t" 
      "incl %%ebx                  \n\t"
      "shrb %%al                   \n\t" 
      "addb -1(%%edi,%%ebx,),%%al  \n\t" 
      "cmpl %%ecx, %%ebx           \n\t"
      "movb %%al,-1(%%edi,%%ebx,)  \n\t" 
      "jb avg_rlp                  \n\t" 
      
      "movl %%edi, _dif            \n\t" 
      "addl %%ebx, _dif            \n\t" 
      "addl $0xf, _dif             \n\t" 
      "andl $0xfffffff8, _dif      \n\t" 
      "subl %%edi, _dif            \n\t" 
      "jz avg_go                   \n\t" 
      
      
      
      "xorl %%ecx, %%ecx           \n\t"
   "avg_lp1:                       \n\t"
      "xorl %%eax, %%eax           \n\t"
      "movb (%%esi,%%ebx,), %%cl   \n\t" 
      "movb (%%edx,%%ebx,), %%al   \n\t" 
      "addw %%cx, %%ax             \n\t"
      "incl %%ebx                  \n\t"
      "shrw %%ax                   \n\t" 
      "addb -1(%%edi,%%ebx,), %%al \n\t" 
      "cmpl _dif, %%ebx            \n\t" 
      "movb %%al, -1(%%edi,%%ebx,) \n\t" 
      "jb avg_lp1                  \n\t" 
   "avg_go:                        \n\t"
      "movl _FullLength, %%eax     \n\t"
      "movl %%eax, %%ecx           \n\t"
      "subl %%ebx, %%eax           \n\t" 
      "andl $0x00000007, %%eax     \n\t" 
      "subl %%eax, %%ecx           \n\t" 
      "movl %%ecx, _MMXLength      \n\t"
#ifdef __PIC__
      "popl %%ebx                  \n\t" 
#endif
      : "=c" (dummy_value_c),            
        "=S" (dummy_value_S),
        "=D" (dummy_value_D)
      : "0" (bpp),       
        "1" (prev_row),  
        "2" (row)        
      : "%eax", "%edx"                   
#ifndef __PIC__
      , "%ebx"
#endif
      
      
   );
   
   switch (bpp)
   {
      case 3:
      {
         _ActiveMask.use  = 0x0000000000ffffffLL;
         _ShiftBpp.use = 24;    
         _ShiftRem.use = 40;    
         __asm__ __volatile__ (
            
            "movq _ActiveMask, %%mm7      \n\t"
            "movl _dif, %%ecx             \n\t" 
            "movq _LBCarryMask, %%mm5     \n\t" 
            "movq _HBClearMask, %%mm4     \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" 
                                                
         "avg_3lp:                        \n\t"
            "movq (%%edi,%%ecx,), %%mm0   \n\t" 
            "movq %%mm5, %%mm3            \n\t"
            "psrlq _ShiftRem, %%mm2       \n\t" 
                                                
            "movq (%%esi,%%ecx,), %%mm1   \n\t" 
            "movq %%mm7, %%mm6            \n\t"
            "pand %%mm1, %%mm3            \n\t" 
            "psrlq $1, %%mm1              \n\t" 
            "pand  %%mm4, %%mm1           \n\t" 
                                                
            "paddb %%mm1, %%mm0           \n\t" 
                                                
            
            "movq %%mm3, %%mm1            \n\t" 
                                                
            "pand %%mm2, %%mm1            \n\t" 
                                                
                               
            "psrlq $1, %%mm2              \n\t" 
            "pand  %%mm4, %%mm2           \n\t" 
                                                
            "paddb %%mm1, %%mm2           \n\t" 
                                                
            "pand %%mm6, %%mm2            \n\t" 
                                                
            "paddb %%mm2, %%mm0           \n\t" 
                                                
                               
            
            "psllq _ShiftBpp, %%mm6       \n\t" 
                                                
            "movq %%mm0, %%mm2            \n\t" 
            "psllq _ShiftBpp, %%mm2       \n\t" 
            "movq %%mm3, %%mm1            \n\t" 
                                                
            "pand %%mm2, %%mm1            \n\t" 
                                                
                               
            "psrlq $1, %%mm2              \n\t" 
            "pand  %%mm4, %%mm2           \n\t" 
                                                
            "paddb %%mm1, %%mm2           \n\t" 
                                                
            "pand %%mm6, %%mm2            \n\t" 
                                                
            "paddb %%mm2, %%mm0           \n\t" 
                                                
                               
            
            "psllq _ShiftBpp, %%mm6       \n\t" 
                                                
                                 
            "movq %%mm0, %%mm2            \n\t" 
            "psllq _ShiftBpp, %%mm2       \n\t" 
                              
                              
            "movq %%mm3, %%mm1            \n\t" 
                                                
            "pand %%mm2, %%mm1            \n\t" 
                                                
                              
            "psrlq $1, %%mm2              \n\t" 
            "pand  %%mm4, %%mm2           \n\t" 
                                                
            "paddb %%mm1, %%mm2           \n\t" 
                                                
            "pand %%mm6, %%mm2            \n\t" 
                                                
            "addl $8, %%ecx               \n\t"
            "paddb %%mm2, %%mm0           \n\t" 
                                                
                                                
            
            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
            
            "cmpl _MMXLength, %%ecx       \n\t"
            "movq %%mm0, %%mm2            \n\t" 
            "jb avg_3lp                   \n\t"
            : "=S" (dummy_value_S),             
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 6:
      case 4:
      
      
      {
         _ActiveMask.use  = 0xffffffffffffffffLL; 
                                                  
         _ShiftBpp.use = bpp << 3;
         _ShiftRem.use = 64 - _ShiftBpp.use;
         __asm__ __volatile__ (
            "movq _HBClearMask, %%mm4    \n\t"
            
            "movl _dif, %%ecx            \n\t" 
                                               
            
            "movq _ActiveMask, %%mm7     \n\t"
            "psrlq _ShiftRem, %%mm7      \n\t"
            "movq %%mm7, %%mm6           \n\t"
            "movq _LBCarryMask, %%mm5    \n\t"
            "psllq _ShiftBpp, %%mm6      \n\t" 
                                               
            
            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" 
                                          
         "avg_4lp:                       \n\t"
            "movq (%%edi,%%ecx,), %%mm0  \n\t"
            "psrlq _ShiftRem, %%mm2      \n\t" 
            "movq (%%esi,%%ecx,), %%mm1  \n\t"
            
            "movq %%mm5, %%mm3           \n\t"
            "pand %%mm1, %%mm3           \n\t" 
            "psrlq $1, %%mm1             \n\t" 
            "pand  %%mm4, %%mm1          \n\t" 
                                               
            "paddb %%mm1, %%mm0          \n\t" 
                                               
            
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                              
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm7, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
                              
            
            "movq %%mm0, %%mm2           \n\t" 
            "psllq _ShiftBpp, %%mm2      \n\t" 
            "addl $8, %%ecx              \n\t"
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                              
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm6, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
                              
            "cmpl _MMXLength, %%ecx      \n\t"
            
            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
            
            "movq %%mm0, %%mm2           \n\t" 
            "jb avg_4lp                  \n\t"
            : "=S" (dummy_value_S),            
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                           
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 2:
      {
         _ActiveMask.use  = 0x000000000000ffffLL;
         _ShiftBpp.use = 16;   
         _ShiftRem.use = 48;   
         __asm__ __volatile__ (
            
            "movq _ActiveMask, %%mm7     \n\t"
            
            "movl _dif, %%ecx            \n\t" 
                                               
            "movq _LBCarryMask, %%mm5    \n\t"
            "movq _HBClearMask, %%mm4    \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" 
                              
         "avg_2lp:                       \n\t"
            "movq (%%edi,%%ecx,), %%mm0  \n\t"
            "psrlq _ShiftRem, %%mm2      \n\t" 
            "movq (%%esi,%%ecx,), %%mm1  \n\t" 
            
            "movq %%mm5, %%mm3           \n\t"
            "pand %%mm1, %%mm3           \n\t" 
            "psrlq $1, %%mm1             \n\t" 
            "pand  %%mm4, %%mm1          \n\t" 
                                               
            "movq %%mm7, %%mm6           \n\t"
            "paddb %%mm1, %%mm0          \n\t" 
                                               
            
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                                               
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm6, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
            
            "psllq _ShiftBpp, %%mm6      \n\t" 
                                               
            "movq %%mm0, %%mm2           \n\t" 
            "psllq _ShiftBpp, %%mm2      \n\t" 
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                                               
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm6, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
            
            "psllq _ShiftBpp, %%mm6      \n\t" 
                                               
            "movq %%mm0, %%mm2           \n\t" 
            "psllq _ShiftBpp, %%mm2      \n\t" 
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm6, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
            
            "psllq _ShiftBpp, %%mm6      \n\t" 
                                               
            "movq %%mm0, %%mm2           \n\t" 
            "psllq _ShiftBpp, %%mm2      \n\t" 
            "addl $8, %%ecx              \n\t"
            "movq %%mm3, %%mm1           \n\t" 
                                               
            "pand %%mm2, %%mm1           \n\t" 
                                               
                                               
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm2          \n\t" 
                                               
            "pand %%mm6, %%mm2           \n\t" 
                                               
            "paddb %%mm2, %%mm0          \n\t" 
                                               
            "cmpl _MMXLength, %%ecx      \n\t"
            
            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
            
            "movq %%mm0, %%mm2           \n\t" 
            "jb avg_2lp                  \n\t"
            : "=S" (dummy_value_S),            
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                           
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 1:
      {
         __asm__ __volatile__ (
            
#ifdef __PIC__
            "pushl %%ebx                 \n\t" 
#endif
            "movl _dif, %%ebx            \n\t" 
                                               
            "cmpl _FullLength, %%ebx     \n\t" 
            "jnb avg_1end                \n\t"
            
            "movl %%edi, %%edx           \n\t"
            "subl %%ecx, %%edx           \n\t" 
            "xorl %%ecx, %%ecx           \n\t" 
                                               
         "avg_1lp:                       \n\t"
            
            "xorl %%eax, %%eax           \n\t"
            "movb (%%esi,%%ebx,), %%cl   \n\t" 
            "movb (%%edx,%%ebx,), %%al   \n\t" 
            "addw %%cx, %%ax             \n\t"
            "incl %%ebx                  \n\t"
            "shrw %%ax                   \n\t" 
            "addb -1(%%edi,%%ebx,), %%al \n\t" 
                                               
            "cmpl _FullLength, %%ebx     \n\t" 
            "movb %%al, -1(%%edi,%%ebx,) \n\t" 
                         
            "jb avg_1lp                  \n\t"
         "avg_1end:                      \n\t"
#ifdef __PIC__
            "popl %%ebx                  \n\t" 
#endif
            : "=c" (dummy_value_c),            
              "=S" (dummy_value_S),
              "=D" (dummy_value_D)
            : "0" (bpp),       
              "1" (prev_row),  
              "2" (row)        
            : "%eax", "%edx"                   
#ifndef __PIC__
            , "%ebx"
#endif
         );
      }
      return;  
      case 8:
      {
         __asm__ __volatile__ (
            
            "movl _dif, %%ecx            \n\t" 
            "movq _LBCarryMask, %%mm5    \n\t" 
            "movq _HBClearMask, %%mm4    \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm2 \n\t" 
                                      
         "avg_8lp:                       \n\t"
            "movq (%%edi,%%ecx,), %%mm0  \n\t"
            "movq %%mm5, %%mm3           \n\t"
            "movq (%%esi,%%ecx,), %%mm1  \n\t"
            "addl $8, %%ecx              \n\t"
            "pand %%mm1, %%mm3           \n\t" 
            "psrlq $1, %%mm1             \n\t" 
            "pand %%mm2, %%mm3           \n\t" 
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm1          \n\t" 
            "paddb %%mm3, %%mm0          \n\t" 
            "pand  %%mm4, %%mm2          \n\t" 
            "paddb %%mm1, %%mm0          \n\t" 
            "paddb %%mm2, %%mm0          \n\t" 
            "cmpl _MMXLength, %%ecx      \n\t"
            "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
            "movq %%mm0, %%mm2           \n\t" 
            "jb avg_8lp                  \n\t"
            : "=S" (dummy_value_S),            
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                           
#if 0  
            , "%mm0", "%mm1", "%mm2"
            , "%mm3", "%mm4", "%mm5"
#endif
         );
      }
      break;  
      default:                  
      {
#ifdef PNG_DEBUG
         
        png_debug(1,
        "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
#endif
#if 0
        __asm__ __volatile__ (
            "movq _LBCarryMask, %%mm5    \n\t"
            
            "movl _dif, %%ebx            \n\t" 
                                               
            "movl row, %%edi             \n\t" 
            "movq _HBClearMask, %%mm4    \n\t"
            "movl %%edi, %%edx           \n\t"
            "movl prev_row, %%esi        \n\t" 
            "subl bpp, %%edx             \n\t" 
         "avg_Alp:                       \n\t"
            "movq (%%edi,%%ebx,), %%mm0  \n\t"
            "movq %%mm5, %%mm3           \n\t"
            "movq (%%esi,%%ebx,), %%mm1  \n\t"
            "pand %%mm1, %%mm3           \n\t" 
            "movq (%%edx,%%ebx,), %%mm2  \n\t"
            "psrlq $1, %%mm1             \n\t" 
            "pand %%mm2, %%mm3           \n\t" 
                                               
            "psrlq $1, %%mm2             \n\t" 
            "pand  %%mm4, %%mm1          \n\t" 
                                               
            "paddb %%mm3, %%mm0          \n\t" 
                                               
            "pand  %%mm4, %%mm2          \n\t" 
                                               
            "paddb %%mm1, %%mm0          \n\t" 
                                               
            "addl $8, %%ebx              \n\t"
            "paddb %%mm2, %%mm0          \n\t" 
                                               
            "cmpl _MMXLength, %%ebx      \n\t"
            "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
            "jb avg_Alp                  \n\t"
            : 
            : 
            : "%ebx", "%edx", "%edi", "%esi" 
         );
#endif 
      }
      break;
   } 
   __asm__ __volatile__ (
      
      
#ifdef __PIC__
      "pushl %%ebx                 \n\t" 
#endif
      "movl _MMXLength, %%ebx      \n\t" 
      "cmpl _FullLength, %%ebx     \n\t" 
      "jnb avg_end                 \n\t"
      
      "movl %%edi, %%edx           \n\t"
      "subl %%ecx, %%edx           \n\t" 
      "xorl %%ecx, %%ecx           \n\t" 
   "avg_lp2:                       \n\t"
      
      "xorl %%eax, %%eax           \n\t"
      "movb (%%esi,%%ebx,), %%cl   \n\t" 
      "movb (%%edx,%%ebx,), %%al   \n\t" 
      "addw %%cx, %%ax             \n\t"
      "incl %%ebx                  \n\t"
      "shrw %%ax                   \n\t" 
      "addb -1(%%edi,%%ebx,), %%al \n\t" 
      "cmpl _FullLength, %%ebx     \n\t" 
      "movb %%al, -1(%%edi,%%ebx,) \n\t" 
      "jb avg_lp2                  \n\t" 
   "avg_end:                       \n\t"
      "EMMS                        \n\t" 
#ifdef __PIC__
      "popl %%ebx                  \n\t" 
#endif
      : "=c" (dummy_value_c),            
        "=S" (dummy_value_S),
        "=D" (dummy_value_D)
      : "0" (bpp),       
        "1" (prev_row),  
        "2" (row)        
      : "%eax", "%edx"                   
#ifndef __PIC__
      , "%ebx"
#endif
   );
} 
#endif
#ifdef PNG_THREAD_UNSAFE_OK
static void 
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
                              png_bytep prev_row)
{
   int bpp;
   int dummy_value_c;   
   int dummy_value_S;
   int dummy_value_D;
   bpp = (row_info->pixel_depth + 7) >> 3; 
   _FullLength  = row_info->rowbytes; 
   __asm__ __volatile__ (
#ifdef __PIC__
      "pushl %%ebx                 \n\t" 
#endif
      "xorl %%ebx, %%ebx           \n\t" 
      "xorl %%edx, %%edx           \n\t" 
      "xorl %%eax, %%eax           \n\t"
      
      
      
   "paeth_rlp:                     \n\t"
      "movb (%%edi,%%ebx,), %%al   \n\t"
      "addb (%%esi,%%ebx,), %%al   \n\t"
      "incl %%ebx                  \n\t"
      "cmpl %%ecx, %%ebx           \n\t"
      "movb %%al, -1(%%edi,%%ebx,) \n\t"
      "jb paeth_rlp                \n\t"
      
      "movl %%edi, _dif            \n\t" 
      "addl %%ebx, _dif            \n\t" 
      "xorl %%ecx, %%ecx           \n\t"
      "addl $0xf, _dif             \n\t" 
                                         
      "andl $0xfffffff8, _dif      \n\t" 
      "subl %%edi, _dif            \n\t" 
                                         
      "jz paeth_go                 \n\t"
      
   "paeth_lp1:                     \n\t"
      "xorl %%eax, %%eax           \n\t"
      
      "movb (%%esi,%%ebx,), %%al   \n\t" 
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "subl %%ecx, %%eax           \n\t" 
      "movl %%eax, _patemp         \n\t" 
      "xorl %%eax, %%eax           \n\t"
      
      "movb (%%edi,%%edx,), %%al   \n\t" 
      "subl %%ecx, %%eax           \n\t" 
      "movl %%eax, %%ecx           \n\t"
      
      "addl _patemp, %%eax         \n\t" 
      
      "testl $0x80000000, %%eax    \n\t"
      "jz paeth_pca                \n\t"
      "negl %%eax                  \n\t" 
   "paeth_pca:                     \n\t"
      "movl %%eax, _pctemp         \n\t" 
      
      "testl $0x80000000, %%ecx    \n\t"
      "jz paeth_pba                \n\t"
      "negl %%ecx                  \n\t" 
   "paeth_pba:                     \n\t"
      "movl %%ecx, _pbtemp         \n\t" 
      
      "movl _patemp, %%eax         \n\t"
      "testl $0x80000000, %%eax    \n\t"
      "jz paeth_paa                \n\t"
      "negl %%eax                  \n\t" 
   "paeth_paa:                     \n\t"
      "movl %%eax, _patemp         \n\t" 
      
      "cmpl %%ecx, %%eax           \n\t"
      "jna paeth_abb               \n\t"
      
      "cmpl _pctemp, %%ecx         \n\t"
      "jna paeth_bbc               \n\t"
      
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "jmp paeth_paeth             \n\t"
   "paeth_bbc:                     \n\t"
      
      "movb (%%esi,%%ebx,), %%cl   \n\t" 
      "jmp paeth_paeth             \n\t"
   "paeth_abb:                     \n\t"
      
      "cmpl _pctemp, %%eax         \n\t"
      "jna paeth_abc               \n\t"
      
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "jmp paeth_paeth             \n\t"
   "paeth_abc:                     \n\t"
      
      "movb (%%edi,%%edx,), %%cl   \n\t" 
   "paeth_paeth:                   \n\t"
      "incl %%ebx                  \n\t"
      "incl %%edx                  \n\t"
      
      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
      "cmpl _dif, %%ebx            \n\t"
      "jb paeth_lp1                \n\t"
   "paeth_go:                      \n\t"
      "movl _FullLength, %%ecx     \n\t"
      "movl %%ecx, %%eax           \n\t"
      "subl %%ebx, %%eax           \n\t" 
      "andl $0x00000007, %%eax     \n\t" 
      "subl %%eax, %%ecx           \n\t" 
      "movl %%ecx, _MMXLength      \n\t"
#ifdef __PIC__
      "popl %%ebx                  \n\t" 
#endif
      : "=c" (dummy_value_c),            
        "=S" (dummy_value_S),
        "=D" (dummy_value_D)
      : "0" (bpp),       
        "1" (prev_row),  
        "2" (row)        
      : "%eax", "%edx"                   
#ifndef __PIC__
      , "%ebx"
#endif
   );
   
   switch (bpp)
   {
      case 3:
      {
         _ActiveMask.use = 0x0000000000ffffffLL;
         _ActiveMaskEnd.use = 0xffff000000000000LL;
         _ShiftBpp.use = 24;    
         _ShiftRem.use = 40;    
         __asm__ __volatile__ (
            "movl _dif, %%ecx            \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
         "paeth_3lp:                     \n\t"
            "psrlq _ShiftRem, %%mm1      \n\t" 
                                               
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "punpcklbw %%mm0, %%mm1      \n\t" 
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "punpcklbw %%mm0, %%mm2      \n\t" 
            "psrlq _ShiftRem, %%mm3      \n\t" 
                                               
            
            "movq %%mm2, %%mm4           \n\t"
            "punpcklbw %%mm0, %%mm3      \n\t" 
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "packuswb %%mm1, %%mm7       \n\t"
            "movq (%%esi,%%ecx,), %%mm3  \n\t" 
            "pand _ActiveMask, %%mm7     \n\t"
            "movq %%mm3, %%mm2           \n\t" 
            "paddb (%%edi,%%ecx,), %%mm7 \n\t" 
            "punpcklbw %%mm0, %%mm3      \n\t" 
            "movq %%mm7, (%%edi,%%ecx,)  \n\t" 
            "movq %%mm7, %%mm1           \n\t" 
                                               
            
            "psrlq _ShiftBpp, %%mm2      \n\t" 
            "punpcklbw %%mm0, %%mm1      \n\t" 
            "pxor %%mm7, %%mm7           \n\t"
            "punpcklbw %%mm0, %%mm2      \n\t" 
            
            "movq %%mm1, %%mm5           \n\t"
            
            "movq %%mm2, %%mm4           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            
            
            "movq %%mm5, %%mm6           \n\t"
            "paddw %%mm4, %%mm6          \n\t"
            
            
            
            "pcmpgtw %%mm5, %%mm0        \n\t" 
            "pcmpgtw %%mm4, %%mm7        \n\t" 
            "pand %%mm5, %%mm0           \n\t" 
            "pand %%mm4, %%mm7           \n\t" 
            "psubw %%mm0, %%mm5          \n\t"
            "psubw %%mm7, %%mm4          \n\t"
            "psubw %%mm0, %%mm5          \n\t"
            "psubw %%mm7, %%mm4          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "pxor %%mm1, %%mm1           \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "packuswb %%mm1, %%mm7       \n\t"
            "movq %%mm2, %%mm3           \n\t" 
            "pand _ActiveMask, %%mm7     \n\t"
            "punpckhbw %%mm0, %%mm2      \n\t" 
            "psllq _ShiftBpp, %%mm7      \n\t" 
                                               
             
            "movq %%mm2, %%mm4           \n\t"
            "paddb (%%edi,%%ecx,), %%mm7 \n\t" 
            "psllq _ShiftBpp, %%mm3      \n\t" 
            "movq %%mm7, (%%edi,%%ecx,)  \n\t" 
            "movq %%mm7, %%mm1           \n\t"
            "punpckhbw %%mm0, %%mm3      \n\t" 
            "psllq _ShiftBpp, %%mm1      \n\t" 
                                    
            
            "pxor %%mm7, %%mm7           \n\t"
            "punpckhbw %%mm0, %%mm1      \n\t" 
            "psubw %%mm3, %%mm4          \n\t"
            
            "movq %%mm1, %%mm5           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "paddw %%mm5, %%mm6          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "pand %%mm4, %%mm0           \n\t" 
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm1, %%mm1           \n\t"
            "packuswb %%mm7, %%mm1       \n\t"
            
            "addl $8, %%ecx              \n\t"
            "pand _ActiveMaskEnd, %%mm1  \n\t"
            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" 
                                                 
            "cmpl _MMXLength, %%ecx      \n\t"
            "pxor %%mm0, %%mm0           \n\t" 
            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" 
                                 
                           
            "jb paeth_3lp                \n\t"
            : "=S" (dummy_value_S),             
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 6:
      
      
      {
         _ActiveMask.use  = 0x00000000ffffffffLL;
         _ActiveMask2.use = 0xffffffff00000000LL;
         _ShiftBpp.use = bpp << 3;    
         _ShiftRem.use = 64 - _ShiftBpp.use;
         __asm__ __volatile__ (
            "movl _dif, %%ecx            \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
            "pxor %%mm0, %%mm0           \n\t"
         "paeth_6lp:                     \n\t"
            
            "psrlq _ShiftRem, %%mm1      \n\t"
            
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "punpcklbw %%mm0, %%mm1      \n\t" 
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "punpcklbw %%mm0, %%mm2      \n\t" 
            
            "psrlq _ShiftRem, %%mm3      \n\t"
            
            "movq %%mm2, %%mm4           \n\t"
            "punpcklbw %%mm0, %%mm3      \n\t" 
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "packuswb %%mm1, %%mm7       \n\t"
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "pand _ActiveMask, %%mm7     \n\t"
            "psrlq _ShiftRem, %%mm3      \n\t"
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "paddb (%%edi,%%ecx,), %%mm7 \n\t" 
            "movq %%mm2, %%mm6           \n\t"
            "movq %%mm7, (%%edi,%%ecx,)  \n\t" 
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
            "psllq _ShiftBpp, %%mm6      \n\t"
            "movq %%mm7, %%mm5           \n\t"
            "psrlq _ShiftRem, %%mm1      \n\t"
            "por %%mm6, %%mm3            \n\t"
            "psllq _ShiftBpp, %%mm5      \n\t"
            "punpckhbw %%mm0, %%mm3      \n\t" 
            "por %%mm5, %%mm1            \n\t"
            
            "punpckhbw %%mm0, %%mm2      \n\t" 
            "punpckhbw %%mm0, %%mm1      \n\t" 
            
            "movq %%mm2, %%mm4           \n\t"
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "pxor %%mm1, %%mm1           \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "addl $8, %%ecx              \n\t"
            "packuswb %%mm7, %%mm1       \n\t"
            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" 
            "cmpl _MMXLength, %%ecx      \n\t"
            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" 
                                
            "jb paeth_6lp                \n\t"
            : "=S" (dummy_value_S),             
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 4:
      {
         _ActiveMask.use  = 0x00000000ffffffffLL;
         __asm__ __volatile__ (
            "movl _dif, %%ecx            \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 
                                     
         "paeth_4lp:                     \n\t"
            
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "punpckhbw %%mm0, %%mm1      \n\t" 
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "punpcklbw %%mm0, %%mm2      \n\t" 
            
            "movq %%mm2, %%mm4           \n\t"
            "punpckhbw %%mm0, %%mm3      \n\t" 
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "packuswb %%mm1, %%mm7       \n\t"
            "movq (%%esi,%%ecx,), %%mm3  \n\t" 
            "pand _ActiveMask, %%mm7     \n\t"
            "movq %%mm3, %%mm2           \n\t" 
            "paddb (%%edi,%%ecx,), %%mm7 \n\t" 
            "punpcklbw %%mm0, %%mm3      \n\t" 
            "movq %%mm7, (%%edi,%%ecx,)  \n\t" 
            "movq %%mm7, %%mm1           \n\t" 
            
            "punpckhbw %%mm0, %%mm2      \n\t" 
            "punpcklbw %%mm0, %%mm1      \n\t" 
            
            "movq %%mm2, %%mm4           \n\t"
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "pxor %%mm1, %%mm1           \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "addl $8, %%ecx              \n\t"
            "packuswb %%mm7, %%mm1       \n\t"
            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" 
            "cmpl _MMXLength, %%ecx      \n\t"
            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" 
                                
            "jb paeth_4lp                \n\t"
            : "=S" (dummy_value_S),             
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 8:                          
      {
         _ActiveMask.use  = 0x00000000ffffffffLL;
         __asm__ __volatile__ (
            "movl _dif, %%ecx            \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 
                                       
         "paeth_8lp:                     \n\t"
            
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "punpcklbw %%mm0, %%mm1      \n\t" 
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "punpcklbw %%mm0, %%mm2      \n\t" 
            
            "movq %%mm2, %%mm4           \n\t"
            "punpcklbw %%mm0, %%mm3      \n\t" 
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "packuswb %%mm1, %%mm7       \n\t"
            "movq -8(%%esi,%%ecx,), %%mm3 \n\t" 
            "pand _ActiveMask, %%mm7     \n\t"
            "movq (%%esi,%%ecx,), %%mm2  \n\t" 
            "paddb (%%edi,%%ecx,), %%mm7 \n\t" 
            "punpckhbw %%mm0, %%mm3      \n\t" 
            "movq %%mm7, (%%edi,%%ecx,)  \n\t" 
            "movq -8(%%edi,%%ecx,), %%mm1 \n\t" 
            
            "punpckhbw %%mm0, %%mm2      \n\t" 
            "punpckhbw %%mm0, %%mm1      \n\t" 
            
            "movq %%mm2, %%mm4           \n\t"
            
            "movq %%mm1, %%mm5           \n\t"
            "psubw %%mm3, %%mm4          \n\t"
            "pxor %%mm7, %%mm7           \n\t"
            
            "movq %%mm4, %%mm6           \n\t"
            "psubw %%mm3, %%mm5          \n\t"
            
            
            
            "pcmpgtw %%mm4, %%mm0        \n\t" 
            "paddw %%mm5, %%mm6          \n\t"
            "pand %%mm4, %%mm0           \n\t" 
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "pand %%mm5, %%mm7           \n\t" 
            "psubw %%mm0, %%mm4          \n\t"
            "psubw %%mm7, %%mm5          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            "pcmpgtw %%mm6, %%mm0        \n\t" 
            "pand %%mm6, %%mm0           \n\t" 
            "psubw %%mm7, %%mm5          \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            
            "movq %%mm4, %%mm7           \n\t"
            "psubw %%mm0, %%mm6          \n\t"
            "pcmpgtw %%mm5, %%mm7        \n\t" 
            "movq %%mm7, %%mm0           \n\t"
            
            "pand %%mm7, %%mm5           \n\t"
            
            "pand %%mm0, %%mm2           \n\t"
            "pandn %%mm4, %%mm7          \n\t"
            "pandn %%mm1, %%mm0          \n\t"
            "paddw %%mm5, %%mm7          \n\t"
            "paddw %%mm2, %%mm0          \n\t"
            
            "pcmpgtw %%mm6, %%mm7        \n\t" 
            "pxor %%mm1, %%mm1           \n\t"
            "pand %%mm7, %%mm3           \n\t"
            "pandn %%mm0, %%mm7          \n\t"
            "pxor %%mm1, %%mm1           \n\t"
            "paddw %%mm3, %%mm7          \n\t"
            "pxor %%mm0, %%mm0           \n\t"
            
            "addl $8, %%ecx              \n\t"
            "packuswb %%mm7, %%mm1       \n\t"
            "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" 
            "cmpl _MMXLength, %%ecx      \n\t"
            "movq %%mm1, -8(%%edi,%%ecx,) \n\t" 
                            
            "jb paeth_8lp                \n\t"
            : "=S" (dummy_value_S),             
              "=D" (dummy_value_D)
            : "0" (prev_row),  
              "1" (row)        
            : "%ecx"                            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3"
            , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;  
      case 1:                
      case 2:                
      default:               
      {
         __asm__ __volatile__ (
#ifdef __PIC__
            "pushl %%ebx                 \n\t" 
#endif
            "movl _dif, %%ebx            \n\t"
            "cmpl _FullLength, %%ebx     \n\t"
            "jnb paeth_dend              \n\t"
            
            "movl %%ebx, %%edx           \n\t"
            "subl %%ecx, %%edx           \n\t" 
            "xorl %%ecx, %%ecx           \n\t" 
         "paeth_dlp:                     \n\t"
            "xorl %%eax, %%eax           \n\t"
            
            "movb (%%esi,%%ebx,), %%al   \n\t" 
            "movb (%%esi,%%edx,), %%cl   \n\t" 
            "subl %%ecx, %%eax           \n\t" 
            "movl %%eax, _patemp         \n\t" 
            "xorl %%eax, %%eax           \n\t"
            
            "movb (%%edi,%%edx,), %%al   \n\t" 
            "subl %%ecx, %%eax           \n\t" 
            "movl %%eax, %%ecx           \n\t"
            
            "addl _patemp, %%eax         \n\t" 
            
            "testl $0x80000000, %%eax    \n\t"
            "jz paeth_dpca               \n\t"
            "negl %%eax                  \n\t" 
         "paeth_dpca:                    \n\t"
            "movl %%eax, _pctemp         \n\t" 
            
            "testl $0x80000000, %%ecx    \n\t"
            "jz paeth_dpba               \n\t"
            "negl %%ecx                  \n\t" 
         "paeth_dpba:                    \n\t"
            "movl %%ecx, _pbtemp         \n\t" 
            
            "movl _patemp, %%eax         \n\t"
            "testl $0x80000000, %%eax    \n\t"
            "jz paeth_dpaa               \n\t"
            "negl %%eax                  \n\t" 
         "paeth_dpaa:                    \n\t"
            "movl %%eax, _patemp         \n\t" 
            
            "cmpl %%ecx, %%eax           \n\t"
            "jna paeth_dabb              \n\t"
            
            "cmpl _pctemp, %%ecx         \n\t"
            "jna paeth_dbbc              \n\t"
            
            "movb (%%esi,%%edx,), %%cl   \n\t" 
            "jmp paeth_dpaeth            \n\t"
         "paeth_dbbc:                    \n\t"
            
            "movb (%%esi,%%ebx,), %%cl   \n\t" 
            "jmp paeth_dpaeth            \n\t"
         "paeth_dabb:                    \n\t"
            
            "cmpl _pctemp, %%eax         \n\t"
            "jna paeth_dabc              \n\t"
            
            "movb (%%esi,%%edx,), %%cl   \n\t" 
            "jmp paeth_dpaeth            \n\t"
         "paeth_dabc:                    \n\t"
            
            "movb (%%edi,%%edx,), %%cl   \n\t" 
         "paeth_dpaeth:                  \n\t"
            "incl %%ebx                  \n\t"
            "incl %%edx                  \n\t"
            
            "addb %%cl, -1(%%edi,%%ebx,) \n\t"
            "cmpl _FullLength, %%ebx     \n\t"
            "jb paeth_dlp                \n\t"
         "paeth_dend:                    \n\t"
#ifdef __PIC__
            "popl %%ebx                  \n\t" 
#endif
            : "=c" (dummy_value_c),            
              "=S" (dummy_value_S),
              "=D" (dummy_value_D)
            : "0" (bpp),       
              "1" (prev_row),  
              "2" (row)        
            : "%eax", "%edx"                   
#ifndef __PIC__
            , "%ebx"
#endif
         );
      }
      return;                   
   } 
   __asm__ __volatile__ (
      
      
#ifdef __PIC__
      "pushl %%ebx                 \n\t" 
#endif
      "movl _MMXLength, %%ebx      \n\t"
      "cmpl _FullLength, %%ebx     \n\t"
      "jnb paeth_end               \n\t"
      
      "movl %%ebx, %%edx           \n\t"
      "subl %%ecx, %%edx           \n\t" 
      "xorl %%ecx, %%ecx           \n\t" 
   "paeth_lp2:                     \n\t"
      "xorl %%eax, %%eax           \n\t"
      
      "movb (%%esi,%%ebx,), %%al   \n\t" 
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "subl %%ecx, %%eax           \n\t" 
      "movl %%eax, _patemp         \n\t" 
      "xorl %%eax, %%eax           \n\t"
      
      "movb (%%edi,%%edx,), %%al   \n\t" 
      "subl %%ecx, %%eax           \n\t" 
      "movl %%eax, %%ecx           \n\t"
      
      "addl _patemp, %%eax         \n\t" 
      
      "testl $0x80000000, %%eax    \n\t"
      "jz paeth_pca2               \n\t"
      "negl %%eax                  \n\t" 
   "paeth_pca2:                    \n\t"
      "movl %%eax, _pctemp         \n\t" 
      
      "testl $0x80000000, %%ecx    \n\t"
      "jz paeth_pba2               \n\t"
      "negl %%ecx                  \n\t" 
   "paeth_pba2:                    \n\t"
      "movl %%ecx, _pbtemp         \n\t" 
      
      "movl _patemp, %%eax         \n\t"
      "testl $0x80000000, %%eax    \n\t"
      "jz paeth_paa2               \n\t"
      "negl %%eax                  \n\t" 
   "paeth_paa2:                    \n\t"
      "movl %%eax, _patemp         \n\t" 
      
      "cmpl %%ecx, %%eax           \n\t"
      "jna paeth_abb2              \n\t"
      
      "cmpl _pctemp, %%ecx         \n\t"
      "jna paeth_bbc2              \n\t"
      
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "jmp paeth_paeth2            \n\t"
   "paeth_bbc2:                    \n\t"
      
      "movb (%%esi,%%ebx,), %%cl   \n\t" 
      "jmp paeth_paeth2            \n\t"
   "paeth_abb2:                    \n\t"
      
      "cmpl _pctemp, %%eax         \n\t"
      "jna paeth_abc2              \n\t"
      
      "movb (%%esi,%%edx,), %%cl   \n\t" 
      "jmp paeth_paeth2            \n\t"
   "paeth_abc2:                    \n\t"
      
      "movb (%%edi,%%edx,), %%cl   \n\t" 
   "paeth_paeth2:                  \n\t"
      "incl %%ebx                  \n\t"
      "incl %%edx                  \n\t"
      
      "addb %%cl, -1(%%edi,%%ebx,) \n\t"
      "cmpl _FullLength, %%ebx     \n\t"
      "jb paeth_lp2                \n\t"
   "paeth_end:                     \n\t"
      "EMMS                        \n\t" 
#ifdef __PIC__
      "popl %%ebx                  \n\t" 
#endif
      : "=c" (dummy_value_c),            
        "=S" (dummy_value_S),
        "=D" (dummy_value_D)
      : "0" (bpp),       
        "1" (prev_row),  
        "2" (row)        
      : "%eax", "%edx"                   
#ifndef __PIC__
      , "%ebx"
#endif
   );
} 
#endif
#ifdef PNG_THREAD_UNSAFE_OK
static void 
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
{
   int bpp;
   int dummy_value_a;
   int dummy_value_D;
   bpp = (row_info->pixel_depth + 7) >> 3;   
   _FullLength = row_info->rowbytes - bpp;   
   __asm__ __volatile__ (
      "movl %%edi, %%esi           \n\t" 
      "addl %%eax, %%edi           \n\t" 
      
      "movl %%edi, _dif            \n\t" 
      "addl $0xf, _dif             \n\t" 
                                         
      "xorl %%ecx, %%ecx           \n\t"
      "andl $0xfffffff8, _dif      \n\t" 
      "subl %%edi, _dif            \n\t" 
      "jz sub_go                   \n\t" 
   "sub_lp1:                       \n\t" 
      "movb (%%esi,%%ecx,), %%al   \n\t"
      "addb %%al, (%%edi,%%ecx,)   \n\t"
      "incl %%ecx                  \n\t"
      "cmpl _dif, %%ecx            \n\t"
      "jb sub_lp1                  \n\t"
   "sub_go:                        \n\t"
      "movl _FullLength, %%eax     \n\t"
      "movl %%eax, %%edx           \n\t"
      "subl %%ecx, %%edx           \n\t" 
      "andl $0x00000007, %%edx     \n\t" 
      "subl %%edx, %%eax           \n\t" 
      "movl %%eax, _MMXLength      \n\t"
      : "=a" (dummy_value_a),   
        "=D" (dummy_value_D)    
      : "0" (bpp),              
        "1" (row)               
      : "%esi", "%ecx", "%edx"            
#if 0  
      , "%mm0", "%mm1", "%mm2", "%mm3"
      , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
   );
   
   switch (bpp)
   {
      case 3:
      {
         _ActiveMask.use  = 0x0000ffffff000000LL;
         _ShiftBpp.use = 24;       
         _ShiftRem.use  = 40;      
         __asm__ __volatile__ (
            "movq _ActiveMask, %%mm7       \n\t" 
                                                
            "movl %%edi, %%esi            \n\t" 
            "addl %%eax, %%edi            \n\t" 
            "movq %%mm7, %%mm6            \n\t"
            "movl _dif, %%edx             \n\t"
            "psllq _ShiftBpp, %%mm6       \n\t" 
                                                
            
            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
         "sub_3lp:                        \n\t" 
            "psrlq _ShiftRem, %%mm1       \n\t" 
                                                
            
            "movq (%%edi,%%edx,), %%mm0   \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "pand %%mm7, %%mm1            \n\t" 
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "pand %%mm6, %%mm1            \n\t" 
            "addl $8, %%edx               \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "movq %%mm0, -8(%%edi,%%edx,) \n\t" 
            "movq %%mm0, %%mm1            \n\t" 
            "jb sub_3lp                   \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%edx", "%esi"                    
#if 0  
            , "%mm0", "%mm1", "%mm6", "%mm7"
#endif
         );
      }
      break;
      case 1:
      {
         __asm__ __volatile__ (
            "movl _dif, %%edx            \n\t"
            "cmpl _FullLength, %%edx     \n\t"
            "jnb sub_1end                \n\t"
            "movl %%edi, %%esi           \n\t" 
            "xorl %%eax, %%eax           \n\t"
            "addl %%eax, %%edi           \n\t" 
         "sub_1lp:                       \n\t"
            "movb (%%esi,%%edx,), %%al   \n\t"
            "addb %%al, (%%edi,%%edx,)   \n\t"
            "incl %%edx                  \n\t"
            "cmpl _FullLength, %%edx     \n\t"
            "jb sub_1lp                  \n\t"
         "sub_1end:                      \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%edx", "%esi"                    
         );
      }
      return;
      case 6:
      case 4:
      
      
      {
         _ShiftBpp.use = bpp << 3;
         _ShiftRem.use = 64 - _ShiftBpp.use;
         __asm__ __volatile__ (
            "movl _dif, %%edx             \n\t"
            "movl %%edi, %%esi            \n\t" 
            "addl %%eax, %%edi            \n\t" 
            
            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
         "sub_4lp:                        \n\t" 
            "psrlq _ShiftRem, %%mm1       \n\t" 
                                                
            "movq (%%edi,%%edx,), %%mm0   \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "addl $8, %%edx               \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "movq %%mm0, -8(%%edi,%%edx,) \n\t"
            "movq %%mm0, %%mm1            \n\t" 
            "jb sub_4lp                   \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%edx", "%esi"                    
#if 0  
            , "%mm0", "%mm1"
#endif
         );
      }
      break;
      case 2:
      {
         _ActiveMask.use = 0x00000000ffff0000LL;
         _ShiftBpp.use = 16;       
         _ShiftRem.use = 48;       
         __asm__ __volatile__ (
            "movq _ActiveMask, %%mm7      \n\t" 
                                                
            "movl _dif, %%edx             \n\t"
            "movq %%mm7, %%mm6            \n\t"
            "psllq _ShiftBpp, %%mm6       \n\t" 
                                                
            "movl %%edi, %%esi            \n\t" 
            "movq %%mm6, %%mm5            \n\t"
            "addl %%eax, %%edi            \n\t" 
            "psllq _ShiftBpp, %%mm5       \n\t" 
                                                
            
            "movq -8(%%edi,%%edx,), %%mm1 \n\t"
         "sub_2lp:                        \n\t" 
            "psrlq _ShiftRem, %%mm1       \n\t" 
                                                
            
            "movq (%%edi,%%edx,), %%mm0   \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "pand %%mm7, %%mm1            \n\t" 
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "pand %%mm6, %%mm1            \n\t" 
            "paddb %%mm1, %%mm0           \n\t"
            
            "movq %%mm0, %%mm1            \n\t" 
            "psllq _ShiftBpp, %%mm1       \n\t" 
            "pand %%mm5, %%mm1            \n\t" 
            "addl $8, %%edx               \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "movq %%mm0, -8(%%edi,%%edx,) \n\t" 
            "movq %%mm0, %%mm1            \n\t" 
            "jb sub_2lp                   \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%edx", "%esi"                    
#if 0  
            , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;
      case 8:
      {
         __asm__ __volatile__ (
            "movl _dif, %%edx             \n\t"
            "movl %%edi, %%esi            \n\t" 
            "addl %%eax, %%edi            \n\t" 
            "movl _MMXLength, %%ecx       \n\t"
            
            "movq -8(%%edi,%%edx,), %%mm7 \n\t"
            "andl $0x0000003f, %%ecx      \n\t" 
         "sub_8lp:                        \n\t"
            "movq (%%edi,%%edx,), %%mm0   \n\t" 
            "paddb %%mm7, %%mm0           \n\t"
            "movq 8(%%edi,%%edx,), %%mm1  \n\t" 
            "movq %%mm0, (%%edi,%%edx,)   \n\t" 
            
            
            
            
            "paddb %%mm0, %%mm1           \n\t"
            "movq 16(%%edi,%%edx,), %%mm2 \n\t" 
            "movq %%mm1, 8(%%edi,%%edx,)  \n\t" 
            "paddb %%mm1, %%mm2           \n\t"
            "movq 24(%%edi,%%edx,), %%mm3 \n\t" 
            "movq %%mm2, 16(%%edi,%%edx,) \n\t" 
            "paddb %%mm2, %%mm3           \n\t"
            "movq 32(%%edi,%%edx,), %%mm4 \n\t" 
            "movq %%mm3, 24(%%edi,%%edx,) \n\t" 
            "paddb %%mm3, %%mm4           \n\t"
            "movq 40(%%edi,%%edx,), %%mm5 \n\t" 
            "movq %%mm4, 32(%%edi,%%edx,) \n\t" 
            "paddb %%mm4, %%mm5           \n\t"
            "movq 48(%%edi,%%edx,), %%mm6 \n\t" 
            "movq %%mm5, 40(%%edi,%%edx,) \n\t" 
            "paddb %%mm5, %%mm6           \n\t"
            "movq 56(%%edi,%%edx,), %%mm7 \n\t" 
            "movq %%mm6, 48(%%edi,%%edx,) \n\t" 
            "addl $64, %%edx              \n\t"
            "paddb %%mm6, %%mm7           \n\t"
            "cmpl %%ecx, %%edx            \n\t"
            "movq %%mm7, -8(%%edi,%%edx,) \n\t" 
            "jb sub_8lp                   \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "jnb sub_8lt8                 \n\t"
         "sub_8lpA:                       \n\t"
            "movq (%%edi,%%edx,), %%mm0   \n\t"
            "addl $8, %%edx               \n\t"
            "paddb %%mm7, %%mm0           \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "movq %%mm0, -8(%%edi,%%edx,) \n\t" 
            "movq %%mm0, %%mm7            \n\t" 
                                                
                                                
            "jb sub_8lpA                  \n\t"
         "sub_8lt8:                       \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%ecx", "%edx", "%esi"            
#if 0  
            , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
#endif
         );
      }
      break;
      default:                
      {
         __asm__ __volatile__ (
            "movl _dif, %%edx             \n\t"
            "movl %%edi, %%esi            \n\t" 
            "addl %%eax, %%edi            \n\t" 
         "sub_Alp:                        \n\t"
            "movq (%%edi,%%edx,), %%mm0   \n\t"
            "movq (%%esi,%%edx,), %%mm1   \n\t"
            "addl $8, %%edx               \n\t"
            "paddb %%mm1, %%mm0           \n\t"
            "cmpl _MMXLength, %%edx       \n\t"
            "movq %%mm0, -8(%%edi,%%edx,) \n\t" 
                                                
            "jb sub_Alp                   \n\t"
            : "=a" (dummy_value_a),   
              "=D" (dummy_value_D)    
            : "0" (bpp),              
              "1" (row)               
            : "%edx", "%esi"                    
#if 0  
            , "%mm0", "%mm1"
#endif
         );
      }
      break;
   } 
   __asm__ __volatile__ (
      "movl _MMXLength, %%edx       \n\t"
      "cmpl _FullLength, %%edx      \n\t"
      "jnb sub_end                  \n\t"
      "movl %%edi, %%esi            \n\t" 
      "addl %%eax, %%edi            \n\t" 
      "xorl %%eax, %%eax            \n\t"
   "sub_lp2:                        \n\t"
      "movb (%%esi,%%edx,), %%al    \n\t"
      "addb %%al, (%%edi,%%edx,)    \n\t"
      "incl %%edx                   \n\t"
      "cmpl _FullLength, %%edx      \n\t"
      "jb sub_lp2                   \n\t"
   "sub_end:                        \n\t"
      "EMMS                         \n\t" 
      : "=a" (dummy_value_a),   
        "=D" (dummy_value_D)    
      : "0" (bpp),              
        "1" (row)               
      : "%edx", "%esi"                    
   );
} 
#endif
static void 
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
                           png_bytep prev_row)
{
   png_uint_32 len;
   int dummy_value_d;   
   int dummy_value_S;
   int dummy_value_D;
   len = row_info->rowbytes;              
   __asm__ __volatile__ (
      
#ifdef __PIC__
      "pushl %%ebx                  \n\t"
#endif
      "movl %%edi, %%ecx            \n\t"
      "xorl %%ebx, %%ebx            \n\t"
      "addl $0x7, %%ecx             \n\t"
      "xorl %%eax, %%eax            \n\t"
      "andl $0xfffffff8, %%ecx      \n\t"
      "subl %%edi, %%ecx            \n\t"
      "jz up_go                     \n\t"
   "up_lp1:                         \n\t" 
      "movb (%%edi,%%ebx,), %%al    \n\t"
      "addb (%%esi,%%ebx,), %%al    \n\t"
      "incl %%ebx                   \n\t"
      "cmpl %%ecx, %%ebx            \n\t"
      "movb %%al, -1(%%edi,%%ebx,)  \n\t" 
      "jb up_lp1                    \n\t" 
   "up_go:                          \n\t"
      "movl %%edx, %%ecx            \n\t"
      "subl %%ebx, %%edx            \n\t" 
      "andl $0x0000003f, %%edx      \n\t" 
      "subl %%edx, %%ecx            \n\t" 
      
      
   "up_loop:                        \n\t"
      "movq (%%esi,%%ebx,), %%mm1   \n\t"
      "movq (%%edi,%%ebx,), %%mm0   \n\t"
      "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
      "paddb %%mm1, %%mm0           \n\t"
      "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
      "movq %%mm0, (%%edi,%%ebx,)   \n\t"
      "paddb %%mm3, %%mm2           \n\t"
      "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
      "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
      "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
      "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
      "paddb %%mm5, %%mm4           \n\t"
      "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
      "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
      "paddb %%mm7, %%mm6           \n\t"
      "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
      "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
      "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
      "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
      "paddb %%mm1, %%mm0           \n\t"
      "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
      "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
      "paddb %%mm3, %%mm2           \n\t"
      "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
      "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
      "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
      "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
      "paddb %%mm5, %%mm4           \n\t"
      "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
      "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
      "addl $64, %%ebx              \n\t"
      "paddb %%mm7, %%mm6           \n\t"
      "cmpl %%ecx, %%ebx            \n\t"
      "movq %%mm6, -8(%%edi,%%ebx,) \n\t" 
      "jb up_loop                   \n\t" 
      "cmpl $0, %%edx               \n\t" 
      "jz up_end                    \n\t"
      "cmpl $8, %%edx               \n\t" 
      "jb up_lt8                    \n\t" 
      "addl %%edx, %%ecx            \n\t"
      "andl $0x00000007, %%edx      \n\t" 
      "subl %%edx, %%ecx            \n\t" 
      "jz up_lt8                    \n\t"
   "up_lpA:                         \n\t" 
      "movq (%%esi,%%ebx,), %%mm1   \n\t"
      "movq (%%edi,%%ebx,), %%mm0   \n\t"
      "addl $8, %%ebx               \n\t"
      "paddb %%mm1, %%mm0           \n\t"
      "cmpl %%ecx, %%ebx            \n\t"
      "movq %%mm0, -8(%%edi,%%ebx,) \n\t" 
      "jb up_lpA                    \n\t" 
      "cmpl $0, %%edx               \n\t" 
      "jz up_end                    \n\t"
   "up_lt8:                         \n\t"
      "xorl %%eax, %%eax            \n\t"
      "addl %%edx, %%ecx            \n\t" 
   "up_lp2:                         \n\t" 
      "movb (%%edi,%%ebx,), %%al    \n\t"
      "addb (%%esi,%%ebx,), %%al    \n\t"
      "incl %%ebx                   \n\t"
      "cmpl %%ecx, %%ebx            \n\t"
      "movb %%al, -1(%%edi,%%ebx,)  \n\t" 
      "jb up_lp2                    \n\t" 
   "up_end:                         \n\t"
      "EMMS                         \n\t" 
#ifdef __PIC__
      "popl %%ebx                   \n\t"
#endif
      : "=d" (dummy_value_d),   
        "=S" (dummy_value_S),   
        "=D" (dummy_value_D)    
      : "0" (len),              
        "1" (prev_row),         
        "2" (row)               
      : "%eax", "%ecx"            
#ifndef __PIC__
      , "%ebx"
#endif
#if 0  
      , "%mm0", "%mm1", "%mm2", "%mm3"
      , "%mm4", "%mm5", "%mm6", "%mm7"
#endif
   );
} 
#endif 
void 
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
   row, png_bytep prev_row, int filter)
{
#ifdef PNG_DEBUG
   char filnm[10];
#endif
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
#define UseMMX_sub    1   
#define UseMMX_up     1   
#define UseMMX_avg    1   
#define UseMMX_paeth  1   
   if (_mmx_supported == 2) {
       
#if !defined(PNG_1_0_X)
       png_warning(png_ptr, "asm_flags may not have been initialized");
#endif
       png_mmx_support();
   }
#endif 
#ifdef PNG_DEBUG
   png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
   switch (filter)
   {
      case 0: sprintf(filnm, "none");
         break;
      case 1: sprintf(filnm, "sub-%s",
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
#endif
#endif
"x86");
         break;
      case 2: sprintf(filnm, "up-%s",
#ifdef PNG_ASSEMBLER_CODE_SUPPORTED
#if !defined(PNG_1_0_X)
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
#endif
#endif
 "x86");
         break;
      case 3: sprintf(filnm, "avg-%s",
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
#endif
#endif
 "x86");
         break;
      case 4: sprintf(filnm, "Paeth-%s",
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
#endif
#endif
"x86");
         break;
      default: sprintf(filnm, "unknw");
         break;
   }
   png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
   png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
   png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
      (int)((row_info->pixel_depth + 7) >> 3));
   png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
#endif BUG
   switch (filter)
   {
      case PNG_FILTER_VALUE_NONE:
         break;
      case PNG_FILTER_VALUE_SUB:
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
#else
         if (_mmx_supported)
#endif
         {
            png_read_filter_row_mmx_sub(row_info, row);
         }
         else
#endif 
         {
            png_uint_32 i;
            png_uint_32 istop = row_info->rowbytes;
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
            png_bytep rp = row + bpp;
            png_bytep lp = row;
            for (i = bpp; i < istop; i++)
            {
               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
               rp++;
            }
         }  
         break;
      case PNG_FILTER_VALUE_UP:
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
#if !defined(PNG_1_0_X)
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
#else
         if (_mmx_supported)
#endif
         {
            png_read_filter_row_mmx_up(row_info, row, prev_row);
         }
          else
#endif 
         {
            png_uint_32 i;
            png_uint_32 istop = row_info->rowbytes;
            png_bytep rp = row;
            png_bytep pp = prev_row;
            for (i = 0; i < istop; ++i)
            {
               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
               rp++;
            }
         }  
         break;
      case PNG_FILTER_VALUE_AVG:
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
#else
         if (_mmx_supported)
#endif
         {
            png_read_filter_row_mmx_avg(row_info, row, prev_row);
         }
         else
#endif 
         {
            png_uint_32 i;
            png_bytep rp = row;
            png_bytep pp = prev_row;
            png_bytep lp = row;
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
            png_uint_32 istop = row_info->rowbytes - bpp;
            for (i = 0; i < bpp; i++)
            {
               *rp = (png_byte)(((int)(*rp) +
                  ((int)(*pp++) >> 1)) & 0xff);
               rp++;
            }
            for (i = 0; i < istop; i++)
            {
               *rp = (png_byte)(((int)(*rp) +
                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
               rp++;
            }
         }  
         break;
      case PNG_FILTER_VALUE_PAETH:
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
#if !defined(PNG_1_0_X)
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
#else
         if (_mmx_supported)
#endif
         {
            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
         }
         else
#endif 
         {
            png_uint_32 i;
            png_bytep rp = row;
            png_bytep pp = prev_row;
            png_bytep lp = row;
            png_bytep cp = prev_row;
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
            png_uint_32 istop = row_info->rowbytes - bpp;
            for (i = 0; i < bpp; i++)
            {
               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
               rp++;
            }
            for (i = 0; i < istop; i++)   
            {
               int a, b, c, pa, pb, pc, p;
               a = *lp++;
               b = *pp++;
               c = *cp++;
               p = b - c;
               pc = a - c;
#ifdef PNG_USE_ABS
               pa = abs(p);
               pb = abs(pc);
               pc = abs(p + pc);
#else
               pa = p < 0 ? -p : p;
               pb = pc < 0 ? -pc : pc;
               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
#endif
               
               p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
               rp++;
            }
         }  
         break;
      default:
         png_warning(png_ptr, "Ignoring bad row-filter type");
         *row=0;
         break;
   }
}
#endif 
int PNGAPI
png_mmx_support(void)
{
#if defined(PNG_MMX_CODE_SUPPORTED)
    int result;
    __asm__ __volatile__ (
        "pushl %%ebx          \n\t"  
        "pushl %%ecx          \n\t"  
        "pushl %%edx          \n\t"  
        "pushfl               \n\t"  
        "popl %%eax           \n\t"  
        "movl %%eax, %%ecx    \n\t"  
        "xorl $0x200000, %%eax \n\t" 
        "pushl %%eax          \n\t"  
        "popfl                \n\t"  
        "pushfl               \n\t"  
        "popl %%eax           \n\t"  
        "pushl %%ecx          \n\t"  
        "popfl                \n\t"  
        "xorl %%ecx, %%eax    \n\t"  
        "jz 0f                \n\t"  
        "xorl %%eax, %%eax    \n\t"  
        "cpuid                \n\t"  
        "cmpl $1, %%eax       \n\t"  
        "jl 0f                \n\t"  
        "xorl %%eax, %%eax    \n\t"  
        "incl %%eax           \n\t"  
                                     
        "cpuid                \n\t"  
        "andl $0x800000, %%edx \n\t" 
        "cmpl $0, %%edx       \n\t"  
        "jz 0f                \n\t"  
        "movl $1, %%eax       \n\t"  
        "jmp  1f              \n\t"  
    "0:                       \n\t"  
        "movl $0, %%eax       \n\t"  
    "1:                       \n\t"  
        "popl %%edx           \n\t"  
        "popl %%ecx           \n\t"  
        "popl %%ebx           \n\t"  
                                     
        : "=a" (result)              
        :                            
                                     
    );
    _mmx_supported = result;
#else
    _mmx_supported = 0;
#endif 
    return _mmx_supported;
}
#endif