root/source/common/ppc/intrapred_altivec.cpp

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. intra_pred
  2. intra_pred_ang_altivec
  3. one_ang_pred_altivec
  4. all_angs_pred_altivec
  5. setupIntraPrimitives_altivec

/*****************************************************************************
 * Copyright (C) 2013-2017 MulticoreWare, Inc
 *
 * Authors: Roger Moussalli <rmoussal@us.ibm.com>
 *          Min Chen <min.chen@multicorewareinc.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at license @ x265.com.
 *****************************************************************************/

#include <iostream>
#include <vector>
#include <assert.h>
#include <math.h>
#include <cmath>
#include <linux/types.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <string.h>

#include "common.h"
#include "primitives.h"
#include "x265.h"
#include "ppccommon.h"

//using namespace std ;
namespace X265_NS {

/* INTRA Prediction - altivec implementation */
template<int width, int dirMode>
void intra_pred(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter){};

template<>
void intra_pred<4, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    if(dstStride == 4) {        
        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask = {0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,0x04, 0x02, 0x03,0x04,0x05, 0x03,0x04,0x05, 0x06}; 
        vec_u8_t vout = vec_perm(srcV, srcV, mask);
        vec_xst(vout, 0, dst); 
    }
    else if(dstStride%16 == 0){
        vec_u8_t v0 = vec_xl(10, srcPix0);
        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
        vec_u8_t v1 = vec_xl(11, srcPix0);
        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
        vec_u8_t v2 = vec_xl(12, srcPix0);
        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
        vec_u8_t v3 = vec_xl(13, srcPix0);
        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
    }
    else{
        const vec_u8_t srcV = vec_xl(10, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        vec_u8_t v0 = vec_perm(srcV, vec_xl(0, dst), mask_0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srcV, vec_xl(dstStride, dst), mask_1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srcV, vec_xl(dstStride*2, dst), mask_2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srcV,  vec_xl(dstStride*3, dst), mask_3);
        vec_xst(v3, dstStride*3, dst);
    }
#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    if(dstStride == 8) {        
        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03,0x04, 0x05, 0x06, 0x07, 0x08};
        const vec_u8_t mask_1 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
        const vec_u8_t mask_2 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
        const vec_u8_t mask_3 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
        vec_u8_t v0 = vec_perm(srcV1, srcV1, mask_0);
        vec_u8_t v1 = vec_perm(srcV1, srcV1, mask_1);
        vec_u8_t v2 = vec_perm(srcV1, srcV1, mask_2);
        vec_u8_t v3 = vec_perm(srcV1, srcV1, mask_3);
        vec_xst(v0, 0, dst);
        vec_xst(v1, 16, dst); 
        vec_xst(v2, 32, dst); 
        vec_xst(v3, 48, dst); 
    }
    else{
        //pixel *out = dst;     
        const vec_u8_t srcV1 = vec_xl(18, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_4 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_5 = {0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_6 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_7 = {0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(srcV1, vec_xl(0, dst), mask_0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srcV1, vec_xl(dstStride, dst), mask_1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srcV1, vec_xl(dstStride*2, dst), mask_2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srcV1,  vec_xl(dstStride*3, dst), mask_3);
        vec_xst(v3, dstStride*3, dst);
        vec_u8_t v4 = vec_perm(srcV1,  vec_xl(dstStride*4, dst), mask_4);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(srcV1,  vec_xl(dstStride*5, dst), mask_5);
        vec_xst(v5, dstStride*5, dst);
        vec_u8_t v6 = vec_perm(srcV1,  vec_xl(dstStride*6, dst), mask_6);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(srcV1,  vec_xl(dstStride*7, dst), mask_7);
        vec_xst(v7, dstStride*7, dst);
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    int i;
    //int off = dstStride;      
    //const pixel *srcPix = srcPix0;
    for(i=0; i<16; i++){
        vec_xst(        vec_xl(34+i, srcPix0), i*dstStride, dst); /* first offset = width2+2 = width<<1 + 2*/
    }
#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x <16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 2>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    int i;
    int off = dstStride;        
    //const pixel *srcPix = srcPix0;
    for(i=0; i<32; i++){
        off = i*dstStride;              
        vec_xst(        vec_xl(66+i, srcPix0), off, dst); /* first offset = width2+2 = width<<1 + 2*/
        vec_xst(        vec_xl(82+i, srcPix0), off+16, dst); /* first offset = width2+2 = width<<1 + 2*/
    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x <32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

#define one_line(s0, s1, vf32, vf, vout) {\
vmle0 = vec_mule(s0, vf32);\
vmlo0 = vec_mulo(s0, vf32);\
vmle1 = vec_mule(s1, vf);\
vmlo1 = vec_mulo(s1, vf);\
vsume = vec_add(vec_add(vmle0, vmle1), u16_16);\
ve = vec_sra(vsume, u16_5);\
vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16);\
vo = vec_sra(vsumo, u16_5);\
vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));\
}

template<>
void intra_pred<4, 3>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x1, 0x2, 0x3, 0x4, 0x2, 0x3, 0x4, 0x5, 0x3, 0x4, 0x5, 0x6};
    vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x2, 0x3, 0x4, 0x5, 0x3, 0x4, 0x5, 0x6, 0x4, 0x5, 0x6, 0x7};

    vec_u8_t vfrac4 = (vec_u8_t){26, 20, 14, 8, 26, 20, 14, 8, 26, 20, 14, 8, 26, 20, 14, 8};
    vec_u8_t vfrac4_32 = (vec_u8_t){6, 12, 18, 24, 6, 12, 18, 24, 6, 12, 18, 24, 6, 12, 18, 24};

        
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 3>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7};
    vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8};
    vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9};
    vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa};
    vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb};
    vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc};
    vec_u8_t mask6={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd};
    vec_u8_t mask7={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe};
    //vec_u8_t mask8={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf};


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

    vec_u8_t vfrac8 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 26, 20, 14, 8, 2, 28, 22, 16};
    vec_u8_t vfrac8_32 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 6, 12, 18, 24, 30, 4, 10, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 3>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd};
vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe};
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf};
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10};
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11};
vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12};
vec_u8_t mask6={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13};
vec_u8_t mask7={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14};
vec_u8_t mask8={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
vec_u8_t mask9={0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
vec_u8_t mask10={0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
vec_u8_t mask11={0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
vec_u8_t mask12={0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
vec_u8_t mask13={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
vec_u8_t mask14={0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
vec_u8_t mask15={0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);
        
vec_u8_t vfrac16 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
vec_u8_t vfrac16_32 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 3>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};

vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t mask16_0={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, };
vec_u8_t mask16_1={0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask16_2={0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask16_3={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask16_4={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, };
vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask16_5={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask6={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask16_6={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask7={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask16_7={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask8={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask16_8={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask9={0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask16_9={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask10={0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask16_10={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask11={0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask16_11={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask12={0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask16_12={0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask13={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask16_13={0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask14={0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask16_14={0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask15={0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask16_15={0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };

/*vec_u8_t mask16={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t mask16_16={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask17={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, };
vec_u8_t mask16_17={0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask18={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask16_18={0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask19={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask16_19={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t mask20={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask16_20={0x1, 0x2, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, };
vec_u8_t mask21={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask16_21={0x2, 0x3, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask22={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask16_22={0x3, 0x4, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask23={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask16_23={0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask24={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask16_24={0x5, 0x6, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask25={0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask16_25={0x6, 0x7, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask26={0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask16_26={0x7, 0x8, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask27={0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask16_27={0x8, 0x9, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask28={0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask16_28={0x9, 0xa, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask29={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask16_29={0xa, 0xb, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask30={0xe, 0xf, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask16_30={0xb, 0xc, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask31={0xf, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask16_31={0xc, 0xd, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };*/

vec_u8_t maskadd1_31={0x0, 0x1, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t maskadd1_16_31={0xd, 0xe, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    //vec_u8_t sv4 =vec_xl(129, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv2, sv3, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv2, sv3, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv2, sv3, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv2, sv3, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv2, sv3, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv2, sv3, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv2, sv3, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv2, sv3, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv2, sv3, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv2, sv3, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv2, sv3, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        

vec_u8_t vfrac32_0 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
vec_u8_t vfrac32_1 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
vec_u8_t vfrac32_32_0 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 32};
vec_u8_t vfrac32_32_1 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 4>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, 0x3, 0x4, 0x4, 0x5};
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, 0x3, 0x4, 0x4, 0x5, 0x4, 0x5, 0x5, 0x6};

vec_u8_t vfrac4 = (vec_u8_t){21, 10, 31, 20, 21, 10, 31, 20, 21, 10, 31, 20, 21, 10, 31, 20};
vec_u8_t vfrac4_32 = (vec_u8_t){11, 22, 1, 12, 11, 22, 1, 12, 11, 22, 1, 12, 11, 22, 1, 12};


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 4>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, };
//vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

    //mode 4, mode32    
    //int offset_4[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
    //int fraction_4[32] = {21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0};

vec_u8_t vfrac8 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 21, 10, 31, 20, 9, 30, 19, 8, };
vec_u8_t vfrac8_32 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 11, 22, 1, 12, 23, 2, 13, 24, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 4>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask9={0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask10={0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask11={0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask12={0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask13={0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, };
vec_u8_t mask14={0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x17, 0x18, };
vec_u8_t mask15={0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x18, 0x19, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);;
        
vec_u8_t vfrac16 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 4>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};

vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask16_0={0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask16_1={0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t mask16_2={0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask16_3={0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask16_4={0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask16_5={0x0, 0x0, 0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask16_6={0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask16_7={0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, };
vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask16_8={0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, };
vec_u8_t mask9={0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask16_9={0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, };
vec_u8_t mask10={0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask16_10={0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, };
vec_u8_t mask11={0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask16_11={0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, };
vec_u8_t mask12={0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask16_12={0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, };
vec_u8_t mask16_13={0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, };
vec_u8_t mask14={0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x17, 0x18, };
vec_u8_t mask16_14={0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, };
vec_u8_t mask15={0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x18, 0x19, };
vec_u8_t mask16_15={0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, };
/*vec_u8_t mask16={0x0, 0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask16_16={0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, };
vec_u8_t mask17={0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask16_17={0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, };
vec_u8_t mask18={0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t mask16_18={0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, };
vec_u8_t mask19={0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask16_19={0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, };
vec_u8_t mask20={0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask16_20={0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x17, 0x18, 0x19, };
vec_u8_t mask21={0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask16_21={0x0, 0x0, 0x1, 0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, };
vec_u8_t mask22={0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask16_22={0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, };
vec_u8_t mask23={0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask16_23={0x2, 0x2, 0x3, 0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, };
vec_u8_t mask24={0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask16_24={0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, };
vec_u8_t mask25={0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask16_25={0x4, 0x4, 0x5, 0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, };
vec_u8_t mask26={0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask16_26={0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, };
vec_u8_t mask27={0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask16_27={0x6, 0x6, 0x7, 0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, };
vec_u8_t mask28={0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask16_28={0x7, 0x7, 0x8, 0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, };
vec_u8_t mask29={0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, };
vec_u8_t mask16_29={0x8, 0x8, 0x9, 0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, };
vec_u8_t mask30={0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x15, 0x16, 0x17, 0x17, 0x18, };
vec_u8_t mask16_30={0x9, 0x9, 0xa, 0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, };
vec_u8_t mask31={0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, 0x14, 0x15, 0x16, 0x16, 0x17, 0x18, 0x18, 0x19, };
vec_u8_t mask16_31={0xa, 0xa, 0xb, 0xc, 0xc, 0xd, 0xe, 0xe, 0xf, 0x10, 0x10, 0x11, 0x12, 0x12, 0x13, 0x14, };*/
vec_u8_t maskadd1_31={0x0, 0x1, 0x1, 0x2, 0x3, 0x3, 0x4, 0x5, 0x5, 0x6, 0x7, 0x7, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t maskadd1_16_31={0xb, 0xb, 0xc, 0xd, 0xd, 0xe, 0xf, 0xf, 0x10, 0x11, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv0, sv1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv0, sv1, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv2, sv3, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv2, sv3, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv2, sv3, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv2, sv3, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv2, sv3, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv2, sv3, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv2, sv3, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv2, sv3, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv2, sv3, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        

vec_u8_t vfrac32_0 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 5>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, 0x3, 0x4, 0x4, 0x5, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, 0x3, 0x4, 0x4, 0x5, 0x4, 0x5, 0x5, 0x6, };

vec_u8_t vfrac4 = (vec_u8_t){17, 2, 19, 4, 17, 2, 19, 4, 17, 2, 19, 4, 17, 2, 19, 4, };
vec_u8_t vfrac4_32 = (vec_u8_t){15, 30, 13, 28, 15, 30, 13, 28, 15, 30, 13, 28, 15, 30, 13, 28, };



    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 5>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, };
//vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 17, 2, 19, 4, 21, 6, 23, 8, };
vec_u8_t vfrac8_32 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 15, 30, 13, 28, 11, 26, 9, 24, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 5>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask9={0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask10={0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask11={0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask12={0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask13={0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask14={0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask15={0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, 0x17, };
//vec_u8_t mask16={0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, 0x17, 0x17, 0x18, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);
        
vec_u8_t vfrac16 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 7, 22, 5, 20, 3, 18, 1, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 5>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */
vec_u8_t mask0={0x0, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, };
vec_u8_t mask16_0={0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x11, };
vec_u8_t mask1={0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, };
vec_u8_t mask16_1={0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x12, };
vec_u8_t mask2={0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask16_2={0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x13, };
vec_u8_t mask3={0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask16_3={0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x14, };
vec_u8_t mask4={0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t mask16_4={0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, };
vec_u8_t mask5={0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask16_5={0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x16, };
vec_u8_t mask6={0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask16_6={0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x17, };
vec_u8_t mask7={0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask16_7={0x0, 0x0, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x8, };
vec_u8_t mask8={0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask16_8={0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x9, };
vec_u8_t mask9={0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask16_9={0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0xa, };
vec_u8_t mask10={0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask16_10={0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xb, };
vec_u8_t mask11={0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask16_11={0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xc, };
vec_u8_t mask12={0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask16_12={0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xd, };
vec_u8_t mask13={0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask16_13={0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xe, };
vec_u8_t mask14={0xe, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask16_14={0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xf, };
vec_u8_t mask15={0xf, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, 0x16, 0x17, };
vec_u8_t mask16_15={0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_31={0x0, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, };
vec_u8_t maskadd1_16_31={0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, 0x11, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    //vec_u8_t sv4 =vec_xl(129, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv0, sv1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv0, sv1, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv0, sv1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv0, sv1, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv2, sv3, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv2, sv3, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv2, sv3, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv2, sv3, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv2, sv3, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv2, sv3, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv2, sv3, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        

vec_u8_t vfrac32_0 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 7, 22, 5, 20, 3, 18, 1, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){31, 14, 29, 12, 27, 10, 25, 8, 23, 6, 21, 4, 19, 2, 17, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 6>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, };
vec_u8_t mask1={0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, };


vec_u8_t vfrac4 = (vec_u8_t){13, 26, 7, 20, 13, 26, 7, 20, 13, 26, 7, 20, 13, 26, 7, 20, };

vec_u8_t vfrac4_32 = (vec_u8_t){19, 6, 25, 12, 19, 6, 25, 12, 19, 6, 25, 12, 19, 6, 25, 12, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 6>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t mask1={0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask2={0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask3={0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask4={0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t mask5={0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask6={0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, };
vec_u8_t mask7={0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, };
//vec_u8_t mask8={0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 13, 26, 7, 20, 1, 14, 27, 8, };
vec_u8_t vfrac8_32 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 19, 6, 25, 12, 31, 18, 5, 24, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 6>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x0, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t mask1={0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, };
vec_u8_t mask2={0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, };
vec_u8_t mask3={0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, };
vec_u8_t mask4={0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, };
vec_u8_t mask5={0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, };
vec_u8_t mask6={0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, };
vec_u8_t mask7={0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, };
vec_u8_t mask8={0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, };
vec_u8_t mask9={0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, };
vec_u8_t mask10={0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, };
vec_u8_t mask11={0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, };
vec_u8_t mask12={0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, };
vec_u8_t mask13={0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, };
vec_u8_t mask14={0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, };
vec_u8_t mask15={0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);
        
vec_u8_t vfrac16 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 6>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */
vec_u8_t mask0={0x0, 0x0, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t mask16_0={0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, };
vec_u8_t mask1={0x1, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, };
vec_u8_t mask16_1={0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xe, };
vec_u8_t mask2={0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, };
vec_u8_t mask16_2={0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xf, };
vec_u8_t mask3={0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, };
vec_u8_t mask16_3={0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0x10, };
vec_u8_t mask4={0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, };
vec_u8_t mask16_4={0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x11, };
vec_u8_t mask5={0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, };
vec_u8_t mask16_5={0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x12, };
vec_u8_t mask6={0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, };
vec_u8_t mask16_6={0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x13, };
vec_u8_t mask7={0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, };
vec_u8_t mask16_7={0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, };
vec_u8_t mask8={0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, };
vec_u8_t mask16_8={0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, 0x15, };
vec_u8_t mask9={0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, };
vec_u8_t mask16_9={0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, 0x16, };
vec_u8_t mask10={0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, };
vec_u8_t mask16_10={0x0, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x7, };
vec_u8_t mask11={0xb, 0xb, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, };
vec_u8_t mask16_11={0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x8, };
vec_u8_t mask12={0xc, 0xc, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, };
vec_u8_t mask16_12={0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x9, };
vec_u8_t mask13={0xd, 0xd, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, };
vec_u8_t mask16_13={0x3, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0xa, };
vec_u8_t mask14={0xe, 0xe, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, 0x13, 0x14, 0x14, };
vec_u8_t mask16_14={0x4, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xb, };
vec_u8_t mask15={0xf, 0xf, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, 0x14, 0x15, 0x15, };
vec_u8_t mask16_15={0x5, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xb, 0xb, 0xc, };
vec_u8_t maskadd1_31={0x0, 0x0, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t maskadd1_16_31={0x6, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xc, 0xc, 0xd, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    //vec_u8_t sv4 =vec_xl(129, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv0, sv1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv0, sv1, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv0, sv1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv0, sv1, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv0, sv1, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv0, sv1, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv0, sv1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv2, sv3, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv2, sv3, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv2, sv3, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv2, sv3, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        
vec_u8_t vfrac32_0 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 7>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, };

vec_u8_t vfrac4 = (vec_u8_t){9, 18, 27, 4, 9, 18, 27, 4, 9, 18, 27, 4, 9, 18, 27, 4, };

vec_u8_t vfrac4_32 = (vec_u8_t){23, 14, 5, 28, 23, 14, 5, 28, 23, 14, 5, 28, 23, 14, 5, 28, };




    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 7>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, };
//vec_u8_t mask8={0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 9, 18, 27, 4, 13, 22, 31, 8, };
vec_u8_t vfrac8_32 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 23, 14, 5, 28, 19, 10, 1, 24, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 7>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, 0x12, 0x12, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x12, 0x13, 0x13, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);
        
vec_u8_t vfrac16 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 7>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */
vec_u8_t mask0={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, };
vec_u8_t mask16_0={0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, };
vec_u8_t mask16_1={0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t mask16_2={0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, };
vec_u8_t mask16_3={0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, };
vec_u8_t mask16_4={0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, };
vec_u8_t mask16_5={0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, };
vec_u8_t mask16_6={0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, };
vec_u8_t mask16_7={0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, };
vec_u8_t mask16_8={0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, };
vec_u8_t mask16_9={0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, };
vec_u8_t mask16_10={0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x13, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, };
vec_u8_t mask16_11={0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x12, 0x13, 0x13, 0x13, 0x14, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, };
vec_u8_t mask16_12={0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, };
vec_u8_t mask16_13={0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, 0x12, 0x12, };
vec_u8_t mask16_14={0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x12, 0x12, 0x12, 0x12, 0x13, 0x13, };
vec_u8_t mask16_15={0x3, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t maskadd1_31={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, };
vec_u8_t maskadd1_16_31={0x4, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    //vec_u8_t sv4 =vec_xl(129, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv0, sv1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv0, sv1, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv0, sv1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv0, sv1, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv0, sv1, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv0, sv1, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv0, sv1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv0, sv1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv0, sv1, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv2, sv3, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv2, sv3, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        

vec_u8_t vfrac32_0 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 8>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };

vec_u8_t vfrac4 = (vec_u8_t){5, 10, 15, 20, 5, 10, 15, 20, 5, 10, 15, 20, 5, 10, 15, 20, };
vec_u8_t vfrac4_32 = (vec_u8_t){27, 22, 17, 12, 27, 22, 17, 12, 27, 22, 17, 12, 27, 22, 17, 12, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 8>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, };
//vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 5, 10, 15, 20, 25, 30, 3, 8, };
vec_u8_t vfrac8_32 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 27, 22, 17, 12, 7, 2, 29, 24, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 8>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);;
        
vec_u8_t vfrac16 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 19, 14, 9, 4, 31, 26, 21, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 8>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask16_0={0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask16_1={0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask16_2={0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask16_3={0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask16_4={0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask16_5={0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask16_6={0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask16_7={0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask16_8={0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask16_9={0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask16_10={0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask16_11={0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask16_12={0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x11, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, };
vec_u8_t mask16_13={0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x12, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, };
vec_u8_t mask16_14={0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x11, 0x11, 0x11, 0x11, };
vec_u8_t mask16_15={0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t maskadd1_31={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t maskadd1_16_31={0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    //vec_u8_t sv4 =vec_xl(129, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

/*
    printf("source:\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+65]);
    }
    printf("\n");
    for(int i=0; i<32; i++){
        printf("%d ", srcPix0[i+97]);
    }
    printf("\n\n");
*/    
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);
        
    vec_u8_t srv16_0 = vec_perm(sv0, sv1, mask16_0);
    vec_u8_t srv16_1 = vec_perm(sv0, sv1,mask16_1);
    vec_u8_t srv16_2 = vec_perm(sv0, sv1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(sv0, sv1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(sv0, sv1, mask16_4);
    vec_u8_t srv16_5 = vec_perm(sv0, sv1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(sv0, sv1, mask16_6);
    vec_u8_t srv16_7 = vec_perm(sv0, sv1, mask16_7);
    vec_u8_t srv16_8 = vec_perm(sv0, sv1, mask16_8);
    vec_u8_t srv16_9 = vec_perm(sv0, sv1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(sv0, sv1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(sv0, sv1, mask16_11);
    vec_u8_t srv16_12 = vec_perm(sv0, sv1, mask16_12);
    vec_u8_t srv16_13 = vec_perm(sv0, sv1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(sv1, sv2, mask16_14);
    vec_u8_t srv16_15 = vec_perm(sv1, sv2, mask16_15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, maskadd1_31);


    vec_u8_t srv16_16= vec_perm(sv1, sv2, mask16_0); /* mask16_16 == mask16_0 */
    vec_u8_t srv16_17= vec_perm(sv1, sv2, mask16_1);
    vec_u8_t srv16_18 = vec_perm(sv1, sv2, mask16_2);
    vec_u8_t srv16_19 = vec_perm(sv1, sv2, mask16_3);
    vec_u8_t srv16_20 = vec_perm(sv1, sv2, mask16_4);
    vec_u8_t srv16_21 = vec_perm(sv1, sv2, mask16_5);
    vec_u8_t srv16_22 = vec_perm(sv1, sv2, mask16_6);
    vec_u8_t srv16_23 = vec_perm(sv1, sv2, mask16_7);
    vec_u8_t srv16_24 = vec_perm(sv1, sv2, mask16_8);
    vec_u8_t srv16_25 = vec_perm(sv1, sv2, mask16_9);
    vec_u8_t srv16_26 = vec_perm(sv1, sv2, mask16_10);
    vec_u8_t srv16_27 = vec_perm(sv1, sv2, mask16_11);
    vec_u8_t srv16_28 = vec_perm(sv1, sv2, mask16_12);
    vec_u8_t srv16_29 = vec_perm(sv1, sv2, mask16_13);
    vec_u8_t srv16_30 = vec_perm(sv2, sv3, mask16_14);
    vec_u8_t srv16_31 = vec_perm(sv2, sv3, mask16_15);
    vec_u8_t srv16_32 = vec_perm(sv2, sv3, maskadd1_16_31);
        
vec_u8_t vfrac32_0 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 19, 14, 9, 4, 31, 26, 21, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){11, 6, 1, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16_1, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16_2,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16_3,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16_4,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16_5,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16_6,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16_7,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16_8,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16_9,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16_10,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16_11,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16_12,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16_13,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16_14,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16_15,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16_16,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16_17, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16_18,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16_19,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16_20,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16_21,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16_22,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16_23,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16_24,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16_25,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16_26,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16_27,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16_28,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16_29,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16_30,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16_31,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16_32,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 9>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };

vec_u8_t vfrac4 = (vec_u8_t){2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, };

vec_u8_t vfrac4_32 = (vec_u8_t){30, 28, 26, 24, 30, 28, 26, 24, 30, 28, 26, 24, 30, 28, 26, 24, };



    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 9>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
//vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16, };
vec_u8_t vfrac8_32 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 30, 28, 26, 24, 22, 20, 18, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32); 
    vmlo0 = vec_mulo(srv2, vfrac8_32); 
    vmle1 = vec_mule(srv3, vfrac8); 
    vmlo1 = vec_mulo(srv3, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32); 
    vmlo0 = vec_mulo(srv4, vfrac8_32); 
    vmle1 = vec_mule(srv5, vfrac8); 
    vmlo1 = vec_mulo(srv5, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32); 
    vmlo0 = vec_mulo(srv6, vfrac8_32);
    vmle1 = vec_mule(srv7, vfrac8); 
    vmlo1 = vec_mulo(srv7, vfrac8); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 9>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, };

    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);
    vec_u8_t srv00 = vec_perm(sv1, sv1, mask0);;
        
vec_u8_t vfrac16 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, };
vec_u8_t vfrac16_32 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv2, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv3, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv4, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv5, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv6, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv7, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv8, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv9, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srva, vfrac16_32, vfrac16, vout_9);
    one_line(srva, srvb, vfrac16_32, vfrac16, vout_10);
    one_line(srvb, srvc, vfrac16_32, vfrac16, vout_11);
    one_line(srvc, srvd, vfrac16_32, vfrac16, vout_12);
    one_line(srvd, srve, vfrac16_32, vfrac16, vout_13);
    one_line(srve, srvf, vfrac16_32, vfrac16, vout_14);
    one_line(srvf, srv00, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 9>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[0 * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[1 * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[2 * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            ...
            dst[16 * dstStride + 0] = (pixel)((f32[16]* ref[off16 + 0] + f[16] * ref[off16 + 1] + 16) >> 5);
            ...
            dst[31 * dstStride + 0] = (pixel)((f32[31]* ref[off31 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[0 * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[1 * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[2 * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[3 * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[0 * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[1 * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[2 * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[3 * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);

            ....
            y=16;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 16] = (pixel)((f32[0]* ref[off0 + 16] + f[0] * ref[off0 + 16] + 16) >> 5);
            dst[1 * dstStride + 16] = (pixel)((f32[1]* ref[off1 + 16] + f[1] * ref[off1 + 16] + 16) >> 5);
            dst[2 * dstStride + 16] = (pixel)((f32[2]* ref[off2 + 16] + f[2] * ref[off2 + 16] + 16) >> 5);
            ...
            dst[16 * dstStride + 16] = (pixel)((f32[16]* ref[off16 + 16] + f[16] * ref[off16 + 16] + 16) >> 5);
            ...
            dst[31 * dstStride + 16] = (pixel)((f32[31]* ref[off31 + 16] + f[31] * ref[off31 + 16] + 16) >> 5);

            ....
            y=31;  off3 = offset[3]; x=0-3;
            dst[0 * dstStride + 31] = (pixel)((f32[0]* ref[off0 + 31] + f[0] * ref[off0 + 31] + 16) >> 5);
            dst[1 * dstStride + 31] = (pixel)((f32[1]* ref[off1 + 31] + f[1] * ref[off1 + 31] + 16) >> 5);
            dst[2 * dstStride + 31] = (pixel)((f32[2]* ref[off2 + 31] + f[2] * ref[off2 + 31] + 16) >> 5);
            ...
            dst[3 * dstStride + 31] = (pixel)((f32[31]* ref[off31 + 31] + f[31] * ref[off31 + 31] + 16) >> 5);
        }
    */

vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0xa, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xb, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xc, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xd, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xe, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xf, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */     
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); 
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srv10 = vec_perm(sv0, sv1, mask10);
    vec_u8_t srv11 = vec_perm(sv0, sv1, mask11);
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask12);
    vec_u8_t srv13 = vec_perm(sv0, sv1, mask13);
    vec_u8_t srv14 = vec_perm(sv0, sv1, mask14);
    vec_u8_t srv15 = vec_perm(sv0, sv1, mask15);

    vec_u8_t  srv16 = vec_perm(sv1, sv2, mask0);  /* mask16 == mask0 */
    vec_u8_t  srv17 = vec_perm(sv1, sv2, mask1);
    vec_u8_t  srv18 = vec_perm(sv1, sv2, mask2);
    vec_u8_t  srv19 = vec_perm(sv1, sv2, mask3);
    vec_u8_t  srv20 = vec_perm(sv1, sv2, mask4);
    vec_u8_t  srv21 = vec_perm(sv1, sv2, mask5);
    vec_u8_t  srv22 = vec_perm(sv1, sv2, mask6);
    vec_u8_t  srv23 = vec_perm(sv1, sv2, mask7);
    vec_u8_t  srv24 = vec_perm(sv1, sv2, mask8);
    vec_u8_t  srv25 = vec_perm(sv1, sv2, mask9);
    vec_u8_t  srv26 = vec_perm(sv1, sv2, mask10);
    vec_u8_t  srv27 = vec_perm(sv1, sv2, mask11);
    vec_u8_t  srv28 = vec_perm(sv1, sv2, mask12);
    vec_u8_t  srv29 = vec_perm(sv1, sv2, mask13);
    vec_u8_t  srv30 = vec_perm(sv1, sv2, mask14);
    vec_u8_t  srv31 = vec_perm(sv1, sv2, mask15);
    vec_u8_t  srv32 = vec_perm(sv2, sv3, mask0);
    vec_u8_t  srv33 = vec_perm(sv2, sv3, mask1);

vec_u8_t vfrac32_0 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, };
vec_u8_t vfrac32_1 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 32, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv1, srv2, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv2, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv2, srv3,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv3, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv3, srv4,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv4, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv4, srv5,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv5, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv5, srv6,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv6, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv6, srv7,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv7, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv7, srv8,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv8, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv8, srv9,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv9, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv9, srv10,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv10, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv10, srv11,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv11, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv11, srv12,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv12, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv12, srv13,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv13, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv13, srv14,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv14, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv14, srv15,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv15, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv15, srv16,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv16, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16, srv17,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv17, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv17, srv18, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv18, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv18, srv19,  vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv19, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv19, srv20,  vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv20, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv20, srv21,  vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv21, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv21, srv22,  vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv22, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv22, srv23,  vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv23, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv23, srv24,  vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv24, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv24, srv25,  vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv25, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv25, srv26,  vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv26, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv26, srv27,  vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv27, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv27, srv28,  vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv28, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv28, srv29,  vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv29, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv29, srv30,   vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv30, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv30, srv31,  vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv31, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv31, srv32,  vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv32, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv32, srv33,  vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

#ifdef WORDS_BIGENDIAN
   vec_u8_t u8_to_s16_w4x4_mask1 = {0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
   vec_u8_t u8_to_s16_w4x4_mask9 = {0x00, 0x19, 0x00, 0x1a, 0x00, 0x1b, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
   vec_u8_t u8_to_s16_w8x8_mask1 = {0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00, 0x18};
   vec_u8_t u8_to_s16_w8x8_maskh = {0x00, 0x10, 0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17};
   vec_u8_t u8_to_s16_w8x8_maskl = {0x00, 0x18, 0x00, 0x19, 0x00, 0x1a, 0x00, 0x1b, 0x00, 0x1c, 0x00, 0x1d, 0x00, 0x1e, 0x00, 0x1f};
   vec_u8_t u8_to_s16_b0_mask = {0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10};
   vec_u8_t u8_to_s16_b1_mask = {0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11};
   vec_u8_t u8_to_s16_b9_mask = {0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10};
#else
   vec_u8_t u8_to_s16_w4x4_mask1 = {0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
   vec_u8_t u8_to_s16_w4x4_mask9 = {0x19, 0x00, 0x1a, 0x00, 0x1b, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
   vec_u8_t u8_to_s16_w8x8_mask1 = {0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00, 0x18, 0x00};
   vec_u8_t u8_to_s16_w8x8_maskh = {0x10, 0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00};
   vec_u8_t u8_to_s16_w8x8_maskl = {0x18, 0x00, 0x19, 0x00, 0x1a, 0x00, 0x1b, 0x00, 0x1c, 0x00, 0x1d, 0x00, 0x1e, 0x00, 0x1f, 0x00};
   vec_u8_t u8_to_s16_b0_mask = {0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00, 0x10, 0x00};
   vec_u8_t u8_to_s16_b1_mask = {0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00, 0x11, 0x00};
   vec_u8_t u8_to_s16_b9_mask = {0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09, 0x10, 0x09};
#endif
vec_s16_t min_s16v = (vec_s16_t){255, 255, 255, 255, 255, 255, 255, 255}; 
vec_u16_t one_u16v = (vec_u16_t)vec_splat_u16(1);

template<>
void intra_pred<4, 10>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(9, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t v_filter_u8, v_mask0, v_mask; 
    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w4x4_mask1));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
         v_mask0 = (vec_u8_t){0x10, 0x11, 0x12, 0x13, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03};
         v_mask = (vec_u8_t){0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
    }
    else{
         v_mask0 = (vec_u8_t){0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03};
         v_mask = (vec_u8_t){0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        v_filter_u8 = srv; 
    }
                
                
    if(dstStride == 4) {
         vec_u8_t v0 = vec_perm(srv, v_filter_u8, v_mask0);
         vec_xst(v0, 0, dst);
    }
    else if(dstStride%16 == 0){ 
         vec_u8_t v0 = vec_perm(srv, v_filter_u8, v_mask0);
        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
        vec_u8_t v1 = vec_sld(v0, v0, 12);
        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
        vec_u8_t v2 = vec_sld(v0, v0, 8);
        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
        vec_u8_t v3 = vec_sld(v0, v0, 4);
        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
    }
    else{
         vec_u8_t v_mask1 = {0x01, 0x01, 0x01, 0x01, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x02, 0x02, 0x02, 0x02, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x03, 0x03, 0x03, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(v_filter_u8, vec_xl(0, dst), v_mask);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srv, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srv, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srv,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 10>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(17, srcPix0); /* offset = width2+1 = width<<1 + 1 */
        
    if(dstStride == 8) {
         vec_u8_t v_mask0 = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01};
         vec_u8_t v_mask1 = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03};
         vec_u8_t v_mask2 = {0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05};
         vec_u8_t v_mask3 = {0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07};
         vec_u8_t v0 = vec_perm(srv, srv, v_mask0);
         vec_xst(v0, 0, dst);
         vec_u8_t v1 = vec_perm(srv, srv, v_mask1);
         vec_xst(v1, 16, dst);
         vec_u8_t v2 = vec_perm(srv, srv, v_mask2);
         vec_xst(v2, 32, dst);
         vec_u8_t v3 = vec_perm(srv, srv, v_mask3);
         vec_xst(v3, 48, dst);
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask4 = {0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask5 = {0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask6 = {0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask7 = {0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(srv, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srv, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srv, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srv,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
        vec_u8_t v4 = vec_perm(srv,  vec_xl(dstStride*4, dst), v_mask4);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(srv,  vec_xl(dstStride*5, dst), v_mask5);
        vec_xst(v5, dstStride*5, dst);
        vec_u8_t v6 = vec_perm(srv,  vec_xl(dstStride*6, dst), v_mask6);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(srv,  vec_xl(dstStride*7, dst), v_mask7);
        vec_xst(v7, dstStride*7, dst);
    }

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_mask1));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_xst( vec_perm(v_filter_u8, vec_xl(0, dst), v_mask0), 0, dst );
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 10>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(33, srcPix0); /* offset = width2+1 = width<<1 + 1 */
        
    if(dstStride == 16) {
         vec_xst(vec_splat(srv, 0), 0, dst);
         vec_xst(vec_splat(srv, 1), 16, dst);
         vec_xst(vec_splat(srv, 2), 32, dst);
         vec_xst(vec_splat(srv, 3), 48, dst);
         vec_xst(vec_splat(srv, 4), 64, dst);
         vec_xst(vec_splat(srv, 5), 80, dst);
         vec_xst(vec_splat(srv, 6), 96, dst);
         vec_xst(vec_splat(srv, 7), 112, dst);
         vec_xst(vec_splat(srv, 8), 128, dst);
         vec_xst(vec_splat(srv, 9), 144, dst);
         vec_xst(vec_splat(srv, 10), 160, dst);
         vec_xst(vec_splat(srv, 11), 176, dst);
         vec_xst(vec_splat(srv, 12), 192, dst);
         vec_xst(vec_splat(srv, 13), 208, dst);
         vec_xst(vec_splat(srv, 14), 224, dst);
         vec_xst(vec_splat(srv, 15), 240, dst);
    }
    else{
         vec_xst(vec_splat(srv, 0), 0, dst);
         vec_xst(vec_splat(srv, 1), 1*dstStride, dst);
         vec_xst(vec_splat(srv, 2), 2*dstStride, dst);
         vec_xst(vec_splat(srv, 3), 3*dstStride, dst);
         vec_xst(vec_splat(srv, 4), 4*dstStride, dst);
         vec_xst(vec_splat(srv, 5), 5*dstStride, dst);
         vec_xst(vec_splat(srv, 6), 6*dstStride, dst);
         vec_xst(vec_splat(srv, 7), 7*dstStride, dst);
         vec_xst(vec_splat(srv, 8), 8*dstStride, dst);
         vec_xst(vec_splat(srv, 9), 9*dstStride, dst);
         vec_xst(vec_splat(srv, 10), 10*dstStride, dst);
         vec_xst(vec_splat(srv, 11), 11*dstStride, dst);
         vec_xst(vec_splat(srv, 12), 12*dstStride, dst);
         vec_xst(vec_splat(srv, 13), 13*dstStride, dst);
         vec_xst(vec_splat(srv, 14), 14*dstStride, dst);
         vec_xst(vec_splat(srv, 15), 15*dstStride, dst);
    }

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(1, srcPix0);           
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 
        vec_xst( v_filter_u8, 0, dst );
    }
#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<32, 10>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(65, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t srv1 =vec_xl(81, srcPix0); 
    vec_u8_t vout;      
    int offset = 0;

    #define v_pred32(vi, vo, i){\
        vo = vec_splat(vi, i);\
        vec_xst(vo, offset, dst);\
        vec_xst(vo, 16+offset, dst);\
        offset += dstStride;\
    }
        
    v_pred32(srv, vout, 0);     
    v_pred32(srv, vout, 1);     
    v_pred32(srv, vout, 2);     
    v_pred32(srv, vout, 3);     
    v_pred32(srv, vout, 4);     
    v_pred32(srv, vout, 5);     
    v_pred32(srv, vout, 6);     
    v_pred32(srv, vout, 7);     
    v_pred32(srv, vout, 8);     
    v_pred32(srv, vout, 9);     
    v_pred32(srv, vout, 10);    
    v_pred32(srv, vout, 11);    
    v_pred32(srv, vout, 12);    
    v_pred32(srv, vout, 13);    
    v_pred32(srv, vout, 14);    
    v_pred32(srv, vout, 15);    

    v_pred32(srv1, vout, 0);    
    v_pred32(srv1, vout, 1);    
    v_pred32(srv1, vout, 2);    
    v_pred32(srv1, vout, 3);    
    v_pred32(srv1, vout, 4);    
    v_pred32(srv1, vout, 5);    
    v_pred32(srv1, vout, 6);    
    v_pred32(srv1, vout, 7);    
    v_pred32(srv1, vout, 8);    
    v_pred32(srv1, vout, 9);    
    v_pred32(srv1, vout, 10);   
    v_pred32(srv1, vout, 11);   
    v_pred32(srv1, vout, 12);   
    v_pred32(srv1, vout, 13);   
    v_pred32(srv1, vout, 14);   
    v_pred32(srv1, vout, 15);   

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(1, srcPix0);           
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 
        vec_xst( v_filter_u8, 0, dst );

        vec_u8_t  srcv2 = vec_xl(17, srcPix0);          
        vec_s16_t v2h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskh));
        vec_s16_t v2l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskl));
        vec_s16_t v3h_s16 =  (vec_s16_t)vec_sra( vec_sub(v2h_s16, c0_s16v), one_u16v );
        vec_s16_t v3l_s16 =  (vec_s16_t)vec_sra( vec_sub(v2l_s16, c0_s16v), one_u16v );
        vec_s16_t v2h_sum = vec_add(c1_s16v, v3h_s16);
        vec_s16_t v2l_sum = vec_add(c1_s16v, v3l_s16);
        vec_u16_t v2h_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2h_sum));
        vec_u16_t v2l_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2l_sum));
        vec_u8_t v2_filter_u8 = vec_pack(v2h_filter_u16, v2l_filter_u16); 
        vec_xst( v2_filter_u8, 16, dst );

    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 11>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
    vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){30, 28, 26, 24, 30, 28, 26, 24, 30, 28, 26, 24, 30, 28, 26, 24, };
    vec_u8_t vfrac4_32 = (vec_u8_t){2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 11>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
//vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);
vec_u8_t vfrac8 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 30, 28, 26, 24, 22, 20, 18, 16, };
vec_u8_t vfrac8_32 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16, };

    one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
    one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
    one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
    one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 11>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, };
vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, };
vec_u8_t maskadd1_15={0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};

    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(48, srcPix0);  

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, };
vec_u8_t vfrac16_32 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, };

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                


#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 11>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask1={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask2={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask3={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask4={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask5={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask6={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask7={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask8={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask9={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask10={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask11={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask12={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask13={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask14={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, };
vec_u8_t mask15={0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, };

vec_u8_t mask16_0={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, };
/*vec_u8_t mask16_1={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask16_2={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask16_3={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask16_4={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask16_5={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask16_6={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask16_7={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask16_8={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask16_9={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask16_10={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask16_11={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask16_12={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask16_13={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask16_14={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask16_15={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, };
*/
vec_u8_t maskadd1_31={0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, };
vec_u8_t maskadd1_16_31={0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 

vec_u8_t refmask_32_0={0x10, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
vec_u8_t refmask_32_1={0x0, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};

    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(79, srcPix0);  
    vec_u8_t s2 = vec_xl(95, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s0, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s0, mask1);
    vec_u8_t srv2 = vec_perm(s0, s0, mask2);
    vec_u8_t srv3 = vec_perm(s0, s0, mask3);
    vec_u8_t srv4 = vec_perm(s0, s0, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s0, mask5);
    vec_u8_t srv6 = vec_perm(s0, s0, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s0, mask7);
    vec_u8_t srv8 = vec_perm(s0, s0, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s0, mask9);
    vec_u8_t srv10 = vec_perm(s0, s0, mask10);
    vec_u8_t srv11 = vec_perm(s0, s0, mask11);
    vec_u8_t srv12= vec_perm(s0, s0, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s0, mask13);
    vec_u8_t srv14 = vec_perm(s0, s0, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = srv0;
    vec_u8_t srv16_2 = srv1;
    vec_u8_t srv16_3 = srv2;
    vec_u8_t srv16_4 = srv3; 
    vec_u8_t srv16_5 = srv4;
    vec_u8_t srv16_6 = srv5; 
    vec_u8_t srv16_7 = srv6;
    vec_u8_t srv16_8 = srv7; 
    vec_u8_t srv16_9 = srv8;
    vec_u8_t srv16_10 = srv9;
    vec_u8_t srv16_11 = srv10;
    vec_u8_t srv16_12= srv11; 
    vec_u8_t srv16_13 = srv12;
    vec_u8_t srv16_14 = srv13; 
    vec_u8_t srv16_15 =srv14;

/*
    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s0, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s0, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s0, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s0, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s0, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s0, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s0, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s0, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s0, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s0, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s0, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s0, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s0, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s0, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s0, mask16_15);
*/
    vec_u8_t  srv16 = vec_perm(s1, s1, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s1, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s1, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s1, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s1, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s1, mask5);
    vec_u8_t  srv22 = vec_perm(s1, s1, mask6);
    vec_u8_t  srv23 = vec_perm(s1, s1, mask7);
    vec_u8_t  srv24 = vec_perm(s1, s1, mask8);
    vec_u8_t  srv25 = vec_perm(s1, s1, mask9);
    vec_u8_t  srv26 = vec_perm(s1, s1, mask10);
    vec_u8_t  srv27 = vec_perm(s1, s1, mask11);
    vec_u8_t  srv28 = vec_perm(s1, s1, mask12);
    vec_u8_t  srv29 = vec_perm(s1, s1, mask13);
    vec_u8_t  srv30 = vec_perm(s1, s1, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s2, mask15);

/*
    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s1, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s1, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s1, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s1, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s1, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s1, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s1, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s1, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s1, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s1, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s1, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s1, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s1, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s1, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s1, mask16_15);
*/
    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = srv16;
    vec_u8_t  srv16_18 = srv17;
    vec_u8_t  srv16_19 = srv18;
    vec_u8_t  srv16_20 = srv19;
    vec_u8_t  srv16_21 = srv20;
    vec_u8_t  srv16_22 = srv21;
    vec_u8_t  srv16_23 = srv22;
    vec_u8_t  srv16_24 = srv23;
    vec_u8_t  srv16_25 = srv24;
    vec_u8_t  srv16_26 = srv25;
    vec_u8_t  srv16_27 = srv26;
    vec_u8_t  srv16_28 = srv27;
    vec_u8_t  srv16_29 = srv28;
    vec_u8_t  srv16_30 = srv29;
    vec_u8_t  srv16_31 = srv30;

    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s2, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, };
vec_u8_t vfrac32_1 = (vec_u8_t){30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, };
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 12>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, };
    vec_u8_t mask1={0x1, 0x1, 0x1, 0x1, 0x2, 0x2, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3, 0x4, 0x4, 0x4, 0x4, };
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){27, 22, 17, 12, 27, 22, 17, 12, 27, 22, 17, 12, 27, 22, 17, 12, };
    vec_u8_t vfrac4_32 = (vec_u8_t){5, 10, 15, 20, 5, 10, 15, 20, 5, 10, 15, 20, 5, 10, 15, 20, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 12>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask1={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask2={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask3={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask4={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask5={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask6={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask7={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x6, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };

    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 27, 22, 17, 12, 7, 2, 29, 24, };
vec_u8_t vfrac8_32 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 5, 10, 15, 20, 25, 30, 3, 8, };

    one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
    one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
    one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
    one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 12>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, };
vec_u8_t mask1={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask2={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask3={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask4={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask5={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask6={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask7={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask8={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask9={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask10={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask11={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask12={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask13={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask14={0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask15={0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, };

vec_u8_t maskadd1_15={0x12, 0x12, 0x12, 0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(33, srcPix0); 
    vec_u8_t refmask_16={0xd, 0x6, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(46, srcPix0);
        
    vec_u8_t srv0 = vec_perm(s0, s0, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s0, mask1);
    vec_u8_t srv2 = vec_perm(s0, s0, mask2);
    vec_u8_t srv3 = vec_perm(s0, s0, mask3);
    vec_u8_t srv4 = vec_perm(s0, s0, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s0, mask5);
    vec_u8_t srv6 = vec_perm(s0, s0, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s0, mask7);
    vec_u8_t srv8 = vec_perm(s0, s0, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s0, mask9);
    vec_u8_t srv10 = vec_perm(s0, s0, mask10);
    vec_u8_t srv11 = vec_perm(s0, s0, mask11);
    vec_u8_t srv12= vec_perm(s0, s0, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s0, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s1, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 19, 14, 9, 4, 31, 26, 21, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<32, 12>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask1={0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask2={0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask3={0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask4={0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask5={0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask6={0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask7={0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask8={0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask9={0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask10={0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask11={0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask12={0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask13={0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, };
vec_u8_t mask14={0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, };
vec_u8_t mask15={0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, };

vec_u8_t mask16_0={0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, };
vec_u8_t mask16_1={0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask16_2={0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask16_3={0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask16_4={0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask16_5={0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask16_6={0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask16_7={0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask16_8={0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask16_9={0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask16_10={0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask16_11={0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask16_12={0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask16_13={0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask16_14={0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask16_15={0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, };

vec_u8_t maskadd1_31={0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t maskadd1_16_31={0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1a, 0x13, 0xd, 0x6, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(76, srcPix0);  
    vec_u8_t s2 = vec_xl(92, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s0, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s0, mask1);
    vec_u8_t srv2 = vec_perm(s0, s0, mask2);
    vec_u8_t srv3 = vec_perm(s0, s0, mask3);
    vec_u8_t srv4 = vec_perm(s0, s0, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s0, mask5);
    vec_u8_t srv6 = vec_perm(s0, s0, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s0, mask7);
    vec_u8_t srv8 = vec_perm(s0, s0, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s0, mask9);
    vec_u8_t srv10 = vec_perm(s0, s0, mask10);
    vec_u8_t srv11 = vec_perm(s0, s0, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s1, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s0, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s0, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s0, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s0, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s0, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s0, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s0, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s0, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s0, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s0, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s0, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s0, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s0, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s1, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s1, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s1, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s1, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s1, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s1, mask5);
    vec_u8_t  srv22 = vec_perm(s1, s1, mask6);
    vec_u8_t  srv23 = vec_perm(s1, s1, mask7);
    vec_u8_t  srv24 = vec_perm(s1, s1, mask8);
    vec_u8_t  srv25 = vec_perm(s1, s1, mask9);
    vec_u8_t  srv26 = vec_perm(s1, s1, mask10);
    vec_u8_t  srv27 = vec_perm(s1, s1, mask11);
    vec_u8_t  srv28 = vec_perm(s1, s2, mask12);
    vec_u8_t  srv29 = vec_perm(s1, s2, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s2, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s2, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s1, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s1, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s1, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s1, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s1, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s1, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s1, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s1, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s1, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s1, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s1, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s1, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s1, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s2, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){27, 22, 17, 12, 7, 2, 29, 24, 19, 14, 9, 4, 31, 26, 21, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){11, 6, 1, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 13>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x1, 0x1, 0x1, 0x0, 0x2, 0x2, 0x2, 0x1, 0x3, 0x3, 0x3, 0x2, 0x4, 0x4, 0x4, 0x3, };
    vec_u8_t mask1={0x2, 0x2, 0x2, 0x1, 0x3, 0x3, 0x3, 0x2, 0x4, 0x4, 0x4, 0x3, 0x5, 0x5, 0x5, 0x4, };
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

vec_u8_t vfrac4 = (vec_u8_t){23, 14, 5, 28, 23, 14, 5, 28, 23, 14, 5, 28, 23, 14, 5, 28, };
vec_u8_t vfrac4_32 = (vec_u8_t){9, 18, 27, 4, 9, 18, 27, 4, 9, 18, 27, 4, 9, 18, 27, 4, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 13>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x0, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x1, };
    vec_u8_t mask1={0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x1, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, };
    vec_u8_t mask2={0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, };
    vec_u8_t mask3={0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, };
    vec_u8_t mask4={0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, };
    vec_u8_t mask5={0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, };
    vec_u8_t mask6={0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, };
    vec_u8_t mask7={0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, };
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

    vec_u8_t vfrac8 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 23, 14, 5, 28, 19, 10, 1, 24, };
    vec_u8_t vfrac8_32 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 9, 18, 27, 4, 13, 22, 31, 8, };

    one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
    one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
    one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
    one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 13>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask1={0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask2={0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask3={0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask4={0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask5={0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask6={0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask7={0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask8={0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask9={0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask10={0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask11={0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask12={0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask13={0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask14={0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask15={0x13, 0x13, 0x13, 0x12, 0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, };
vec_u8_t maskadd1_15={0x14, 0x14, 0x14, 0x13, 0x13, 0x13, 0x13, 0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xe, 0xb, 0x7, 0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(44, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 13>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask1={0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask2={0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask3={0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask4={0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask5={0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask6={0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask7={0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask8={0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask9={0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask10={0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask11={0x13, 0x13, 0x13, 0x12, 0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, };
vec_u8_t mask12={0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask13={0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask14={0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask15={0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, };

vec_u8_t mask16_0={0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, };
vec_u8_t mask16_1={0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, };
vec_u8_t mask16_2={0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x2, };
vec_u8_t mask16_3={0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, 0x3, 0x3, 0x3, 0x3, };
vec_u8_t mask16_4={0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, 0x4, 0x4, };
vec_u8_t mask16_5={0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, };
vec_u8_t mask16_6={0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x6, };
vec_u8_t mask16_7={0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, };
vec_u8_t mask16_8={0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, 0x8, 0x8, 0x8, 0x8, };
vec_u8_t mask16_9={0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, 0x9, 0x9, 0x9, 0x9, };
vec_u8_t mask16_10={0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, 0xa, 0xa, 0xa, 0xa, };
vec_u8_t mask16_11={0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, 0xb, 0xb, 0xb, 0xb, };
vec_u8_t mask16_12={0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, 0xc, 0xc, 0xc, 0xc, };
vec_u8_t mask16_13={0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, 0xd, 0xd, 0xd, 0xd, };
vec_u8_t mask16_14={0x12, 0x11, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, 0xe, 0xe, 0xe, 0xe, };
vec_u8_t mask16_15={0x13, 0x12, 0x12, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0x10, 0x10, 0xf, 0xf, 0xf, 0xf, };

vec_u8_t maskadd1_31={0x8, 0x8, 0x8, 0x7, 0x7, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t maskadd1_16_31={0x4, 0x3, 0x3, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1c, 0x19, 0x15, 0x12, 0xe, 0xb, 0x7, 0x4, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(72, srcPix0);  
    vec_u8_t s2 = vec_xl(88, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s0, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s0, mask1);
    vec_u8_t srv2 = vec_perm(s0, s0, mask2);
    vec_u8_t srv3 = vec_perm(s0, s0, mask3);
    vec_u8_t srv4 = vec_perm(s0, s0, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s0, mask5);
    vec_u8_t srv6 = vec_perm(s0, s0, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s0, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s1, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s1, mask13);
    vec_u8_t srv14 = vec_perm(s1, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s0, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s0, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s0, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s0, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s0, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s0, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s0, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s0, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s0, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s0, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s0, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s1, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s1, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s1, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s1, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s1, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s1, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s1, mask5);
    vec_u8_t  srv22 = vec_perm(s1, s1, mask6);
    vec_u8_t  srv23 = vec_perm(s1, s1, mask7);
    vec_u8_t  srv24 = vec_perm(s1, s2, mask8);
    vec_u8_t  srv25 = vec_perm(s1, s2, mask9);
    vec_u8_t  srv26 = vec_perm(s1, s2, mask10);
    vec_u8_t  srv27 = vec_perm(s1, s2, mask11);
    vec_u8_t  srv28 = vec_perm(s2, s2, mask12);
    vec_u8_t  srv29 = vec_perm(s2, s2, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s2, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s2, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s1, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s1, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s1, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s1, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s1, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s1, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s1, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s1, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s1, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s1, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s1, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s2, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){23, 14, 5, 28, 19, 10, 1, 24, 15, 6, 29, 20, 11, 2, 25, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){7, 30, 21, 12, 3, 26, 17, 8, 31, 22, 13, 4, 27, 18, 9, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}



template<>
void intra_pred<4, 14>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x1, 0x1, 0x0, 0x0, 0x2, 0x2, 0x1, 0x1, 0x3, 0x3, 0x2, 0x2, 0x4, 0x4, 0x3, 0x3, };
    vec_u8_t mask1={0x2, 0x2, 0x1, 0x1, 0x3, 0x3, 0x2, 0x2, 0x4, 0x4, 0x3, 0x3, 0x5, 0x5, 0x4, 0x4, };

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){19, 6, 25, 12, 19, 6, 25, 12, 19, 6, 25, 12, 19, 6, 25, 12, };
    vec_u8_t vfrac4_32 = (vec_u8_t){13, 26, 7, 20, 13, 26, 7, 20, 13, 26, 7, 20, 13, 26, 7, 20, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 14>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x1, 0x0, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, };
vec_u8_t mask1={0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, };
vec_u8_t mask2={0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, };
vec_u8_t mask3={0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, };
vec_u8_t mask4={0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, };
vec_u8_t mask5={0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, };
vec_u8_t mask6={0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, };
vec_u8_t mask7={0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, };
//vec_u8_t mask8={0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x5, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

    vec_u8_t vfrac8 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 19, 6, 25, 12, 31, 18, 5, 24, };
    vec_u8_t vfrac8_32 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 13, 26, 7, 20, 1, 14, 27, 8, };

    one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
    one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
    one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
    one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 14>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask1={0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask2={0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask3={0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask4={0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask5={0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask6={0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask7={0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask8={0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask9={0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask10={0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask11={0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask12={0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask13={0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask14={0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask15={0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, };
vec_u8_t maskadd1_15={0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    //vec_u8_t s1 = vec_xl(40, srcPix0);        
    vec_u8_t s1 = vec_xl(42, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 14>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask1={0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask2={0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask3={0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask4={0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask5={0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask6={0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask7={0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask8={0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask9={0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, };
vec_u8_t mask10={0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask11={0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask12={0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask13={0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask14={0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask15={0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, };

vec_u8_t mask16_0={0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, 0x0, };
vec_u8_t mask16_1={0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x1, };
vec_u8_t mask16_2={0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, };
vec_u8_t mask16_3={0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x3, };
vec_u8_t mask16_4={0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x4, };
vec_u8_t mask16_5={0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x5, };
vec_u8_t mask16_6={0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x6, };
vec_u8_t mask16_7={0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x7, };
vec_u8_t mask16_8={0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, };
vec_u8_t mask16_9={0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x9, };
vec_u8_t mask16_10={0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, };
vec_u8_t mask16_11={0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xb, };
vec_u8_t mask16_12={0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xc, };
vec_u8_t mask16_13={0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xd, };
vec_u8_t mask16_14={0x14, 0x13, 0x13, 0x12, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xe, };
vec_u8_t mask16_15={0x15, 0x14, 0x14, 0x13, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xf, };

vec_u8_t maskadd1_31={0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t maskadd1_16_31={0x6, 0x5, 0x5, 0x4, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1b, 0x19, 0x16, 0x14, 0x11, 0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x10, 0x11, 0x12};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(68, srcPix0);  
    vec_u8_t s2 = vec_xl(84, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s0, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s0, mask1);
    vec_u8_t srv2 = vec_perm(s0, s0, mask2);
    vec_u8_t srv3 = vec_perm(s0, s0, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s1, s1, mask10);
    vec_u8_t srv11 = vec_perm(s1, s1, mask11);
    vec_u8_t srv12= vec_perm(s1, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s1, mask13);
    vec_u8_t srv14 = vec_perm(s1, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s0, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s0, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s0, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s0, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s0, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s0, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s0, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s0, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s0, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s1, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s1, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s1, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s1, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s1, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s1, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s2, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s2, mask5);
    vec_u8_t  srv22 = vec_perm(s1, s2, mask6);
    vec_u8_t  srv23 = vec_perm(s1, s2, mask7);
    vec_u8_t  srv24 = vec_perm(s1, s2, mask8);
    vec_u8_t  srv25 = vec_perm(s1, s2, mask9);
    vec_u8_t  srv26 = vec_perm(s2, s2, mask10);
    vec_u8_t  srv27 = vec_perm(s2, s2, mask11);
    vec_u8_t  srv28 = vec_perm(s2, s2, mask12);
    vec_u8_t  srv29 = vec_perm(s2, s2, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s2, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s2, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s1, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s1, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s1, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s1, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s1, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s1, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s1, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s1, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s1, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s2, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){19, 6, 25, 12, 31, 18, 5, 24, 11, 30, 17, 4, 23, 10, 29, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){3, 22, 9, 28, 15, 2, 21, 8, 27, 14, 1, 20, 7, 26, 13, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}



template<>
void intra_pred<4, 15>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x2, 0x1, 0x1, 0x0, 0x3, 0x2, 0x2, 0x1, 0x4, 0x3, 0x3, 0x2, 0x5, 0x4, 0x4, 0x3, };
    vec_u8_t mask1={0x3, 0x2, 0x2, 0x1, 0x4, 0x3, 0x3, 0x2, 0x5, 0x4, 0x4, 0x3, 0x6, 0x5, 0x5, 0x4, };
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){15, 30, 13, 28, 15, 30, 13, 28, 15, 30, 13, 28, 15, 30, 13, 28, };
    vec_u8_t vfrac4_32 = (vec_u8_t){17, 2, 19, 4, 17, 2, 19, 4, 17, 2, 19, 4, 17, 2, 19, 4, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 15>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, };
vec_u8_t mask1={0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, };
vec_u8_t mask2={0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, };
vec_u8_t mask3={0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, };
vec_u8_t mask4={0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, };
vec_u8_t mask5={0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, };
vec_u8_t mask6={0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, };
vec_u8_t mask7={0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, };
//vec_u8_t mask8={0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x8, 0x6, 0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

    vec_u8_t vfrac8 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 15, 30, 13, 28, 11, 26, 9, 24, };
    vec_u8_t vfrac8_32 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 17, 2, 19, 4, 21, 6, 23, 8, };

    one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
    one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
    one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
    one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 15>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0, };
vec_u8_t mask1={0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, };
vec_u8_t mask2={0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, };
vec_u8_t mask3={0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, };
vec_u8_t mask4={0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, };
vec_u8_t mask5={0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, };
vec_u8_t mask6={0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, };
vec_u8_t mask7={0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, };
vec_u8_t mask8={0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, };
vec_u8_t mask9={0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, };
vec_u8_t mask10={0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, };
vec_u8_t mask11={0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, };
vec_u8_t mask12={0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, };
vec_u8_t mask13={0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, };
vec_u8_t mask14={0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, };
vec_u8_t mask15={0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, };
vec_u8_t maskadd1_15={0x18, 0x17, 0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(40, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 7, 22, 5, 20, 3, 18, 1, 16, };
vec_u8_t vfrac16_32 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 15>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, };
vec_u8_t mask1={0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, };
vec_u8_t mask2={0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, };
vec_u8_t mask3={0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, };
vec_u8_t mask4={0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, };
vec_u8_t mask5={0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, };
vec_u8_t mask6={0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, };
vec_u8_t mask7={0x17, 0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, };
vec_u8_t mask8={0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0, };
vec_u8_t mask9={0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, };
vec_u8_t mask10={0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, };
vec_u8_t mask11={0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, };
vec_u8_t mask12={0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, };
vec_u8_t mask13={0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, };
vec_u8_t mask14={0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, };
vec_u8_t mask15={0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, };

vec_u8_t mask16_0={0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask16_1={0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask16_2={0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask16_3={0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask16_4={0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask16_5={0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask16_6={0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask16_7={0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask16_8={0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask16_9={0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask16_10={0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask16_11={0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask16_12={0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask16_13={0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask16_14={0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask16_15={0x16, 0x16, 0x15, 0x15, 0x14, 0x14, 0x13, 0x13, 0x12, 0x12, 0x11, 0x11, 0x10, 0x10, 0xf, 0xf, };

vec_u8_t maskadd1_31={0x10, 0xf, 0xf, 0xe, 0xe, 0xd, 0xd, 0xc, 0xc, 0xb, 0xb, 0xa, 0xa, 0x9, 0x9, 0x8, };
vec_u8_t maskadd1_16_31={0x7, 0x7, 0x6, 0x6, 0x5, 0x5, 0x4, 0x4, 0x3, 0x3, 0x2, 0x2, 0x1, 0x1, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1c, 0x1a, 0x18, 0x17, 0x15, 0x13, 0x11, 0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2};
    vec_u8_t refmask_32_1={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(80, srcPix0);  
    vec_u8_t s3 = vec_xl(96, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s1, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s1, s1, mask9);
    vec_u8_t srv10 = vec_perm(s1, s1, mask10);
    vec_u8_t srv11 = vec_perm(s1, s1, mask11);
    vec_u8_t srv12= vec_perm(s1, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s1, mask13);
    vec_u8_t srv14 = vec_perm(s1, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s0, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s0, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s0, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s0, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s0, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s0, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s0, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s0, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s0, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s1, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s1, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s2, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s2, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s2, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s2, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s2, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s2, mask5);
    vec_u8_t  srv22 = vec_perm(s1, s2, mask6);
    vec_u8_t  srv23 = vec_perm(s1, s2, mask7);
    vec_u8_t  srv24 = vec_perm(s2, s2, mask8);
    vec_u8_t  srv25 = vec_perm(s2, s2, mask9);
    vec_u8_t  srv26 = vec_perm(s2, s2, mask10);
    vec_u8_t  srv27 = vec_perm(s2, s2, mask11);
    vec_u8_t  srv28 = vec_perm(s2, s2, mask12);
    vec_u8_t  srv29 = vec_perm(s2, s2, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s2, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s2, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s1, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s1, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s1, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s1, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s1, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s1, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s1, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s1, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s1, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s3, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){15, 30, 13, 28, 11, 26, 9, 24, 7, 22, 5, 20, 3, 18, 1, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){31, 14, 29, 12, 27, 10, 25, 8, 23, 6, 21, 4, 19, 2, 17, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 16>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x2, 0x1, 0x1, 0x0, 0x3, 0x2, 0x2, 0x1, 0x4, 0x3, 0x3, 0x2, 0x5, 0x4, 0x4, 0x3, };
    vec_u8_t mask1={0x3, 0x2, 0x2, 0x1, 0x4, 0x3, 0x3, 0x2, 0x5, 0x4, 0x4, 0x3, 0x6, 0x5, 0x5, 0x4, };
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){11, 22, 1, 12, 11, 22, 1, 12, 11, 22, 1, 12, 11, 22, 1, 12, };
    vec_u8_t vfrac4_32 = (vec_u8_t){21, 10, 31, 20, 21, 10, 31, 20, 21, 10, 31, 20, 21, 10, 31, 20, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 16>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x5, 0x4, 0x4, 0x3, 0x2, 0x2, 0x1, 0x0, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, };
vec_u8_t mask1={0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, };
vec_u8_t mask2={0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, };
vec_u8_t mask3={0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, };
vec_u8_t mask4={0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, };
vec_u8_t mask5={0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, };
vec_u8_t mask6={0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, };
vec_u8_t mask7={0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, };
//vec_u8_t mask8={0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;


    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x8, 0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

    vec_u8_t vfrac8 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 11, 22, 1, 12, 23, 2, 13, 24, };
    vec_u8_t vfrac8_32 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 21, 10, 31, 20, 9, 30, 19, 8, };

one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 16>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x1, 0x0, };
vec_u8_t mask1={0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x2, 0x1, };
vec_u8_t mask2={0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, };
vec_u8_t mask3={0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, };
vec_u8_t mask4={0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, };
vec_u8_t mask5={0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, };
vec_u8_t mask6={0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, };
vec_u8_t mask7={0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, };
vec_u8_t mask8={0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, };
vec_u8_t mask9={0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, };
vec_u8_t mask10={0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, };
vec_u8_t mask11={0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, };
vec_u8_t mask12={0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, };
vec_u8_t mask13={0x17, 0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, };
vec_u8_t mask14={0x18, 0x17, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, };
vec_u8_t mask15={0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, };
vec_u8_t maskadd1_15={0x1a, 0x19, 0x19, 0x18, 0x17, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0x9, 0x8, 0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(38, srcPix0);
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

    vec_u8_t vfrac16 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, };
    vec_u8_t vfrac16_32 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 16>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, };
vec_u8_t mask1={0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, };
vec_u8_t mask2={0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, };
vec_u8_t mask3={0x17, 0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, };
vec_u8_t mask4={0x18, 0x17, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, };
vec_u8_t mask5={0x19, 0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, };
vec_u8_t mask6={0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x1, 0x0, };
vec_u8_t mask7={0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x2, 0x1, };
vec_u8_t mask8={0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, };
vec_u8_t mask9={0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, };
vec_u8_t mask10={0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, };
vec_u8_t mask11={0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, };
vec_u8_t mask12={0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, };
vec_u8_t mask13={0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, };
vec_u8_t mask14={0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, };
vec_u8_t mask15={0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, };

vec_u8_t mask16_0={0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x1, 0x0, 0x0, };
vec_u8_t mask16_1={0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x2, 0x1, 0x1, };
vec_u8_t mask16_2={0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x2, };
vec_u8_t mask16_3={0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x4, 0x3, 0x3, };
vec_u8_t mask16_4={0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x4, };
vec_u8_t mask16_5={0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x6, 0x5, 0x5, };
vec_u8_t mask16_6={0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x6, };
vec_u8_t mask16_7={0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x8, 0x7, 0x7, };
vec_u8_t mask16_8={0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x9, 0x8, 0x8, };
vec_u8_t mask16_9={0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0xa, 0x9, 0x9, };
vec_u8_t mask16_10={0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, 0xa, };
vec_u8_t mask16_11={0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xc, 0xb, 0xb, };
vec_u8_t mask16_12={0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xc, };
vec_u8_t mask16_13={0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xe, 0xd, 0xd, };
vec_u8_t mask16_14={0x17, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xe, };
vec_u8_t mask16_15={0x18, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0x10, 0xf, 0xf, };

vec_u8_t maskadd1_31={0x14, 0x13, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xb, 0xa, };
vec_u8_t maskadd1_16_31={0x9, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x1, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1d, 0x1b, 0x1a, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0xf, 0xe, 0xc, 0xb, 0x9, 0x8};
    vec_u8_t refmask_32_1={0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(76, srcPix0);  
    vec_u8_t s3 = vec_xl(92, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s1, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s1, s1, mask7);
    vec_u8_t srv8 = vec_perm(s1, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s1, s1, mask9);
    vec_u8_t srv10 = vec_perm(s1, s1, mask10);
    vec_u8_t srv11 = vec_perm(s1, s1, mask11);
    vec_u8_t srv12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s2, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s1, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s1, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s1, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s1, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s1, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s1, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s1, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s1, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s2, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s2, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s2, mask2);
    vec_u8_t  srv19 = vec_perm(s1, s2, mask3);
    vec_u8_t  srv20 = vec_perm(s1, s2, mask4);
    vec_u8_t  srv21 = vec_perm(s1, s2, mask5);
    vec_u8_t  srv22 = vec_perm(s2, s2, mask6);
    vec_u8_t  srv23 = vec_perm(s2, s2, mask7);
    vec_u8_t  srv24 = vec_perm(s2, s2, mask8);
    vec_u8_t  srv25 = vec_perm(s2, s2, mask9);
    vec_u8_t  srv26 = vec_perm(s2, s2, mask10);
    vec_u8_t  srv27 = vec_perm(s2, s2, mask11);
    vec_u8_t  srv28 = vec_perm(s2, s3, mask12);
    vec_u8_t  srv29 = vec_perm(s2, s3, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s3, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s3, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s2, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s3, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){11, 22, 1, 12, 23, 2, 13, 24, 3, 14, 25, 4, 15, 26, 5, 16, };
vec_u8_t vfrac32_1 = (vec_u8_t){27, 6, 17, 28, 7, 18, 29, 8, 19, 30, 9, 20, 31, 10, 21, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 17>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    //vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
    //vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };
    vec_u8_t mask0={0x3, 0x2, 0x1, 0x0, 0x4, 0x3, 0x2, 0x1, 0x5, 0x4, 0x3, 0x2, 0x6, 0x5, 0x4, 0x3};
    vec_u8_t mask1={0x4, 0x3, 0x2, 0x1, 0x5, 0x4, 0x3, 0x2, 0x6, 0x5, 0x4, 0x3, 0x7, 0x6, 0x5, 0x4};
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    //vec_u8_t vfrac4 = (vec_u8_t){6, 6, 6, 6, 12, 12, 12, 12, 18, 18, 18, 18, 24, 24, 24, 24};
    //vec_u8_t vfrac4_32 = (vec_u8_t){26, 26, 26, 26, 20, 20, 20, 20, 14, 14, 14, 14, 8, 8, 8, 8};
    vec_u8_t vfrac4 = (vec_u8_t){6, 12, 18, 24, 6, 12, 18, 24, 6, 12, 18, 24, 6, 12, 18, 24, };
    vec_u8_t vfrac4_32 = (vec_u8_t){26, 20, 14, 8, 26, 20, 14, 8, 26, 20, 14, 8, 26, 20, 14, 8, };


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 17>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x6, 0x5, 0x4, 0x3, 0x2, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, };
    vec_u8_t mask1={0x7, 0x6, 0x5, 0x4, 0x3, 0x3, 0x2, 0x1, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, };
    vec_u8_t mask2={0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, };
    vec_u8_t mask3={0x9, 0x8, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, };
    vec_u8_t mask4={0xa, 0x9, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, };
    vec_u8_t mask5={0xb, 0xa, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, };
    vec_u8_t mask6={0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, };
    vec_u8_t mask7={0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, };
    //vec_u8_t mask8={0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00};

    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 6, 12, 18, 24, 30, 4, 10, 16, };
vec_u8_t vfrac8_32 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 26, 20, 14, 8, 2, 28, 22, 16, };

one_line(srv0, srv1, vfrac8_32, vfrac8, vout_0);
one_line(srv2, srv3, vfrac8_32, vfrac8, vout_1);
one_line(srv4, srv5, vfrac8_32, vfrac8, vout_2);
one_line(srv6, srv7, vfrac8_32, vfrac8, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 17>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x1, 0x0, 0x0, };
vec_u8_t mask1={0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x2, 0x1, 0x1, };
vec_u8_t mask2={0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x3, 0x2, 0x2, };
vec_u8_t mask3={0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x4, 0x3, 0x3, };
vec_u8_t mask4={0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, };
vec_u8_t mask5={0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, };
vec_u8_t mask6={0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, };
vec_u8_t mask7={0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, };
vec_u8_t mask8={0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, };
vec_u8_t mask9={0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, };
vec_u8_t mask10={0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, };
vec_u8_t mask11={0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, };
vec_u8_t mask12={0x18, 0x17, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, };
vec_u8_t mask13={0x19, 0x18, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, };
vec_u8_t mask14={0x1a, 0x19, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, };
vec_u8_t mask15={0x1b, 0x1a, 0x19, 0x18, 0x17, 0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, };
vec_u8_t maskadd1_15={0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x11, 0x10, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(36, srcPix0);  
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = srv1; 
    vec_u8_t srv1_add1 = srv2;
    vec_u8_t srv2_add1 = srv3;
    vec_u8_t srv3_add1 = srv4;
    vec_u8_t srv4_add1 = srv5; 
    vec_u8_t srv5_add1 = srv6; 
    vec_u8_t srv6_add1 = srv7;
    vec_u8_t srv7_add1 = srv8; 
    vec_u8_t srv8_add1 = srv9;
    vec_u8_t srv9_add1 = srv10;
    vec_u8_t srv10_add1 = srv11;
    vec_u8_t srv11_add1 = srv12;
    vec_u8_t srv12_add1= srv13; 
    vec_u8_t srv13_add1 = srv14;
    vec_u8_t srv14_add1 = srv15; 
    vec_u8_t srv15_add1 = vec_perm(s0, s1, maskadd1_15);

vec_u8_t vfrac16 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, };
vec_u8_t vfrac16_32 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 32, };

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32, vfrac16, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32, vfrac16, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32, vfrac16, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32, vfrac16, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32, vfrac16, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32, vfrac16, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32, vfrac16, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32, vfrac16, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32, vfrac16, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32, vfrac16, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32, vfrac16, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32, vfrac16, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32, vfrac16, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32, vfrac16, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32, vfrac16, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32, vfrac16, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 17>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x19, 0x18, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, };
vec_u8_t mask1={0x1a, 0x19, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, };
vec_u8_t mask2={0x1b, 0x1a, 0x19, 0x18, 0x17, 0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, };
vec_u8_t mask3={0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x1, 0x0, 0x0, };
vec_u8_t mask4={0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x2, 0x1, 0x1, };
vec_u8_t mask5={0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x3, 0x2, 0x2, };
vec_u8_t mask6={0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x4, 0x3, 0x3, };
vec_u8_t mask7={0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, };
vec_u8_t mask8={0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, };
vec_u8_t mask9={0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, };
vec_u8_t mask10={0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, };
vec_u8_t mask11={0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, };
vec_u8_t mask12={0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, };
vec_u8_t mask13={0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, };
vec_u8_t mask14={0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, };
vec_u8_t mask15={0x18, 0x17, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, };

vec_u8_t mask16_0={0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x1, 0x0, 0x0, };
vec_u8_t mask16_1={0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, 0x4, 0x3, 0x2, 0x1, 0x1, };
vec_u8_t mask16_2={0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, 0x5, 0x4, 0x3, 0x2, 0x2, };
vec_u8_t mask16_3={0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, 0x6, 0x5, 0x4, 0x3, 0x3, };
vec_u8_t mask16_4={0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, };
vec_u8_t mask16_5={0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, 0x8, 0x7, 0x6, 0x5, 0x5, };
vec_u8_t mask16_6={0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, 0x9, 0x8, 0x7, 0x6, 0x6, };
vec_u8_t mask16_7={0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, 0xa, 0x9, 0x8, 0x7, 0x7, };
vec_u8_t mask16_8={0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, 0xb, 0xa, 0x9, 0x8, 0x8, };
vec_u8_t mask16_9={0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, 0xc, 0xb, 0xa, 0x9, 0x9, };
vec_u8_t mask16_10={0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, 0xd, 0xc, 0xb, 0xa, 0xa, };
vec_u8_t mask16_11={0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, 0xe, 0xd, 0xc, 0xb, 0xb, };
vec_u8_t mask16_12={0x18, 0x17, 0x16, 0x15, 0x14, 0x14, 0x13, 0x12, 0x11, 0x10, 0x10, 0xf, 0xe, 0xd, 0xc, 0xc, };
vec_u8_t mask16_13={0x19, 0x18, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, };
vec_u8_t mask16_14={0x1a, 0x19, 0x18, 0x17, 0x16, 0x16, 0x15, 0x14, 0x13, 0x12, 0x12, 0x11, 0x10, 0xf, 0xe, 0xe, };
vec_u8_t mask16_15={0x1b, 0x1a, 0x19, 0x18, 0x17, 0x17, 0x16, 0x15, 0x14, 0x13, 0x13, 0x12, 0x11, 0x10, 0xf, 0xf, };

vec_u8_t maskadd1_31={0x19, 0x18, 0x17, 0x16, 0x15, 0x15, 0x14, 0x13, 0x12, 0x11, 0x11, 0x10, 0xf, 0xe, 0xd, 0xd, };
vec_u8_t maskadd1_16_31={0xc, 0xb, 0xa, 0x9, 0x8, 0x8, 0x7, 0x6, 0x5, 0x4, 0x4, 0x3, 0x2, 0x1, 0x0, 0x0, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    

    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1f, 0x1e, 0x1c, 0x1b, 0x1a, 0x19, 0x17, 0x16, 0x15, 0x14, 0x12, 0x11, 0x10, 0xf, 0xe, 0xc };
    vec_u8_t refmask_32_1={0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};

    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(71, srcPix0);  
    vec_u8_t s3 = vec_xl(87, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s1, s1, mask3);
    vec_u8_t srv4 = vec_perm(s1, s1, mask4); 
    vec_u8_t srv5 = vec_perm(s1, s1, mask5);
    vec_u8_t srv6 = vec_perm(s1, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv8 = vec_perm(s1, s2, mask8); 
    vec_u8_t srv9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv11 = vec_perm(s1, s2, mask11);
    vec_u8_t srv12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv15 = vec_perm(s1, s2, mask15);

    vec_u8_t srv16_0 = vec_perm(s0, s1, mask16_0); 
    vec_u8_t srv16_1 = vec_perm(s0, s1, mask16_1);
    vec_u8_t srv16_2 = vec_perm(s0, s1, mask16_2);
    vec_u8_t srv16_3 = vec_perm(s0, s1, mask16_3);
    vec_u8_t srv16_4 = vec_perm(s0, s1, mask16_4); 
    vec_u8_t srv16_5 = vec_perm(s0, s1, mask16_5);
    vec_u8_t srv16_6 = vec_perm(s0, s1, mask16_6); 
    vec_u8_t srv16_7 = vec_perm(s0, s1, mask16_7);
    vec_u8_t srv16_8 = vec_perm(s0, s1, mask16_8); 
    vec_u8_t srv16_9 = vec_perm(s0, s1, mask16_9);
    vec_u8_t srv16_10 = vec_perm(s0, s1, mask16_10);
    vec_u8_t srv16_11 = vec_perm(s0, s1, mask16_11);
    vec_u8_t srv16_12= vec_perm(s0, s1, mask16_12); 
    vec_u8_t srv16_13 = vec_perm(s0, s1, mask16_13);
    vec_u8_t srv16_14 = vec_perm(s0, s1, mask16_14); 
    vec_u8_t srv16_15 = vec_perm(s0, s1, mask16_15);

    vec_u8_t  srv16 = vec_perm(s1, s2, mask0);  
    vec_u8_t  srv17 = vec_perm(s1, s2, mask1);
    vec_u8_t  srv18 = vec_perm(s1, s2, mask2);
    vec_u8_t  srv19 = vec_perm(s2, s2, mask3);
    vec_u8_t  srv20 = vec_perm(s2, s2, mask4);
    vec_u8_t  srv21 = vec_perm(s2, s2, mask5);
    vec_u8_t  srv22 = vec_perm(s2, s2, mask6);
    vec_u8_t  srv23 = vec_perm(s2, s3, mask7);
    vec_u8_t  srv24 = vec_perm(s2, s3, mask8);
    vec_u8_t  srv25 = vec_perm(s2, s3, mask9);
    vec_u8_t  srv26 = vec_perm(s2, s3, mask10);
    vec_u8_t  srv27 = vec_perm(s2, s3, mask11);
    vec_u8_t  srv28 = vec_perm(s2, s3, mask12);
    vec_u8_t  srv29 = vec_perm(s2, s3, mask13);
    vec_u8_t  srv30 = vec_perm(s2, s3, mask14);
    vec_u8_t  srv31 = vec_perm(s2, s3, mask15);

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16_0);  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask16_1);
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask16_2);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask16_3);
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask16_4);
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask16_5);
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask16_6);
    vec_u8_t  srv16_23 = vec_perm(s1, s2, mask16_7);
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask16_8);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask16_9);
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask16_10);
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask16_11);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask16_12);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask16_13);
    vec_u8_t  srv16_30 = vec_perm(s1, s2, mask16_14);
    vec_u8_t  srv16_31 = vec_perm(s1, s2, mask16_15);
        
    vec_u8_t srv0add1 = srv1;
    vec_u8_t srv1add1 = srv2;
    vec_u8_t srv2add1 = srv3;
    vec_u8_t srv3add1 = srv4;
    vec_u8_t srv4add1 = srv5; 
    vec_u8_t srv5add1 = srv6; 
    vec_u8_t srv6add1 = srv7;
    vec_u8_t srv7add1 = srv8; 
    vec_u8_t srv8add1 = srv9;
    vec_u8_t srv9add1 = srv10;
    vec_u8_t srv10add1 = srv11;
    vec_u8_t srv11add1 = srv12;
    vec_u8_t srv12add1= srv13; 
    vec_u8_t srv13add1 = srv14;
    vec_u8_t srv14add1 = srv15; 
    vec_u8_t srv15add1 = srv16;

    vec_u8_t srv16add1_0 = srv16_1;
    vec_u8_t srv16add1_1 = srv16_2;
    vec_u8_t srv16add1_2 = srv16_3;
    vec_u8_t srv16add1_3 = srv16_4;
    vec_u8_t srv16add1_4 = srv16_5; 
    vec_u8_t srv16add1_5 = srv16_6;
    vec_u8_t srv16add1_6 = srv16_7; 
    vec_u8_t srv16add1_7 = srv16_8;
    vec_u8_t srv16add1_8 = srv16_9; 
    vec_u8_t srv16add1_9 = srv16_10;
    vec_u8_t srv16add1_10 = srv16_11;
    vec_u8_t srv16add1_11 = srv16_12;
    vec_u8_t srv16add1_12= srv16_13; 
    vec_u8_t srv16add1_13 = srv16_14;
    vec_u8_t srv16add1_14 = srv16_15; 
    vec_u8_t srv16add1_15 = srv16_16;

    vec_u8_t  srv16add1 =  srv17;  
    vec_u8_t  srv17add1 = srv18;
    vec_u8_t  srv18add1 = srv19;
    vec_u8_t  srv19add1 = srv20;
    vec_u8_t  srv20add1 = srv21;
    vec_u8_t  srv21add1 = srv22;
    vec_u8_t  srv22add1 = srv23;
    vec_u8_t  srv23add1 = srv24;
    vec_u8_t  srv24add1 = srv25;
    vec_u8_t  srv25add1 = srv26;
    vec_u8_t  srv26add1 = srv27;
    vec_u8_t  srv27add1 = srv28;
    vec_u8_t  srv28add1 = srv29;
    vec_u8_t  srv29add1 = srv30;
    vec_u8_t  srv30add1 = srv31;
    vec_u8_t  srv31add1 = vec_perm(s2, s3, maskadd1_31);

    vec_u8_t  srv16add1_16 = srv16_17;   
    vec_u8_t  srv16add1_17 = srv16_18;
    vec_u8_t  srv16add1_18 = srv16_19;
    vec_u8_t  srv16add1_19 = srv16_20;
    vec_u8_t  srv16add1_20 = srv16_21;
    vec_u8_t  srv16add1_21 = srv16_22;
    vec_u8_t  srv16add1_22 = srv16_23;
    vec_u8_t  srv16add1_23 = srv16_24;
    vec_u8_t  srv16add1_24 = srv16_25;
    vec_u8_t  srv16add1_25 = srv16_26;
    vec_u8_t  srv16add1_26 = srv16_27;
    vec_u8_t  srv16add1_27 = srv16_28;
    vec_u8_t  srv16add1_28 = srv16_29;
    vec_u8_t  srv16add1_29 = srv16_30;
    vec_u8_t  srv16add1_30 = srv16_31;
    vec_u8_t  srv16add1_31 = vec_perm(s2, s2, maskadd1_16_31);

vec_u8_t vfrac32_0 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, };
vec_u8_t vfrac32_1 = (vec_u8_t){6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, };
vec_u8_t vfrac32_32_0 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 32, };
vec_u8_t vfrac32_32_1 = (vec_u8_t){26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 32, };

    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv1, srv1add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv2, srv2add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv3, srv3add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv4, srv4add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv5, srv5add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv6, srv6add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv7, srv7add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv8, srv8add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv9, srv9add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv10, srv10add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv11, srv11add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv12, srv12add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv13, srv13add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv14, srv14add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv15, srv15add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac32_32_0, vfrac32_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac32_32_1, vfrac32_1, vout_1);

    one_line(srv17, srv17add1, vfrac32_32_0, vfrac32_0, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac32_32_1, vfrac32_1, vout_3);

    one_line(srv18, srv18add1, vfrac32_32_0, vfrac32_0, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac32_32_1, vfrac32_1, vout_5);

    one_line(srv19, srv19add1, vfrac32_32_0, vfrac32_0, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac32_32_1, vfrac32_1, vout_7);

    one_line(srv20, srv20add1, vfrac32_32_0, vfrac32_0, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac32_32_1, vfrac32_1, vout_9);

    one_line(srv21, srv21add1, vfrac32_32_0, vfrac32_0, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac32_32_1, vfrac32_1, vout_11);

    one_line(srv22, srv22add1, vfrac32_32_0, vfrac32_0, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac32_32_1, vfrac32_1, vout_13);

    one_line(srv23, srv23add1, vfrac32_32_0, vfrac32_0, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac32_32_1, vfrac32_1, vout_15);

    one_line(srv24, srv24add1, vfrac32_32_0, vfrac32_0, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac32_32_1, vfrac32_1, vout_17);

    one_line(srv25, srv25add1, vfrac32_32_0, vfrac32_0, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac32_32_1, vfrac32_1, vout_19);

    one_line(srv26, srv26add1, vfrac32_32_0, vfrac32_0, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac32_32_1, vfrac32_1, vout_21);

    one_line(srv27, srv27add1, vfrac32_32_0, vfrac32_0, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac32_32_1, vfrac32_1, vout_23);

    one_line(srv28, srv28add1, vfrac32_32_0, vfrac32_0, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac32_32_1, vfrac32_1, vout_25);

    one_line(srv29, srv29add1, vfrac32_32_0, vfrac32_0, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac32_32_1, vfrac32_1, vout_27);

    one_line(srv30, srv30add1, vfrac32_32_0, vfrac32_0, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac32_32_1, vfrac32_1, vout_29);

    one_line(srv31, srv31add1, vfrac32_32_0, vfrac32_0, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac32_32_1, vfrac32_1, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 18>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    //vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
    //vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };


    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_4={0x3, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    //vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    //vec_u8_t vfrac4 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
    //vec_u8_t vfrac4_32 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    if(dstStride==4){
        vec_xst(srv0, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)srv0, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(srv0, srv0, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(srv0, srv0, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(srv0, srv0, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(srv0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srv0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srv0, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srv0,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 18>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
//vec_u8_t mask1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, };
vec_u8_t mask2={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
//vec_u8_t mask3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, };
vec_u8_t mask4={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
//vec_u8_t mask5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
//vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    //vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    //vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    //vec_u8_t vout_0, vout_1, vout_2, vout_3;  
    //vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    //vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    //vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    //vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    //vec_u8_t srv7 = vec_perm(srv, srv, mask7);

    if(dstStride==8){
        vec_xst(srv0, 0, dst);          
        vec_xst(srv2, 16, dst);         
        vec_xst(srv4, 32, dst);         
        vec_xst(srv6, 48, dst);         
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(srv0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srv0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(srv2, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srv2, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(srv4, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(srv4, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(srv6, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(srv6, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 18>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask1={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask2={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask3={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask4={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask5={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask6={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask7={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask8={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask9={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask10={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask11={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
   vec_u8_t refmask_16={0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x10};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(1, srcPix0);   
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = s0;


    vec_xst(srv0, 0, dst);              
    vec_xst(srv1, dstStride, dst);              
    vec_xst(srv2, dstStride*2, dst);            
    vec_xst(srv3, dstStride*3, dst);            
    vec_xst(srv4, dstStride*4, dst);            
    vec_xst(srv5, dstStride*5, dst);            
    vec_xst(srv6, dstStride*6, dst);            
    vec_xst(srv7, dstStride*7, dst);            
    vec_xst(srv8, dstStride*8, dst);            
    vec_xst(srv9, dstStride*9, dst);            
    vec_xst(srv10, dstStride*10, dst);          
    vec_xst(srv11, dstStride*11, dst);          
    vec_xst(srv12, dstStride*12, dst);          
    vec_xst(srv13, dstStride*13, dst);          
    vec_xst(srv14, dstStride*14, dst);          
    vec_xst(srv15, dstStride*15, dst);          

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 18>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask1={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask2={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask3={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask4={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask5={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask6={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask7={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask8={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask9={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask10={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask11={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };

    //vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    //vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t refmask_32_0 = {0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10};
    vec_u8_t refmask_32_1 = {0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8, 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x10};

    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(1, srcPix0);   
    vec_u8_t s3 = vec_xl(17, srcPix0);  

    vec_u8_t srv0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv2 = vec_perm(s1, s2, mask2);
    vec_u8_t srv3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv5 = vec_perm(s1, s2, mask5);
    vec_u8_t srv6 = vec_perm(s1, s2, mask6); 
    vec_u8_t srv7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv8 = vec_perm(s1, s2, mask8);
    vec_u8_t srv9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv11 = vec_perm(s1, s2, mask11);
    vec_u8_t srv12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv15 = s1;

    vec_u8_t srv16_0 = vec_perm(s2, s3, mask0); 
    vec_u8_t srv16_1 = vec_perm(s2, s3, mask1);
    vec_u8_t srv16_2 = vec_perm(s2, s3, mask2);
    vec_u8_t srv16_3 = vec_perm(s2, s3, mask3);
    vec_u8_t srv16_4 = vec_perm(s2, s3, mask4); 
    vec_u8_t srv16_5 = vec_perm(s2, s3, mask5);
    vec_u8_t srv16_6 = vec_perm(s2, s3, mask6); 
    vec_u8_t srv16_7 = vec_perm(s2, s3, mask7);
    vec_u8_t srv16_8 = vec_perm(s2, s3, mask8); 
    vec_u8_t srv16_9 = vec_perm(s2, s3, mask9);
    vec_u8_t srv16_10 = vec_perm(s2, s3, mask10);
    vec_u8_t srv16_11 = vec_perm(s2, s3, mask11);
    vec_u8_t srv16_12= vec_perm(s2, s3, mask12); 
    vec_u8_t srv16_13 = vec_perm(s2, s3, mask13);
    vec_u8_t srv16_14 = vec_perm(s2, s3, mask14); 
    vec_u8_t srv16_15 = s2;

    //0(1,2),1,1,3,4,4,6(1),7(0,1),7,9,10,10,12,13,13,15,16,16,18,19,19,21,22,22,24,25,25,27,28,28,30,30

    vec_u8_t  srv16 = vec_perm(s0, s1, mask0);  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask1);
    vec_u8_t  srv18 = vec_perm(s0, s1, mask2);
    vec_u8_t  srv19 = vec_perm(s0, s1, mask3);
    vec_u8_t  srv20 = vec_perm(s0, s1, mask4);
    vec_u8_t  srv21 = vec_perm(s0, s1, mask5);
    vec_u8_t  srv22 = vec_perm(s0, s1, mask6);
    vec_u8_t  srv23 = vec_perm(s0, s1, mask7);
    vec_u8_t  srv24 = vec_perm(s0, s1, mask8);
    vec_u8_t  srv25 = vec_perm(s0, s1, mask9);
    vec_u8_t  srv26 = vec_perm(s0, s1, mask10);
    vec_u8_t  srv27 = vec_perm(s0, s1, mask11);
    vec_u8_t  srv28 = vec_perm(s0, s1, mask12);
    vec_u8_t  srv29 = vec_perm(s0, s1, mask13);
    vec_u8_t  srv30 = vec_perm(s0, s1, mask14);
    vec_u8_t  srv31 = s0;

    vec_xst(srv0, 0, dst);              
    vec_xst(srv16_0, 16, dst);          
    vec_xst(srv1, dstStride, dst);              
    vec_xst(srv16_1, dstStride+16, dst);                
    vec_xst(srv2, dstStride*2, dst);            
    vec_xst(srv16_2, dstStride*2+16, dst);              
    vec_xst(srv3, dstStride*3, dst);            
    vec_xst(srv16_3, dstStride*3+16, dst);              
    vec_xst(srv4, dstStride*4, dst);            
    vec_xst(srv16_4, dstStride*4+16, dst);              
    vec_xst(srv5, dstStride*5, dst);            
    vec_xst(srv16_5, dstStride*5+16, dst);              
    vec_xst(srv6, dstStride*6, dst);            
    vec_xst(srv16_6, dstStride*6+16, dst);              
    vec_xst(srv7, dstStride*7, dst);            
    vec_xst(srv16_7, dstStride*7+16, dst);              
    vec_xst(srv8, dstStride*8, dst);            
    vec_xst(srv16_8, dstStride*8+16, dst);              
    vec_xst(srv9, dstStride*9, dst);            
    vec_xst(srv16_9, dstStride*9+16, dst);              
    vec_xst(srv10, dstStride*10, dst);          
    vec_xst(srv16_10, dstStride*10+16, dst);            
    vec_xst(srv11, dstStride*11, dst);          
    vec_xst(srv16_11, dstStride*11+16, dst);            
    vec_xst(srv12, dstStride*12, dst);          
    vec_xst(srv16_12, dstStride*12+16, dst);            
    vec_xst(srv13, dstStride*13, dst);          
    vec_xst(srv16_13, dstStride*13+16, dst);            
    vec_xst(srv14, dstStride*14, dst);          
    vec_xst(srv16_14, dstStride*14+16, dst);            
    vec_xst(srv15, dstStride*15, dst);          
    vec_xst(srv16_15, dstStride*15+16, dst);            

    vec_xst(srv16, dstStride*16, dst);          
    vec_xst(srv0, dstStride*16+16, dst);                
    vec_xst(srv17, dstStride*17, dst);          
    vec_xst(srv1, dstStride*17+16, dst);                
    vec_xst(srv18, dstStride*18, dst);          
    vec_xst(srv2, dstStride*18+16, dst);                
    vec_xst(srv19, dstStride*19, dst);          
    vec_xst(srv3, dstStride*19+16, dst);                
    vec_xst(srv20, dstStride*20, dst);          
    vec_xst(srv4, dstStride*20+16, dst);                
    vec_xst(srv21, dstStride*21, dst);          
    vec_xst(srv5, dstStride*21+16, dst);                
    vec_xst(srv22, dstStride*22, dst);          
    vec_xst(srv6, dstStride*22+16, dst);                
    vec_xst(srv23, dstStride*23, dst);          
    vec_xst(srv7, dstStride*23+16, dst);                
    vec_xst(srv24, dstStride*24, dst);          
    vec_xst(srv8, dstStride*24+16, dst);                
    vec_xst(srv25, dstStride*25, dst);          
    vec_xst(srv9, dstStride*25+16, dst);                
    vec_xst(srv26, dstStride*26, dst);          
    vec_xst(srv10, dstStride*26+16, dst);               
    vec_xst(srv27, dstStride*27, dst);          
    vec_xst(srv11, dstStride*27+16, dst);               
    vec_xst(srv28, dstStride*28, dst);          
    vec_xst(srv12, dstStride*28+16, dst);               
    vec_xst(srv29, dstStride*29, dst);          
    vec_xst(srv13, dstStride*29+16, dst);               
    vec_xst(srv30, dstStride*30, dst);          
    vec_xst(srv14, dstStride*30+16, dst);               
    vec_xst(srv31, dstStride*31, dst);          
    vec_xst(srv15, dstStride*31+16, dst);               


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 19>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };


    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_left=vec_perm(srv_left, srv_left, mask_left); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_4={0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};

    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

vec_u8_t vfrac4 = (vec_u8_t){6, 6, 6, 6, 12, 12, 12, 12, 18, 18, 18, 18, 24, 24, 24, 24};
vec_u8_t vfrac4_32 = (vec_u8_t){26, 26, 26, 26, 20, 20, 20, 20, 14, 14, 14, 14, 8, 8, 8, 8};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 19>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, };
vec_u8_t mask1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


    /* fraction[0-7] */
vec_u8_t vfrac8_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_2 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_3 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 16, 16, 16, 16, 16, 16, 16, 16};

    /* 32 - fraction[0-7] */
vec_u8_t vfrac8_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_32_2 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_32_3 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 16, 16, 16, 16, 16, 16, 16, 16};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 19>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask1={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask2={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask3={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask4={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask5={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16 ={0xf, 0xe, 0xc, 0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(4, srcPix0);   
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv1;
    vec_u8_t srv3_add1 = srv2;
    vec_u8_t srv4_add1 = srv3; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv4;
    vec_u8_t srv7_add1 = srv6; 
    vec_u8_t srv8_add1 = srv7;
    vec_u8_t srv9_add1 = srv8;
    vec_u8_t srv10_add1 = srv8;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv11; 
    vec_u8_t srv13_add1 = srv12;
    vec_u8_t srv14_add1 = srv13; 
    vec_u8_t srv15_add1 = srv13;


    /* fraction[0-15] */
vec_u8_t vfrac16_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

        /* 32- fraction[0-15] */
vec_u8_t vfrac16_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 19>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask2={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask4={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask11={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask12={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask13={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask14={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask15={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask16={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };

vec_u8_t mask17={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask18={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask19={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask20={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask21={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask22={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask23={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask24={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask25={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask26={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask27={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask28={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t refmask_32_0 ={0x1f, 0x1e, 0x1c, 0x1b, 0x1a, 0x19, 0x17, 0x16, 0x15, 0x14, 0x12, 0x11, 0x10, 0xf, 0xe, 0xc};
    vec_u8_t refmask_32_1 = {0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};

    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(7, srcPix0);   
    vec_u8_t s3 = vec_xl(16+7, srcPix0);        
        
    vec_u8_t srv0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv2 = vec_perm(s1, s2, mask2);
    vec_u8_t srv3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s1, s2, mask6); 
    vec_u8_t srv7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv8 = vec_perm(s1, s2, mask8); 
    vec_u8_t srv9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = s1;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    vec_u8_t srv16_0 = vec_perm(s2, s3, mask0); 
    vec_u8_t srv16_1 = vec_perm(s2, s3, mask1);
    vec_u8_t srv16_2 = vec_perm(s2, s3, mask2);
    vec_u8_t srv16_3 = vec_perm(s2, s3, mask3);
    vec_u8_t srv16_4 = vec_perm(s2, s3, mask4); 
    vec_u8_t srv16_5 =srv16_4;
    vec_u8_t srv16_6 = vec_perm(s2, s3, mask6); 
    vec_u8_t srv16_7 = vec_perm(s2, s3, mask7);
    vec_u8_t srv16_8 = vec_perm(s2, s3, mask8); 
    vec_u8_t srv16_9 = vec_perm(s2, s3, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = s2;
    vec_u8_t srv16_12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;
    //0,1,2,3,4,4,6,7,8,9,9(1,2),11(1),12(0,1),13,14,14,15,16,17,18,19,20,20,22,23,24,25,25,27,28,29,30(0),30,

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = vec_perm(s0, s1, mask20);
    vec_u8_t  srv21 = srv20;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = vec_perm(s0, s1, mask23);
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = vec_perm(s0, s1, mask25);
    vec_u8_t  srv26 = srv25;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = vec_perm(s0, s1, mask29);
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask20);
    vec_u8_t  srv16_21 = srv16_20;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = vec_perm(s1, s2, mask23);
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask25);
    vec_u8_t  srv16_26 = srv16_25;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask29);
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;
        
    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = srv0;
    vec_u8_t srv2add1 = srv1;
    vec_u8_t srv3add1 = srv2;
    vec_u8_t srv4add1 = srv3; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv4;
    vec_u8_t srv7add1 = srv6; 
    vec_u8_t srv8add1 = srv7;
    vec_u8_t srv9add1 = srv8;
    vec_u8_t srv10add1 = srv8;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv11; 
    vec_u8_t srv13add1 = srv12;
    vec_u8_t srv14add1 = srv13; 
    vec_u8_t srv15add1 = srv13;

    //0(1,2),1,2,3,3.4,6,7,8,8,9,11(1),12(0,1),13,13,14,16, 17, 18,19,19,20,22,26,24,24,25,27,28,29,29,

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16_0;
    vec_u8_t srv16add1_2 = srv16_1;
    vec_u8_t srv16add1_3 = srv16_2;
    vec_u8_t srv16add1_4 = srv16_3; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_4; 
    vec_u8_t srv16add1_7 = srv16_6;
    vec_u8_t srv16add1_8 = srv16_7; 
    vec_u8_t srv16add1_9 = srv16_8;
    vec_u8_t srv16add1_10 = srv16_8;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_11; 
    vec_u8_t srv16add1_13 = srv16_12;
    vec_u8_t srv16add1_14 = srv16_13; 
    vec_u8_t srv16add1_15 = srv16_13;

    vec_u8_t  srv16add1 =  srv14;  
    vec_u8_t  srv17add1 = srv16;
    vec_u8_t  srv18add1 = srv17;
    vec_u8_t  srv19add1 = srv18;
    vec_u8_t  srv20add1 = srv19;
    vec_u8_t  srv21add1 = srv19;
    vec_u8_t  srv22add1 = srv20;
    vec_u8_t  srv23add1 = srv22;
    vec_u8_t  srv24add1 = srv23;
    vec_u8_t  srv25add1 = srv24;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv25;
    vec_u8_t  srv28add1 = srv27;
    vec_u8_t  srv29add1 = srv28;
    vec_u8_t  srv30add1 = srv29;
    vec_u8_t  srv31add1 = srv29;

    vec_u8_t  srv16add1_16 = srv16_14;   
    vec_u8_t  srv16add1_17 = srv16_16;
    vec_u8_t  srv16add1_18 = srv16_17;
    vec_u8_t  srv16add1_19 = srv16_18;
    vec_u8_t  srv16add1_20 = srv16_19;
    vec_u8_t  srv16add1_21 = srv16_19;
    vec_u8_t  srv16add1_22 = srv16_20;
    vec_u8_t  srv16add1_23 = srv16_22;
    vec_u8_t  srv16add1_24 = srv16_23;
    vec_u8_t  srv16add1_25 = srv16_24;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_25;
    vec_u8_t  srv16add1_28 = srv16_27;
    vec_u8_t  srv16add1_29 = srv16_28;
    vec_u8_t  srv16add1_30 = srv16_29;
    vec_u8_t  srv16add1_31 = srv16_29;

vec_u8_t vfrac16_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 20>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };


    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_left=vec_perm(srv_left, srv_left, mask_left); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_4={0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

vec_u8_t vfrac4 = (vec_u8_t){11, 11, 11, 11, 22, 22, 22, 22, 1, 1, 1, 1, 12, 12, 12, 12};
vec_u8_t vfrac4_32 = (vec_u8_t){21, 21, 21, 21, 10, 10, 10, 10, 31, 31, 31, 31, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 20>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask1={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, };
vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x8, 0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_1 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_3 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24};

vec_u8_t vfrac8_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_32_1 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_32_3 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 20>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask1={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask2={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask3={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask5={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask6={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask7={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t maskadd1_0={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
/*vec_u8_t maskadd1_1={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t maskadd1_2={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t maskadd1_3={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t maskadd1_4={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_5={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_8={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_9={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_11={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0x9, 0x8, 0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(6, srcPix0);   
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv1;
    vec_u8_t srv4_add1 = srv3; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv4;
    vec_u8_t srv7_add1 = srv6; 
    vec_u8_t srv8_add1 = srv6;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv9;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv10; 
    vec_u8_t srv13_add1 = srv12;
    vec_u8_t srv14_add1 = srv12; 
    vec_u8_t srv15_add1 = srv13;
vec_u8_t vfrac16_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 20>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask7={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
//vec_u8_t mask8={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask9={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask10={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask11={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask12={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask13={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask14={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask15={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };

vec_u8_t mask16={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask17={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask18={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask19={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask20={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask21={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask22={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask23={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask24={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask25={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask26={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask27={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t refmask_32_0 = {0x1e, 0x1d, 0x1b, 0x1a, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0xf, 0xe, 0xc, 0xb, 0x9, 0x8, };
    vec_u8_t refmask_32_1 = {0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};

    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(12, srcPix0);  
    vec_u8_t s3 = vec_xl(16+12, srcPix0);       

    vec_u8_t srv0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = s1; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s2, s3, mask0); 
    vec_u8_t srv16_1 = vec_perm(s2, s3, mask1);
    vec_u8_t srv16_2 = srv16_1;
    vec_u8_t srv16_3 = vec_perm(s2, s3, mask3);
    vec_u8_t srv16_4 = vec_perm(s2, s3, mask4); 
    vec_u8_t srv16_5 = srv16_4;
    vec_u8_t srv16_6 = s2; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv16_11 = srv16_10;
    vec_u8_t srv16_12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = srv16_13; 
    vec_u8_t srv16_15 = vec_perm(s1, s2, mask15);

    //0(1,2),1,1,3,4,4,6(1),7(0,1),7,9,10,10,12,13,13,15,16,16,18,19,19,21,22,22,24,25,25,27,28,28,30,30

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = srv16;
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = vec_perm(s0, s1, mask21);
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = vec_perm(s0, s1, mask25);
    vec_u8_t  srv26 = srv25;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = srv28;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = srv16_16;
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask21);
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask25);
    vec_u8_t  srv16_26 = srv16_25;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = srv16_28;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = srv0;
    vec_u8_t srv2add1 = srv0;
    vec_u8_t srv3add1 = srv1;
    vec_u8_t srv4add1 = srv3; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv4;
    vec_u8_t srv7add1 = s1; 
    vec_u8_t srv8add1 = s1;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv9;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv10; 
    vec_u8_t srv13add1 = srv12;
    vec_u8_t srv14add1 = srv12; 
    vec_u8_t srv15add1 = srv13;

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16_0;
    vec_u8_t srv16add1_2 = srv16_0;
    vec_u8_t srv16add1_3 = srv16_1;
    vec_u8_t srv16add1_4 = srv16_3; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_4; 
    vec_u8_t srv16add1_7 = s2;
    vec_u8_t srv16add1_8 = s2; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_9;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_10; 
    vec_u8_t srv16add1_13 = srv16_12;
    vec_u8_t srv16add1_14 = srv16_12; 
    vec_u8_t srv16add1_15 = srv16_13;

    //0,0,1,3,3,4,6(0),6,7,9,9,10,12,12,13,15,15,16,18,18,19,21,21,22,24,24,25,27,27,28,28

    vec_u8_t  srv16add1 = srv15;  
    vec_u8_t  srv17add1 = srv15;
    vec_u8_t  srv18add1 = srv16;
    vec_u8_t  srv19add1 = srv18;
    vec_u8_t  srv20add1 = srv18;
    vec_u8_t  srv21add1 = srv19;
    vec_u8_t  srv22add1 = srv21;
    vec_u8_t  srv23add1 = srv21;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv24;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv25;
    vec_u8_t  srv28add1 = srv27;
    vec_u8_t  srv29add1 = srv27;
    vec_u8_t  srv30add1 = srv28;
    vec_u8_t  srv31add1 = srv28;

    vec_u8_t  srv16add1_16 = srv16_15;   
    vec_u8_t  srv16add1_17 = srv16_15;
    vec_u8_t  srv16add1_18 = srv16_16;
    vec_u8_t  srv16add1_19 = srv16_18;
    vec_u8_t  srv16add1_20 = srv16_18;
    vec_u8_t  srv16add1_21 = srv16_19;
    vec_u8_t  srv16add1_22 = srv16_21;
    vec_u8_t  srv16add1_23 = srv16_21;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_24;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_25;
    vec_u8_t  srv16add1_28 = srv16_27;
    vec_u8_t  srv16add1_29 = srv16_27;
    vec_u8_t  srv16add1_30 = srv16_28;
    vec_u8_t  srv16add1_31 = srv16_28;

vec_u8_t vfrac16_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_18 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_22 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_26 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_30 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_18 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_22 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_26 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_30 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 21>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };



    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_left=vec_perm(srv_left, srv_left, mask_left); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_4={0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
vec_u8_t vfrac4 = (vec_u8_t){15, 15, 15, 15, 30, 30, 30, 30, 13, 13, 13, 13, 28, 28, 28, 28};
vec_u8_t vfrac4_32 = (vec_u8_t){17, 17, 17, 17, 2, 2, 2, 2, 19, 19, 19, 19, 4, 4, 4, 4};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 21>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask1={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t refmask_8={0x8, 0x6, 0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_1 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_2 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac8_3 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_32_1 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_2 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac8_32_3 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 21>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask2={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask4={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask6={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask7={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask8={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
/*vec_u8_t maskadd1_1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_2={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_3={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_5={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_6={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_7={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(8, srcPix0);   
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = srv5; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= srv11; 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv1;
    vec_u8_t srv4_add1 = srv1; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv3;
    vec_u8_t srv7_add1 = srv5; 
    vec_u8_t srv8_add1 = srv5;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv9; 
    vec_u8_t srv13_add1 = srv11;
    vec_u8_t srv14_add1 = srv11; 
    vec_u8_t srv15_add1 = srv13;
        
vec_u8_t vfrac16_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 21>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
//vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask1={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
//vec_u8_t mask2={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask3={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
//vec_u8_t mask4={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask5={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask6={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask7={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
//vec_u8_t mask8={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask9={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask10={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask11={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask12={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask13={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask14={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask15={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };

vec_u8_t mask16={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask17={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask18={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask19={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask20={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask21={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask22={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask23={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask24={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask25={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask26={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask27={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    //vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    //vec_u8_t srv_left1=vec_xl(80, srcPix0);
    //vec_u8_t srv_right=vec_xl(0, srcPix0);
    //vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0);       
    //vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1);       
    //vec_u8_t s2 = vec_xl(12, srcPix0);        
    //vec_u8_t s3 = vec_xl(16+12, srcPix0);     

    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0); 
    vec_u8_t refmask_32 = {0x1e, 0x1c, 0x1a, 0x18, 0x17, 0x15, 0x13, 0x11, 0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32);   
    vec_u8_t s1 = vec_xl(0, srcPix0);;
    vec_u8_t s2 = vec_xl(16, srcPix0);  
    vec_u8_t s3 = vec_xl(32, srcPix0);  
    

    vec_u8_t srv0 = s1; 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = srv5; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= srv11; 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);

    vec_u8_t srv16_0 = s2; 
    vec_u8_t srv16_1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv16_2 = srv16_1;
    vec_u8_t srv16_3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv16_4 = srv16_3; 
    vec_u8_t srv16_5 = vec_perm(s1, s2, mask5);
    vec_u8_t srv16_6 = srv16_5; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = vec_perm(s1, s2, mask11);
    vec_u8_t srv16_12= srv16_11; 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = srv16_13; 
    vec_u8_t srv16_15 = vec_perm(s1, s2, mask15);

    //s1, 1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,s0,s0

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = srv16;
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = srv18;
    vec_u8_t  srv20 = vec_perm(s0, s1, mask20);
    vec_u8_t  srv21 = srv20;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = vec_perm(s0, s1, mask26);
    vec_u8_t  srv27 = srv26;
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = srv28;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = srv16_16;
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = srv16_18;
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask20);
    vec_u8_t  srv16_21 = srv16_20;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask26);
    vec_u8_t  srv16_27 = srv16_26;
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = srv16_28;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = s1;
    vec_u8_t srv2add1 = s1;
    vec_u8_t srv3add1 = srv1;
    vec_u8_t srv4add1 = srv1; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv3;
    vec_u8_t srv7add1 = srv6; 
    vec_u8_t srv8add1 = srv6;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv9; 
    vec_u8_t srv13add1 = srv11;
    vec_u8_t srv14add1 = srv11; 
    vec_u8_t srv15add1 = srv14;

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = s2;
    vec_u8_t srv16add1_2 = s2;
    vec_u8_t srv16add1_3 = srv16_1;
    vec_u8_t srv16add1_4 = srv16_1; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_3; 
    vec_u8_t srv16add1_7 = srv16_6;
    vec_u8_t srv16add1_8 = srv16_6; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_9; 
    vec_u8_t srv16add1_13 = srv16_11;
    vec_u8_t srv16add1_14 = srv16_11; 
    vec_u8_t srv16add1_15 = srv16_14;

    //srv28, s1,s1, 1,1,3,3,6,6,7,7,9,9,11,11,14,15,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,

    vec_u8_t  srv16add1 = srv15;  
    vec_u8_t  srv17add1 = srv15;
    vec_u8_t  srv18add1 = srv16;
    vec_u8_t  srv19add1 = srv16;
    vec_u8_t  srv20add1 = srv18;
    vec_u8_t  srv21add1 = srv18;
    vec_u8_t  srv22add1 = srv20;
    vec_u8_t  srv23add1 = srv20;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv22;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv24;
    vec_u8_t  srv28add1 = srv26;
    vec_u8_t  srv29add1 = srv26;
    vec_u8_t  srv30add1 = srv28;
    vec_u8_t  srv31add1 = srv28;

    vec_u8_t  srv16add1_16 = srv16_15;   
    vec_u8_t  srv16add1_17 = srv16_15;
    vec_u8_t  srv16add1_18 = srv16_16;
    vec_u8_t  srv16add1_19 = srv16_16;
    vec_u8_t  srv16add1_20 = srv16_18;
    vec_u8_t  srv16add1_21 = srv16_18;
    vec_u8_t  srv16add1_22 = srv16_20;
    vec_u8_t  srv16add1_23 = srv16_20;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_22;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_24;
    vec_u8_t  srv16add1_28 = srv16_26;
    vec_u8_t  srv16add1_29 = srv16_26;
    vec_u8_t  srv16add1_30 = srv16_28;
    vec_u8_t  srv16add1_31 = srv16_28;

vec_u8_t vfrac16_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_17 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_18 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_20 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_21 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_22 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_25 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_26 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_28 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_29 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_30 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_17 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_18 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_20 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_21 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_22 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_25 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_26 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_28 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_29 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_30 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 22>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };



    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_left=vec_perm(srv_left, srv_left, mask_left); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_4={0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
vec_u8_t vfrac4 = (vec_u8_t){19, 19, 19, 19, 6, 6, 6, 6, 25, 25, 25, 25, 12, 12, 12, 12};
vec_u8_t vfrac4_32 = (vec_u8_t){13, 13, 13, 13, 26, 26, 26, 26, 7, 7, 7, 7, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 22>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x7, 0x5, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac8_1 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac8_3 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac8_32_1 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac8_32_3 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 22>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask1={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask2={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
/*vec_u8_t maskadd1_1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_2={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_4={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_6={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_7={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_8={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};

    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(10, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = srv2;
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = srv4; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = srv9;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv0;
    vec_u8_t srv4_add1 = srv2; 
    vec_u8_t srv5_add1 = srv2; 
    vec_u8_t srv6_add1 = srv2;
    vec_u8_t srv7_add1 = srv4; 
    vec_u8_t srv8_add1 = srv4;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv7;
    vec_u8_t srv12_add1= srv9; 
    vec_u8_t srv13_add1 = srv9;
    vec_u8_t srv14_add1 = srv12; 
    vec_u8_t srv15_add1 = srv12;
vec_u8_t vfrac16_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 22>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
//vec_u8_t mask1={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask2={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask3={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask4={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask5={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask6={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask7={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask8={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask9={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask10={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask11={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask12={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask13={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask14={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask15={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };

//vec_u8_t mask16={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask17={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask18={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask19={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask20={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask21={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask22={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask23={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask24={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask25={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask26={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask27={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    //vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    //vec_u8_t srv_left1=vec_xl(80, srcPix0);
    //vec_u8_t srv_right=vec_xl(0, srcPix0);
    //vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0);       
    //vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1);       
    //vec_u8_t s2 = vec_xl(12, srcPix0);        
    //vec_u8_t s3 = vec_xl(16+12, srcPix0);     

    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1e, 0x1b, 0x19, 0x16, 0x14, 0x11, 0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(4, srcPix0);;
    vec_u8_t s2 = vec_xl(20, srcPix0);  
    //vec_u8_t s3 = vec_xl(36, srcPix0);        
 
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = srv2;
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = srv4; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = srv9;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = vec_perm(s1, s2, mask2);
    vec_u8_t srv16_3 = srv16_2;
    vec_u8_t srv16_4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv16_5 = srv16_4;
    vec_u8_t srv16_6 = srv16_4; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = srv16_9;
    vec_u8_t srv16_12=  vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = srv16_12;
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;

    //0(0,1),0,2,2,4,4,4,7,7,9,9,9,12,12,14,14,14,17,17,19,19,19,22,22,24,24,24,27,27,s0,s0,s0

    vec_u8_t  srv16 = srv14;  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = srv17;
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = srv19;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = srv24;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = srv27;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_14;  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = srv16_17;
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = srv16_19;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = srv16_24;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = srv16_27;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0;
    vec_u8_t srv3add1 = srv0;
    vec_u8_t srv4add1 = srv2; 
    vec_u8_t srv5add1 = srv2; 
    vec_u8_t srv6add1 = srv2;
    vec_u8_t srv7add1 = srv4; 
    vec_u8_t srv8add1 = srv4;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv7;
    vec_u8_t srv12add1= srv9; 
    vec_u8_t srv13add1 = srv9;
    vec_u8_t srv14add1 = srv12; 
    vec_u8_t srv15add1 = srv12;

    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16_0;
    vec_u8_t srv16add1_3 = srv16_0;
    vec_u8_t srv16add1_4 = srv16_2; 
    vec_u8_t srv16add1_5 = srv16_2;
    vec_u8_t srv16add1_6 = srv16_2; 
    vec_u8_t srv16add1_7 = srv16_4;
    vec_u8_t srv16add1_8 = srv16_4; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_7;
    vec_u8_t srv16add1_12= srv16_9; 
    vec_u8_t srv16add1_13 = srv16_9;
    vec_u8_t srv16add1_14 = srv16_12; 
    vec_u8_t srv16add1_15 = srv16_12;

    //srv28, s1,s1, 1,1,3,3,6,6,7,7,9,9,11,11,14,15,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,
    //0,0,2,2,2,4,4,7,7,7,9,9,12,12,12,14,14,17,17,17,19,19,22,22,22,24,24,27,27,27,

    vec_u8_t  srv16add1 = srv12;  
    vec_u8_t  srv17add1 = srv14;
    vec_u8_t  srv18add1 = srv14;
    vec_u8_t  srv19add1 = srv17;
    vec_u8_t  srv20add1 = srv17;
    vec_u8_t  srv21add1 = srv17;
    vec_u8_t  srv22add1 = srv19;
    vec_u8_t  srv23add1 = srv19;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv22;
    vec_u8_t  srv26add1 = srv22;
    vec_u8_t  srv27add1 = srv24;
    vec_u8_t  srv28add1 = srv24;
    vec_u8_t  srv29add1 = srv27;
    vec_u8_t  srv30add1 = srv27;
    vec_u8_t  srv31add1 = srv27;

    vec_u8_t  srv16add1_16 = srv16_12;   
    vec_u8_t  srv16add1_17 = srv16_14;
    vec_u8_t  srv16add1_18 = srv16_14;
    vec_u8_t  srv16add1_19 = srv16_17;
    vec_u8_t  srv16add1_20 = srv16_17;
    vec_u8_t  srv16add1_21 = srv16_17;
    vec_u8_t  srv16add1_22 = srv16_19;
    vec_u8_t  srv16add1_23 = srv16_19;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_22;
    vec_u8_t  srv16add1_26 = srv16_22;
    vec_u8_t  srv16add1_27 = srv16_24;
    vec_u8_t  srv16add1_28 = srv16_24;
    vec_u8_t  srv16add1_29 = srv16_27;
    vec_u8_t  srv16add1_30 = srv16_27;
    vec_u8_t  srv16add1_31 = srv16_27;

vec_u8_t vfrac16_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_17 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_18 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_21 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_22 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_25 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_26 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_29 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_30 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_17 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_18 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_21 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_22 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_25 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_26 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_29 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_30 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 23>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };

    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_left=vec_perm(srv_left, srv_left, mask_left); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
vec_u8_t refmask_4={0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};

    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
vec_u8_t vfrac4 = (vec_u8_t){23, 23, 23, 23, 14, 14, 14, 14, 5, 5, 5, 5, 28, 28, 28, 28};
vec_u8_t vfrac4_32 = (vec_u8_t){9, 9, 9, 9, 18, 18, 18, 18, 27, 27, 27, 27, 4, 4, 4, 4};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 23>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
vec_u8_t refmask_8={0x7, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, };

    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac8_1 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_3 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac8_32_1 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_32_3 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 23>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask4={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask6={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
/*vec_u8_t maskadd1_1={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_2={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
vec_u8_t refmask_16={0xe, 0xb, 0x7, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(12, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = srv3;
    vec_u8_t srv6 = srv3; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = srv7;
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= srv10; 
    vec_u8_t srv13 = srv10;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0_add1;
    vec_u8_t srv3_add1 = srv0;
    vec_u8_t srv4_add1 = srv0; 
    vec_u8_t srv5_add1 = srv0; 
    vec_u8_t srv6_add1 = srv0;
    vec_u8_t srv7_add1 = srv3; 
    vec_u8_t srv8_add1 = srv3;
    vec_u8_t srv9_add1 = srv3;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv7;
    vec_u8_t srv12_add1= srv7; 
    vec_u8_t srv13_add1 = srv7;
    vec_u8_t srv14_add1 = srv10; 
    vec_u8_t srv15_add1 = srv10;
vec_u8_t vfrac16_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 23>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask3={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask10={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask14={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask17={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask21={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask24={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t mask1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask2={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask5={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask8={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask9={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask11={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask12={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask13={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask15={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };

vec_u8_t mask16={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask18={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask19={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask20={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask22={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask23={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask25={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask26={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask27={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask28={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/

vec_u8_t maskadd1_0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1c, 0x19, 0x15, 0x12, 0xe, 0xb, 0x7, 0x4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(8, srcPix0);;
    vec_u8_t s2 = vec_xl(24, srcPix0);  
    //vec_u8_t s3 = vec_xl(40, srcPix0);        
 
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = srv3;
    vec_u8_t srv6 = srv3; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = srv7;
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= srv10; 
    vec_u8_t srv13 = srv10;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    //0,0,0,3,3,3,3,7,7,7,10,10,10,10,14,14,14,17,17,17,17,21,21,21,24,24,24,24,s0,s0,s0,s0

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = srv16_0;
    vec_u8_t srv16_3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv16_4 = srv16_3; 
    vec_u8_t srv16_5 = srv16_3;
    vec_u8_t srv16_6 = srv16_3; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = srv16_7;
    vec_u8_t srv16_10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv16_11 = srv16_10;
    vec_u8_t srv16_12=  srv16_10; 
    vec_u8_t srv16_13 = srv16_10;
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;

    vec_u8_t  srv16 = srv14;  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = srv17;
    vec_u8_t  srv19 = srv17;
    vec_u8_t  srv20 = srv17;
    vec_u8_t  srv21 = vec_perm(s0, s1, mask21);
    vec_u8_t  srv22 = srv21;
    vec_u8_t  srv23 = srv21;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = srv24;
    vec_u8_t  srv27 = srv24;
    vec_u8_t  srv28 = s0;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_14;  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = srv16_17;
    vec_u8_t  srv16_19 = srv16_17;
    vec_u8_t  srv16_20 = srv16_17;
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask21);
    vec_u8_t  srv16_22 = srv16_21;
    vec_u8_t  srv16_23 = srv16_21;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = srv16_24;
    vec_u8_t  srv16_27 = srv16_24;
    vec_u8_t  srv16_28 = s1;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0add1;
    vec_u8_t srv3add1 = srv0;
    vec_u8_t srv4add1 = srv0; 
    vec_u8_t srv5add1 = srv0; 
    vec_u8_t srv6add1 = srv0;
    vec_u8_t srv7add1 = srv3; 
    vec_u8_t srv8add1 = srv3;
    vec_u8_t srv9add1 = srv3;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv7;
    vec_u8_t srv12add1= srv7; 
    vec_u8_t srv13add1 = srv7;
    vec_u8_t srv14add1 = srv10; 
    vec_u8_t srv15add1 = srv10;
    //0,0,0,0,3,3,3,7,7,7,7,10,10,10,14,14,14,14,17,17,17,21,21,21,21,24,24,24,24,
    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16add1_0;
    vec_u8_t srv16add1_3 = srv16_0;
    vec_u8_t srv16add1_4 = srv16_0; 
    vec_u8_t srv16add1_5 = srv16_0;
    vec_u8_t srv16add1_6 = srv16_0; 
    vec_u8_t srv16add1_7 = srv16_3;
    vec_u8_t srv16add1_8 = srv16_3; 
    vec_u8_t srv16add1_9 = srv16_3;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_7;
    vec_u8_t srv16add1_12= srv16_7; 
    vec_u8_t srv16add1_13 = srv16_7;
    vec_u8_t srv16add1_14 = srv16_10; 
    vec_u8_t srv16add1_15 = srv16_10;

    vec_u8_t  srv16add1 = srv10;  
    vec_u8_t  srv17add1 = srv14;
    vec_u8_t  srv18add1 = srv14;
    vec_u8_t  srv19add1 = srv14;
    vec_u8_t  srv20add1 = srv14;
    vec_u8_t  srv21add1 = srv17;
    vec_u8_t  srv22add1 = srv17;
    vec_u8_t  srv23add1 = srv17;
    vec_u8_t  srv24add1 = srv21;
    vec_u8_t  srv25add1 = srv21;
    vec_u8_t  srv26add1 = srv21;
    vec_u8_t  srv27add1 = srv21;
    vec_u8_t  srv28add1 = srv24;
    vec_u8_t  srv29add1 = srv24;
    vec_u8_t  srv30add1 = srv24;
    vec_u8_t  srv31add1 = srv24;

    vec_u8_t  srv16add1_16 = srv16_10;   
    vec_u8_t  srv16add1_17 = srv16_14;
    vec_u8_t  srv16add1_18 = srv16_14;
    vec_u8_t  srv16add1_19 = srv16_14;
    vec_u8_t  srv16add1_20 = srv16_14;
    vec_u8_t  srv16add1_21 = srv16_17;
    vec_u8_t  srv16add1_22 = srv16_17;
    vec_u8_t  srv16add1_23 = srv16_17;
    vec_u8_t  srv16add1_24 = srv16_21;
    vec_u8_t  srv16add1_25 = srv16_21;
    vec_u8_t  srv16add1_26 = srv16_21;
    vec_u8_t  srv16add1_27 = srv16_21;
    vec_u8_t  srv16add1_28 = srv16_24;
    vec_u8_t  srv16add1_29 = srv16_24;
    vec_u8_t  srv16add1_30 = srv16_24;
    vec_u8_t  srv16add1_31 = srv16_24;

vec_u8_t vfrac16_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_17 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_18 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_20 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_21 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_22 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_25 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_26 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_28 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_29 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_30 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_17 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_18 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_20 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_21 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_22 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_25 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_26 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_28 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_29 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_30 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 24>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };


    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    //vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    // vec_u8_t refmask_4={0x10, 0x11, 0x12, 0x13, 0x14, 0x00, };
    //vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);  
    vec_u8_t srv = vec_xl(0, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
vec_u8_t vfrac4 = (vec_u8_t){27, 27, 27, 27, 22, 22, 22, 22, 17, 17, 17, 17, 12, 12, 12, 12};
vec_u8_t vfrac4_32 = (vec_u8_t){5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 24>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask2={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x6, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_1 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_3 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_32_1 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_32_3 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 24>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask12={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t maskadd1_0={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
/*vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask8={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask13={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_4={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_6={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xd, 0x6, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(14, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = srv0;
    vec_u8_t srv4 = srv0; 
    vec_u8_t srv5 = srv0;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = srv6;
    vec_u8_t srv8 = srv6; 
    vec_u8_t srv9 = srv6;
    vec_u8_t srv10 = srv6;
    vec_u8_t srv11 = srv6;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = srv12; 
    vec_u8_t srv15 = srv12;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0_add1;
    vec_u8_t srv3_add1 = srv0_add1;
    vec_u8_t srv4_add1 = srv0_add1; 
    vec_u8_t srv5_add1 = srv0_add1; 
    vec_u8_t srv6_add1 = srv0;
    vec_u8_t srv7_add1 = srv0; 
    vec_u8_t srv8_add1 = srv0;
    vec_u8_t srv9_add1 = srv0;
    vec_u8_t srv10_add1 = srv0;
    vec_u8_t srv11_add1 = srv0;
    vec_u8_t srv12_add1= srv6; 
    vec_u8_t srv13_add1 = srv6;
    vec_u8_t srv14_add1 = srv6; 
    vec_u8_t srv15_add1 = srv6;
vec_u8_t vfrac16_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 24>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
/*vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };*/
vec_u8_t mask6={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
/*vec_u8_t mask13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask15={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };

vec_u8_t mask16={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask17={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask18={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };*/
vec_u8_t mask19={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t mask20={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask21={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask22={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask23={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask24={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask25={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask26={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask27={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask28={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1a, 0x13, 0xd, 0x6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(12, srcPix0);;
    vec_u8_t s2 = vec_xl(28, srcPix0);  
    //vec_u8_t s3 = vec_xl(44, srcPix0);        

   //(0,6)(6,6)(12,7)(19,6)(25, s0)

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = srv0;
    vec_u8_t srv4 = srv0; 
    vec_u8_t srv5 = srv0;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = srv6;
    vec_u8_t srv8 = srv6;
    vec_u8_t srv9 = srv6;
    vec_u8_t srv10 = srv6;
    vec_u8_t srv11 = srv6;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = srv12; 
    vec_u8_t srv15 = srv12;

    //0,0,0,3,3,3,3,7,7,7,10,10,10,10,14,14,14,17,17,17,17,21,21,21,24,24,24,24,s0,s0,s0,s0

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = srv16_0;
    vec_u8_t srv16_3 = srv16_0;
    vec_u8_t srv16_4 = srv16_0; 
    vec_u8_t srv16_5 = srv16_0;
    vec_u8_t srv16_6 = vec_perm(s1, s2, mask6); 
    vec_u8_t srv16_7 = srv16_6;
    vec_u8_t srv16_8 = srv16_6; 
    vec_u8_t srv16_9 = srv16_6;
    vec_u8_t srv16_10 = srv16_6;
    vec_u8_t srv16_11 = srv16_6;
    vec_u8_t srv16_12=  vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = srv16_12;
    vec_u8_t srv16_14 = srv16_12; 
    vec_u8_t srv16_15 = srv16_12;

    vec_u8_t  srv16 = srv12;  
    vec_u8_t  srv17 = srv12;
    vec_u8_t  srv18 = srv12;
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = srv19;
    vec_u8_t  srv22 = srv19;
    vec_u8_t  srv23 = srv19;
    vec_u8_t  srv24 = srv19;
    vec_u8_t  srv25 = s0;
    vec_u8_t  srv26 = s0;
    vec_u8_t  srv27 = s0;
    vec_u8_t  srv28 = s0;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_12;  
    vec_u8_t  srv16_17 = srv16_12;
    vec_u8_t  srv16_18 = srv16_12;
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = srv16_19;
    vec_u8_t  srv16_22 = srv16_19;
    vec_u8_t  srv16_23 = srv16_19;
    vec_u8_t  srv16_24 = srv16_19;
    vec_u8_t  srv16_25 = s1;
    vec_u8_t  srv16_26 = s1;
    vec_u8_t  srv16_27 = s1;
    vec_u8_t  srv16_28 = s1;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0add1;
    vec_u8_t srv3add1 = srv0add1;
    vec_u8_t srv4add1 = srv0add1; 
    vec_u8_t srv5add1 = srv0add1; 
    vec_u8_t srv6add1 = srv0;
    vec_u8_t srv7add1 = srv0; 
    vec_u8_t srv8add1 = srv0;
    vec_u8_t srv9add1 = srv0;
    vec_u8_t srv10add1 = srv0;
    vec_u8_t srv11add1 = srv0;
    vec_u8_t srv12add1= srv6; 
    vec_u8_t srv13add1 = srv6;
    vec_u8_t srv14add1 = srv6; 
    vec_u8_t srv15add1 = srv6;

    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16add1_0;
    vec_u8_t srv16add1_3 = srv16add1_0;
    vec_u8_t srv16add1_4 = srv16add1_0; 
    vec_u8_t srv16add1_5 = srv16add1_0;
    vec_u8_t srv16add1_6 = srv16_0; 
    vec_u8_t srv16add1_7 = srv16_0;
    vec_u8_t srv16add1_8 = srv16_0; 
    vec_u8_t srv16add1_9 = srv16_0;
    vec_u8_t srv16add1_10 = srv16_0;
    vec_u8_t srv16add1_11 = srv16_0;
    vec_u8_t srv16add1_12= srv16_6; 
    vec_u8_t srv16add1_13 = srv16_6;
    vec_u8_t srv16add1_14 = srv16_6; 
    vec_u8_t srv16add1_15 = srv16_6;

    vec_u8_t  srv16add1 = srv6;  
    vec_u8_t  srv17add1 = srv6;
    vec_u8_t  srv18add1 = srv6;
    vec_u8_t  srv19add1 = srv12;
    vec_u8_t  srv20add1 = srv12;
    vec_u8_t  srv21add1 = srv12;
    vec_u8_t  srv22add1 = srv12;
    vec_u8_t  srv23add1 = srv12;
    vec_u8_t  srv24add1 = srv12;
    vec_u8_t  srv25add1 = srv19;
    vec_u8_t  srv26add1 = srv19;
    vec_u8_t  srv27add1 = srv19;
    vec_u8_t  srv28add1 = srv19;
    vec_u8_t  srv29add1 = srv19;
    vec_u8_t  srv30add1 = srv19;
    vec_u8_t  srv31add1 = srv19;

    vec_u8_t  srv16add1_16 = srv16_6;   
    vec_u8_t  srv16add1_17 = srv16_6;
    vec_u8_t  srv16add1_18 = srv16_6;
    vec_u8_t  srv16add1_19 = srv16_12;
    vec_u8_t  srv16add1_20 = srv16_12;
    vec_u8_t  srv16add1_21 = srv16_12;
    vec_u8_t  srv16add1_22 = srv16_12;
    vec_u8_t  srv16add1_23 = srv16_12;
    vec_u8_t  srv16add1_24 = srv16_12;
    vec_u8_t  srv16add1_25 = srv16_19;
    vec_u8_t  srv16add1_26 = srv16_19;
    vec_u8_t  srv16add1_27 = srv16_19;
    vec_u8_t  srv16add1_28 = srv16_19;
    vec_u8_t  srv16add1_29 = srv16_19;
    vec_u8_t  srv16add1_30 = srv16_19;
    vec_u8_t  srv16add1_31 = srv16_19;

vec_u8_t vfrac16_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_18 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_22 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_26 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_30 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_18 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_22 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_26 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_30 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 25>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };


    //mode 19:
    //int offset[32] = {-1, -2, -3, -4, -5, -5, -6, -7, -8, -9, -9, -10, -11, -12, -13, -13, -14, -15, -16, -17, -18, -18, -19, -20, -21, -22, -22, -23, -24, -25, -26, -26};
    //int fraction[32] = {6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0, 6, 12, 18, 24, 30, 4, 10, 16, 22, 28, 2, 8, 14, 20, 26, 0};
    //mode=19 width=32 nbProjected=25(invAngleSum >> 8)=1 ,(invAngleSum >> 8)=2 ,(invAngleSum >> 8)=4 ,(invAngleSum >> 8)=5 ,(invAngleSum >> 8)=6 ,(invAngleSum >> 8)=7 ,(invAngleSum >> 8)=9 ,(invAngleSum >> 8)=10 ,(invAngleSum >> 8)=11 ,(invAngleSum >> 8)=12 ,(invAngleSum >> 8)=14 ,(invAngleSum >> 8)=15 ,(invAngleSum >> 8)=16 ,(invAngleSum >> 8)=17 ,(invAngleSum >> 8)=18 ,(invAngleSum >> 8)=20 ,(invAngleSum >> 8)=21 ,(invAngleSum >> 8)=22 ,(invAngleSum >> 8)=23 ,(invAngleSum >> 8)=25 ,(invAngleSum >> 8)=26 ,(invAngleSum >> 8)=27 ,(invAngleSum >> 8)=28 ,(invAngleSum >> 8)=30 ,(invAngleSum >> 8)=31 

    //mode19 invAS[32]= {1, 2, 4, };
    //vec_u8_t mask_left={0x1, 0x02, 0x04, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,0x0, 0x0};
    //vec_u8_t srv_left=vec_xl(8, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t refmask_4={0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, };
    //vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);  

    vec_u8_t srv=vec_xl(0, srcPix0); 
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
vec_u8_t vfrac4 = (vec_u8_t){30, 30, 30, 30, 28, 28, 28, 28, 26, 26, 26, 26, 24, 24, 24, 24};
vec_u8_t vfrac4_32 = (vec_u8_t){2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 25>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask2={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask3={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask4={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    //vec_u8_t srv_left=vec_xl(16, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t refmask_8={0x7, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, };
    //vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);  
    vec_u8_t srv = vec_xl(0, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_2 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_3 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac8_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_32_2 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_3 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 25>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
/*vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask2={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask3={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask4={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask5={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask7={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask8={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask9={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask10={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask11={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask12={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask13={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t maskadd1_0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_1={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_2={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_3={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_8={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    //vec_u8_t srv_left=vec_xl(32, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t srv_right=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    //vec_u8_t refmask_16={0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, };
    //vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);  
    //vec_u8_t s1 = vec_xl(12, srcPix0);        

    vec_u8_t srv0 = vec_xl(0, srcPix0); 
    vec_u8_t srv1 = vec_xl(1, srcPix0);

vec_u8_t vfrac16_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv0, srv1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 25>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv_left = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32 ={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_32);
    vec_u8_t s1 = vec_xl(15, srcPix0);;
    vec_u8_t s2 = vec_xl(31, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);

vec_u8_t vfrac16_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv0add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_0, srv16add1_0, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv0add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_0, srv16add1_0, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv0add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_0, srv16add1_0, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv0add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_0, srv16add1_0, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv0add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_0, srv16add1_0, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv0, srv0add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_0, srv16add1_0, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv0, srv0add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_0, srv16add1_0, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv0, srv0add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_0, srv16add1_0, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv0, srv0add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_0, srv16add1_0, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv0, srv0add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_0, srv16add1_0, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv0, srv0add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_0, srv16add1_0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv0, srv0add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_0, srv16add1_0, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv0, srv0add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_0, srv16add1_0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv0, srv0add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_0, srv16add1_0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv0, srv0add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_0, srv16add1_0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(s0, srv0, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(s1, srv16_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(s0, srv0, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(s1, srv16_0, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(s0, srv0, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(s1, srv16_0, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(s0, srv0, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(s1, srv16_0, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(s0, srv0, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(s1, srv16_0, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(s0, srv0, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(s1, srv16_0, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(s0, srv0, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(s1, srv16_0, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(s0, srv0, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(s1, srv16_0, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(s0, srv0, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(s1, srv16_0, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(s0, srv0, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(s1, srv16_0, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(s0, srv0, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(s1, srv16_0, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(s0, srv0, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(s1, srv16_0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(s0, srv0, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(s1, srv16_0, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(s0, srv0, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(s1, srv16_0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(s0, srv0, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(s1, srv16_0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(s0, srv0, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(s1, srv16_0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 26>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); /* offset = width2+1 = width<<1 + 1 */

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_sld(srv, srv, 15);         
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_w4x4_mask9));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
         vec_u8_t v_mask = {0x10, 0x02, 0x03, 0x04, 0x11, 0x02, 0x03, 0x04, 0x12, 0x02, 0x03, 0x04, 0x13, 0x02, 0x03, 0x04};
         vec_u8_t vout = vec_perm(srv, v_filter_u8, v_mask);
        if(dstStride == 4) {
             vec_xst(vout, 0, dst);
        }
        else if(dstStride%16 == 0){     
            vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
            vec_u8_t v1 = vec_sld(vout, vout, 12);
            vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
            vec_u8_t v2 = vec_sld(vout, vout, 8);
            vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
            vec_u8_t v3 = vec_sld(vout, vout, 4);
            vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
        }
        else{
             vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
            vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
            vec_xst(v0, 0, dst);
            vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
            vec_xst(v1, dstStride, dst);
            vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
            vec_xst(v2, dstStride*2, dst);
            vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
            vec_xst(v3, dstStride*3, dst);
        }
    }
    else{

        if(dstStride == 4) {
             vec_u8_t v_mask0 = {0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
             vec_u8_t v0 = vec_perm(srv, srv, v_mask0);
             vec_xst(v0, 0, dst);
        }
        else if(dstStride%16 == 0){     
             vec_u8_t v_mask0 = {0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
             vec_u8_t v0 = vec_perm(srv, srv, v_mask0);
            vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
            vec_u8_t v1 = vec_sld(v0, v0, 12);
            vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
            vec_u8_t v2 = vec_sld(v0, v0, 8);
            vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
            vec_u8_t v3 = vec_sld(v0, v0, 4);
            vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
        }
        else{
             vec_u8_t v_mask0 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask2 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_mask3 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
            vec_u8_t v0 = vec_perm(srv, vec_xl(0, dst), v_mask0);
            vec_xst(v0, 0, dst);
            vec_u8_t v1 = vec_perm(srv, vec_xl(dstStride, dst), v_mask1);
            vec_xst(v1, dstStride, dst);
            vec_u8_t v2 = vec_perm(srv, vec_xl(dstStride*2, dst), v_mask2);
            vec_xst(v2, dstStride*2, dst);
            vec_u8_t v3 = vec_perm(srv,  vec_xl(dstStride*3, dst), v_mask3);
            vec_xst(v3, dstStride*3, dst);
        }
    }
        
#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<8, 26>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); /* offset = width2+1 = width<<1 + 1 */

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(17, srcPix0);           
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b1_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskh));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
        vec_u8_t v_mask0 = {0x00, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x01, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask1 = {0x02, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x03, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask2 = {0x04, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x05, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask3 = {0x06, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x07, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v0 = vec_perm(v_filter_u8, srv, v_mask0);
        vec_u8_t v1 = vec_perm(v_filter_u8, srv, v_mask1);
        vec_u8_t v2 = vec_perm(v_filter_u8, srv, v_mask2);
        vec_u8_t v3 = vec_perm(v_filter_u8, srv, v_mask3);
        if(dstStride == 8) {
            vec_xst(v0, 0, dst);
            vec_xst(v1, 16, dst);
            vec_xst(v2, 32, dst);
            vec_xst(v3, 48, dst);
        }
        else{
             vec_u8_t v_maskh = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
             vec_u8_t v_maskl = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
            vec_xst(vec_perm(v0, vec_xl(0, dst), v_maskh), 0, dst);
            vec_xst(vec_perm(v0, vec_xl(dstStride, dst), v_maskl), dstStride, dst);
            vec_xst(vec_perm(v1, vec_xl(dstStride*2, dst), v_maskh), dstStride*2, dst);
            vec_xst(vec_perm(v1, vec_xl(dstStride*3, dst), v_maskl), dstStride*3, dst);
            vec_xst(vec_perm(v2, vec_xl(dstStride*4, dst), v_maskh), dstStride*4, dst);
            vec_xst(vec_perm(v2, vec_xl(dstStride*5, dst), v_maskl), dstStride*5, dst);
            vec_xst(vec_perm(v3, vec_xl(dstStride*6, dst), v_maskh), dstStride*6, dst);
            vec_xst(vec_perm(v3, vec_xl(dstStride*7, dst), v_maskl), dstStride*7, dst);
        }
    }
    else{
        if(dstStride == 8) {
        vec_u8_t v_mask = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
        vec_u8_t v0 = vec_perm(srv, srv, v_mask);
        vec_xst(v0, 0, dst);
        vec_xst(v0, 16, dst);
        vec_xst(v0, 32, dst);
        vec_xst(v0, 48, dst);
        }
        else{
             vec_u8_t v_mask = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
            vec_xst(vec_perm(srv, vec_xl(0, dst), v_mask), 0, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride, dst), v_mask), dstStride, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*2, dst), v_mask), dstStride*2, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*3, dst), v_mask), dstStride*3, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*4, dst), v_mask), dstStride*4, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*5, dst), v_mask), dstStride*5, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*6, dst), v_mask), dstStride*6, dst);
            vec_xst(vec_perm(srv, vec_xl(dstStride*7, dst), v_mask), dstStride*7, dst);
        }
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 26>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); 
    vec_u8_t srv1 =vec_xl(1, srcPix0); 

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(33, srcPix0);   /* offset = width2+1 = width<<1 + 1 */  
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b1_mask));
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 
        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};


        if(dstStride == 16) {
             vec_xst(vec_perm(v_filter_u8, srv1, mask0), 0, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask1), 16, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask2), 32, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask3), 48, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask4), 64, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask5), 80, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask6), 96, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask7), 112, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask8), 128, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask9), 144, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask10), 160, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask11), 176, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask12), 192, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask13), 208, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask14), 224, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask15), 240, dst);
        }
        else{
             vec_xst(vec_perm(v_filter_u8, srv1, mask0), 0, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask1), dstStride, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask2), dstStride*2, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask3), dstStride*3, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask4), dstStride*4, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask5), dstStride*5, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask6), dstStride*6, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask7), dstStride*7, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask8), dstStride*8, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask9), dstStride*9, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask10), dstStride*10, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask11), dstStride*11, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask12), dstStride*12, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask13), dstStride*13, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask14), dstStride*14, dst);
             vec_xst(vec_perm(v_filter_u8, srv1, mask15), dstStride*15, dst);
           }
    }
    else{
        if(dstStride == 16) {
             vec_xst(srv1, 0, dst);
             vec_xst(srv1, 16, dst);
             vec_xst(srv1, 32, dst);
             vec_xst(srv1, 48, dst);
             vec_xst(srv1, 64, dst);
             vec_xst(srv1, 80, dst);
             vec_xst(srv1, 96, dst);
             vec_xst(srv1, 112, dst);
             vec_xst(srv1, 128, dst);
             vec_xst(srv1, 144, dst);
             vec_xst(srv1, 160, dst);
             vec_xst(srv1, 176, dst);
             vec_xst(srv1, 192, dst);
             vec_xst(srv1, 208, dst);
             vec_xst(srv1, 224, dst);
             vec_xst(srv1, 240, dst);
        }
        else{
             vec_xst(srv1, 0, dst);
             vec_xst(srv1, dstStride, dst);
             vec_xst(srv1, dstStride*2, dst);
             vec_xst(srv1, dstStride*3, dst);
             vec_xst(srv1, dstStride*4, dst);
             vec_xst(srv1, dstStride*5, dst);
             vec_xst(srv1, dstStride*6, dst);
             vec_xst(srv1, dstStride*7, dst);
             vec_xst(srv1, dstStride*8, dst);
             vec_xst(srv1, dstStride*9, dst);
             vec_xst(srv1, dstStride*10, dst);
             vec_xst(srv1, dstStride*11, dst);
             vec_xst(srv1, dstStride*12, dst);
             vec_xst(srv1, dstStride*13, dst);
             vec_xst(srv1, dstStride*14, dst);
             vec_xst(srv1, dstStride*15, dst);
        }
    }
        
#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 26>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(1, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t srv1 =vec_xl(17, srcPix0); 
        
    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(65, srcPix0);          
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );

        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 

        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_xst(vec_perm(v_filter_u8, srv, mask0), 0, dst);
         vec_xst(srv1, 16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask1), dstStride, dst);
         vec_xst(srv1, dstStride+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask2), dstStride*2, dst);
         vec_xst(srv1, dstStride*2+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask3), dstStride*3, dst);
         vec_xst(srv1, dstStride*3+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask4), dstStride*4, dst);
         vec_xst(srv1, dstStride*4+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask5), dstStride*5, dst);
         vec_xst(srv1, dstStride*5+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask6), dstStride*6, dst);
         vec_xst(srv1, dstStride*6+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask7), dstStride*7, dst);
         vec_xst(srv1, dstStride*7+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask8), dstStride*8, dst);
         vec_xst(srv1, dstStride*8+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask9), dstStride*9, dst);
         vec_xst(srv1, dstStride*9+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask10), dstStride*10, dst);
         vec_xst(srv1, dstStride*10+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask11), dstStride*11, dst);
         vec_xst(srv1, dstStride*11+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask12), dstStride*12, dst);
         vec_xst(srv1, dstStride*12+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask13), dstStride*13, dst);
         vec_xst(srv1, dstStride*13+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask14), dstStride*14, dst);
         vec_xst(srv1, dstStride*14+16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask15), dstStride*15, dst);
         vec_xst(srv1, dstStride*15+16, dst);

        vec_u8_t  srcv2 = vec_xl(81, srcPix0);          
        vec_s16_t v2h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskh));
        vec_s16_t v2l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskl));
        vec_s16_t v3h_s16 =  (vec_s16_t)vec_sra( vec_sub(v2h_s16, c0_s16v), one_u16v );
        vec_s16_t v3l_s16 =  (vec_s16_t)vec_sra( vec_sub(v2l_s16, c0_s16v), one_u16v );
        vec_s16_t v2h_sum = vec_add(c1_s16v, v3h_s16);
        vec_s16_t v2l_sum = vec_add(c1_s16v, v3l_s16);
        vec_u16_t v2h_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2h_sum));
        vec_u16_t v2l_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2l_sum));
        vec_u8_t v2_filter_u8 = vec_pack(v2h_filter_u16, v2l_filter_u16); 

         vec_xst(vec_perm(v2_filter_u8, srv, mask0), dstStride*16, dst);
         vec_xst(srv1, dstStride*16+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask1), dstStride*17, dst);
         vec_xst(srv1, dstStride*17+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask2), dstStride*18, dst);
         vec_xst(srv1, dstStride*18+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask3), dstStride*19, dst);
         vec_xst(srv1, dstStride*19+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask4), dstStride*20, dst);
         vec_xst(srv1, dstStride*20+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask5), dstStride*21, dst);
         vec_xst(srv1, dstStride*21+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask6), dstStride*22, dst);
         vec_xst(srv1, dstStride*22+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask7), dstStride*23, dst);
         vec_xst(srv1, dstStride*23+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask8), dstStride*24, dst);
         vec_xst(srv1, dstStride*24+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask9), dstStride*25, dst);
         vec_xst(srv1, dstStride*25+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask10), dstStride*26, dst);
         vec_xst(srv1, dstStride*26+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask11), dstStride*27, dst);
         vec_xst(srv1, dstStride*27+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask12), dstStride*28, dst);
         vec_xst(srv1, dstStride*28+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask13), dstStride*29, dst);
         vec_xst(srv1, dstStride*29+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask14), dstStride*30, dst);
         vec_xst(srv1, dstStride*30+16, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask15), dstStride*31, dst);
         vec_xst(srv1, dstStride*31+16, dst);

    }
    else{
        int offset = 0;

        for(int i=0; i<32; i++){
            vec_xst(srv, offset, dst);
            vec_xst(srv1, 16+offset, dst);
            offset += dstStride;
        }
    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 27>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){30, 30, 30, 30, 28, 28, 28, 28, 26, 26, 26, 26, 24, 24, 24, 24}; /* 32 - fraction[0-3] */

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 27>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac8_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv0, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_1); 
    vmle1 = vec_mule(srv1, vfrac8_1); 
    vmlo1 = vec_mulo(srv1, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y4, y5 */        
    vmle0 = vec_mule(srv0, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_2); 
    vmle1 = vec_mule(srv1, vfrac8_2); 
    vmlo1 = vec_mulo(srv1, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y6, y7 */        
    vmle0 = vec_mule(srv0, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_3);
    vmle1 = vec_mule(srv1, vfrac8_3); 
    vmlo1 = vec_mulo(srv1, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 27>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

#if 0
    #define one_line(s0, s1, vf32, vf, vout) {\
        vmle0 = vec_mule(s0, vf32);\
        vmlo0 = vec_mulo(s0, vf32);\
        vmle1 = vec_mule(s1, vf);\
        vmlo1 = vec_mulo(s1, vf);\
        vsume = vec_add(vec_add(vmle0, vmle1), u16_16);\
        ve = vec_sra(vsume, u16_5);\
        vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16);\
        vo = vec_sra(vsumo, u16_5);\
        vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));\
    }
#endif

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv1, srv2, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 27>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); /* from y= 15, use srv1, srv2 */
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); /* y=31, use srv2, srv3 */

    vec_u8_t srv4 = sv1;
    vec_u8_t srv5 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv6 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv7 = vec_perm(sv2, sv2, mask3);  

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv4, srv5, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv4, srv5, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv4, srv5, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv4, srv5, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv4, srv5, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv4, srv5, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv1, srv2, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv5, srv6, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             


    one_line(srv1, srv2, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv5, srv6, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv5, srv6, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv5, srv6, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv5, srv6, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv5, srv6, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv5, srv6, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv5, srv6, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv1, srv2, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv1, srv2, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv1, srv2, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv5, srv6, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 28>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){27, 27, 27, 27, 22, 22, 22, 22, 17, 17, 17, 17, 12, 12, 12, 12}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 28>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac8_1 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 22, 22, 22, 22, 22, 22, 22, 22};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv0, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_1); 
    vmle1 = vec_mule(srv1, vfrac8_1); 
    vmlo1 = vec_mulo(srv1, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y4, y5 */        
    vmle0 = vec_mule(srv0, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_2); 
    vmle1 = vec_mule(srv1, vfrac8_2); 
    vmlo1 = vec_mulo(srv1, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y6, y7 */        
    vmle0 = vec_mule(srv1, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_3);
    vmle1 = vec_mule(srv2, vfrac8_3); 
    vmlo1 = vec_mulo(srv2, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 28>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);

    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13,13, 13, 13, 13,13, 13, 13, 13}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv2, srv3, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv2, srv3, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv2, srv3, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 28>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); /* from y= 15, use srv1, srv2 */
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); /* y=31, use srv2, srv3 */
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask4); /* y=31, use srv2, srv3 */
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask5); /* y=31, use srv2, srv3 */
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask6); /* y=31, use srv2, srv3 */

    vec_u8_t srv4 = sv1;
    vec_u8_t srv5 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv6 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv7 = vec_perm(sv1, sv2, mask3);  
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask4); /* y=31, use srv2, srv3 */
    vec_u8_t srv11 = vec_perm(sv1, sv2, mask5); /* y=31, use srv2, srv3 */
    vec_u8_t srv13 = vec_perm(sv1, sv2, mask6); /* y=31, use srv2, srv3 */

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13,13, 13, 13, 13,13, 13, 13, 13}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_18 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv4, srv5, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv4, srv5, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv4, srv5, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv4, srv5, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv5, srv6, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv2, srv3, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv2, srv3, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv6, srv7, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv2, srv3, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv2, srv3, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv6, srv7, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv2, srv3, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv6, srv7, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv2, srv3, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv6, srv7, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv3, srv8, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv7, srv10, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv3, srv8, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv7, srv10, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv3, srv8, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv7, srv10, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv3, srv8, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv7, srv10, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv3, srv8, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv7, srv10, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv3, srv8, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv7, srv10, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv8, srv9, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv10, srv11, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv8, srv9, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv10, srv11, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv8, srv9, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv10, srv11, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv8, srv9, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv10, srv11, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv8, srv9, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv10, srv11, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv8, srv9, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv10, srv11, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv9, srv12, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv11, srv13, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 29>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){9, 9, 9, 9, 18, 18, 18, 18, 27, 27, 27, 27, 4, 4, 4, 4}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){23, 23, 23, 23, 14, 14, 14, 14, 5, 5, 5, 5, 28, 28, 28, 28}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 29>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask3={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask4={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask5={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 0, 1 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 1, 2 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 2 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 2, 3 */

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac8_1 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_1); 
    vmle1 = vec_mule(srv3, vfrac8_1); 
    vmlo1 = vec_mulo(srv3, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv1, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_2); 
    vmle1 = vec_mule(srv4, vfrac8_2); 
    vmlo1 = vec_mulo(srv4, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv3, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_3);
    vmle1 = vec_mule(srv5, vfrac8_3); 
    vmlo1 = vec_mulo(srv5, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 29>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
    vec_u8_t vfrac16_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

#if 0
    #define one_line(s0, s1, vf32, vf, vout) {\
        vmle0 = vec_mule(s0, vf32);\
        vmlo0 = vec_mulo(s0, vf32);\
        vmle1 = vec_mule(s1, vf);\
        vmlo1 = vec_mulo(s1, vf);\
        vsume = vec_add(vec_add(vmle0, vmle1), u16_16);\
        ve = vec_sra(vsume, u16_5);\
        vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16);\
        vo = vec_sra(vsumo, u16_5);\
        vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));\
    }
#endif
    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv2, srv3, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv2, srv3, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv2, srv3, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv3, srv4, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv3, srv4, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv3, srv4, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv3, srv4, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv4, srv5, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 29>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); 
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); 
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4); 
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5); 
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6); 
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7); 
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9); 
    vec_u8_t srva = vec_perm(sv0, sv1, mask10); 

    vec_u8_t srv00 = sv1;
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3); 
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4); 
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5); 
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6); 
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7); 
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9); 
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10); 


    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
    vec_u8_t vfrac16_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; 
    vec_u8_t vfrac16_18 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 


    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv00, srv10, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv00, srv10, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv10, srv20, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv10, srv20, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv10, srv20, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv10, srv20, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv2, srv3, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv20, srv30, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv2, srv3, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv20, srv30, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv2, srv3, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv20, srv30, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv3, srv4, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv30, srv40, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv3, srv4, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv30, srv40, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv3, srv4, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv30, srv40,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv3, srv4, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv30, srv40, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv40, srv50, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv4, srv5, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv40, srv50, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv4, srv5, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv40, srv50, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv5, srv6, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv50, srv60, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv5, srv6, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv50, srv60, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv5, srv6, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv50, srv60, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv5, srv6, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv50, srv60, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv6, srv7, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv60, srv70, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv6, srv7, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv60, srv70, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv6, srv7, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv60, srv70, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv7, srv8, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv70, srv80, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv7, srv8, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv70, srv80, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv7, srv8, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv70, srv80, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv7, srv8, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv70, srv80, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv8, srv9, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv80, srv90, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv8, srv9, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv80, srv90, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv8, srv9, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv80, srv90, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv9, srva, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv90, srva0, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 30>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 30:
    //int offset[32] = {0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13};
    //int fraction[32] = {13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){13, 13, 13, 13, 26, 26, 26, 26, 7, 7, 7, 7, 20, 20, 20, 20}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){19, 19, 19, 19, 6, 6, 6, 6, 25, 25, 25, 25, 12, 12, 12, 12}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 30>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 30:
    //int offset[32] = {0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13};
    //int fraction[32] = {13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask4={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask5={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 2 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 3 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 3, 4 */

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac8_1 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14 }; 
    vec_u8_t vfrac8_3 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv2, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_2); 
    vmle1 = vec_mule(srv3, vfrac8_2); 
    vmlo1 = vec_mulo(srv3, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv4, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv4, vfrac8_32_3);
    vmle1 = vec_mule(srv5, vfrac8_3); 
    vmlo1 = vec_mulo(srv5, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 30>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 30:
    //int offset[32] = {0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13};
    //int fraction[32] = {13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    //vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    //vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    //vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    //vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    //vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    //vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    //vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    //vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    //vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    //vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    //vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    //vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    //vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    //vec_u8_t srve = vec_perm(sv0, sv1, mask14);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
    vec_u8_t vfrac16_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv2, srv3, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv2, srv3, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv3, srv4, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv3, srv4, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 30>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    //mode 30:
    //int offset[32] = {0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8, 8, 8, 9, 9, 10, 10, 10, 11, 11, 12, 12, 13};
    //int fraction[32] = {13, 26, 7, 20, 1, 14, 27, 8, 21, 2, 15, 28, 9, 22, 3, 16, 29, 10, 23, 4, 17, 30, 11, 24, 5, 18, 31, 12, 25, 6, 19, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);


    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
    vec_u8_t vfrac16_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_18 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 


    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv00, srv10, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv10, srv20, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv20, srv30, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv2, srv3, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv20, srv30, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv2, srv3, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv20, srv30, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv3, srv4, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv30, srv40, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv3, srv4, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv30, srv40, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv40, srv50, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv40, srv50, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv40, srv50, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv50, srv60,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv50, srv60, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv60, srv70, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv60, srv70, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv6, srv7, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv60, srv70, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv7, srv8, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv70, srv80, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv7, srv8, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv70, srv80, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv8, srv9, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv80, srv90, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv8, srv9, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv80, srv90, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv8, srv9, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv80, srv90, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv9, srva, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv90, srva0, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv9, srva, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv90, srva0, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srva, srvb, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srva0, srvb0, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srva, srvb, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srva0, srvb0, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srva, srvb, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srva0, srvb0, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srvb, srvc, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srvb0, srvc0, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srvb, srvc, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srvb0, srvc0, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srvc, srvd, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srvc0, srvd0, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srvc, srvd, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srvc0, srvd0, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srvd, srve, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srvd0, srve0, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 31>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){17, 17, 17, 17, 2, 2, 2, 2, 19, 19, 19, 19, 4, 4, 4, 4};
    vec_u8_t vfrac4_32 = (vec_u8_t){15, 15, 15, 15, 30, 30, 30, 30, 13, 13, 13, 13, 28, 28, 28, 28};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 31>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off0 + 7] + f[0] * ref[off0 + 7] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[1]* ref[off1 + 7] + f[1] * ref[off1 + 7] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[2]* ref[off2 + 7] + f[2] * ref[off2 + 7] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off3 + 7] + f[0] * ref[off3 + 7] + 16) >> 5);

            ...

            y=7;  off7 = offset[7]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[7]* ref[off7 + 0] + f[7] * ref[off7 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[7]* ref[off7 + 1] + f[7] * ref[off7 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[7]* ref[off7 + 2] + f[7] * ref[off7 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[7]* ref[off7 + 3] + f[7] * ref[off7 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off7 + 7] + f[0] * ref[off7 + 7] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 2 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 3 */

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac8_1 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac8_2 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac8_3 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 8, 8, 8, 8, 8};

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac8_32_2 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac8_32_3 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 24, 24, 24, 24, 24, 24, 24, 24};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv2, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_2); 
    vmle1 = vec_mule(srv3, vfrac8_2); 
    vmlo1 = vec_mulo(srv3, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv3, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_3);
    vmle1 = vec_mule(srv4, vfrac8_3); 
    vmlo1 = vec_mulo(srv4, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 31>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[3]* ref[off3 + 15] + f[3] * ref[off3 + 16] + 16) >> 5);

            ...

            y=15;  off7 = offset[7]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
        }
    */
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
    vec_u8_t vfrac16_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
    vec_u8_t vfrac16_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
    vec_u8_t vfrac16_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
    vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
    vec_u8_t vfrac16_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
    vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac16_32_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
    vec_u8_t vfrac16_32_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
    vec_u8_t vfrac16_32_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
    vec_u8_t vfrac16_32_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
    vec_u8_t vfrac16_32_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv3, srv4, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv6, srv7, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv7, srv8, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv7, srv8, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv8, srv9, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 31>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            ...
            
            y=15;  off15 = offset[15]; x=0-31; off15-off30 = 1;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
 
            ...

            y=31;  off31= offset[31]; x=0-31; off31 = 2;
            dst[y * dstStride + 0] = (pixel)((f32[31]* ref[off15 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[31]* ref[off15 + 1] + f[31] * ref[off31 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[31]* ref[off15 + 2] + f[31] * ref[off31 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[31]* ref[off15 + 3] + f[31] * ref[off31 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 31] = (pixel)((f32[31]* ref[off15 + 31] + f[31] * ref[off31 + 32] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);


    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
    vec_u8_t vfrac16_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_17 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_18 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_20 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_21 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_22 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_24 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_25 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_26 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_28 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_29 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_30 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    /* 32 - fraction[0-15] */
vec_u8_t vfrac16_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_17 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_18 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_20 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_21 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_22 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_24 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_25 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_26 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_28 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_29 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_30 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv20, srv30, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv20, srv30, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv30, srv40, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv3, srv4, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv30, srv40, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv40, srv50, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv40, srv50, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv50, srv60, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv50, srv60, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv6, srv7, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv60, srv70, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv60, srv70,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv7, srv8, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv70, srv80, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv7, srv8, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv70, srv80, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv8, srv9, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv80, srv90, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srv9, srva, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv90, srva0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv9, srva, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv90, srva0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srva, srvb, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srva0, srvb0, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srva, srvb, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srva0, srvb0, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srvb, srvc, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srvb0, srvc0, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srvb, srvc, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srvb0, srvc0, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srvc, srvd, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srvc0, srvd0, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srvc, srvd, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srvc0, srvd0, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srvd, srve, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srvd0, srve0, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srvd, srve, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srvd0, srve0, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srve, srvf, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srve0, srvf0, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srve, srvf, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srve0, srvf0, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srvf, srv00, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srvf0, srv000, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srvf, srv00, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srvf0, srv000, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv00, srv10, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv000, srv100, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv10, srv20, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv100, srv200, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void intra_pred<4, 32>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
        }
    */
    //mode 32:
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
    //int fraction[32] = {21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

vec_u8_t vfrac4 = (vec_u8_t){21, 21, 21, 21, 10, 10, 10, 10, 31, 31, 31, 31, 20, 20, 20, 20};
vec_u8_t vfrac4_32 = (vec_u8_t){11, 11, 11, 11, 22, 22, 22, 22, 1, 1, 1, 1, 12, 12, 12, 12};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 32>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off0 + 7] + f[0] * ref[off0 + 7] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[1]* ref[off1 + 7] + f[1] * ref[off1 + 7] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[2]* ref[off2 + 7] + f[2] * ref[off2 + 7] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off3 + 7] + f[0] * ref[off3 + 7] + 16) >> 5);

            ...

            y=7;  off7 = offset[7]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[7]* ref[off7 + 0] + f[7] * ref[off7 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[7]* ref[off7 + 1] + f[7] * ref[off7 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[7]* ref[off7 + 2] + f[7] * ref[off7 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[7]* ref[off7 + 3] + f[7] * ref[off7 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off7 + 7] + f[0] * ref[off7 + 7] + 16) >> 5);
        }
    */
    //mode 32:
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
    //int fraction[32] = {21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask5={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u8_t mask6={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */

vec_u8_t vfrac8_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_1 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_2 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_3 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 8, 8, 8, 8, 8, 8, 8, 8};

vec_u8_t vfrac8_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_32_1 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_2 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_32_3 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv3, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_2); 
    vmle1 = vec_mule(srv4, vfrac8_2); 
    vmlo1 = vec_mulo(srv4, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv5, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv5, vfrac8_32_3);
    vmle1 = vec_mule(srv6, vfrac8_3); 
    vmlo1 = vec_mulo(srv6, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 32>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[3]* ref[off3 + 15] + f[3] * ref[off3 + 16] + 16) >> 5);

            ...

            y=15;  off7 = offset[7]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
        }
    */
    //mode 32:
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
    //int fraction[32] = {21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);

vec_u8_t vfrac16_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv3, srv4, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv6, srv7, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv7, srv8, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv7, srv8, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv8, srv9, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv9, srva, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv9, srva, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srva, srvb, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 32>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            ...
            
            y=15;  off15 = offset[15]; x=0-31; off15-off30 = 1;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
 
            ...

            y=31;  off31= offset[31]; x=0-31; off31 = 2;
            dst[y * dstStride + 0] = (pixel)((f32[31]* ref[off15 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[31]* ref[off15 + 1] + f[31] * ref[off31 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[31]* ref[off15 + 2] + f[31] * ref[off31 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[31]* ref[off15 + 3] + f[31] * ref[off31 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 31] = (pixel)((f32[31]* ref[off15 + 31] + f[31] * ref[off31 + 32] + 16) >> 5);
        }
    */
    //mode 32:
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
    //int fraction[32] = {21, 10, 31, 20, 9, 30, 19, 8, 29, 18, 7, 28, 17, 6, 27, 16, 5, 26, 15, 4, 25, 14, 3, 24, 13, 2, 23, 12, 1, 22, 11, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);
    vec_u8_t srv300 = vec_perm(sv2, sv3, mask3);
    vec_u8_t srv400 = vec_perm(sv2, sv3, mask4);
    vec_u8_t srv500 = vec_perm(sv2, sv3, mask5);
    vec_u8_t srv600 = vec_perm(sv2, sv3, mask6);

vec_u8_t vfrac16_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_18 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_20 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_22 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_24 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_26 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_28 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_30 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_18 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_20 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_22 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_24 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_26 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_28 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_30 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv20, srv30, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv3, srv4, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv30, srv40, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv30, srv40, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv40, srv50, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv50, srv60, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv50, srv60, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv6, srv7, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv60, srv70, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv7, srv8, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv70, srv80, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv7, srv8, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv70, srv80, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv8, srv9, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv80, srv90,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv9, srva, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv90, srva0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv9, srva, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv90, srva0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srva, srvb, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srva0, srvb0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srvb, srvc, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srvb0, srvc0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srvb, srvc, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srvb0, srvc0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srvc, srvd, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srvc0, srvd0, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srvd, srve, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srvd0, srve0, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srvd, srve, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srvd0, srve0, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srve, srvf, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srve0, srvf0, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srvf, srv00, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srvf0, srv000, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srvf, srv00, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srvf0, srv000, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv00, srv10, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv000, srv100, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv10, srv20, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv100, srv200, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv10, srv20, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv100, srv200, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv20, srv30, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv200, srv300, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv30, srv40, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv300, srv400, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv30, srv40, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv300, srv400, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv40, srv50, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv400, srv500, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv50, srv60, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv500, srv600, vfrac16_32_31, vfrac16_31, vout_31);
    //int offset[32] = { 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 33>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){26, 26, 26, 26, 20, 20, 20, 20, 14, 14, 14, 14, 8, 8, 8, 8};
    vec_u8_t vfrac4_32 = (vec_u8_t){6, 6, 6, 6, 12, 12, 12, 12, 18, 18, 18, 18, 24, 24, 24, 24};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    if(dstStride==4){
        vec_xst(vout, 0, dst);          
    }
    else if(dstStride%16 == 0){
        vec_ste((vec_u32_t)vout, 0, (unsigned int*)dst);
        vec_ste((vec_u32_t)vec_sld(vout, vout, 12), 16, (unsigned int*)dst);            
        vec_ste((vec_u32_t)vec_sld(vout, vout, 8), 32, (unsigned int*)dst);             
        vec_ste((vec_u32_t)vec_sld(vout, vout, 4), 48, (unsigned int*)dst);             
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask2 = {0x08, 0x09, 0x0a, 0x0b, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(vout, vec_xl(dstStride*2, dst), v_mask2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout,  vec_xl(dstStride*3, dst), v_mask3);
        vec_xst(v3, dstStride*3, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 33>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off0 + 7] + f[0] * ref[off0 + 7] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[1]* ref[off1 + 7] + f[1] * ref[off1 + 7] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[2]* ref[off2 + 7] + f[2] * ref[off2 + 7] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off3 + 7] + f[0] * ref[off3 + 7] + 16) >> 5);

            ...

            y=7;  off7 = offset[7]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[7]* ref[off7 + 0] + f[7] * ref[off7 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[7]* ref[off7 + 1] + f[7] * ref[off7 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[7]* ref[off7 + 2] + f[7] * ref[off7 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[7]* ref[off7 + 3] + f[7] * ref[off7 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off7 + 7] + f[0] * ref[off7 + 7] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u8_t mask6={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d};
    vec_u8_t mask7={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_2 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_3 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac8_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_2 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_3 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_1); 
    vmle1 = vec_mule(srv3, vfrac8_1); 
    vmlo1 = vec_mulo(srv3, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv4, vfrac8_32_2); 
    vmle1 = vec_mule(srv5, vfrac8_2); 
    vmlo1 = vec_mulo(srv5, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv6, vfrac8_32_3);
    vmle1 = vec_mule(srv7, vfrac8_3); 
    vmlo1 = vec_mulo(srv7, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    if(dstStride==8){
        vec_xst(vout_0, 0, dst);                
        vec_xst(vout_1, 16, dst);               
        vec_xst(vout_2, 32, dst);               
        vec_xst(vout_3, 48, dst);               
    }
    else{
         vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_u8_t v_mask1 = {0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(vout_0, vec_xl(0, dst), v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(vout_0, vec_xl(dstStride, dst), v_mask1);
        vec_xst(v1, dstStride, dst);

        vec_u8_t v2 = vec_perm(vout_1, vec_xl(dstStride*2, dst), v_mask0);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(vout_1, vec_xl(dstStride*3, dst), v_mask1);
        vec_xst(v3, dstStride*3, dst);

        vec_u8_t v4 = vec_perm(vout_2, vec_xl(dstStride*4, dst), v_mask0);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(vout_2, vec_xl(dstStride*5, dst), v_mask1);
        vec_xst(v5, dstStride*5, dst);

        vec_u8_t v6 = vec_perm(vout_3, vec_xl(dstStride*6, dst), v_mask0);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(vout_3, vec_xl(dstStride*7, dst), v_mask1);
        vec_xst(v7, dstStride*7, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 33>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[3]* ref[off3 + 15] + f[3] * ref[off3 + 16] + 16) >> 5);

            ...

            y=15;  off7 = offset[7]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);

vec_u8_t vfrac16_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv3, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv4, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv6, srv7, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv7, srv8, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv8, srv9, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv8, srv9, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv9, srva, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srva, srvb, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srvb, srvc, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srvc, srvd, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srvd, srve, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, dstStride, dst);            
    vec_xst(vout_2, dstStride*2, dst);          
    vec_xst(vout_3, dstStride*3, dst);          
    vec_xst(vout_4, dstStride*4, dst);          
    vec_xst(vout_5, dstStride*5, dst);          
    vec_xst(vout_6, dstStride*6, dst);          
    vec_xst(vout_7, dstStride*7, dst);          
    vec_xst(vout_8, dstStride*8, dst);          
    vec_xst(vout_9, dstStride*9, dst);          
    vec_xst(vout_10, dstStride*10, dst);                
    vec_xst(vout_11, dstStride*11, dst);                
    vec_xst(vout_12, dstStride*12, dst);                
    vec_xst(vout_13, dstStride*13, dst);                
    vec_xst(vout_14, dstStride*14, dst);                
    vec_xst(vout_15, dstStride*15, dst);                

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 33>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            ...
            
            y=15;  off15 = offset[15]; x=0-31; off15-off30 = 1;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
 
            ...

            y=31;  off31= offset[31]; x=0-31; off31 = 2;
            dst[y * dstStride + 0] = (pixel)((f32[31]* ref[off15 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[31]* ref[off15 + 1] + f[31] * ref[off31 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[31]* ref[off15 + 2] + f[31] * ref[off31 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[31]* ref[off15 + 3] + f[31] * ref[off31 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 31] = (pixel)((f32[31]* ref[off15 + 31] + f[31] * ref[off31 + 32] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(1, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);
    vec_u8_t srv300 = vec_perm(sv2, sv3, mask3);
    vec_u8_t srv400 = vec_perm(sv2, sv3, mask4);
    vec_u8_t srv500 = vec_perm(sv2, sv3, mask5);
    vec_u8_t srv600 = vec_perm(sv2, sv3, mask6);
    vec_u8_t srv700 = vec_perm(sv2, sv3, mask7);
    vec_u8_t srv800 = vec_perm(sv2, sv3, mask8);
    vec_u8_t srv900 = vec_perm(sv2, sv3, mask9);
    vec_u8_t srva00 = vec_perm(sv2, sv3, mask10);
    vec_u8_t srvb00 = vec_perm(sv2, sv3, mask11);

vec_u8_t vfrac16_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_16 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_17 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_18 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_19 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_20 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_21 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_22 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_23 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_24 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_25 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_26 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_27 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_28 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_29 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_30 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
vec_u8_t vfrac16_32_16 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_17 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_18 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_19 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_20 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_21 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_22 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_23 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_24 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_25 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_26 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_27 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_28 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_29 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_30 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv3, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv20, srv30, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv4, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv30, srv40, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv40, srv50, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv40, srv50, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv50, srv60, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv6, srv7, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv60, srv70, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv7, srv8, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv70, srv80, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv8, srv9, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv80, srv90, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv8, srv9, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv80, srv90, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv9, srva, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv90, srva0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srva, srvb, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srva0, srvb0,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srvb, srvc, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srvb0, srvc0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srvc, srvd, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srvc0, srvd0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srvd, srve, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srvd0, srve0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, dstStride, dst);            
    vec_xst(vout_3, dstStride+16, dst);         
    vec_xst(vout_4, dstStride*2, dst);          
    vec_xst(vout_5, dstStride*2+16, dst);               
    vec_xst(vout_6, dstStride*3, dst);          
    vec_xst(vout_7, dstStride*3+16, dst);               
    vec_xst(vout_8, dstStride*4, dst);          
    vec_xst(vout_9, dstStride*4+16, dst);               
    vec_xst(vout_10, dstStride*5, dst);         
    vec_xst(vout_11, dstStride*5+16, dst);              
    vec_xst(vout_12, dstStride*6, dst);         
    vec_xst(vout_13, dstStride*6+16, dst);              
    vec_xst(vout_14, dstStride*7, dst);         
    vec_xst(vout_15, dstStride*7+16, dst);              
    vec_xst(vout_16, dstStride*8, dst);         
    vec_xst(vout_17, dstStride*8+16, dst);              
    vec_xst(vout_18, dstStride*9, dst);         
    vec_xst(vout_19, dstStride*9+16, dst);              
    vec_xst(vout_20, dstStride*10, dst);                
    vec_xst(vout_21, dstStride*10+16, dst);             
    vec_xst(vout_22, dstStride*11, dst);                
    vec_xst(vout_23, dstStride*11+16, dst);             
    vec_xst(vout_24, dstStride*12, dst);                
    vec_xst(vout_25, dstStride*12+16, dst);             
    vec_xst(vout_26, dstStride*13, dst);                
    vec_xst(vout_27, dstStride*13+16, dst);             
    vec_xst(vout_28, dstStride*14, dst);                
    vec_xst(vout_29, dstStride*14+16, dst);             
    vec_xst(vout_30, dstStride*15, dst);                
    vec_xst(vout_31, dstStride*15+16, dst);             

    one_line(srvd, srve, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srvd0, srve0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srve, srvf, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srve0, srvf0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srvf, srv00, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srvf0, srv000, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv00, srv10, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv000, srv100, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv10, srv20, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv100, srv200, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv10, srv20, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv100, srv200, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv20, srv30, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv200, srv300, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv30, srv40, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv300, srv400, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv40, srv50, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv400, srv500, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv50, srv60, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv500, srv600, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv50, srv60, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv500, srv600, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv60, srv70, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv600, srv700, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv70, srv80, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv700, srv800, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv80, srv90, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv800, srv900, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv90, srva0, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv900, srva00, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srva0, srvb0, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srva00, srvb00, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, dstStride*16, dst);         
    vec_xst(vout_1, dstStride*16+16, dst);              
    vec_xst(vout_2, dstStride*17, dst);         
    vec_xst(vout_3, dstStride*17+16, dst);              
    vec_xst(vout_4, dstStride*18, dst);         
    vec_xst(vout_5, dstStride*18+16, dst);              
    vec_xst(vout_6, dstStride*19, dst);         
    vec_xst(vout_7, dstStride*19+16, dst);              
    vec_xst(vout_8, dstStride*20, dst);         
    vec_xst(vout_9, dstStride*20+16, dst);              
    vec_xst(vout_10, dstStride*21, dst);                
    vec_xst(vout_11, dstStride*21+16, dst);             
    vec_xst(vout_12, dstStride*22, dst);                
    vec_xst(vout_13, dstStride*22+16, dst);             
    vec_xst(vout_14, dstStride*23, dst);                
    vec_xst(vout_15, dstStride*23+16, dst);             
    vec_xst(vout_16, dstStride*24, dst);                
    vec_xst(vout_17, dstStride*24+16, dst);             
    vec_xst(vout_18, dstStride*25, dst);                
    vec_xst(vout_19, dstStride*25+16, dst);             
    vec_xst(vout_20, dstStride*26, dst);                
    vec_xst(vout_21, dstStride*26+16, dst);             
    vec_xst(vout_22, dstStride*27, dst);                
    vec_xst(vout_23, dstStride*27+16, dst);             
    vec_xst(vout_24, dstStride*28, dst);                
    vec_xst(vout_25, dstStride*28+16, dst);             
    vec_xst(vout_26, dstStride*29, dst);                
    vec_xst(vout_27, dstStride*29+16, dst);             
    vec_xst(vout_28, dstStride*30, dst);                
    vec_xst(vout_29, dstStride*30+16, dst);             
    vec_xst(vout_30, dstStride*31, dst);                
    vec_xst(vout_31, dstStride*31+16, dst);             


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<4, 34>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    if(dstStride == 4) {        
        const vec_u8_t srcV = vec_xl(2, srcPix0); 
        const vec_u8_t mask = {0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,0x04, 0x02, 0x03,0x04,0x05, 0x03,0x04,0x05, 0x06}; 
        vec_u8_t vout = vec_perm(srcV, srcV, mask);
        vec_xst(vout, 0, dst); 
    }
    else if(dstStride%16 == 0){
        vec_u8_t v0 = vec_xl(2, srcPix0);
        vec_ste((vec_u32_t)v0, 0, (unsigned int*)dst);
        vec_u8_t v1 = vec_xl(3, srcPix0);
        vec_ste((vec_u32_t)v1, 0, (unsigned int*)(dst+dstStride));
        vec_u8_t v2 = vec_xl(4, srcPix0);
        vec_ste((vec_u32_t)v2, 0, (unsigned int*)(dst+dstStride*2));
        vec_u8_t v3 = vec_xl(5, srcPix0);
        vec_ste((vec_u32_t)v3, 0, (unsigned int*)(dst+dstStride*3));
    }
    else{
        const vec_u8_t srcV = vec_xl(2, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; 
        vec_u8_t v0 = vec_perm(srcV, vec_xl(0, dst), mask_0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srcV, vec_xl(dstStride, dst), mask_1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srcV, vec_xl(dstStride*2, dst), mask_2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srcV,  vec_xl(dstStride*3, dst), mask_3);
        vec_xst(v3, dstStride*3, dst);
    }
#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<8, 34>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    if(dstStride == 8) {        
        const vec_u8_t srcV1 = vec_xl(2, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03,0x04, 0x05, 0x06, 0x07, 0x08};
        const vec_u8_t mask_1 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
        const vec_u8_t mask_2 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
        const vec_u8_t mask_3 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
        vec_u8_t v0 = vec_perm(srcV1, srcV1, mask_0);
        vec_u8_t v1 = vec_perm(srcV1, srcV1, mask_1);
        vec_u8_t v2 = vec_perm(srcV1, srcV1, mask_2);
        vec_u8_t v3 = vec_perm(srcV1, srcV1, mask_3);
        vec_xst(v0, 0, dst);
        vec_xst(v1, 16, dst); 
        vec_xst(v2, 32, dst); 
        vec_xst(v3, 48, dst); 
    }
    else{
        const vec_u8_t srcV1 = vec_xl(2, srcPix0); /* offset = width2+2 = width<<1 + 2*/
        const vec_u8_t mask_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_1 = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_2 = {0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_3 = {0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_4 = {0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_5 = {0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_6 = {0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        const vec_u8_t mask_7 = {0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t v0 = vec_perm(srcV1, vec_xl(0, dst), mask_0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(srcV1, vec_xl(dstStride, dst), mask_1);
        vec_xst(v1, dstStride, dst);
        vec_u8_t v2 = vec_perm(srcV1, vec_xl(dstStride*2, dst), mask_2);
        vec_xst(v2, dstStride*2, dst);
        vec_u8_t v3 = vec_perm(srcV1,  vec_xl(dstStride*3, dst), mask_3);
        vec_xst(v3, dstStride*3, dst);
        vec_u8_t v4 = vec_perm(srcV1,  vec_xl(dstStride*4, dst), mask_4);
        vec_xst(v4, dstStride*4, dst);
        vec_u8_t v5 = vec_perm(srcV1,  vec_xl(dstStride*5, dst), mask_5);
        vec_xst(v5, dstStride*5, dst);
        vec_u8_t v6 = vec_perm(srcV1,  vec_xl(dstStride*6, dst), mask_6);
        vec_xst(v6, dstStride*6, dst);
        vec_u8_t v7 = vec_perm(srcV1,  vec_xl(dstStride*7, dst), mask_7);
        vec_xst(v7, dstStride*7, dst);
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<16, 34>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    int i;
    //int off = dstStride;      
    //const pixel *srcPix = srcPix0;
    for(i=0; i<16; i++){
        vec_xst(        vec_xl(2+i, srcPix0), i*dstStride, dst); /* first offset = width2+2 = width<<1 + 2*/
    }

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x <16; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void intra_pred<32, 34>(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int bFilter)
{
    int i;
    int off = dstStride;        
    //const pixel *srcPix = srcPix0;
    for(i=0; i<32; i++){
        off = i*dstStride;              
        vec_xst(vec_xl(2+i, srcPix0), off, dst); /* first offset = width2+2 = width<<1 + 2*/
        vec_xst(vec_xl(18+i, srcPix0), off+16, dst); /* first offset = width2+2 = width<<1 + 2*/
    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x <32; x++)
            {
                printf("%d ",dst[y * dstStride + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<int width>
void intra_pred_ang_altivec(pixel* dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
{
    const int size = width;
    switch(dirMode){
    case 2:             
        intra_pred<size, 2>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 3:             
        intra_pred<size, 3>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 4:             
        intra_pred<size, 4>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 5:             
        intra_pred<size, 5>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 6:             
        intra_pred<size, 6>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 7:             
        intra_pred<size, 7>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 8:             
        intra_pred<size, 8>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 9:             
        intra_pred<size, 9>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 10:            
        intra_pred<size, 10>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 11:            
        intra_pred<size, 11>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 12:            
        intra_pred<size, 12>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 13:            
        intra_pred<size, 13>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 14:            
        intra_pred<size, 14>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 15:            
        intra_pred<size, 15>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 16:            
        intra_pred<size, 16>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 17:            
        intra_pred<size, 17>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 18:            
        intra_pred<size, 18>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 19:            
        intra_pred<size, 19>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 20:            
        intra_pred<size, 20>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 21:            
        intra_pred<size, 21>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 22:            
        intra_pred<size, 22>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 23:            
        intra_pred<size, 23>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 24:            
        intra_pred<size, 24>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 25:            
        intra_pred<size, 25>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 26:            
        intra_pred<size, 26>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 27:            
        intra_pred<size, 27>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 28:            
        intra_pred<size, 28>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 29:            
        intra_pred<size, 29>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 30:            
        intra_pred<size, 30>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 31:            
        intra_pred<size, 31>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 32:            
        intra_pred<size, 32>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 33:            
        intra_pred<size, 33>(dst, dstStride, srcPix0, bFilter);
        return;         
    case 34:            
        intra_pred<size, 34>(dst, dstStride, srcPix0, bFilter);
        return;
    default:
        printf("No supported intra prediction mode\n");
        exit(1);                
    }
}

template<int dstStride, int dirMode>
void one_ang_pred_altivec(pixel* dst, const pixel *srcPix0, int bFilter){};

template<>
void one_ang_pred_altivec<4, 2>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 2>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 2>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 2>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 2>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 2>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 2>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 2>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 18>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 18>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 18>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 18>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 18>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 18>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 18>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 18>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 19>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 19>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 19>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 19>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 19>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 19>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 19>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 19>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 20>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 20>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 20>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 20>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 20>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 20>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 20>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 20>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 21>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 21>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 21>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 21>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 21>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 21>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 21>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 21>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 22>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 22>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 22>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 22>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 22>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 22>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 22>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 22>(dst, 32, srcPix0, bFilter);
    return;      
}


template<>
void one_ang_pred_altivec<4, 23>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 23>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 23>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 23>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 23>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 23>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 23>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 23>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 24>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 24>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 24>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 24>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 24>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 24>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 24>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 24>(dst, 32, srcPix0, bFilter);
    return;      
}


template<>
void one_ang_pred_altivec<4, 25>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 25>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 25>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 25>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 25>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 25>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 25>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 25>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 27>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 27>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 27>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 27>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 27>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 27>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 27>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 27>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 28>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 28>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 28>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 28>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 28>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 28>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 28>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 28>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 29>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 29>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 29>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 29>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 29>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 29>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 29>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 29>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 30>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 30>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 30>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 30>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 30>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 30>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 30>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 30>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 31>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 31>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 31>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 31>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 31>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 31>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 31>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 31>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 32>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 32>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 32>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 32>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 32>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 32>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 32>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 32>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 33>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 33>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 33>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 33>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 33>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 33>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 33>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 33>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 34>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<4, 34>(dst, 4, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<8, 34>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<8, 34>(dst, 8, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<16, 34>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<16, 34>(dst, 16, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<32, 34>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    intra_pred<32, 34>(dst, 32, srcPix0, bFilter);
    return;      
}

template<>
void one_ang_pred_altivec<4, 6>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){13, 13, 13, 13, 26, 26, 26, 26, 7, 7, 7, 7, 20, 20, 20, 20}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){19, 19, 19, 19, 6, 6, 6, 6, 25, 25, 25, 25, 12, 12, 12, 12}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 6>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask4={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask5={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 2 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 3 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 3, 4 */

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac8_1 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14 }; 
    vec_u8_t vfrac8_3 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv2, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_2); 
    vmle1 = vec_mule(srv3, vfrac8_2); 
    vmlo1 = vec_mulo(srv3, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv4, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv4, vfrac8_32_3);
    vmle1 = vec_mule(srv5, vfrac8_3); 
    vmlo1 = vec_mulo(srv5, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 6>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    /*vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};*/
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    //vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    //vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    //vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    //vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    //vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    //vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    //vec_u8_t srve = vec_perm(sv0, sv1, mask14);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
    vec_u8_t vfrac16_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv2, srv3, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv2, srv3, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv3, srv4, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv3, srv4, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 6>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);


    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
    vec_u8_t vfrac16_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_18 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 


    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv00, srv10, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv10, srv20, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv20, srv30, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv2, srv3, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv20, srv30, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv2, srv3, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv20, srv30, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv3, srv4, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv30, srv40, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv3, srv4, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv30, srv40, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv40, srv50, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv40, srv50, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv40, srv50, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv50, srv60,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv50, srv60, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv60, srv70, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv60, srv70, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv6, srv7, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv60, srv70, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv7, srv8, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv70, srv80, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv7, srv8, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv70, srv80, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv8, srv9, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv80, srv90, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv8, srv9, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv80, srv90, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv8, srv9, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv80, srv90, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv9, srva, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv90, srva0, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv9, srva, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv90, srva0, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srva, srvb, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srva0, srvb0, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srva, srvb, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srva0, srvb0, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srva, srvb, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srva0, srvb0, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srvb, srvc, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srvb0, srvc0, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srvb, srvc, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srvb0, srvc0, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srvc, srvd, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srvc0, srvd0, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srvc, srvd, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srvc0, srvd0, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srvd, srve, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srvd0, srve0, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 7>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){9, 9, 9, 9, 18, 18, 18, 18, 27, 27, 27, 27, 4, 4, 4, 4}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){23, 23, 23, 23, 14, 14, 14, 14, 5, 5, 5, 5, 28, 28, 28, 28}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 7>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask3={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask4={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask5={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 0, 1 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 1, 2 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 2 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 2, 3 */

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac8_1 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_1); 
    vmle1 = vec_mule(srv3, vfrac8_1); 
    vmlo1 = vec_mulo(srv3, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv1, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_2); 
    vmle1 = vec_mule(srv4, vfrac8_2); 
    vmlo1 = vec_mulo(srv4, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv3, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_3);
    vmle1 = vec_mule(srv5, vfrac8_3); 
    vmlo1 = vec_mulo(srv5, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 7>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
    vec_u8_t vfrac16_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv2, srv3, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv2, srv3, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv2, srv3, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv3, srv4, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv3, srv4, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv3, srv4, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv3, srv4, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv4, srv5, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 7>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 29:
    //int offset[32] = {0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9};
    //int fraction[32] = {9, 18, 27, 4, 13, 22, 31, 8, 17, 26, 3, 12, 21, 30, 7, 16, 25, 2, 11, 20, 29, 6, 15, 24, 1, 10, 19, 28, 5, 14, 23, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); 
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); 
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4); 
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5); 
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6); 
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7); 
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9); 
    vec_u8_t srva = vec_perm(sv0, sv1, mask10); 

    vec_u8_t srv00 = sv1;
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3); 
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4); 
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5); 
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6); 
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7); 
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9); 
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10); 


    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
    vec_u8_t vfrac16_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }; 
    vec_u8_t vfrac16_18 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 


    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv00, srv10, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv00, srv10, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv10, srv20, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv10, srv20, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv10, srv20, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv10, srv20, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv2, srv3, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv20, srv30, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv2, srv3, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv20, srv30, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv2, srv3, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv20, srv30, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv3, srv4, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv30, srv40, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv3, srv4, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv30, srv40, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv3, srv4, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv30, srv40,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv3, srv4, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv30, srv40, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv40, srv50, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv4, srv5, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv40, srv50, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv4, srv5, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv40, srv50, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv5, srv6, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv50, srv60, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv5, srv6, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv50, srv60, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv5, srv6, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv50, srv60, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv5, srv6, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv50, srv60, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv6, srv7, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv60, srv70, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv6, srv7, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv60, srv70, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv6, srv7, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv60, srv70, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv7, srv8, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv70, srv80, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv7, srv8, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv70, srv80, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv7, srv8, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv70, srv80, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv7, srv8, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv70, srv80, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv8, srv9, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv80, srv90, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv8, srv9, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv80, srv90, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv8, srv9, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv80, srv90, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv9, srva, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv90, srva0, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 8>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){27, 27, 27, 27, 22, 22, 22, 22, 17, 17, 17, 17, 12, 12, 12, 12}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              
   
#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 8>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac8_1 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 22, 22, 22, 22, 22, 22, 22, 22};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 24, 24, 24, 24, 24, 24, 24, 24}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv0, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_1); 
    vmle1 = vec_mule(srv1, vfrac8_1); 
    vmlo1 = vec_mulo(srv1, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv0, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_2); 
    vmle1 = vec_mule(srv1, vfrac8_2); 
    vmlo1 = vec_mulo(srv1, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y6, y7 */        
    vmle0 = vec_mule(srv1, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_3);
    vmle1 = vec_mule(srv2, vfrac8_3); 
    vmlo1 = vec_mulo(srv2, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 8>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);

    //mode 28
    //int offset[32] = {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5};
    //int fraction[32] = {5, 10, 15, 20, 25, 30, 3, 8, 13, 18, 23, 28, 1, 6, 11, 16, 21, 26, 31, 4, 9, 14, 19, 24, 29, 2, 7, 12, 17, 22, 27, 0};

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13,13, 13, 13, 13,13, 13, 13, 13}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv2, srv3, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv2, srv3, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv2, srv3, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 8>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); /* from y= 15, use srv1, srv2 */
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); /* y=31, use srv2, srv3 */
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask4); /* y=31, use srv2, srv3 */
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask5); /* y=31, use srv2, srv3 */
    vec_u8_t srv12 = vec_perm(sv0, sv1, mask6); /* y=31, use srv2, srv3 */

    vec_u8_t srv4 = sv1;
    vec_u8_t srv5 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv6 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv7 = vec_perm(sv1, sv2, mask3);  
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask4); /* y=31, use srv2, srv3 */
    vec_u8_t srv11 = vec_perm(sv1, sv2, mask5); /* y=31, use srv2, srv3 */
    vec_u8_t srv13 = vec_perm(sv1, sv2, mask6); /* y=31, use srv2, srv3 */

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13,13, 13, 13, 13,13, 13, 13, 13}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_16 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_18 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_20 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_22 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_24 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_26 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_28 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_30 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27}; 
    vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 

    vec_u8_t vfrac16_32_16 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11}; 
    vec_u8_t vfrac16_32_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_18 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 
    vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_20 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23}; 
    vec_u8_t vfrac16_32_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_22 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13}; 
    vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_32_24 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; 
    vec_u8_t vfrac16_32_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_32_26 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25}; 
    vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_32_28 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; 
    vec_u8_t vfrac16_32_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_30 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}; 
    vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv4, srv5, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv4, srv5, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv4, srv5, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv4, srv5, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv5, srv6, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv2, srv3, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv2, srv3, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv6, srv7, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv2, srv3, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv6, srv7, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv2, srv3, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv6, srv7, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv2, srv3, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv6, srv7, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv2, srv3, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv6, srv7, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv3, srv8, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv7, srv10, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv3, srv8, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv7, srv10, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv3, srv8, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv7, srv10, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv3, srv8, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv7, srv10, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv3, srv8, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv7, srv10, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv3, srv8, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv7, srv10, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv8, srv9, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv10, srv11, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv8, srv9, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv10, srv11, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv8, srv9, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv10, srv11, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv8, srv9, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv10, srv11, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv8, srv9, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv10, srv11, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv8, srv9, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv10, srv11, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv9, srv12, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv11, srv13, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 9>(pixel* dst, const pixel *srcPix0, int bFilter)
{

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){30, 30, 30, 30, 28, 28, 28, 28, 26, 26, 26, 26, 24, 24, 24, 24}; /* 32 - fraction[0-3] */

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 9>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */ /*width2*/
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    /* fraction[0-7] */
    vec_u8_t vfrac8_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac8_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac8_2 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac8_3 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /* 32 - fraction[0-7] */
    vec_u8_t vfrac8_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac8_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac8_32_2 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac8_32_3 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 16, 16, 16, 16}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv0, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_1); 
    vmle1 = vec_mule(srv1, vfrac8_1); 
    vmlo1 = vec_mulo(srv1, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv0, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_2); 
    vmle1 = vec_mule(srv1, vfrac8_2); 
    vmlo1 = vec_mulo(srv1, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y6, y7 */        
    vmle0 = vec_mule(srv0, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv0, vfrac8_32_3);
    vmle1 = vec_mule(srv1, vfrac8_3); 
    vmlo1 = vec_mulo(srv1, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 9>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = vec_perm(sv0, sv1, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv1, srv2, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           
    vec_xst(vout_4, 64, dst);           
    vec_xst(vout_5, 80, dst);           
    vec_xst(vout_6, 96, dst);           
    vec_xst(vout_7, 112, dst);          
    vec_xst(vout_8, 128, dst);          
    vec_xst(vout_9, 144, dst);          
    vec_xst(vout_10, 160, dst);         
    vec_xst(vout_11, 176, dst);         
    vec_xst(vout_12, 192, dst);         
    vec_xst(vout_13, 208, dst);         
    vec_xst(vout_14, 224, dst);         
    vec_xst(vout_15, 240, dst);         

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 9>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2); /* from y= 15, use srv1, srv2 */
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3); /* y=31, use srv2, srv3 */

    vec_u8_t srv4 = sv1;
    vec_u8_t srv5 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv6 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv7 = vec_perm(sv2, sv2, mask3);  

    /* fraction[0-15] */
    vec_u8_t vfrac16_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; 
    vec_u8_t vfrac16_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12}; 
    vec_u8_t vfrac16_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20}; 
    vec_u8_t vfrac16_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30}; 
    vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 

    /* 32 - fraction[0-15] */
    vec_u8_t vfrac16_32_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30,30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28}; 
    vec_u8_t vfrac16_32_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26}; 
    vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24}; 
    vec_u8_t vfrac16_32_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22}; 
    vec_u8_t vfrac16_32_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18}; 
    vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16}; 
    vec_u8_t vfrac16_32_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}; 
    vec_u8_t vfrac16_32_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}; 
    vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6}; 
    vec_u8_t vfrac16_32_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; 
    vec_u8_t vfrac16_32_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}; 
    vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32}; 

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv4, srv5, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv4, srv5, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv4, srv5, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv4, srv5, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv4, srv5, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv4, srv5, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv4, srv5, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv4, srv5, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv4, srv5, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv4, srv5, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv1, srv2, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv5, srv6, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            


    one_line(srv1, srv2, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv5, srv6, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv5, srv6, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv5, srv6, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv1, srv2, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv5, srv6, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv1, srv2, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv5, srv6, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv1, srv2, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv5, srv6, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv1, srv2, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv1, srv2, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv1, srv2, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv1, srv2, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv1, srv2, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv1, srv2, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv5, srv6, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv1, srv2, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv5, srv6, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv1, srv2, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv5, srv6, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv1, srv2, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv5, srv6, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv2, srv3, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv6, srv7, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 10>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srcV = vec_xl(9, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);            
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srcV, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w4x4_mask1));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
        vec_u8_t mask = {0x00, 0x11, 0x12, 0x13, 0x01, 0x11, 0x12, 0x13, 0x02, 0x11, 0x12, 0x13, 0x03, 0x11, 0x12, 0x13};               
        vec_u8_t v0 = vec_perm(v_filter_u8, srcV, mask);                
        vec_xst(v0, 0, dst);
    }
    else{
        vec_u8_t v0 = (vec_u8_t)vec_splat((vec_u32_t)srcV, 0);
        vec_xst(v0, 0, dst);
    }

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 10>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srcV = vec_xl(17, srcPix0); /* offset = width2+1 = width<<1 + 1 */

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srcV, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_mask1));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
        vec_u8_t v_mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
        vec_u8_t v_mask1 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
        vec_u8_t v_mask2 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
        vec_u8_t v_mask3 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
        vec_u8_t v0 = vec_perm(v_filter_u8, srcV, v_mask0);
        vec_xst(v0, 0, dst);
        vec_u8_t v1 = vec_perm(v_filter_u8, srcV, v_mask1);
        vec_xst(v1, 16, dst);
        vec_u8_t v2 = vec_perm(v_filter_u8, srcV, v_mask2);
        vec_xst(v2, 32, dst);
        vec_u8_t v3 = vec_perm(v_filter_u8, srcV, v_mask3);
        vec_xst(v3, 48, dst);
    }
    else{
        vec_u8_t v_mask0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
        vec_u8_t v0 = vec_perm(srcV, srcV, v_mask0);
        vec_xst(v0, 0, dst);
        vec_xst(v0, 16, dst);
        vec_xst(v0, 32, dst);
        vec_xst(v0, 48, dst);
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          

}

template<>
void one_ang_pred_altivec<16, 10>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(33, srcPix0); /* offset = width2+1 = width<<1 + 1 */

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(1, srcPix0);           
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 
                
        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        
         vec_xst(vec_perm(v_filter_u8, srv, mask0), 0, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask1), 16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask2), 32, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask3), 48, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask4), 64, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask5), 80, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask6), 96, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask7), 112, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask8), 128, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask9), 144, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask10), 160, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask11), 176, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask12), 192, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask13), 208, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask14), 224, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask15), 240, dst);
    }
    else{
         vec_xst(srv, 0, dst);
         vec_xst(srv, 16, dst);
         vec_xst(srv, 32, dst);
         vec_xst(srv, 48, dst);
         vec_xst(srv, 64, dst);
         vec_xst(srv, 80, dst);
         vec_xst(srv, 96, dst);
         vec_xst(srv, 112, dst);
         vec_xst(srv, 128, dst);
         vec_xst(srv, 144, dst);
         vec_xst(srv, 160, dst);
         vec_xst(srv, 176, dst);
         vec_xst(srv, 192, dst);
         vec_xst(srv, 208, dst);
         vec_xst(srv, 224, dst);
         vec_xst(srv, 240, dst);
    }
#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 10>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(65, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t srv1 =vec_xl(81, srcPix0); 
    //vec_u8_t vout;    
    int offset = 0;
        

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(1, srcPix0);           
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 

        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_xst(vec_perm(v_filter_u8, srv, mask0), 0, dst);
         vec_xst(srv1, 16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask1), 32, dst);
         vec_xst(srv1, 48, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask2), 64, dst);
         vec_xst(srv1, 80, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask3), 96, dst);
         vec_xst(srv1, 112, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask4), 128, dst);
         vec_xst(srv1, 144, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask5), 160, dst);
         vec_xst(srv1, 176, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask6), 192, dst);
         vec_xst(srv1, 208, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask7), 224, dst);
         vec_xst(srv1, 240, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask8), 256, dst);
         vec_xst(srv1, 272, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask9), 288, dst);
         vec_xst(srv1, 304, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask10), 320, dst);
         vec_xst(srv1, 336, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask11), 352, dst);
         vec_xst(srv1, 368, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask12), 384, dst);
         vec_xst(srv1, 400, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask13), 416, dst);
         vec_xst(srv1, 432, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask14), 448, dst);
         vec_xst(srv1, 464, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask15), 480, dst);
         vec_xst(srv1, 496, dst);

        vec_u8_t  srcv2 = vec_xl(17, srcPix0);          
        vec_s16_t v2h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskh));
        vec_s16_t v2l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskl));
        vec_s16_t v3h_s16 =  (vec_s16_t)vec_sra( vec_sub(v2h_s16, c0_s16v), one_u16v );
        vec_s16_t v3l_s16 =  (vec_s16_t)vec_sra( vec_sub(v2l_s16, c0_s16v), one_u16v );
        vec_s16_t v2h_sum = vec_add(c1_s16v, v3h_s16);
        vec_s16_t v2l_sum = vec_add(c1_s16v, v3l_s16);
        vec_u16_t v2h_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2h_sum));
        vec_u16_t v2l_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2l_sum));
        vec_u8_t v2_filter_u8 = vec_pack(v2h_filter_u16, v2l_filter_u16); 
         vec_xst(vec_perm(v2_filter_u8, srv, mask0), 512, dst);
         vec_xst(srv1, 528, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask1), 544, dst);
         vec_xst(srv1, 560, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask2), 576, dst);
         vec_xst(srv1, 592, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask3), 608, dst);
         vec_xst(srv1, 624, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask4), 640, dst);
         vec_xst(srv1, 656, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask5), 672, dst);
         vec_xst(srv1, 688, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask6), 704, dst);
         vec_xst(srv1, 720, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask7), 736, dst);
         vec_xst(srv1, 752, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask8), 768, dst);
         vec_xst(srv1, 784, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask9), 800, dst);
         vec_xst(srv1, 816, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask10), 832, dst);
         vec_xst(srv1, 848, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask11), 864, dst);
         vec_xst(srv1, 880, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask12), 896, dst);
         vec_xst(srv1, 912, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask13), 928, dst);
         vec_xst(srv1, 944, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask14), 960, dst);
         vec_xst(srv1, 976, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask15), 992, dst);
         vec_xst(srv1, 1008, dst);
         
    }
    else{
        for(int i = 0; i<32; i++){
           vec_xst(srv, offset, dst);
           vec_xst(srv1, offset+16, dst);
           offset += 32;
        }
    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 26>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t v0;        

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_sld(srv, srv, 15);         
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_w4x4_mask9));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
         vec_u8_t v_mask = {0x10, 0x02, 0x03, 0x04, 0x11, 0x02, 0x03, 0x04, 0x12, 0x02, 0x03, 0x04, 0x13, 0x02, 0x03, 0x04};
         v0 = vec_perm(srv, v_filter_u8, v_mask);
    }
    else{
        vec_u8_t v_mask = {0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04};
        v0 = vec_perm(srv, srv, v_mask);
    }
    vec_xst(v0, 0, dst);
        
#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 26>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); /* offset = width2+1 = width<<1 + 1 */

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(17, srcPix0);           
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b1_mask));
        vec_s16_t v0_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskh));
        vec_s16_t v1_s16 =  (vec_s16_t)vec_sra( vec_sub(v0_s16, c0_s16v), one_u16v );
        vec_s16_t v_sum = vec_add(c1_s16v, v1_s16);
        vec_u16_t v_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v_sum));
        vec_u8_t v_filter_u8 = vec_pack(v_filter_u16, zero_u16v); 
        vec_u8_t v_mask0 = {0x00, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x01, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask1 = {0x02, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x03, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask2 = {0x04, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x05, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v_mask3 = {0x06, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x07, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
        vec_u8_t v0 = vec_perm(v_filter_u8, srv, v_mask0);
        vec_u8_t v1 = vec_perm(v_filter_u8, srv, v_mask1);
        vec_u8_t v2 = vec_perm(v_filter_u8, srv, v_mask2);
        vec_u8_t v3 = vec_perm(v_filter_u8, srv, v_mask3);
        vec_xst(v0, 0, dst);
        vec_xst(v1, 16, dst);
        vec_xst(v2, 32, dst);
        vec_xst(v3, 48, dst);
    }
    else{
        vec_u8_t v_mask = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
        vec_u8_t v0 = vec_perm(srv, srv, v_mask);
        vec_xst(v0, 0, dst);
        vec_xst(v0, 16, dst);
        vec_xst(v0, 32, dst);
        vec_xst(v0, 48, dst);
    }
        
#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 26>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(0, srcPix0); 
    vec_u8_t srv1 =vec_xl(1, srcPix0); 

    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(33, srcPix0);   /* offset = width2+1 = width<<1 + 1 */  
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b1_mask));
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );
        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 
        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};


        vec_xst(vec_perm(v_filter_u8, srv1, mask0), 0, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask1), 16, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask2), 32, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask3), 48, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask4), 64, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask5), 80, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask6), 96, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask7), 112, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask8), 128, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask9), 144, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask10), 160, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask11), 176, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask12), 192, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask13), 208, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask14), 224, dst);
         vec_xst(vec_perm(v_filter_u8, srv1, mask15), 240, dst);
    }
    else{
         vec_xst(srv1, 0, dst);
         vec_xst(srv1, 16, dst);
         vec_xst(srv1, 32, dst);
         vec_xst(srv1, 48, dst);
         vec_xst(srv1, 64, dst);
         vec_xst(srv1, 80, dst);
         vec_xst(srv1, 96, dst);
         vec_xst(srv1, 112, dst);
         vec_xst(srv1, 128, dst);
         vec_xst(srv1, 144, dst);
         vec_xst(srv1, 160, dst);
         vec_xst(srv1, 176, dst);
         vec_xst(srv1, 192, dst);
         vec_xst(srv1, 208, dst);
         vec_xst(srv1, 224, dst);
         vec_xst(srv1, 240, dst);
    }
        
#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 26>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t srv =vec_xl(1, srcPix0); /* offset = width2+1 = width<<1 + 1 */
    vec_u8_t srv1 =vec_xl(17, srcPix0); 
        
    if (bFilter){
        LOAD_ZERO;
        vec_u8_t tmp_v = vec_xl(0, srcPix0);
        vec_s16_t c0_s16v = (vec_s16_t)(vec_perm(zero_u8v, tmp_v, u8_to_s16_b0_mask));
        vec_s16_t c1_s16v = (vec_s16_t)(vec_perm(zero_u8v, srv, u8_to_s16_b0_mask));
        vec_u8_t  srcv1 = vec_xl(65, srcPix0);          
        vec_s16_t v0h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskh));
        vec_s16_t v0l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv1, u8_to_s16_w8x8_maskl));
        vec_s16_t v1h_s16 =  (vec_s16_t)vec_sra( vec_sub(v0h_s16, c0_s16v), one_u16v );
        vec_s16_t v1l_s16 =  (vec_s16_t)vec_sra( vec_sub(v0l_s16, c0_s16v), one_u16v );

        vec_s16_t vh_sum = vec_add(c1_s16v, v1h_s16);
        vec_s16_t vl_sum = vec_add(c1_s16v, v1l_s16);
        vec_u16_t vh_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vh_sum));
        vec_u16_t vl_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, vl_sum));
        vec_u8_t v_filter_u8 = vec_pack(vh_filter_u16, vl_filter_u16); 

        vec_u8_t mask0 = {0x00, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask1 = {0x01, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask2 = {0x02, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask3 = {0x03, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask4 = {0x04, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask5 = {0x05, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask6 = {0x06, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask7 = {0x07, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask8 = {0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask9 = {0x09, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask10 = {0xa, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask11 = {0xb, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask12 = {0xc, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask13 = {0xd, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask14 = {0xe, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
        vec_u8_t mask15 = {0xf, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f};
         vec_xst(vec_perm(v_filter_u8, srv, mask0), 0, dst);
         vec_xst(srv1, 16, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask1), 32, dst);
         vec_xst(srv1, 48, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask2), 64, dst);
         vec_xst(srv1, 80, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask3), 96, dst);
         vec_xst(srv1, 112, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask4), 128, dst);
         vec_xst(srv1, 144, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask5), 160, dst);
         vec_xst(srv1, 176, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask6), 192, dst);
         vec_xst(srv1, 208, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask7), 224, dst);
         vec_xst(srv1, 240, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask8), 256, dst);
         vec_xst(srv1, 272, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask9), 288, dst);
         vec_xst(srv1, 304, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask10), 320, dst);
         vec_xst(srv1, 336, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask11), 352, dst);
         vec_xst(srv1, 368, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask12), 384, dst);
         vec_xst(srv1, 400, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask13), 416, dst);
         vec_xst(srv1, 432, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask14), 448, dst);
         vec_xst(srv1, 464, dst);
         vec_xst(vec_perm(v_filter_u8, srv, mask15), 480, dst);
         vec_xst(srv1, 496, dst);

        vec_u8_t  srcv2 = vec_xl(81, srcPix0);          
        vec_s16_t v2h_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskh));
        vec_s16_t v2l_s16 = (vec_s16_t)(vec_perm(zero_u8v, srcv2, u8_to_s16_w8x8_maskl));
        vec_s16_t v3h_s16 =  (vec_s16_t)vec_sra( vec_sub(v2h_s16, c0_s16v), one_u16v );
        vec_s16_t v3l_s16 =  (vec_s16_t)vec_sra( vec_sub(v2l_s16, c0_s16v), one_u16v );
        vec_s16_t v2h_sum = vec_add(c1_s16v, v3h_s16);
        vec_s16_t v2l_sum = vec_add(c1_s16v, v3l_s16);
        vec_u16_t v2h_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2h_sum));
        vec_u16_t v2l_filter_u16 = (vector unsigned short)vec_min( min_s16v, vec_max(zero_s16v, v2l_sum));
        vec_u8_t v2_filter_u8 = vec_pack(v2h_filter_u16, v2l_filter_u16); 

         vec_xst(vec_perm(v2_filter_u8, srv, mask0), 512, dst);
         vec_xst(srv1, 528, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask1), 544, dst);
         vec_xst(srv1, 560, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask2), 576, dst);
         vec_xst(srv1, 592, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask3), 608, dst);
         vec_xst(srv1, 624, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask4), 640, dst);
         vec_xst(srv1, 656, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask5), 672, dst);
         vec_xst(srv1, 688, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask6), 704, dst);
         vec_xst(srv1, 720, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask7), 736, dst);
         vec_xst(srv1, 752, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask8), 768, dst);
         vec_xst(srv1, 784, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask9), 800, dst);
         vec_xst(srv1, 816, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask10), 832, dst);
         vec_xst(srv1, 848, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask11), 864, dst);
         vec_xst(srv1, 880, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask12), 896, dst);
         vec_xst(srv1, 912, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask13), 928, dst);
         vec_xst(srv1, 944, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask14), 960, dst);
         vec_xst(srv1, 976, dst);
         vec_xst(vec_perm(v2_filter_u8, srv, mask15), 992, dst);
         vec_xst(srv1, 1008, dst);

    }
    else{
        int offset = 0;
        for(int i=0; i<32; i++){
            vec_xst(srv, offset, dst);
            vec_xst(srv1, 16+offset, dst);
            offset += 32;
        }
    }
#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 3>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06, 0x04, 0x05, 0x06, 0x07};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){26, 26, 26, 26, 20, 20, 20, 20, 14, 14, 14, 14, 8, 8, 8, 8};
    vec_u8_t vfrac4_32 = (vec_u8_t){6, 6, 6, 6, 12, 12, 12, 12, 18, 18, 18, 18, 24, 24, 24, 24};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 3>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u8_t mask6={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d};
    vec_u8_t mask7={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 4 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */
    vec_u8_t srv7 = vec_perm(srv, srv, mask7); /* 6, 7 */

vec_u8_t vfrac8_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_2 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_3 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac8_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_2 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_3 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y2, y3 */        
    vmle0 = vec_mule(srv2, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_1); 
    vmle1 = vec_mule(srv3, vfrac8_1); 
    vmlo1 = vec_mulo(srv3, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv4, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv4, vfrac8_32_2); 
    vmle1 = vec_mule(srv5, vfrac8_2); 
    vmlo1 = vec_mulo(srv5, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv6, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv6, vfrac8_32_3);
    vmle1 = vec_mule(srv7, vfrac8_3); 
    vmlo1 = vec_mulo(srv7, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 3>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[3]* ref[off3 + 15] + f[3] * ref[off3 + 16] + 16) >> 5);

            ...

            y=15;  off7 = offset[7]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);

vec_u8_t vfrac16_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv3, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv4, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv6, srv7, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv7, srv8, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv8, srv9, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv8, srv9, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv9, srva, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srva, srvb, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srvb, srvc, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srvc, srvd, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srvd, srve, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 3>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            ...
            
            y=15;  off15 = offset[15]; x=0-31; off15-off30 = 1;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
 
            ...

            y=31;  off31= offset[31]; x=0-31; off31 = 2;
            dst[y * dstStride + 0] = (pixel)((f32[31]* ref[off15 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[31]* ref[off15 + 1] + f[31] * ref[off31 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[31]* ref[off15 + 2] + f[31] * ref[off31 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[31]* ref[off15 + 3] + f[31] * ref[off31 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 31] = (pixel)((f32[31]* ref[off15 + 31] + f[31] * ref[off31 + 32] + 16) >> 5);
        }
    */
    //mode 33:
    //int offset[32] = {0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 17, 18, 19, 20, 21, 21, 22, 23, 24, 25, 26};
    //int fraction[32] = {26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0, 26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0};
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);
    vec_u8_t srv300 = vec_perm(sv2, sv3, mask3);
    vec_u8_t srv400 = vec_perm(sv2, sv3, mask4);
    vec_u8_t srv500 = vec_perm(sv2, sv3, mask5);
    vec_u8_t srv600 = vec_perm(sv2, sv3, mask6);
    vec_u8_t srv700 = vec_perm(sv2, sv3, mask7);
    vec_u8_t srv800 = vec_perm(sv2, sv3, mask8);
    vec_u8_t srv900 = vec_perm(sv2, sv3, mask9);
    vec_u8_t srva00 = vec_perm(sv2, sv3, mask10);
    vec_u8_t srvb00 = vec_perm(sv2, sv3, mask11);

vec_u8_t vfrac16_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_16 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_17 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_18 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_19 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_20 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_21 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_22 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_23 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_24 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_25 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_26 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_27 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_28 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_29 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_30 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
vec_u8_t vfrac16_32_16 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_17 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_18 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_19 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_20 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_21 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_22 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_23 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_24 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_25 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_26 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_27 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_28 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_29 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_30 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;


    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv3, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv20, srv30, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv4, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv30, srv40, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv5, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv40, srv50, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv4, srv5, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv40, srv50, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv5, srv6, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv50, srv60, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv6, srv7, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv60, srv70, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv7, srv8, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv70, srv80, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv8, srv9, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv80, srv90, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv8, srv9, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv80, srv90, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv9, srva, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv90, srva0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srva, srvb, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srva0, srvb0,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srvb, srvc, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srvb0, srvc0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srvc, srvd, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srvc0, srvd0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srvd, srve, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srvd0, srve0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srvd, srve, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srvd0, srve0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srve, srvf, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srve0, srvf0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srvf, srv00, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srvf0, srv000, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv00, srv10, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv000, srv100, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv10, srv20, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv100, srv200, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv10, srv20, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv100, srv200, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv20, srv30, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv200, srv300, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv30, srv40, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv300, srv400, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv40, srv50, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv400, srv500, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv50, srv60, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv500, srv600, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv50, srv60, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv500, srv600, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv60, srv70, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv600, srv700, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv70, srv80, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv700, srv800, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv80, srv90, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv800, srv900, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv90, srva0, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv900, srva00, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srva0, srvb0, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srva00, srvb00, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 4>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

vec_u8_t vfrac4 = (vec_u8_t){21, 21, 21, 21, 10, 10, 10, 10, 31, 31, 31, 31, 20, 20, 20, 20};
vec_u8_t vfrac4_32 = (vec_u8_t){11, 11, 11, 11, 22, 22, 22, 22, 1, 1, 1, 1, 12, 12, 12, 12};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 4>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask5={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u8_t mask6={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 1 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 2 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 3 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 4, 4 */
    vec_u8_t srv5 = vec_perm(srv, srv, mask5); /* 4, 5 */
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); /* 5, 6 */

vec_u8_t vfrac8_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_1 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_2 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_3 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 8, 8, 8, 8, 8, 8, 8, 8};

vec_u8_t vfrac8_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_32_1 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_2 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_32_3 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv3, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_2); 
    vmle1 = vec_mule(srv4, vfrac8_2); 
    vmlo1 = vec_mulo(srv4, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    //int offset[32] = {0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv5, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv5, vfrac8_32_3);
    vmle1 = vec_mule(srv6, vfrac8_3); 
    vmlo1 = vec_mulo(srv6, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 4>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);

vec_u8_t vfrac16_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv3, srv4, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv6, srv7, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv7, srv8, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv7, srv8, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv8, srv9, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv9, srva, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv9, srva, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srva, srvb, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 4>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);
    vec_u8_t srv300 = vec_perm(sv2, sv3, mask3);
    vec_u8_t srv400 = vec_perm(sv2, sv3, mask4);
    vec_u8_t srv500 = vec_perm(sv2, sv3, mask5);
    vec_u8_t srv600 = vec_perm(sv2, sv3, mask6);

vec_u8_t vfrac16_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_18 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_20 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_22 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_24 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_26 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_28 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_30 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_18 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_20 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_22 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_24 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_26 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_28 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_30 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv20, srv30, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv3, srv4, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv30, srv40, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv30, srv40, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv4, srv5, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv40, srv50, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv5, srv6, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv50, srv60, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv5, srv6, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv50, srv60, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv6, srv7, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv60, srv70, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv7, srv8, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv70, srv80, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv7, srv8, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv70, srv80, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv8, srv9, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv80, srv90,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv9, srva, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv90, srva0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv9, srva, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv90, srva0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srva, srvb, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srva0, srvb0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srvb, srvc, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srvb0, srvc0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srvb, srvc, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srvb0, srvc0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srvc, srvd, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srvc0, srvd0, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srvd, srve, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srvd0, srve0, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srvd, srve, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srvd0, srve0, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srve, srvf, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srve0, srvf0, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srvf, srv00, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srvf0, srv000, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srvf, srv00, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srvf0, srv000, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv00, srv10, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv000, srv100, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv10, srv20, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv100, srv200, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv10, srv20, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv100, srv200, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv20, srv30, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv200, srv300, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv30, srv40, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv300, srv400, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv30, srv40, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv300, srv400, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv40, srv50, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv400, srv500, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv50, srv60, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv500, srv600, vfrac16_32_31, vfrac16_31, vout_31);
    //int offset[32] = { 11, 11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 19, 20, 21};

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 5>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-3;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03, 0x04, 0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x02, 0x03, 0x04, 0x05, 0x02, 0x03, 0x04, 0x05, 0x03, 0x04, 0x05, 0x06};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv =vec_xl(9, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){17, 17, 17, 17, 2, 2, 2, 2, 19, 19, 19, 19, 4, 4, 4, 4}; /* fraction[0-3] */
    vec_u8_t vfrac4_32 = (vec_u8_t){15, 15, 15, 15, 30, 30, 30, 30, 13, 13, 13, 13, 28, 28, 28, 28}; /* 32 - fraction[0-3] */


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 5>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off0 + 7] + f[0] * ref[off0 + 7] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[1]* ref[off1 + 7] + f[1] * ref[off1 + 7] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[2]* ref[off2 + 7] + f[2] * ref[off2 + 7] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off3 + 7] + f[0] * ref[off3 + 7] + 16) >> 5);

            ...

            y=7;  off7 = offset[7]; x=0-7;
            dst[y * dstStride + 0] = (pixel)((f32[7]* ref[off7 + 0] + f[7] * ref[off7 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[7]* ref[off7 + 1] + f[7] * ref[off7 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[7]* ref[off7 + 2] + f[7] * ref[off7 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[7]* ref[off7 + 3] + f[7] * ref[off7 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 7] = (pixel)((f32[0]* ref[off7 + 7] + f[0] * ref[off7 + 7] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

    vec_u8_t srv =vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-7] = 0 */
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* 0, 0 */
    vec_u8_t srv1 = vec_perm(srv, srv, mask1); /* 1, 1 */
    vec_u8_t srv2 = vec_perm(srv, srv, mask2); /* 2, 2 */
    vec_u8_t srv3 = vec_perm(srv, srv, mask3); /* 3, 3 */
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); /* 2, 3 */

vec_u8_t vfrac8_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_1 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_2 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac8_3 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_32_1 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_32_2 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac8_32_3 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 24, 24, 24, 24, 24, 24, 24, 24};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    /* y0, y1 */        
    vec_u16_t vmle0 = vec_mule(srv0, vfrac8_32_0); /* (32 - fraction) * ref[offset + x], x=0-7 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac8_32_0); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac8_0); /* fraction * ref[offset + x + 1], x=0-7 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac8_0); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_0 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));


    /* y2, y3 */        
    vmle0 = vec_mule(srv1, vfrac8_32_1); 
    vmlo0 = vec_mulo(srv1, vfrac8_32_1); 
    vmle1 = vec_mule(srv2, vfrac8_1); 
    vmlo1 = vec_mulo(srv2, vfrac8_1); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_1 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    /* y4, y5 */        
    vmle0 = vec_mule(srv2, vfrac8_32_2); 
    vmlo0 = vec_mulo(srv2, vfrac8_32_2); 
    vmle1 = vec_mule(srv3, vfrac8_2); 
    vmlo1 = vec_mulo(srv3, vfrac8_2); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_2 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
        
    /* y6, y7 */        
    vmle0 = vec_mule(srv3, vfrac8_32_3); 
    vmlo0 = vec_mulo(srv3, vfrac8_32_3);
    vmle1 = vec_mule(srv4, vfrac8_3); 
    vmlo1 = vec_mulo(srv4, vfrac8_3); 
    vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    ve = vec_sra(vsume, u16_5);
    vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout_3 = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));
    
    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 5>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            y=3;  off3 = offset[3]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[3]* ref[off3 + 0] + f[3] * ref[off3 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[3]* ref[off3 + 1] + f[3] * ref[off3 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[3]* ref[off3 + 2] + f[3] * ref[off3 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[3]* ref[off3 + 3] + f[3] * ref[off3 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[3]* ref[off3 + 15] + f[3] * ref[off3 + 16] + 16) >> 5);

            ...

            y=15;  off7 = offset[7]; x=0-15;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
        }
    */
    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(49, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);

vec_u8_t vfrac16_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv3, srv4, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv6, srv7, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv7, srv8, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv7, srv8, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv8, srv9, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 5>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    /*
        for (int y = 0; y < width; y++)
        {
            y=0;  off0 = offset[0]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[0]* ref[off0 + 0] + f[0] * ref[off0 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[0]* ref[off0 + 1] + f[0] * ref[off0 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[0]* ref[off0 + 2] + f[0] * ref[off0 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[0]* ref[off0 + 3] + f[0] * ref[off0 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[0]* ref[off0 + 15] + f[0] * ref[off0 + 16] + 16) >> 5);

            y=1;  off1 = offset[1]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[1]* ref[off1 + 0] + f[1] * ref[off1 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[1]* ref[off1 + 1] + f[1] * ref[off1 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[1]* ref[off1 + 2] + f[1] * ref[off1 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[1]* ref[off1 + 3] + f[1] * ref[off1 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[1]* ref[off1 + 15] + f[1] * ref[off1 + 16] + 16) >> 5);

            y=2;  off2 = offset[2]; x=0-31;
            dst[y * dstStride + 0] = (pixel)((f32[2]* ref[off2 + 0] + f[2] * ref[off2 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[2]* ref[off2 + 1] + f[2] * ref[off2 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[2]* ref[off2 + 2] + f[2] * ref[off2 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[2]* ref[off2 + 3] + f[2] * ref[off2 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[2]* ref[off2 + 15] + f[2] * ref[off2 + 16] + 16) >> 5);

            ...
            
            y=15;  off15 = offset[15]; x=0-31; off15-off30 = 1;
            dst[y * dstStride + 0] = (pixel)((f32[15]* ref[off15 + 0] + f[15] * ref[off15 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[15]* ref[off15 + 1] + f[15] * ref[off15 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[15]* ref[off15 + 2] + f[15] * ref[off15 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[15]* ref[off15 + 3] + f[15] * ref[off15 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 15] = (pixel)((f32[15]* ref[off15 + 15] + f[15] * ref[off15 + 16] + 16) >> 5);
 
            ...

            y=31;  off31= offset[31]; x=0-31; off31 = 2;
            dst[y * dstStride + 0] = (pixel)((f32[31]* ref[off15 + 0] + f[31] * ref[off31 + 1] + 16) >> 5);
            dst[y * dstStride + 1] = (pixel)((f32[31]* ref[off15 + 1] + f[31] * ref[off31 + 2] + 16) >> 5);
            dst[y * dstStride + 2] = (pixel)((f32[31]* ref[off15 + 2] + f[31] * ref[off31 + 3] + 16) >> 5);
            dst[y * dstStride + 3] = (pixel)((f32[31]* ref[off15 + 3] + f[31] * ref[off31 + 4] + 16) >> 5);
            ...
            dst[y * dstStride + 31] = (pixel)((f32[31]* ref[off15 + 31] + f[31] * ref[off31 + 32] + 16) >> 5);
        }
    */
    //mode 31:
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};
    //int fraction[32] = {17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 15, 0};

    //vec_u8_t mask0={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f};
    vec_u8_t mask1={0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
    vec_u8_t mask2={0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11};
    vec_u8_t mask3={0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12};
    vec_u8_t mask4={0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t mask5={0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t mask6={0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t mask7={0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t mask8={0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t mask9={0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t mask10={0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t mask11={0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t mask12={0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t mask13={0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t mask14={0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t mask15={0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t sv0 =vec_xl(65, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv1 =vec_xl(81, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv2 =vec_xl(97, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */
    vec_u8_t sv3 =vec_xl(113, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-14] = 0, off[15] = 1 */

    vec_u8_t srv0 = sv0; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(sv0, sv1, mask1);
    vec_u8_t srv2 = vec_perm(sv0, sv1, mask2);
    vec_u8_t srv3 = vec_perm(sv0, sv1, mask3);
    vec_u8_t srv4 = vec_perm(sv0, sv1, mask4);
    vec_u8_t srv5 = vec_perm(sv0, sv1, mask5);
    vec_u8_t srv6 = vec_perm(sv0, sv1, mask6);
    vec_u8_t srv7 = vec_perm(sv0, sv1, mask7);
    vec_u8_t srv8 = vec_perm(sv0, sv1, mask8);
    vec_u8_t srv9 = vec_perm(sv0, sv1, mask9);
    vec_u8_t srva = vec_perm(sv0, sv1, mask10);
    vec_u8_t srvb = vec_perm(sv0, sv1, mask11);
    vec_u8_t srvc = vec_perm(sv0, sv1, mask12);
    vec_u8_t srvd = vec_perm(sv0, sv1, mask13);
    vec_u8_t srve = vec_perm(sv0, sv1, mask14);
    vec_u8_t srvf = vec_perm(sv0, sv1, mask15);

    vec_u8_t srv00 = sv1; /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv10 = vec_perm(sv1, sv2, mask1);
    vec_u8_t srv20 = vec_perm(sv1, sv2, mask2);
    vec_u8_t srv30 = vec_perm(sv1, sv2, mask3);
    vec_u8_t srv40 = vec_perm(sv1, sv2, mask4);
    vec_u8_t srv50 = vec_perm(sv1, sv2, mask5);
    vec_u8_t srv60 = vec_perm(sv1, sv2, mask6);
    vec_u8_t srv70 = vec_perm(sv1, sv2, mask7);
    vec_u8_t srv80 = vec_perm(sv1, sv2, mask8);
    vec_u8_t srv90 = vec_perm(sv1, sv2, mask9);
    vec_u8_t srva0 = vec_perm(sv1, sv2, mask10);
    vec_u8_t srvb0 = vec_perm(sv1, sv2, mask11);
    vec_u8_t srvc0 = vec_perm(sv1, sv2, mask12);
    vec_u8_t srvd0 = vec_perm(sv1, sv2, mask13);
    vec_u8_t srve0 = vec_perm(sv1, sv2, mask14);
    vec_u8_t srvf0 = vec_perm(sv1, sv2, mask15);

    vec_u8_t srv000 = sv2;
    vec_u8_t srv100 = vec_perm(sv2, sv3, mask1);
    vec_u8_t srv200 = vec_perm(sv2, sv3, mask2);


vec_u8_t vfrac16_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_17 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_18 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_20 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_21 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_22 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_24 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_25 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_26 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_28 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_29 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_30 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

vec_u8_t vfrac16_32_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_17 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_18 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_20 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_21 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_22 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_24 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_25 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_26 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_28 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_29 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_30 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;
    //int offset[32] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 17};

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv00, srv10, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv2, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv10, srv20, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv1, srv2, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv10, srv20, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv2, srv3, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv20, srv30, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv2, srv3, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv20, srv30, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv3, srv4, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv30, srv40, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv3, srv4, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv30, srv40, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv4, srv5, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv40, srv50, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv4, srv5, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv40, srv50, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv5, srv6, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv50, srv60, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv5, srv6, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv50, srv60, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv6, srv7, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv60, srv70, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv6, srv7, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv60, srv70,  vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv7, srv8, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv70, srv80, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv7, srv8, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv70, srv80, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv8, srv9, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv80, srv90, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv9, srva, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv90, srva0, vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv9, srva, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv90, srva0, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srva, srvb, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srva0, srvb0, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srva, srvb, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srva0, srvb0, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srvb, srvc, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srvb0, srvc0, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srvb, srvc, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srvb0, srvc0, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srvc, srvd, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srvc0, srvd0, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srvc, srvd, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srvc0, srvd0, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srvd, srve, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srvd0, srve0, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srvd, srve, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srvd0, srve0, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srve, srvf, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srve0, srvf0, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srve, srvf, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srve0, srvf0, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srvf, srv00, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srvf0, srv000, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srvf, srv00, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srvf0, srv000, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv00, srv10, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv000, srv100, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv10, srv20, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv100, srv200, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void one_ang_pred_altivec<4, 17>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3};
    vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4};

    /*vec_u8_t srv_left=vec_xl(8, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);*/

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){6, 6, 6, 6, 12, 12, 12, 12, 18, 18, 18, 18, 24, 24, 24, 24};
    vec_u8_t vfrac4_32 = (vec_u8_t){26, 26, 26, 26, 20, 20, 20, 20, 14, 14, 14, 14, 8, 8, 8, 8};

    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); 
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); 
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 17>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, };
    vec_u8_t mask1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, };
    vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
    vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
    vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
    vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
    vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
    vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);
        

    /* fraction[0-7] */
vec_u8_t vfrac8_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_2 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_3 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 16, 16, 16, 16, 16, 16, 16, 16};

    /* 32 - fraction[0-7] */
vec_u8_t vfrac8_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_32_2 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_32_3 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 16, 16, 16, 16, 16, 16, 16, 16};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 17>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
    vec_u8_t mask1={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
    vec_u8_t mask2={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
    vec_u8_t mask3={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
    vec_u8_t mask4={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
    //vec_u8_t mask5={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
    vec_u8_t mask6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
    vec_u8_t mask7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
    vec_u8_t mask8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
    vec_u8_t mask9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
    //vec_u8_t mask10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
    vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
    vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
    vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
    vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
    //vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
    vec_u8_t maskadd1_0={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*      
    vec_u8_t srv_left=vec_xl(32, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_16 ={0xf, 0xe, 0xc, 0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(4, srcPix0);   
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(36, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = vec_perm(s0, s1, mask8); 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv1;
    vec_u8_t srv3_add1 = srv2;
    vec_u8_t srv4_add1 = srv3; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv4;
    vec_u8_t srv7_add1 = srv6; 
    vec_u8_t srv8_add1 = srv7;
    vec_u8_t srv9_add1 = srv8;
    vec_u8_t srv10_add1 = srv8;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv11; 
    vec_u8_t srv13_add1 = srv12;
    vec_u8_t srv14_add1 = srv13; 
    vec_u8_t srv15_add1 = srv13;


    /* fraction[0-15] */
vec_u8_t vfrac16_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

        /* 32- fraction[0-15] */
vec_u8_t vfrac16_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 17>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask2={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask4={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask11={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask12={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask13={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask14={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask15={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask16={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };

vec_u8_t mask17={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask18={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask19={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask20={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask21={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask22={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask23={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask24={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask25={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask26={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask27={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask28={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*    
    vec_u8_t refmask_32_0 ={0x1f, 0x1e, 0x1c, 0x1b, 0x1a, 0x19, 0x17, 0x16, 0x15, 0x14, 0x12, 0x11, 0x10, 0xf, 0xe, 0xc};
    vec_u8_t refmask_32_1 = {0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(7, srcPix0);   
    vec_u8_t s3 = vec_xl(16+7, srcPix0);        
*/
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1f, 0x1e, 0x1c, 0x1b, 0x1a, 0x19, 0x17, 0x16, 0x15, 0x14, 0x12, 0x11, 0x10, 0xf, 0xe, 0xc };
    vec_u8_t refmask_32_1={0xb, 0xa, 0x9, 0x7, 0x6, 0x5, 0x4, 0x2, 0x1, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(71, srcPix0);  
    vec_u8_t s3 = vec_xl(87, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv2 = vec_perm(s1, s2, mask2);
    vec_u8_t srv3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s1, s2, mask6); 
    vec_u8_t srv7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv8 = vec_perm(s1, s2, mask8); 
    vec_u8_t srv9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = s1;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    vec_u8_t srv16_0 = vec_perm(s2, s3, mask0); 
    vec_u8_t srv16_1 = vec_perm(s2, s3, mask1);
    vec_u8_t srv16_2 = vec_perm(s2, s3, mask2);
    vec_u8_t srv16_3 = vec_perm(s2, s3, mask3);
    vec_u8_t srv16_4 = vec_perm(s2, s3, mask4); 
    vec_u8_t srv16_5 =srv16_4;
    vec_u8_t srv16_6 = vec_perm(s2, s3, mask6); 
    vec_u8_t srv16_7 = vec_perm(s2, s3, mask7);
    vec_u8_t srv16_8 = vec_perm(s2, s3, mask8); 
    vec_u8_t srv16_9 = vec_perm(s2, s3, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = s2;
    vec_u8_t srv16_12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;
    //0,1,2,3,4,4,6,7,8,9,9(1,2),11(1),12(0,1),13,14,14,15,16,17,18,19,20,20,22,23,24,25,25,27,28,29,30(0),30,

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = vec_perm(s0, s1, mask20);
    vec_u8_t  srv21 = srv20;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = vec_perm(s0, s1, mask23);
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = vec_perm(s0, s1, mask25);
    vec_u8_t  srv26 = srv25;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = vec_perm(s0, s1, mask29);
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask20);
    vec_u8_t  srv16_21 = srv16_20;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = vec_perm(s1, s2, mask23);
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask25);
    vec_u8_t  srv16_26 = srv16_25;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = vec_perm(s1, s2, mask29);
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;
        
    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = srv0;
    vec_u8_t srv2add1 = srv1;
    vec_u8_t srv3add1 = srv2;
    vec_u8_t srv4add1 = srv3; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv4;
    vec_u8_t srv7add1 = srv6; 
    vec_u8_t srv8add1 = srv7;
    vec_u8_t srv9add1 = srv8;
    vec_u8_t srv10add1 = srv8;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv11; 
    vec_u8_t srv13add1 = srv12;
    vec_u8_t srv14add1 = srv13; 
    vec_u8_t srv15add1 = srv13;

    //0(1,2),1,2,3,3.4,6,7,8,8,9,11(1),12(0,1),13,13,14,16, 17, 18,19,19,20,22,26,24,24,25,27,28,29,29,

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16_0;
    vec_u8_t srv16add1_2 = srv16_1;
    vec_u8_t srv16add1_3 = srv16_2;
    vec_u8_t srv16add1_4 = srv16_3; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_4; 
    vec_u8_t srv16add1_7 = srv16_6;
    vec_u8_t srv16add1_8 = srv16_7; 
    vec_u8_t srv16add1_9 = srv16_8;
    vec_u8_t srv16add1_10 = srv16_8;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_11; 
    vec_u8_t srv16add1_13 = srv16_12;
    vec_u8_t srv16add1_14 = srv16_13; 
    vec_u8_t srv16add1_15 = srv16_13;

    vec_u8_t  srv16add1 =  srv14;  
    vec_u8_t  srv17add1 = srv16;
    vec_u8_t  srv18add1 = srv17;
    vec_u8_t  srv19add1 = srv18;
    vec_u8_t  srv20add1 = srv19;
    vec_u8_t  srv21add1 = srv19;
    vec_u8_t  srv22add1 = srv20;
    vec_u8_t  srv23add1 = srv22;
    vec_u8_t  srv24add1 = srv23;
    vec_u8_t  srv25add1 = srv24;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv25;
    vec_u8_t  srv28add1 = srv27;
    vec_u8_t  srv29add1 = srv28;
    vec_u8_t  srv30add1 = srv29;
    vec_u8_t  srv31add1 = srv29;

    vec_u8_t  srv16add1_16 = srv16_14;   
    vec_u8_t  srv16add1_17 = srv16_16;
    vec_u8_t  srv16add1_18 = srv16_17;
    vec_u8_t  srv16add1_19 = srv16_18;
    vec_u8_t  srv16add1_20 = srv16_19;
    vec_u8_t  srv16add1_21 = srv16_19;
    vec_u8_t  srv16add1_22 = srv16_20;
    vec_u8_t  srv16add1_23 = srv16_22;
    vec_u8_t  srv16add1_24 = srv16_23;
    vec_u8_t  srv16add1_25 = srv16_24;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_25;
    vec_u8_t  srv16add1_28 = srv16_27;
    vec_u8_t  srv16add1_29 = srv16_28;
    vec_u8_t  srv16add1_30 = srv16_29;
    vec_u8_t  srv16add1_31 = srv16_29;

vec_u8_t vfrac16_0 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_1 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_2 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_5 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_6 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_9 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_10 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_13 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_14 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_1 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_2 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_5 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_6 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_9 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_10 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_13 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_14 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_16, srv16add1_16, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 16>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };

/*
    vec_u8_t srv_left=vec_xl(8, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_4={0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
*/      
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);

    vec_u8_t vfrac4 = (vec_u8_t){11, 11, 11, 11, 22, 22, 22, 22, 1, 1, 1, 1, 12, 12, 12, 12};
    vec_u8_t vfrac4_32 = (vec_u8_t){21, 21, 21, 21, 10, 10, 10, 10, 31, 31, 31, 31, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 16>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask1={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, };
vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_8={0x8, 0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(17, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_8={0x8, 0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_1 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_3 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24};

vec_u8_t vfrac8_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_32_1 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_32_3 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 16>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask1={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask2={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask3={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask5={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask6={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask7={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t maskadd1_0={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
/*vec_u8_t maskadd1_1={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t maskadd1_2={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t maskadd1_3={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t maskadd1_4={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_5={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_8={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_9={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_11={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
*/
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*    
    vec_u8_t srv_left=vec_xl(32, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0x9, 0x8, 0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(6, srcPix0);   
*/      
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xe, 0xc, 0xb, 0x9, 0x8, 0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(38, srcPix0);

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 =srv4;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv1;
    vec_u8_t srv4_add1 = srv3; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv4;
    vec_u8_t srv7_add1 = srv6; 
    vec_u8_t srv8_add1 = srv6;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv9;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv10; 
    vec_u8_t srv13_add1 = srv12;
    vec_u8_t srv14_add1 = srv12; 
    vec_u8_t srv15_add1 = srv13;
vec_u8_t vfrac16_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 16>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask7={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
//vec_u8_t mask8={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask9={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask10={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask11={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask12={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask13={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask14={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask15={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };

vec_u8_t mask16={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask17={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask18={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask19={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask20={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask21={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask22={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask23={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask24={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask25={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask26={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask27={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*
    vec_u8_t refmask_32_0 = {0x1e, 0x1d, 0x1b, 0x1a, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0xf, 0xe, 0xc, 0xb, 0x9, 0x8, };
    vec_u8_t refmask_32_1 = {0x6, 0x5, 0x3, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(12, srcPix0);  
    vec_u8_t s3 = vec_xl(16+12, srcPix0);       
*/
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1d, 0x1b, 0x1a, 0x18, 0x17, 0x15, 0x14, 0x12, 0x11, 0xf, 0xe, 0xc, 0xb, 0x9, 0x8};
    vec_u8_t refmask_32_1={0x6, 0x5, 0x3, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(76, srcPix0);  
    vec_u8_t s3 = vec_xl(92, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = s1; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);

    vec_u8_t srv16_0 = vec_perm(s2, s3, mask0); 
    vec_u8_t srv16_1 = vec_perm(s2, s3, mask1);
    vec_u8_t srv16_2 = srv16_1;
    vec_u8_t srv16_3 = vec_perm(s2, s3, mask3);
    vec_u8_t srv16_4 = vec_perm(s2, s3, mask4); 
    vec_u8_t srv16_5 = srv16_4;
    vec_u8_t srv16_6 = s2; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv16_11 = srv16_10;
    vec_u8_t srv16_12= vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = srv16_13; 
    vec_u8_t srv16_15 = vec_perm(s1, s2, mask15);

    //0(1,2),1,1,3,4,4,6(1),7(0,1),7,9,10,10,12,13,13,15,16,16,18,19,19,21,22,22,24,25,25,27,28,28,30,30

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = srv16;
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = vec_perm(s0, s1, mask21);
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = vec_perm(s0, s1, mask25);
    vec_u8_t  srv26 = srv25;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = srv28;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = srv16_16;
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask21);
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = vec_perm(s1, s2, mask25);
    vec_u8_t  srv16_26 = srv16_25;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = srv16_28;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = srv0;
    vec_u8_t srv2add1 = srv0;
    vec_u8_t srv3add1 = srv1;
    vec_u8_t srv4add1 = srv3; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv4;
    vec_u8_t srv7add1 = s1; 
    vec_u8_t srv8add1 = s1;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv9;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv10; 
    vec_u8_t srv13add1 = srv12;
    vec_u8_t srv14add1 = srv12; 
    vec_u8_t srv15add1 = srv13;

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16_0;
    vec_u8_t srv16add1_2 = srv16_0;
    vec_u8_t srv16add1_3 = srv16_1;
    vec_u8_t srv16add1_4 = srv16_3; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_4; 
    vec_u8_t srv16add1_7 = s2;
    vec_u8_t srv16add1_8 = s2; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_9;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_10; 
    vec_u8_t srv16add1_13 = srv16_12;
    vec_u8_t srv16add1_14 = srv16_12; 
    vec_u8_t srv16add1_15 = srv16_13;

    //0,0,1,3,3,4,6(0),6,7,9,9,10,12,12,13,15,15,16,18,18,19,21,21,22,24,24,25,27,27,28,28

    vec_u8_t  srv16add1 = srv15;  
    vec_u8_t  srv17add1 = srv15;
    vec_u8_t  srv18add1 = srv16;
    vec_u8_t  srv19add1 = srv18;
    vec_u8_t  srv20add1 = srv18;
    vec_u8_t  srv21add1 = srv19;
    vec_u8_t  srv22add1 = srv21;
    vec_u8_t  srv23add1 = srv21;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv24;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv25;
    vec_u8_t  srv28add1 = srv27;
    vec_u8_t  srv29add1 = srv27;
    vec_u8_t  srv30add1 = srv28;
    vec_u8_t  srv31add1 = srv28;

    vec_u8_t  srv16add1_16 = srv16_15;   
    vec_u8_t  srv16add1_17 = srv16_15;
    vec_u8_t  srv16add1_18 = srv16_16;
    vec_u8_t  srv16add1_19 = srv16_18;
    vec_u8_t  srv16add1_20 = srv16_18;
    vec_u8_t  srv16add1_21 = srv16_19;
    vec_u8_t  srv16add1_22 = srv16_21;
    vec_u8_t  srv16add1_23 = srv16_21;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_24;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_25;
    vec_u8_t  srv16add1_28 = srv16_27;
    vec_u8_t  srv16add1_29 = srv16_27;
    vec_u8_t  srv16add1_30 = srv16_28;
    vec_u8_t  srv16add1_31 = srv16_28;

vec_u8_t vfrac16_0 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_18 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_22 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_26 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_30 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_18 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_22 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_26 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_30 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 15>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };

/*
    vec_u8_t srv_left=vec_xl(8, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
*/      
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t vfrac4 = (vec_u8_t){15, 15, 15, 15, 30, 30, 30, 30, 13, 13, 13, 13, 28, 28, 28, 28};
    vec_u8_t vfrac4_32 = (vec_u8_t){17, 17, 17, 17, 2, 2, 2, 2, 19, 19, 19, 19, 4, 4, 4, 4};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 15>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask1={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0);
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t refmask_8={0x8, 0x6, 0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x8, 0x6, 0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_1 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_2 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac8_3 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_32_1 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_2 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac8_32_3 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 15>(pixel* dst, const pixel *srcPix0, int bFilter)
{

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t mask0={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask2={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask4={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask6={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask7={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask8={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
/*vec_u8_t maskadd1_1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_2={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t maskadd1_3={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_5={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_6={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_7={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_8={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_9={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_10={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_12={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

/*      
    vec_u8_t srv_left=vec_xl(32, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t refmask_16={0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(8, srcPix0);   
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(40, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = srv5; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= srv11; 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv1;
    vec_u8_t srv4_add1 = srv1; 
    vec_u8_t srv5_add1 = srv3; 
    vec_u8_t srv6_add1 = srv3;
    vec_u8_t srv7_add1 = srv5; 
    vec_u8_t srv8_add1 = srv5;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv9;
    vec_u8_t srv12_add1= srv9; 
    vec_u8_t srv13_add1 = srv11;
    vec_u8_t srv14_add1 = srv11; 
    vec_u8_t srv15_add1 = srv13;
        
vec_u8_t vfrac16_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 15>(pixel* dst, const pixel *srcPix0, int bFilter)
{
//vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask1={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
//vec_u8_t mask2={0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, };
vec_u8_t mask3={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
//vec_u8_t mask4={0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, };
vec_u8_t mask5={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
//vec_u8_t mask6={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };
vec_u8_t mask7={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
//vec_u8_t mask8={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask9={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask10={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask11={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask12={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask13={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask14={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask15={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };

vec_u8_t mask16={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask17={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask18={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask19={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask20={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask21={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask22={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask23={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask24={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask25={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask26={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask27={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*
    vec_u8_t srv_left0=vec_xl(64, srcPix0); 
    vec_u8_t srv_left1=vec_xl(80, srcPix0); 
    vec_u8_t refmask_32 = {0x1e, 0x1c, 0x1a, 0x18, 0x17, 0x15, 0x13, 0x11, 0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32);   
    vec_u8_t s1 = vec_xl(0, srcPix0);;
    vec_u8_t s2 = vec_xl(16, srcPix0);  
    vec_u8_t s3 = vec_xl(32, srcPix0);  
 */   
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1c, 0x1a, 0x18, 0x17, 0x15, 0x13, 0x11, 0xf, 0xd, 0xb, 0x9, 0x8, 0x6, 0x4, 0x2};
    vec_u8_t refmask_32_1={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u8_t s0 = vec_perm(srv_left0, srv_left1, refmask_32_0); 
    vec_u8_t s1 = vec_perm(srv_left0, srv_right, refmask_32_1); 
    vec_u8_t s2 = vec_xl(80, srcPix0);  
    vec_u8_t s3 = vec_xl(96, srcPix0);  
        
    vec_u8_t srv0 = s1; 
    vec_u8_t srv1 = vec_perm(s0, s1, mask1);
    vec_u8_t srv2 = srv1;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = vec_perm(s0, s1, mask5);
    vec_u8_t srv6 = srv5; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = vec_perm(s0, s1, mask11);
    vec_u8_t srv12= srv11; 
    vec_u8_t srv13 = vec_perm(s0, s1, mask13);
    vec_u8_t srv14 = srv13; 
    vec_u8_t srv15 = vec_perm(s0, s1, mask15);

    vec_u8_t srv16_0 = s2; 
    vec_u8_t srv16_1 = vec_perm(s1, s2, mask1);
    vec_u8_t srv16_2 = srv16_1;
    vec_u8_t srv16_3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv16_4 = srv16_3; 
    vec_u8_t srv16_5 = vec_perm(s1, s2, mask5);
    vec_u8_t srv16_6 = srv16_5; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = vec_perm(s1, s2, mask11);
    vec_u8_t srv16_12= srv16_11; 
    vec_u8_t srv16_13 = vec_perm(s1, s2, mask13);
    vec_u8_t srv16_14 = srv16_13; 
    vec_u8_t srv16_15 = vec_perm(s1, s2, mask15);

    //s1, 1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,s0,s0

    vec_u8_t  srv16 = vec_perm(s0, s1, mask16);  
    vec_u8_t  srv17 = srv16;
    vec_u8_t  srv18 = vec_perm(s0, s1, mask18);
    vec_u8_t  srv19 = srv18;
    vec_u8_t  srv20 = vec_perm(s0, s1, mask20);
    vec_u8_t  srv21 = srv20;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = vec_perm(s0, s1, mask26);
    vec_u8_t  srv27 = srv26;
    vec_u8_t  srv28 = vec_perm(s0, s1, mask28);
    vec_u8_t  srv29 = srv28;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = vec_perm(s1, s2, mask16);  
    vec_u8_t  srv16_17 = srv16_16;
    vec_u8_t  srv16_18 = vec_perm(s1, s2, mask18);
    vec_u8_t  srv16_19 = srv16_18;
    vec_u8_t  srv16_20 = vec_perm(s1, s2, mask20);
    vec_u8_t  srv16_21 = srv16_20;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = vec_perm(s1, s2, mask26);
    vec_u8_t  srv16_27 = srv16_26;
    vec_u8_t  srv16_28 = vec_perm(s1, s2, mask28);
    vec_u8_t  srv16_29 = srv16_28;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv1add1 = s1;
    vec_u8_t srv2add1 = s1;
    vec_u8_t srv3add1 = srv1;
    vec_u8_t srv4add1 = srv1; 
    vec_u8_t srv5add1 = srv3; 
    vec_u8_t srv6add1 = srv3;
    vec_u8_t srv7add1 = srv6; 
    vec_u8_t srv8add1 = srv6;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv9;
    vec_u8_t srv12add1= srv9; 
    vec_u8_t srv13add1 = srv11;
    vec_u8_t srv14add1 = srv11; 
    vec_u8_t srv15add1 = srv14;

    vec_u8_t srv16add1_0 = vec_perm(s2, s3, maskadd1_0);
    vec_u8_t srv16add1_1 = s2;
    vec_u8_t srv16add1_2 = s2;
    vec_u8_t srv16add1_3 = srv16_1;
    vec_u8_t srv16add1_4 = srv16_1; 
    vec_u8_t srv16add1_5 = srv16_3;
    vec_u8_t srv16add1_6 = srv16_3; 
    vec_u8_t srv16add1_7 = srv16_6;
    vec_u8_t srv16add1_8 = srv16_6; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_9;
    vec_u8_t srv16add1_12= srv16_9; 
    vec_u8_t srv16add1_13 = srv16_11;
    vec_u8_t srv16add1_14 = srv16_11; 
    vec_u8_t srv16add1_15 = srv16_14;

    //srv28, s1,s1, 1,1,3,3,6,6,7,7,9,9,11,11,14,15,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,

    vec_u8_t  srv16add1 = srv15;  
    vec_u8_t  srv17add1 = srv15;
    vec_u8_t  srv18add1 = srv16;
    vec_u8_t  srv19add1 = srv16;
    vec_u8_t  srv20add1 = srv18;
    vec_u8_t  srv21add1 = srv18;
    vec_u8_t  srv22add1 = srv20;
    vec_u8_t  srv23add1 = srv20;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv22;
    vec_u8_t  srv26add1 = srv24;
    vec_u8_t  srv27add1 = srv24;
    vec_u8_t  srv28add1 = srv26;
    vec_u8_t  srv29add1 = srv26;
    vec_u8_t  srv30add1 = srv28;
    vec_u8_t  srv31add1 = srv28;

    vec_u8_t  srv16add1_16 = srv16_15;   
    vec_u8_t  srv16add1_17 = srv16_15;
    vec_u8_t  srv16add1_18 = srv16_16;
    vec_u8_t  srv16add1_19 = srv16_16;
    vec_u8_t  srv16add1_20 = srv16_18;
    vec_u8_t  srv16add1_21 = srv16_18;
    vec_u8_t  srv16add1_22 = srv16_20;
    vec_u8_t  srv16add1_23 = srv16_20;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_22;
    vec_u8_t  srv16add1_26 = srv16_24;
    vec_u8_t  srv16add1_27 = srv16_24;
    vec_u8_t  srv16add1_28 = srv16_26;
    vec_u8_t  srv16add1_29 = srv16_26;
    vec_u8_t  srv16add1_30 = srv16_28;
    vec_u8_t  srv16add1_31 = srv16_28;

vec_u8_t vfrac16_0 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_1 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_5 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_6 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_9 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_10 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_13 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_14 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_17 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_18 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_20 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_21 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_22 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_25 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_26 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_28 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_29 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_30 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_1 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_5 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_6 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_9 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_10 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_13 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_14 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_17 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_18 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_20 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_21 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_22 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_25 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_26 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_28 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_29 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_30 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void one_ang_pred_altivec<4, 14>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };

/*
    vec_u8_t srv_left=vec_xl(8, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_4={0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); /* need to update for each mode y=0, offset[0]; y=1, offset[1]; y=2, offset[2]...*/
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t vfrac4 = (vec_u8_t){19, 19, 19, 19, 6, 6, 6, 6, 25, 25, 25, 25, 12, 12, 12, 12};
    vec_u8_t vfrac4_32 = (vec_u8_t){13, 13, 13, 13, 26, 26, 26, 26, 7, 7, 7, 7, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 14>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x5, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x5, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac8_1 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac8_3 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac8_32_1 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac8_32_3 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 14>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask1={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask2={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask3={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
/*vec_u8_t maskadd1_1={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t maskadd1_2={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_3={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t maskadd1_4={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_5={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_6={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_7={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_8={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*    
    vec_u8_t srv_left=vec_xl(32, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_16={0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(10, srcPix0);  
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(42, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = srv2;
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = srv4; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = srv9;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0;
    vec_u8_t srv3_add1 = srv0;
    vec_u8_t srv4_add1 = srv2; 
    vec_u8_t srv5_add1 = srv2; 
    vec_u8_t srv6_add1 = srv2;
    vec_u8_t srv7_add1 = srv4; 
    vec_u8_t srv8_add1 = srv4;
    vec_u8_t srv9_add1 = srv7;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv7;
    vec_u8_t srv12_add1= srv9; 
    vec_u8_t srv13_add1 = srv9;
    vec_u8_t srv14_add1 = srv12; 
    vec_u8_t srv15_add1 = srv12;
vec_u8_t vfrac16_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

vec_u8_t vfrac16_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 14>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
//vec_u8_t mask1={0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, };
vec_u8_t mask2={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
//vec_u8_t mask3={0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, };
vec_u8_t mask4={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask5={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
//vec_u8_t mask6={0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, };
vec_u8_t mask7={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
//vec_u8_t mask8={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };
vec_u8_t mask9={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask10={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask11={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask12={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask13={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask14={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask15={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };

//vec_u8_t mask16={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask17={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask18={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask19={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask20={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask21={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask22={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask23={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask24={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask25={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask26={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask27={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask28={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*
    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1e, 0x1b, 0x19, 0x16, 0x14, 0x11, 0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x10, 0x11, 0x12, 0x13};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(4, srcPix0);;
    vec_u8_t s2 = vec_xl(20, srcPix0);  
 */
     vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1e, 0x1b, 0x19, 0x16, 0x14, 0x11, 0xf, 0xc, 0xa, 0x7, 0x5, 0x2, 0x00, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0x10, 0x11, 0x12};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(68, srcPix0);  
    vec_u8_t s2 = vec_xl(84, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = vec_perm(s0, s1, mask2);
    vec_u8_t srv3 = srv2;
    vec_u8_t srv4 = vec_perm(s0, s1, mask4); 
    vec_u8_t srv5 = srv4;
    vec_u8_t srv6 = srv4; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = vec_perm(s0, s1, mask9);
    vec_u8_t srv10 = srv9;
    vec_u8_t srv11 = srv9;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = vec_perm(s1, s2, mask2);
    vec_u8_t srv16_3 = srv16_2;
    vec_u8_t srv16_4 = vec_perm(s1, s2, mask4); 
    vec_u8_t srv16_5 = srv16_4;
    vec_u8_t srv16_6 = srv16_4; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = vec_perm(s1, s2, mask9);
    vec_u8_t srv16_10 = srv16_9;
    vec_u8_t srv16_11 = srv16_9;
    vec_u8_t srv16_12=  vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = srv16_12;
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;

    //0(0,1),0,2,2,4,4,4,7,7,9,9,9,12,12,14,14,14,17,17,19,19,19,22,22,24,24,24,27,27,s0,s0,s0

    vec_u8_t  srv16 = srv14;  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = srv17;
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = srv19;
    vec_u8_t  srv22 = vec_perm(s0, s1, mask22);
    vec_u8_t  srv23 = srv22;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = srv24;
    vec_u8_t  srv27 = vec_perm(s0, s1, mask27);
    vec_u8_t  srv28 = srv27;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_14;  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = srv16_17;
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = srv16_19;
    vec_u8_t  srv16_22 = vec_perm(s1, s2, mask22);
    vec_u8_t  srv16_23 = srv16_22;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = srv16_24;
    vec_u8_t  srv16_27 = vec_perm(s1, s2, mask27);
    vec_u8_t  srv16_28 = srv16_27;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0;
    vec_u8_t srv3add1 = srv0;
    vec_u8_t srv4add1 = srv2; 
    vec_u8_t srv5add1 = srv2; 
    vec_u8_t srv6add1 = srv2;
    vec_u8_t srv7add1 = srv4; 
    vec_u8_t srv8add1 = srv4;
    vec_u8_t srv9add1 = srv7;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv7;
    vec_u8_t srv12add1= srv9; 
    vec_u8_t srv13add1 = srv9;
    vec_u8_t srv14add1 = srv12; 
    vec_u8_t srv15add1 = srv12;

    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16_0;
    vec_u8_t srv16add1_3 = srv16_0;
    vec_u8_t srv16add1_4 = srv16_2; 
    vec_u8_t srv16add1_5 = srv16_2;
    vec_u8_t srv16add1_6 = srv16_2; 
    vec_u8_t srv16add1_7 = srv16_4;
    vec_u8_t srv16add1_8 = srv16_4; 
    vec_u8_t srv16add1_9 = srv16_7;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_7;
    vec_u8_t srv16add1_12= srv16_9; 
    vec_u8_t srv16add1_13 = srv16_9;
    vec_u8_t srv16add1_14 = srv16_12; 
    vec_u8_t srv16add1_15 = srv16_12;

    //srv28, s1,s1, 1,1,3,3,6,6,7,7,9,9,11,11,14,15,15,16,16,18,18,20,20,22,22,24,24,26,26,28,28,
    //0,0,2,2,2,4,4,7,7,7,9,9,12,12,12,14,14,17,17,17,19,19,22,22,22,24,24,27,27,27,

    vec_u8_t  srv16add1 = srv12;  
    vec_u8_t  srv17add1 = srv14;
    vec_u8_t  srv18add1 = srv14;
    vec_u8_t  srv19add1 = srv17;
    vec_u8_t  srv20add1 = srv17;
    vec_u8_t  srv21add1 = srv17;
    vec_u8_t  srv22add1 = srv19;
    vec_u8_t  srv23add1 = srv19;
    vec_u8_t  srv24add1 = srv22;
    vec_u8_t  srv25add1 = srv22;
    vec_u8_t  srv26add1 = srv22;
    vec_u8_t  srv27add1 = srv24;
    vec_u8_t  srv28add1 = srv24;
    vec_u8_t  srv29add1 = srv27;
    vec_u8_t  srv30add1 = srv27;
    vec_u8_t  srv31add1 = srv27;

    vec_u8_t  srv16add1_16 = srv16_12;   
    vec_u8_t  srv16add1_17 = srv16_14;
    vec_u8_t  srv16add1_18 = srv16_14;
    vec_u8_t  srv16add1_19 = srv16_17;
    vec_u8_t  srv16add1_20 = srv16_17;
    vec_u8_t  srv16add1_21 = srv16_17;
    vec_u8_t  srv16add1_22 = srv16_19;
    vec_u8_t  srv16add1_23 = srv16_19;
    vec_u8_t  srv16add1_24 = srv16_22;
    vec_u8_t  srv16add1_25 = srv16_22;
    vec_u8_t  srv16add1_26 = srv16_22;
    vec_u8_t  srv16add1_27 = srv16_24;
    vec_u8_t  srv16add1_28 = srv16_24;
    vec_u8_t  srv16add1_29 = srv16_27;
    vec_u8_t  srv16add1_30 = srv16_27;
    vec_u8_t  srv16add1_31 = srv16_27;

vec_u8_t vfrac16_0 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_5 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_6 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_9 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_10 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_13 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_14 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_17 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_18 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_21 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_22 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_25 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_26 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_29 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_30 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_5 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_6 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_9 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_10 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_13 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_14 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_17 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_18 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_21 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_22 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_25 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_26 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_29 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_30 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<4, 13>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x2, 0x3, 0x4, 0x5, 0x1, 0x2, 0x3, 0x4, };

/*
    vec_u8_t srv_left=vec_xl(8, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t vfrac4 = (vec_u8_t){23, 23, 23, 23, 14, 14, 14, 14, 5, 5, 5, 5, 28, 28, 28, 28};
    vec_u8_t vfrac4_32 = (vec_u8_t){9, 9, 9, 9, 18, 18, 18, 18, 27, 27, 27, 27, 4, 4, 4, 4};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); /* (32 - fraction) * ref[offset + x], x=0-3 */
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); /* fraction * ref[offset + x + 1], x=0-3 */
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 13>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0);
    vec_u8_t refmask_8={0x7, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x7, 0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);


vec_u8_t vfrac8_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac8_1 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_2 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_3 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac8_32_1 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_2 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_32_3 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 13>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask4={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask6={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
//vec_u8_t mask13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
//vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
/*vec_u8_t maskadd1_1={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_2={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t maskadd1_3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_6={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t maskadd1_7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*      
    vec_u8_t srv_left=vec_xl(32, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_16={0xe, 0xb, 0x7, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(12, srcPix0);  
*/      
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0xe, 0xb, 0x7, 0x4, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(44, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = srv3;
    vec_u8_t srv6 = srv3; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7; 
    vec_u8_t srv9 = srv7;
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= srv10; 
    vec_u8_t srv13 = srv10;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0_add1;
    vec_u8_t srv3_add1 = srv0;
    vec_u8_t srv4_add1 = srv0; 
    vec_u8_t srv5_add1 = srv0; 
    vec_u8_t srv6_add1 = srv0;
    vec_u8_t srv7_add1 = srv3; 
    vec_u8_t srv8_add1 = srv3;
    vec_u8_t srv9_add1 = srv3;
    vec_u8_t srv10_add1 = srv7;
    vec_u8_t srv11_add1 = srv7;
    vec_u8_t srv12_add1= srv7; 
    vec_u8_t srv13_add1 = srv7;
    vec_u8_t srv14_add1 = srv10; 
    vec_u8_t srv15_add1 = srv10;
vec_u8_t vfrac16_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 13>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask1={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
//vec_u8_t mask2={0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, };
vec_u8_t mask3={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask4={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask5={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
//vec_u8_t mask6={0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, };
vec_u8_t mask7={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask8={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
//vec_u8_t mask9={0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, };
vec_u8_t mask10={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask11={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask12={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
//vec_u8_t mask13={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };
vec_u8_t mask14={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
//vec_u8_t mask15={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };

//vec_u8_t mask16={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask17={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask18={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask19={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
//vec_u8_t mask20={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask21={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask22={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
//vec_u8_t mask23={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask24={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t mask25={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask26={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask27={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask28={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/
vec_u8_t maskadd1_0={0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*
    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1c, 0x19, 0x15, 0x12, 0xe, 0xb, 0x7, 0x4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(8, srcPix0);;
    vec_u8_t s2 = vec_xl(24, srcPix0);  
*/
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1c, 0x19, 0x15, 0x12, 0xe, 0xb, 0x7, 0x4, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(72, srcPix0);  
    vec_u8_t s2 = vec_xl(88, srcPix0);  
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = vec_perm(s0, s1, mask3);
    vec_u8_t srv4 = srv3; 
    vec_u8_t srv5 = srv3;
    vec_u8_t srv6 = srv3; 
    vec_u8_t srv7 = vec_perm(s0, s1, mask7);
    vec_u8_t srv8 = srv7;
    vec_u8_t srv9 = srv7;
    vec_u8_t srv10 = vec_perm(s0, s1, mask10);
    vec_u8_t srv11 = srv10;
    vec_u8_t srv12= srv10; 
    vec_u8_t srv13 = srv10;
    vec_u8_t srv14 = vec_perm(s0, s1, mask14); 
    vec_u8_t srv15 = srv14;

    //0,0,0,3,3,3,3,7,7,7,10,10,10,10,14,14,14,17,17,17,17,21,21,21,24,24,24,24,s0,s0,s0,s0

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = srv16_0;
    vec_u8_t srv16_3 = vec_perm(s1, s2, mask3);
    vec_u8_t srv16_4 = srv16_3; 
    vec_u8_t srv16_5 = srv16_3;
    vec_u8_t srv16_6 = srv16_3; 
    vec_u8_t srv16_7 = vec_perm(s1, s2, mask7);
    vec_u8_t srv16_8 = srv16_7; 
    vec_u8_t srv16_9 = srv16_7;
    vec_u8_t srv16_10 = vec_perm(s1, s2, mask10);
    vec_u8_t srv16_11 = srv16_10;
    vec_u8_t srv16_12=  srv16_10; 
    vec_u8_t srv16_13 = srv16_10;
    vec_u8_t srv16_14 = vec_perm(s1, s2, mask14); 
    vec_u8_t srv16_15 = srv16_14;

    vec_u8_t  srv16 = srv14;  
    vec_u8_t  srv17 = vec_perm(s0, s1, mask17);
    vec_u8_t  srv18 = srv17;
    vec_u8_t  srv19 = srv17;
    vec_u8_t  srv20 = srv17;
    vec_u8_t  srv21 = vec_perm(s0, s1, mask21);
    vec_u8_t  srv22 = srv21;
    vec_u8_t  srv23 = srv21;
    vec_u8_t  srv24 = vec_perm(s0, s1, mask24);
    vec_u8_t  srv25 = srv24;
    vec_u8_t  srv26 = srv24;
    vec_u8_t  srv27 = srv24;
    vec_u8_t  srv28 = s0;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_14;  
    vec_u8_t  srv16_17 = vec_perm(s1, s2, mask17);
    vec_u8_t  srv16_18 = srv16_17;
    vec_u8_t  srv16_19 = srv16_17;
    vec_u8_t  srv16_20 = srv16_17;
    vec_u8_t  srv16_21 = vec_perm(s1, s2, mask21);
    vec_u8_t  srv16_22 = srv16_21;
    vec_u8_t  srv16_23 = srv16_21;
    vec_u8_t  srv16_24 = vec_perm(s1, s2, mask24);
    vec_u8_t  srv16_25 = srv16_24;
    vec_u8_t  srv16_26 = srv16_24;
    vec_u8_t  srv16_27 = srv16_24;
    vec_u8_t  srv16_28 = s1;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0add1;
    vec_u8_t srv3add1 = srv0;
    vec_u8_t srv4add1 = srv0; 
    vec_u8_t srv5add1 = srv0; 
    vec_u8_t srv6add1 = srv0;
    vec_u8_t srv7add1 = srv3; 
    vec_u8_t srv8add1 = srv3;
    vec_u8_t srv9add1 = srv3;
    vec_u8_t srv10add1 = srv7;
    vec_u8_t srv11add1 = srv7;
    vec_u8_t srv12add1= srv7; 
    vec_u8_t srv13add1 = srv7;
    vec_u8_t srv14add1 = srv10; 
    vec_u8_t srv15add1 = srv10;
    //0,0,0,0,3,3,3,7,7,7,7,10,10,10,14,14,14,14,17,17,17,21,21,21,21,24,24,24,24,
    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16add1_0;
    vec_u8_t srv16add1_3 = srv16_0;
    vec_u8_t srv16add1_4 = srv16_0; 
    vec_u8_t srv16add1_5 = srv16_0;
    vec_u8_t srv16add1_6 = srv16_0; 
    vec_u8_t srv16add1_7 = srv16_3;
    vec_u8_t srv16add1_8 = srv16_3; 
    vec_u8_t srv16add1_9 = srv16_3;
    vec_u8_t srv16add1_10 = srv16_7;
    vec_u8_t srv16add1_11 = srv16_7;
    vec_u8_t srv16add1_12= srv16_7; 
    vec_u8_t srv16add1_13 = srv16_7;
    vec_u8_t srv16add1_14 = srv16_10; 
    vec_u8_t srv16add1_15 = srv16_10;

    vec_u8_t  srv16add1 = srv10;  
    vec_u8_t  srv17add1 = srv14;
    vec_u8_t  srv18add1 = srv14;
    vec_u8_t  srv19add1 = srv14;
    vec_u8_t  srv20add1 = srv14;
    vec_u8_t  srv21add1 = srv17;
    vec_u8_t  srv22add1 = srv17;
    vec_u8_t  srv23add1 = srv17;
    vec_u8_t  srv24add1 = srv21;
    vec_u8_t  srv25add1 = srv21;
    vec_u8_t  srv26add1 = srv21;
    vec_u8_t  srv27add1 = srv21;
    vec_u8_t  srv28add1 = srv24;
    vec_u8_t  srv29add1 = srv24;
    vec_u8_t  srv30add1 = srv24;
    vec_u8_t  srv31add1 = srv24;

    vec_u8_t  srv16add1_16 = srv16_10;   
    vec_u8_t  srv16add1_17 = srv16_14;
    vec_u8_t  srv16add1_18 = srv16_14;
    vec_u8_t  srv16add1_19 = srv16_14;
    vec_u8_t  srv16add1_20 = srv16_14;
    vec_u8_t  srv16add1_21 = srv16_17;
    vec_u8_t  srv16add1_22 = srv16_17;
    vec_u8_t  srv16add1_23 = srv16_17;
    vec_u8_t  srv16add1_24 = srv16_21;
    vec_u8_t  srv16add1_25 = srv16_21;
    vec_u8_t  srv16add1_26 = srv16_21;
    vec_u8_t  srv16add1_27 = srv16_21;
    vec_u8_t  srv16add1_28 = srv16_24;
    vec_u8_t  srv16add1_29 = srv16_24;
    vec_u8_t  srv16add1_30 = srv16_24;
    vec_u8_t  srv16add1_31 = srv16_24;

vec_u8_t vfrac16_0 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_1 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_2 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_3 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_4 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_5 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_6 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_9 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_10 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_11 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_12 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_13 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_14 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_17 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_18 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_19 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_20 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_21 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_22 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_25 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_26 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_27 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_28 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_29 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_30 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_1 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_2 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_3 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_4 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_5 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_6 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_9 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_10 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_11 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_12 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_13 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_14 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_17 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_18 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_19 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_20 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_21 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_22 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_25 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_26 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_27 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_28 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_29 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_30 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void one_ang_pred_altivec<4, 12>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };

    //vec_u8_t srv = vec_xl(0, srcPix0);        

    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
        
    vec_u8_t vfrac4 = (vec_u8_t){27, 27, 27, 27, 22, 22, 22, 22, 17, 17, 17, 17, 12, 12, 12, 12};
    vec_u8_t vfrac4_32 = (vec_u8_t){5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); 
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4);
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 12>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask2={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, };
vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;

/*      
    vec_u8_t srv_left=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_8={0x6, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x6, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac8_1 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_2 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac8_3 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac8_32_1 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_32_2 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac8_32_3 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<16, 12>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
/*vec_u8_t mask1={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask2={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask3={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask4={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask5={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };*/
vec_u8_t mask6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask8={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/
vec_u8_t mask12={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
/*vec_u8_t mask13={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/

vec_u8_t maskadd1_0={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
/*vec_u8_t maskadd1_1={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_2={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_3={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_4={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_5={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t maskadd1_6={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_7={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_8={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_9={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_10={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_11={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t maskadd1_12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*    
    vec_u8_t srv_left=vec_xl(32, srcPix0); 
    vec_u8_t srv_right=vec_xl(0, srcPix0); 
    vec_u8_t refmask_16={0xd, 0x6, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(14, srcPix0);  
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(33, srcPix0); 
    vec_u8_t refmask_16={0xd, 0x6, 0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(46, srcPix0);
        
    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = srv0;
    vec_u8_t srv4 = srv0; 
    vec_u8_t srv5 = srv0;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = srv6;
    vec_u8_t srv8 = srv6; 
    vec_u8_t srv9 = srv6;
    vec_u8_t srv10 = srv6;
    vec_u8_t srv11 = srv6;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = srv12; 
    vec_u8_t srv15 = srv12;
        
    vec_u8_t srv0_add1 = vec_perm(s0, s1, maskadd1_0); 
    vec_u8_t srv1_add1 = srv0_add1;
    vec_u8_t srv2_add1 = srv0_add1;
    vec_u8_t srv3_add1 = srv0_add1;
    vec_u8_t srv4_add1 = srv0_add1; 
    vec_u8_t srv5_add1 = srv0_add1; 
    vec_u8_t srv6_add1 = srv0;
    vec_u8_t srv7_add1 = srv0; 
    vec_u8_t srv8_add1 = srv0;
    vec_u8_t srv9_add1 = srv0;
    vec_u8_t srv10_add1 = srv0;
    vec_u8_t srv11_add1 = srv0;
    vec_u8_t srv12_add1= srv6; 
    vec_u8_t srv13_add1 = srv6;
    vec_u8_t srv14_add1 = srv6; 
    vec_u8_t srv15_add1 = srv6;
vec_u8_t vfrac16_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv0_add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv1, srv1_add1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv2, srv2_add1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv3, srv3_add1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv4, srv4_add1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv5, srv5_add1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv6, srv6_add1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv7, srv7_add1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv8, srv8_add1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv9, srv9_add1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv10, srv10_add1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv11, srv11_add1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv12, srv12_add1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv13, srv13_add1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv14, srv14_add1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv15, srv15_add1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 12>(pixel* dst, const pixel *srcPix0, int bFilter)
{
vec_u8_t mask0={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
/*vec_u8_t mask1={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask2={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask3={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask4={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };
vec_u8_t mask5={0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, };*/
vec_u8_t mask6={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
/*vec_u8_t mask7={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask8={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask9={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask10={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };
vec_u8_t mask11={0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, };*/
vec_u8_t mask12={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
/*vec_u8_t mask13={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask14={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask15={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };

vec_u8_t mask16={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask17={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };
vec_u8_t mask18={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };*/
vec_u8_t mask19={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t mask20={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask21={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask22={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask23={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask24={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t mask25={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask26={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask27={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask28={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask29={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask30={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask31={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/

vec_u8_t maskadd1_0={0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};

/*
    vec_u8_t srv_left0 = vec_xl(64, srcPix0); 
    vec_u8_t srv_left1 = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32_0 ={0x1a, 0x13, 0xd, 0x6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
    vec_u8_t refmask_32_1 ={0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );
    vec_u8_t s1 = vec_xl(12, srcPix0);
    vec_u8_t s2 = vec_xl(28, srcPix0);  
*/
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x1a, 0x13, 0xd, 0x6, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x2, 0x3, 0x4, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(76, srcPix0);  
    vec_u8_t s2 = vec_xl(92, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv1 = srv0;
    vec_u8_t srv2 = srv0;
    vec_u8_t srv3 = srv0;
    vec_u8_t srv4 = srv0; 
    vec_u8_t srv5 = srv0;
    vec_u8_t srv6 = vec_perm(s0, s1, mask6); 
    vec_u8_t srv7 = srv6;
    vec_u8_t srv8 = srv6;
    vec_u8_t srv9 = srv6;
    vec_u8_t srv10 = srv6;
    vec_u8_t srv11 = srv6;
    vec_u8_t srv12= vec_perm(s0, s1, mask12); 
    vec_u8_t srv13 = srv12;
    vec_u8_t srv14 = srv12; 
    vec_u8_t srv15 = srv12;

    //0,0,0,3,3,3,3,7,7,7,10,10,10,10,14,14,14,17,17,17,17,21,21,21,24,24,24,24,s0,s0,s0,s0

    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv16_1 = srv16_0;
    vec_u8_t srv16_2 = srv16_0;
    vec_u8_t srv16_3 = srv16_0;
    vec_u8_t srv16_4 = srv16_0; 
    vec_u8_t srv16_5 = srv16_0;
    vec_u8_t srv16_6 = vec_perm(s1, s2, mask6); 
    vec_u8_t srv16_7 = srv16_6;
    vec_u8_t srv16_8 = srv16_6; 
    vec_u8_t srv16_9 = srv16_6;
    vec_u8_t srv16_10 = srv16_6;
    vec_u8_t srv16_11 = srv16_6;
    vec_u8_t srv16_12=  vec_perm(s1, s2, mask12); 
    vec_u8_t srv16_13 = srv16_12;
    vec_u8_t srv16_14 = srv16_12; 
    vec_u8_t srv16_15 = srv16_12;

    vec_u8_t  srv16 = srv12;  
    vec_u8_t  srv17 = srv12;
    vec_u8_t  srv18 = srv12;
    vec_u8_t  srv19 = vec_perm(s0, s1, mask19);
    vec_u8_t  srv20 = srv19;
    vec_u8_t  srv21 = srv19;
    vec_u8_t  srv22 = srv19;
    vec_u8_t  srv23 = srv19;
    vec_u8_t  srv24 = srv19;
    vec_u8_t  srv25 = s0;
    vec_u8_t  srv26 = s0;
    vec_u8_t  srv27 = s0;
    vec_u8_t  srv28 = s0;
    vec_u8_t  srv29 = s0;
    vec_u8_t  srv30 = s0;
    vec_u8_t  srv31 = s0;

    vec_u8_t  srv16_16 = srv16_12;  
    vec_u8_t  srv16_17 = srv16_12;
    vec_u8_t  srv16_18 = srv16_12;
    vec_u8_t  srv16_19 = vec_perm(s1, s2, mask19);
    vec_u8_t  srv16_20 = srv16_19;
    vec_u8_t  srv16_21 = srv16_19;
    vec_u8_t  srv16_22 = srv16_19;
    vec_u8_t  srv16_23 = srv16_19;
    vec_u8_t  srv16_24 = srv16_19;
    vec_u8_t  srv16_25 = s1;
    vec_u8_t  srv16_26 = s1;
    vec_u8_t  srv16_27 = s1;
    vec_u8_t  srv16_28 = s1;
    vec_u8_t  srv16_29 = s1;
    vec_u8_t  srv16_30 = s1;
    vec_u8_t  srv16_31 = s1;

    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv1add1 = srv0add1;
    vec_u8_t srv2add1 = srv0add1;
    vec_u8_t srv3add1 = srv0add1;
    vec_u8_t srv4add1 = srv0add1; 
    vec_u8_t srv5add1 = srv0add1; 
    vec_u8_t srv6add1 = srv0;
    vec_u8_t srv7add1 = srv0; 
    vec_u8_t srv8add1 = srv0;
    vec_u8_t srv9add1 = srv0;
    vec_u8_t srv10add1 = srv0;
    vec_u8_t srv11add1 = srv0;
    vec_u8_t srv12add1= srv6; 
    vec_u8_t srv13add1 = srv6;
    vec_u8_t srv14add1 = srv6; 
    vec_u8_t srv15add1 = srv6;

    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);
    vec_u8_t srv16add1_1 = srv16add1_0;
    vec_u8_t srv16add1_2 = srv16add1_0;
    vec_u8_t srv16add1_3 = srv16add1_0;
    vec_u8_t srv16add1_4 = srv16add1_0; 
    vec_u8_t srv16add1_5 = srv16add1_0;
    vec_u8_t srv16add1_6 = srv16_0; 
    vec_u8_t srv16add1_7 = srv16_0;
    vec_u8_t srv16add1_8 = srv16_0; 
    vec_u8_t srv16add1_9 = srv16_0;
    vec_u8_t srv16add1_10 = srv16_0;
    vec_u8_t srv16add1_11 = srv16_0;
    vec_u8_t srv16add1_12= srv16_6; 
    vec_u8_t srv16add1_13 = srv16_6;
    vec_u8_t srv16add1_14 = srv16_6; 
    vec_u8_t srv16add1_15 = srv16_6;

    vec_u8_t  srv16add1 = srv6;  
    vec_u8_t  srv17add1 = srv6;
    vec_u8_t  srv18add1 = srv6;
    vec_u8_t  srv19add1 = srv12;
    vec_u8_t  srv20add1 = srv12;
    vec_u8_t  srv21add1 = srv12;
    vec_u8_t  srv22add1 = srv12;
    vec_u8_t  srv23add1 = srv12;
    vec_u8_t  srv24add1 = srv12;
    vec_u8_t  srv25add1 = srv19;
    vec_u8_t  srv26add1 = srv19;
    vec_u8_t  srv27add1 = srv19;
    vec_u8_t  srv28add1 = srv19;
    vec_u8_t  srv29add1 = srv19;
    vec_u8_t  srv30add1 = srv19;
    vec_u8_t  srv31add1 = srv19;

    vec_u8_t  srv16add1_16 = srv16_6;   
    vec_u8_t  srv16add1_17 = srv16_6;
    vec_u8_t  srv16add1_18 = srv16_6;
    vec_u8_t  srv16add1_19 = srv16_12;
    vec_u8_t  srv16add1_20 = srv16_12;
    vec_u8_t  srv16add1_21 = srv16_12;
    vec_u8_t  srv16add1_22 = srv16_12;
    vec_u8_t  srv16add1_23 = srv16_12;
    vec_u8_t  srv16add1_24 = srv16_12;
    vec_u8_t  srv16add1_25 = srv16_19;
    vec_u8_t  srv16add1_26 = srv16_19;
    vec_u8_t  srv16add1_27 = srv16_19;
    vec_u8_t  srv16add1_28 = srv16_19;
    vec_u8_t  srv16add1_29 = srv16_19;
    vec_u8_t  srv16add1_30 = srv16_19;
    vec_u8_t  srv16add1_31 = srv16_19;

vec_u8_t vfrac16_0 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_1 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_2 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_3 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_4 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_5 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_6 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_7 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_8 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_9 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_10 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_11 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_12 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_13 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_14 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_16 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_17 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_18 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_19 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_20 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_21 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_22 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_23 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_24 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_25 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_26 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_27 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_28 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_29 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_30 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_31 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5};
vec_u8_t vfrac16_32_1 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_2 = (vec_u8_t){15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15};
vec_u8_t vfrac16_32_3 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_4 = (vec_u8_t){25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25};
vec_u8_t vfrac16_32_5 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_6 = (vec_u8_t){3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
vec_u8_t vfrac16_32_7 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_8 = (vec_u8_t){13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
vec_u8_t vfrac16_32_9 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_10 = (vec_u8_t){23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23};
vec_u8_t vfrac16_32_11 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_12 = (vec_u8_t){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
vec_u8_t vfrac16_32_13 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_14 = (vec_u8_t){11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11};
vec_u8_t vfrac16_32_15 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_16 = (vec_u8_t){21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21};
vec_u8_t vfrac16_32_17 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_18 = (vec_u8_t){31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31};
vec_u8_t vfrac16_32_19 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_20 = (vec_u8_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9};
vec_u8_t vfrac16_32_21 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_22 = (vec_u8_t){19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19};
vec_u8_t vfrac16_32_23 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_24 = (vec_u8_t){29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
vec_u8_t vfrac16_32_25 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_26 = (vec_u8_t){7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
vec_u8_t vfrac16_32_27 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_28 = (vec_u8_t){17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
vec_u8_t vfrac16_32_29 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_30 = (vec_u8_t){27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27};
vec_u8_t vfrac16_32_31 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};

    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv1, srv1add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_1, srv16add1_1, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv2, srv2add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_2, srv16add1_2, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv3, srv3add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_3, srv16add1_3, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv4, srv4add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_4, srv16add1_4, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv5, srv5add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_5, srv16add1_5, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv6, srv6add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_6, srv16add1_6, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv7, srv7add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_7, srv16add1_7, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv8, srv8add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_8, srv16add1_8, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv9, srv9add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_9, srv16add1_9, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv10, srv10add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_10, srv16add1_10, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv11, srv11add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_11, srv16add1_11, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv12, srv12add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_12, srv16add1_12, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv13, srv13add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_13, srv16add1_13, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv14, srv14add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_14, srv16add1_14, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv15, srv15add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_15, srv16add1_15, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(srv16, srv16add1, vfrac16_32_16, vfrac16_16, vout_0);
    one_line(srv16_16, srv16add1_16,  vfrac16_32_16, vfrac16_16, vout_1);

    one_line(srv17, srv17add1, vfrac16_32_17, vfrac16_17, vout_2);
    one_line(srv16_17, srv16add1_17, vfrac16_32_17, vfrac16_17, vout_3);

    one_line(srv18, srv18add1, vfrac16_32_18, vfrac16_18, vout_4);
    one_line(srv16_18, srv16add1_18, vfrac16_32_18, vfrac16_18, vout_5);

    one_line(srv19, srv19add1, vfrac16_32_19, vfrac16_19, vout_6);
    one_line(srv16_19, srv16add1_19, vfrac16_32_19, vfrac16_19, vout_7);

    one_line(srv20, srv20add1, vfrac16_32_20, vfrac16_20, vout_8);
    one_line(srv16_20, srv16add1_20, vfrac16_32_20, vfrac16_20, vout_9);

    one_line(srv21, srv21add1, vfrac16_32_21, vfrac16_21, vout_10);
    one_line(srv16_21, srv16add1_21, vfrac16_32_21, vfrac16_21, vout_11);

    one_line(srv22, srv22add1, vfrac16_32_22, vfrac16_22, vout_12);
    one_line(srv16_22, srv16add1_22, vfrac16_32_22, vfrac16_22, vout_13);

    one_line(srv23, srv23add1, vfrac16_32_23, vfrac16_23, vout_14);
    one_line(srv16_23, srv16add1_23, vfrac16_32_23, vfrac16_23, vout_15);

    one_line(srv24, srv24add1, vfrac16_32_24, vfrac16_24, vout_16);
    one_line(srv16_24, srv16add1_24, vfrac16_32_24, vfrac16_24, vout_17);

    one_line(srv25, srv25add1, vfrac16_32_25, vfrac16_25, vout_18);
    one_line(srv16_25, srv16add1_25, vfrac16_32_25, vfrac16_25, vout_19);

    one_line(srv26, srv26add1, vfrac16_32_26, vfrac16_26, vout_20);
    one_line(srv16_26, srv16add1_26, vfrac16_32_26, vfrac16_26, vout_21);

    one_line(srv27, srv27add1, vfrac16_32_27, vfrac16_27, vout_22);
    one_line(srv16_27, srv16add1_27, vfrac16_32_27, vfrac16_27, vout_23);

    one_line(srv28, srv28add1, vfrac16_32_28, vfrac16_28, vout_24);
    one_line(srv16_28, srv16add1_28, vfrac16_32_28, vfrac16_28, vout_25);

    one_line(srv29, srv29add1, vfrac16_32_29, vfrac16_29, vout_26);
    one_line(srv16_29, srv16add1_29, vfrac16_32_29, vfrac16_29, vout_27);

    one_line(srv30, srv30add1, vfrac16_32_30, vfrac16_30, vout_28);
    one_line(srv16_30, srv16add1_30, vfrac16_32_30, vfrac16_30, vout_29);

    one_line(srv31, srv31add1, vfrac16_32_31, vfrac16_31, vout_30);
    one_line(srv16_31, srv16add1_31, vfrac16_32_31, vfrac16_31, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void one_ang_pred_altivec<4, 11>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, 0x0, 0x1, 0x2, 0x3, };
    vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4, };
/*
    vec_u8_t srv=vec_xl(0, srcPix0); 
*/
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(9, srcPix0); 
    vec_u8_t refmask_4={0x00, 0x10, 0x11, 0x12, 0x13, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_4);

    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t vfrac4 = (vec_u8_t){30, 30, 30, 30, 28, 28, 28, 28, 26, 26, 26, 26, 24, 24, 24, 24};
    vec_u8_t vfrac4_32 = (vec_u8_t){2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8};

    vec_u16_t vmle0 = vec_mule(srv0, vfrac4_32); 
    vec_u16_t vmlo0 = vec_mulo(srv0, vfrac4_32); 
    vec_u16_t vmle1 = vec_mule(srv1, vfrac4); 
    vec_u16_t vmlo1 = vec_mulo(srv1, vfrac4); 
    vec_u16_t vsume = vec_add(vec_add(vmle0, vmle1), u16_16); 
    vec_u16_t ve = vec_sra(vsume, u16_5);
    vec_u16_t vsumo = vec_add(vec_add(vmlo0, vmlo1), u16_16); 
    vec_u16_t vo = vec_sra(vsumo, u16_5);
    vec_u8_t vout = vec_pack(vec_mergeh(ve, vo), vec_mergel(ve, vo));

    vec_xst(vout, 0, dst);              

#ifdef DEBUG
        for (int y = 0; y < 4; y++)
        {
            for (int x = 0; x < 4; x++)
            {
                printf("%d ",dst[y * 4 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<8, 11>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
    vec_u8_t mask1={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
    vec_u8_t mask2={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
    vec_u8_t mask3={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
    vec_u8_t mask4={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
    vec_u8_t mask5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };
    vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, };
    vec_u8_t mask7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, };


    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    vec_u8_t vout_0, vout_1, vout_2, vout_3;    
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
        
    vec_u8_t srv_left=vec_xl(0, srcPix0); 
    vec_u8_t srv_right=vec_xl(17, srcPix0); 
    vec_u8_t refmask_8={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, };
    vec_u8_t srv = vec_perm(srv_left, srv_right, refmask_8);    
        
    vec_u8_t srv0 = vec_perm(srv, srv, mask0); 
    vec_u8_t srv1 = vec_perm(srv, srv, mask1);
    vec_u8_t srv2 = vec_perm(srv, srv, mask2);
    vec_u8_t srv3 = vec_perm(srv, srv, mask3);
    vec_u8_t srv4 = vec_perm(srv, srv, mask4); 
    vec_u8_t srv5 = vec_perm(srv, srv, mask5);
    vec_u8_t srv6 = vec_perm(srv, srv, mask6); 
    vec_u8_t srv7 = vec_perm(srv, srv, mask7);

vec_u8_t vfrac8_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac8_1 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac8_2 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac8_3 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac8_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac8_32_1 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac8_32_2 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac8_32_3 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16};

one_line(srv0, srv1, vfrac8_32_0, vfrac8_0, vout_0);
one_line(srv2, srv3, vfrac8_32_1, vfrac8_1, vout_1);
one_line(srv4, srv5, vfrac8_32_2, vfrac8_2, vout_2);
one_line(srv6, srv7, vfrac8_32_3, vfrac8_3, vout_3);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 48, dst);           

#ifdef DEBUG
        for (int y = 0; y < 8; y++)
        {
            for (int x = 0; x < 8; x++)
            {
                printf("%d ",dst[y * 8 + x] );                  
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


template<>
void one_ang_pred_altivec<16, 11>(pixel* dst, const pixel *srcPix0, int bFilter)
{
/*vec_u8_t mask0={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask1={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask2={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask3={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask4={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask5={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask6={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask7={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask8={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask9={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask10={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask11={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask12={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask13={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask14={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };
vec_u8_t mask15={0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, };*/
vec_u8_t maskadd1_0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
/*vec_u8_t maskadd1_1={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_2={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_3={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_4={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_5={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_6={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_7={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_8={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_9={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_10={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_11={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_12={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_13={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_14={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
vec_u8_t maskadd1_15={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };*/
    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
    
    vec_u8_t srv_left=vec_xl(0, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t srv_right=vec_xl(33, srcPix0); /* ref[offset + x], ref=srcPix0+1;  offset[0-3] = 0 */
    vec_u8_t refmask_16={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_16);    
    vec_u8_t s1 = vec_xl(48, srcPix0);  

    vec_u8_t srv0 = s0; 
    vec_u8_t srv1 = vec_perm(s0, s1, maskadd1_0);

vec_u8_t vfrac16_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
vec_u8_t vfrac16_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
vec_u8_t vfrac16_32_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
vec_u8_t vfrac16_32_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
vec_u8_t vfrac16_32_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
vec_u8_t vfrac16_32_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
vec_u8_t vfrac16_32_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
vec_u8_t vfrac16_32_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
vec_u8_t vfrac16_32_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
vec_u8_t vfrac16_32_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
vec_u8_t vfrac16_32_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
vec_u8_t vfrac16_32_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
vec_u8_t vfrac16_32_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;

    one_line(srv0, srv1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv0, srv1, vfrac16_32_1, vfrac16_1, vout_1);
    one_line(srv0, srv1, vfrac16_32_2, vfrac16_2, vout_2);
    one_line(srv0, srv1, vfrac16_32_3, vfrac16_3, vout_3);
    one_line(srv0, srv1, vfrac16_32_4, vfrac16_4, vout_4);
    one_line(srv0, srv1, vfrac16_32_5, vfrac16_5, vout_5);
    one_line(srv0, srv1, vfrac16_32_6, vfrac16_6, vout_6);
    one_line(srv0, srv1, vfrac16_32_7, vfrac16_7, vout_7);
    one_line(srv0, srv1, vfrac16_32_8, vfrac16_8, vout_8);
    one_line(srv0, srv1, vfrac16_32_9, vfrac16_9, vout_9);
    one_line(srv0, srv1, vfrac16_32_10, vfrac16_10, vout_10);
    one_line(srv0, srv1, vfrac16_32_11, vfrac16_11, vout_11);
    one_line(srv0, srv1, vfrac16_32_12, vfrac16_12, vout_12);
    one_line(srv0, srv1, vfrac16_32_13, vfrac16_13, vout_13);
    one_line(srv0, srv1, vfrac16_32_14, vfrac16_14, vout_14);
    one_line(srv0, srv1, vfrac16_32_15, vfrac16_15, vout_15);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 16*2, dst);         
    vec_xst(vout_3, 16*3, dst);         
    vec_xst(vout_4, 16*4, dst);         
    vec_xst(vout_5, 16*5, dst);         
    vec_xst(vout_6, 16*6, dst);         
    vec_xst(vout_7, 16*7, dst);         
    vec_xst(vout_8, 16*8, dst);         
    vec_xst(vout_9, 16*9, dst);         
    vec_xst(vout_10, 16*10, dst);               
    vec_xst(vout_11, 16*11, dst);               
    vec_xst(vout_12, 16*12, dst);               
    vec_xst(vout_13, 16*13, dst);               
    vec_xst(vout_14, 16*14, dst);               
    vec_xst(vout_15, 16*15, dst);               

#ifdef DEBUG
        for (int y = 0; y < 16; y++)
        {
            for (int x = 0; x < 16; x++)
            {
                printf("%d ",dst[y * 16 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}

template<>
void one_ang_pred_altivec<32, 11>(pixel* dst, const pixel *srcPix0, int bFilter)
{
    vec_u8_t mask0={0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, };
    vec_u8_t maskadd1_0={0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, };

    vec_u16_t u16_16 = {16, 16, 16, 16, 16, 16, 16, 16};
    vec_u16_t u16_5 = {5, 5, 5, 5, 5, 5, 5, 5};
/*
    vec_u8_t srv_left = vec_xl(80, srcPix0); 
    vec_u8_t srv_right = vec_xl(0, srcPix0);;
    vec_u8_t refmask_32 ={0x00, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e};
    vec_u8_t s0 = vec_perm(srv_left, srv_right, refmask_32);
    vec_u8_t s1 = vec_xl(15, srcPix0);;
    vec_u8_t s2 = vec_xl(31, srcPix0);  
*/
    vec_u8_t srv_left0=vec_xl(0, srcPix0); 
    vec_u8_t srv_left1=vec_xl(16, srcPix0); 
    vec_u8_t srv_right=vec_xl(65, srcPix0); 
    vec_u8_t refmask_32_0={0x10, 0x00, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
    vec_u8_t refmask_32_1={0x0, 0x1, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d};
    vec_u8_t s0 = vec_perm( vec_perm(srv_left0, srv_left1, refmask_32_0), srv_right, refmask_32_1 );    
    vec_u8_t s1 = vec_xl(79, srcPix0);  
    vec_u8_t s2 = vec_xl(95, srcPix0);  

    vec_u8_t srv0 = vec_perm(s0, s1, mask0); 
    vec_u8_t srv16_0 = vec_perm(s1, s2, mask0); 
    vec_u8_t srv0add1 = vec_perm(s0, s1, maskadd1_0);
    vec_u8_t srv16add1_0 = vec_perm(s1, s2, maskadd1_0);

    vec_u8_t vfrac16_0 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_1 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac16_2 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_3 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
    vec_u8_t vfrac16_4 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
    vec_u8_t vfrac16_5 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_6 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    vec_u8_t vfrac16_8 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_9 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_10 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_11 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_12 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_13 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_14 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_15 = (vec_u8_t){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

    vec_u8_t vfrac16_32_0 = (vec_u8_t){2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
    vec_u8_t vfrac16_32_1 = (vec_u8_t){4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4};
    vec_u8_t vfrac16_32_2 = (vec_u8_t){6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
    vec_u8_t vfrac16_32_3 = (vec_u8_t){8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
    vec_u8_t vfrac16_32_4 = (vec_u8_t){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10};
    vec_u8_t vfrac16_32_5 = (vec_u8_t){12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12};
    vec_u8_t vfrac16_32_6 = (vec_u8_t){14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
    vec_u8_t vfrac16_32_7 = (vec_u8_t){16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
    vec_u8_t vfrac16_32_8 = (vec_u8_t){18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18};
    vec_u8_t vfrac16_32_9 = (vec_u8_t){20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
    vec_u8_t vfrac16_32_10 = (vec_u8_t){22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22};
    vec_u8_t vfrac16_32_11 = (vec_u8_t){24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24};
    vec_u8_t vfrac16_32_12 = (vec_u8_t){26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26};
    vec_u8_t vfrac16_32_13 = (vec_u8_t){28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28};
    vec_u8_t vfrac16_32_14 = (vec_u8_t){30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30};
    vec_u8_t vfrac16_32_15 = (vec_u8_t){32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};


    /*  dst[y * dstStride + x] = (pixel)((f32[y]* ref[0 + x] + f[y] * ref[0 + x + 1] + 16) >> 5 */
    vec_u16_t vmle0, vmlo0, vmle1, vmlo1, vsume, ve, vsumo, vo;
    vec_u8_t vout_0, vout_1, vout_2, vout_3, vout_4, vout_5, vout_6, vout_7;
    vec_u8_t vout_8, vout_9, vout_10, vout_11, vout_12, vout_13, vout_14, vout_15;
    vec_u8_t vout_16, vout_17, vout_18, vout_19, vout_20, vout_21, vout_22, vout_23;
    vec_u8_t vout_24, vout_25, vout_26, vout_27, vout_28, vout_29, vout_30, vout_31;

    one_line(srv0, srv0add1, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(srv16_0, srv16add1_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(srv0, srv0add1, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(srv16_0, srv16add1_0, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(srv0, srv0add1, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(srv16_0, srv16add1_0, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(srv0, srv0add1, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(srv16_0, srv16add1_0, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(srv0, srv0add1, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(srv16_0, srv16add1_0, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(srv0, srv0add1, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(srv16_0, srv16add1_0, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(srv0, srv0add1, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(srv16_0, srv16add1_0, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(srv0, srv0add1, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(srv16_0, srv16add1_0, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(srv0, srv0add1, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(srv16_0, srv16add1_0, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(srv0, srv0add1, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(srv16_0, srv16add1_0, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(srv0, srv0add1, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(srv16_0, srv16add1_0, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(srv0, srv0add1, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(srv16_0, srv16add1_0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(srv0, srv0add1, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(srv16_0, srv16add1_0, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(srv0, srv0add1, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(srv16_0, srv16add1_0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(srv0, srv0add1, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(srv16_0, srv16add1_0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(srv0, srv0add1, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(srv16_0, srv16add1_0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 0, dst);            
    vec_xst(vout_1, 16, dst);           
    vec_xst(vout_2, 32, dst);           
    vec_xst(vout_3, 32+16, dst);                
    vec_xst(vout_4, 32*2, dst);         
    vec_xst(vout_5, 32*2+16, dst);              
    vec_xst(vout_6, 32*3, dst);         
    vec_xst(vout_7, 32*3+16, dst);              
    vec_xst(vout_8, 32*4, dst);         
    vec_xst(vout_9, 32*4+16, dst);              
    vec_xst(vout_10, 32*5, dst);                
    vec_xst(vout_11, 32*5+16, dst);             
    vec_xst(vout_12, 32*6, dst);                
    vec_xst(vout_13, 32*6+16, dst);             
    vec_xst(vout_14, 32*7, dst);                
    vec_xst(vout_15, 32*7+16, dst);             
    vec_xst(vout_16, 32*8, dst);                
    vec_xst(vout_17, 32*8+16, dst);             
    vec_xst(vout_18, 32*9, dst);                
    vec_xst(vout_19, 32*9+16, dst);             
    vec_xst(vout_20, 32*10, dst);               
    vec_xst(vout_21, 32*10+16, dst);            
    vec_xst(vout_22, 32*11, dst);               
    vec_xst(vout_23, 32*11+16, dst);            
    vec_xst(vout_24, 32*12, dst);               
    vec_xst(vout_25, 32*12+16, dst);            
    vec_xst(vout_26, 32*13, dst);               
    vec_xst(vout_27, 32*13+16, dst);            
    vec_xst(vout_28, 32*14, dst);               
    vec_xst(vout_29, 32*14+16, dst);            
    vec_xst(vout_30, 32*15, dst);               
    vec_xst(vout_31, 32*15+16, dst);            

    one_line(s0, srv0, vfrac16_32_0, vfrac16_0, vout_0);
    one_line(s1, srv16_0, vfrac16_32_0, vfrac16_0, vout_1);

    one_line(s0, srv0, vfrac16_32_1, vfrac16_1, vout_2);
    one_line(s1, srv16_0, vfrac16_32_1, vfrac16_1, vout_3);

    one_line(s0, srv0, vfrac16_32_2, vfrac16_2, vout_4);
    one_line(s1, srv16_0, vfrac16_32_2, vfrac16_2, vout_5);

    one_line(s0, srv0, vfrac16_32_3, vfrac16_3, vout_6);
    one_line(s1, srv16_0, vfrac16_32_3, vfrac16_3, vout_7);

    one_line(s0, srv0, vfrac16_32_4, vfrac16_4, vout_8);
    one_line(s1, srv16_0, vfrac16_32_4, vfrac16_4, vout_9);

    one_line(s0, srv0, vfrac16_32_5, vfrac16_5, vout_10);
    one_line(s1, srv16_0, vfrac16_32_5, vfrac16_5, vout_11);

    one_line(s0, srv0, vfrac16_32_6, vfrac16_6, vout_12);
    one_line(s1, srv16_0, vfrac16_32_6, vfrac16_6, vout_13);

    one_line(s0, srv0, vfrac16_32_7, vfrac16_7, vout_14);
    one_line(s1, srv16_0, vfrac16_32_7, vfrac16_7, vout_15);

    one_line(s0, srv0, vfrac16_32_8, vfrac16_8, vout_16);
    one_line(s1, srv16_0, vfrac16_32_8, vfrac16_8, vout_17);

    one_line(s0, srv0, vfrac16_32_9, vfrac16_9, vout_18);
    one_line(s1, srv16_0, vfrac16_32_9, vfrac16_9, vout_19);

    one_line(s0, srv0, vfrac16_32_10, vfrac16_10, vout_20);
    one_line(s1, srv16_0, vfrac16_32_10, vfrac16_10, vout_21);

    one_line(s0, srv0, vfrac16_32_11, vfrac16_11, vout_22);
    one_line(s1, srv16_0, vfrac16_32_11, vfrac16_11, vout_23);

    one_line(s0, srv0, vfrac16_32_12, vfrac16_12, vout_24);
    one_line(s1, srv16_0, vfrac16_32_12, vfrac16_12, vout_25);

    one_line(s0, srv0, vfrac16_32_13, vfrac16_13, vout_26);
    one_line(s1, srv16_0, vfrac16_32_13, vfrac16_13, vout_27);

    one_line(s0, srv0, vfrac16_32_14, vfrac16_14, vout_28);
    one_line(s1, srv16_0, vfrac16_32_14, vfrac16_14, vout_29);

    one_line(s0, srv0, vfrac16_32_15, vfrac16_15, vout_30);
    one_line(s1, srv16_0, vfrac16_32_15, vfrac16_15, vout_31);

    vec_xst(vout_0, 32*16, dst);                
    vec_xst(vout_1, 32*16+16, dst);             
    vec_xst(vout_2, 32*17, dst);                
    vec_xst(vout_3, 32*17+16, dst);             
    vec_xst(vout_4, 32*18, dst);                
    vec_xst(vout_5, 32*18+16, dst);             
    vec_xst(vout_6, 32*19, dst);                
    vec_xst(vout_7, 32*19+16, dst);             
    vec_xst(vout_8, 32*20, dst);                
    vec_xst(vout_9, 32*20+16, dst);             
    vec_xst(vout_10, 32*21, dst);               
    vec_xst(vout_11, 32*21+16, dst);            
    vec_xst(vout_12, 32*22, dst);               
    vec_xst(vout_13, 32*22+16, dst);            
    vec_xst(vout_14, 32*23, dst);               
    vec_xst(vout_15, 32*23+16, dst);            
    vec_xst(vout_16, 32*24, dst);               
    vec_xst(vout_17, 32*24+16, dst);            
    vec_xst(vout_18, 32*25, dst);               
    vec_xst(vout_19, 32*25+16, dst);            
    vec_xst(vout_20, 32*26, dst);               
    vec_xst(vout_21, 32*26+16, dst);            
    vec_xst(vout_22, 32*27, dst);               
    vec_xst(vout_23, 32*27+16, dst);            
    vec_xst(vout_24, 32*28, dst);               
    vec_xst(vout_25, 32*28+16, dst);            
    vec_xst(vout_26, 32*29, dst);               
    vec_xst(vout_27, 32*29+16, dst);            
    vec_xst(vout_28, 32*30, dst);               
    vec_xst(vout_29, 32*30+16, dst);            
    vec_xst(vout_30, 32*31, dst);               
    vec_xst(vout_31, 32*31+16, dst);            


#ifdef DEBUG
        for (int y = 0; y < 32; y++)
        {
            for (int x = 0; x < 32; x++)
            {
                printf("%d ",dst[y * 32 + x] );                 
            }
            printf("\n");                       
        }
        printf("\n\n");                 
#endif          
}


#define ONE_ANG(log2Size, mode, dest, refPix, filtPix, bLuma)\
{\
    const int width = 1<< log2Size;\
    pixel *srcPix0  = (g_intraFilterFlags[mode] & width ? filtPix  : refPix);\
    pixel *dst = dest + ((mode - 2) << (log2Size * 2));\
    srcPix0  = refPix;\
    dst = dest;\
    one_ang_pred_altivec<width, mode>(dst, srcPix0, bLuma);\
}
        
        
template<int log2Size>
void all_angs_pred_altivec(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
{
    ONE_ANG(log2Size, 2, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 3, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 4, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 5, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 6, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 7, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 8, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 9, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 10, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 11, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 12, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 13, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 14, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 15, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 16, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 17, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 18, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 19, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 20, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 21, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 22, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 23, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 24, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 25, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 26, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 27, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 28, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 29, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 30, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 31, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 32, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 33, dest, refPix, filtPix, bLuma);
    ONE_ANG(log2Size, 34, dest, refPix, filtPix, bLuma);
    return;
}

void setupIntraPrimitives_altivec(EncoderPrimitives &p)
{
    for (int i = 2; i < NUM_INTRA_MODE; i++)
    {
        p.cu[BLOCK_4x4].intra_pred[i] = intra_pred_ang_altivec<4>;
        p.cu[BLOCK_8x8].intra_pred[i] = intra_pred_ang_altivec<8>;
        p.cu[BLOCK_16x16].intra_pred[i] = intra_pred_ang_altivec<16>;
        p.cu[BLOCK_32x32].intra_pred[i] = intra_pred_ang_altivec<32>;
    }

    p.cu[BLOCK_4x4].intra_pred_allangs = all_angs_pred_altivec<2>;
    p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_altivec<3>;
    p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_altivec<4>;
    p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_altivec<5>;
}

}


/* [<][>][^][v][top][bottom][index][help] */