This source file includes following definitions.
- can_run_code
- wildcard_match
- wildcard_match
- wildcard_search
- check_one
- check
- check_sse_all
- check_neon_all
- check_hvx_all
- check_altivec_all
- test_all
- main
#include "Halide.h"
#include <fstream>
#include <future>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include "test/common/halide_test_dirs.h"
using namespace Halide;
using namespace Halide::ConciseCasts;
using std::vector;
using std::string;
constexpr int W = 256*3;
constexpr int H = 128;
constexpr int max_i8 = 127;
constexpr int max_i16 = 32767;
constexpr int max_i32 = 0x7fffffff;
constexpr int max_u8 = 255;
constexpr int max_u16 = 65535;
const Expr max_u32 = UInt(32).max();
const Var x{"x"}, y{"y"};
struct TestResult {
string op;
string error_msg;
};
struct Task {
string op;
string name;
int vector_width;
Expr expr;
};
size_t num_threads = Halide::Internal::ThreadPool<void>::num_processors_online();
struct Test {
bool use_avx2{false};
bool use_avx512{false};
bool use_avx512_cannonlake{false};
bool use_avx512_knl{false};
bool use_avx512_skylake{false};
bool use_avx{false};
bool use_power_arch_2_07{false};
bool use_sse41{false};
bool use_sse42{false};
bool use_ssse3{false};
bool use_vsx{false};
string filter{"*"};
string output_directory{Internal::get_test_tmp_dir()};
vector<Task> tasks;
Target target;
ImageParam in_f32{Float(32), 1, "in_f32"};
ImageParam in_f64{Float(64), 1, "in_f64"};
ImageParam in_i8 {Int(8), 1, "in_i8"};
ImageParam in_u8 {UInt(8), 1, "in_u8"};
ImageParam in_i16{Int(16), 1, "in_i16"};
ImageParam in_u16{UInt(16), 1, "in_u16"};
ImageParam in_i32{Int(32), 1, "in_i32"};
ImageParam in_u32{UInt(32), 1, "in_u32"};
ImageParam in_i64{Int(64), 1, "in_i64"};
ImageParam in_u64{UInt(64), 1, "in_u64"};
const vector<ImageParam> image_params{in_f32, in_f64, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64};
const vector<Argument> arg_types{in_f32, in_f64, in_i8, in_u8, in_i16, in_u16, in_i32, in_u32, in_i64, in_u64};
Test() {
target = get_target_from_environment()
.with_feature(Target::NoBoundsQuery)
.with_feature(Target::NoAsserts)
.with_feature(Target::NoRuntime);
use_avx512_knl = target.has_feature(Target::AVX512_KNL);
use_avx512_cannonlake = target.has_feature(Target::AVX512_Cannonlake);
use_avx512_skylake = use_avx512_cannonlake || target.has_feature(Target::AVX512_Skylake);
use_avx512 = use_avx512_knl || use_avx512_skylake || use_avx512_cannonlake || target.has_feature(Target::AVX512);
use_avx2 = use_avx512 || target.has_feature(Target::AVX2);
use_avx = use_avx2 || target.has_feature(Target::AVX);
use_sse41 = use_avx || target.has_feature(Target::SSE41);
use_ssse3 = use_sse41;
use_sse42 = use_avx;
use_vsx = target.has_feature(Target::VSX);
use_power_arch_2_07 = target.has_feature(Target::POWER_ARCH_2_07);
const bool can_run = can_run_code();
for (auto p : image_params) {
p.set_host_alignment(128);
p.dim(0).set_min(0);
if (can_run) {
Buffer<> b(p.type(), {W*4+H, H});
Expr r;
if (p.type().is_float()) {
r = cast(p.type(), random_float() * 1024 - 512);
} else {
r = cast(p.type(), random_int() / 4);
}
lambda(x, y, r).realize(b);
p.set(b);
}
}
}
bool can_run_code() const {
Target host_target = get_host_target();
bool can_run_the_code =
(target.arch == host_target.arch &&
target.bits == host_target.bits &&
target.os == host_target.os);
for (Target::Feature f : {Target::SSE41, Target::AVX,
Target::AVX2, Target::AVX512,
Target::FMA, Target::FMA4, Target::F16C,
Target::VSX, Target::POWER_ARCH_2_07,
Target::ARMv7s, Target::NoNEON, Target::MinGW}) {
if (target.has_feature(f) != host_target.has_feature(f)) {
can_run_the_code = false;
}
}
return can_run_the_code;
}
bool wildcard_match(const char* p, const char* str) const {
while (*p && *str && *p == *str && *p != '*') {
str++;
p++;
}
if (!*p) {
return *str == 0;
} else if (*p == '*') {
p++;
do {
if (wildcard_match(p, str)) {
return true;
}
} while(*str++);
} else if (*p == ' ') {
p++;
if (wildcard_match(p, str)) {
return true;
}
} else if (*str == ' ') {
str++;
if (wildcard_match(p, str)) {
return true;
}
}
return !*p;
}
bool wildcard_match(const string& p, const string& str) const {
return wildcard_match(p.c_str(), str.c_str());
}
bool wildcard_search(const string& p, const string& str) const {
return wildcard_match("*" + p + "*", str);
}
TestResult check_one(const string &op, const string &name, int vector_width, Expr e) const {
std::ostringstream error_msg;
Func f(name);
f(x, y) = e;
f.bound(x, 0, W).vectorize(x, vector_width);
f.compute_root();
Func f_scalar("scalar_" + name);
f_scalar(x, y) = e;
f_scalar.bound(x, 0, W);
f_scalar.compute_root();
RDom r(0, W, 0, H);
Func error("error_" + name);
error() = cast<double>(maximum(absd(f(r.x, r.y), f_scalar(r.x, r.y))));
{
string asm_filename = output_directory + "check_" + name + ".s";
f.compile_to_assembly(asm_filename, arg_types, target);
std::ifstream asm_file;
asm_file.open(asm_filename);
bool found_it = false;
std::ostringstream msg;
msg << op << " did not generate. Instead we got:\n";
string line;
while (getline(asm_file, line)) {
msg << line << "\n";
found_it |= wildcard_search(op, line) && !wildcard_search("_" + op, line);
}
if (!found_it) {
error_msg << "Failed: " << msg.str() << "\n";
}
asm_file.close();
}
string fn_name = "test_" + name;
error.compile_to_file(output_directory + fn_name, arg_types, fn_name, target);
bool can_run_the_code = can_run_code();
if (can_run_the_code) {
Realization r = error.realize(target.without_feature(Target::NoRuntime));
double e = Buffer<double>(r[0])();
if (e > 0.001) {
error_msg << "The vector and scalar versions of " << name << " disagree. Maximum error: " << e << "\n";
}
}
return { op, error_msg.str() };
}
void check(string op, int vector_width, Expr e) {
string name = "op_" + op;
for (size_t i = 0; i < name.size(); i++) {
if (!isalnum(name[i])) name[i] = '_';
}
name += "_" + std::to_string(tasks.size());
if (!wildcard_match(filter, op)) return;
tasks.emplace_back(Task {op, name, vector_width, e});
}
void check_sse_all() {
#if LLVM_VERSION > 39
#define YMM "*ymm"
#else
#define YMM
#endif
Expr f64_1 = in_f64(x), f64_2 = in_f64(x+16), f64_3 = in_f64(x+32);
Expr f32_1 = in_f32(x), f32_2 = in_f32(x+16), f32_3 = in_f32(x+32);
Expr i8_1 = in_i8(x), i8_2 = in_i8(x+16), i8_3 = in_i8(x+32);
Expr u8_1 = in_u8(x), u8_2 = in_u8(x+16), u8_3 = in_u8(x+32);
Expr i16_1 = in_i16(x), i16_2 = in_i16(x+16), i16_3 = in_i16(x+32);
Expr u16_1 = in_u16(x), u16_2 = in_u16(x+16), u16_3 = in_u16(x+32);
Expr i32_1 = in_i32(x), i32_2 = in_i32(x+16), i32_3 = in_i32(x+32);
Expr u32_1 = in_u32(x), u32_2 = in_u32(x+16), u32_3 = in_u32(x+32);
Expr i64_1 = in_i64(x), i64_2 = in_i64(x+16), i64_3 = in_i64(x+32);
Expr u64_1 = in_u64(x), u64_2 = in_u64(x+16), u64_3 = in_u64(x+32);
Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
for (int w = 1; w <= 4; w++) {
if (w > 1) {
check("paddb", 8*w, u8_1 + u8_2);
check("psubb", 8*w, u8_1 - u8_2);
check("paddw", 4*w, u16_1 + u16_2);
check("psubw", 4*w, u16_1 - u16_2);
check("pmullw", 4*w, i16_1 * i16_2);
check("paddd", 2*w, i32_1 + i32_2);
check("psubd", 2*w, i32_1 - i32_2);
}
check("paddsb", 8*w, i8_sat(i16(i8_1) + i16(i8_2)));
check("paddsb", 8*w, i8_sat(i16(i8_1) + i16(3)));
check("psubsb", 8*w, i8_sat(i16(i8_1) - i16(i8_2)));
check("paddusb", 8*w, u8(min(u16(u8_1) + u16(u8_2), max_u8)));
check("psubusb", 8*w, u8(max(i16(u8_1) - i16(u8_2), 0)));
check("paddsw", 4*w, i16_sat(i32(i16_1) + i32(i16_2)));
check("psubsw", 4*w, i16_sat(i32(i16_1) - i32(i16_2)));
check("paddusw", 4*w, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
check("psubusw", 4*w, u16(max(i32(u16_1) - i32(u16_2), 0)));
check("pmulhw", 4*w, i16((i32(i16_1) * i32(i16_2)) / (256*256)));
check("pmulhw", 4*w, i16((i32(i16_1) * i32(i16_2)) >> 16));
check("pmulhw", 4*w, i16((3 * i32(i16_2)) / (256*256)));
check("pmulhw", 4*w, select(in_u8(0) == 0,
i16((3 * i32(i16_2)) / (256*256)),
i16((5 * i32(i16_2)) / (256*256))));
check("pmulhuw", 4*w, i16_1 / 15);
check("pcmp*b", 8*w, select(u8_1 == u8_2, u8(1), u8(2)));
check("pcmp*b", 8*w, select(u8_1 > u8_2, u8(1), u8(2)));
check("pcmp*w", 4*w, select(u16_1 == u16_2, u16(1), u16(2)));
check("pcmp*w", 4*w, select(u16_1 > u16_2, u16(1), u16(2)));
check("pcmp*d", 2*w, select(u32_1 == u32_2, u32(1), u32(2)));
check("pcmp*d", 2*w, select(u32_1 > u32_2, u32(1), u32(2)));
check("addps", 2*w, f32_1 + f32_2);
check("subps", 2*w, f32_1 - f32_2);
check("mulps", 2*w, f32_1 * f32_2);
if (w > 1) {
}
check(use_avx512_skylake ? "vrsqrt14ps" : "rsqrtps", 2*w, fast_inverse_sqrt(f32_1));
check(use_avx512_skylake ? "vrcp14ps" : "rcpps", 2*w, fast_inverse(f32_1));
check("sqrtps", 2*w, sqrt(f32_2));
check("maxps", 2*w, max(f32_1, f32_2));
check("minps", 2*w, min(f32_1, f32_2));
check("pavgb", 8*w, u8((u16(u8_1) + u16(u8_2) + 1)/2));
check("pavgb", 8*w, u8((u16(u8_1) + u16(u8_2) + 1)>>1));
check("pavgw", 4*w, u16((u32(u16_1) + u32(u16_2) + 1)/2));
check("pavgw", 4*w, u16((u32(u16_1) + u32(u16_2) + 1)>>1));
check("pmaxsw", 4*w, max(i16_1, i16_2));
check("pminsw", 4*w, min(i16_1, i16_2));
check("pmaxub", 8*w, max(u8_1, u8_2));
check("pminub", 8*w, min(u8_1, u8_2));
check("pmulhuw", 4*w, u16((u32(u16_1) * u32(u16_2))/(256*256)));
check("pmulhuw", 4*w, u16((u32(u16_1) * u32(u16_2))>>16));
check("pmulhuw", 4*w, u16_1 / 15);
check("cmpeqps", 2*w, select(f32_1 == f32_2, 1.0f, 2.0f));
check("cmpltps", 2*w, select(f32_1 < f32_2, 1.0f, 2.0f));
}
check(use_avx512_skylake ? "korw" : "orps", 4, bool_1 | bool_2);
check(use_avx512_skylake ? "kxorw" : "xorps", 4, bool_1 ^ bool_2);
if (!use_avx512) {
check("andps", 4, bool_1 & bool_2);
}
check("shufps", 4, in_f32(2*x));
for (int w = 2; w <= 4; w++) {
check("addpd", w, f64_1 + f64_2);
check("subpd", w, f64_1 - f64_2);
check("mulpd", w, f64_1 * f64_2);
check("divpd", w, f64_1 / f64_2);
check("sqrtpd", w, sqrt(f64_2));
check("maxpd", w, max(f64_1, f64_2));
check("minpd", w, min(f64_1, f64_2));
check("cmpeqpd", w, select(f64_1 == f64_2, 1.0f, 2.0f));
check("cmpltpd", w, select(f64_1 < f64_2, 1.0f, 2.0f));
check("paddq", w, i64_1 + i64_2);
check("psubq", w, i64_1 - i64_2);
check(use_avx512_skylake ? "vpmullq" : "pmuludq", w, u64_1 * u64_2);
check("packssdw", 4*w, i16_sat(i32_1));
check("packsswb", 8*w, i8_sat(i16_1));
check("packuswb", 8*w, u8_sat(i16_1));
}
if (use_ssse3) {
for (int w = 2; w <= 4; w++) {
check("pabsb", 8*w, abs(i8_1));
check("pabsw", 4*w, abs(i16_1));
check("pabsd", 2*w, abs(i32_1));
}
}
for (int w = 2; w <= 4; w++) {
check("pmaddwd", 2*w, i32(i16_1) * 3 + i32(i16_2) * 4);
check("pmaddwd", 2*w, i32(i16_1) * 3 - i32(i16_2) * 4);
}
if (use_avx2) {
check("vpmaddwd", 8, i32(i16_1) * 3 + i32(i16_2) * 4);
} else {
check("pmaddwd", 8, i32(i16_1) * 3 + i32(i16_2) * 4);
}
if (use_sse41) {
for (int w = 2; w <= 4; w++) {
if (!use_avx512) {
check("pmuludq", 2*w, u64(u32_1) * u64(u32_2));
}
check("pmulld", 2*w, i32_1 * i32_2);
check((use_avx512_skylake && w > 2) ? "vinsertf32x8" : "blend*ps", 2*w, select(f32_1 > 0.7f, f32_1, f32_2));
check((use_avx512 && w > 2) ? "vinsertf64x4" : "blend*pd", w, select(f64_1 > cast<double>(0.7f), f64_1, f64_2));
check("pblend*b", 8*w, select(u8_1 > 7, u8_1, u8_2));
check("pblend*b", 8*w, select(u8_1 == 7, u8_1, u8_2));
check("pblend*b", 8*w, select(u8_1 <= 7, i8_1, i8_2));
check("pmaxsb", 8*w, max(i8_1, i8_2));
check("pminsb", 8*w, min(i8_1, i8_2));
check("pmaxuw", 4*w, max(u16_1, u16_2));
check("pminuw", 4*w, min(u16_1, u16_2));
check("pmaxud", 2*w, max(u32_1, u32_2));
check("pminud", 2*w, min(u32_1, u32_2));
check("pmaxsd", 2*w, max(i32_1, i32_2));
check("pminsd", 2*w, min(i32_1, i32_2));
check("roundps", 2*w, round(f32_1));
check("roundpd", w, round(f64_1));
check("roundps", 2*w, floor(f32_1));
check("roundpd", w, floor(f64_1));
check("roundps", 2*w, ceil(f32_1));
check("roundpd", w, ceil(f64_1));
check("pcmpeqq", w, select(i64_1 == i64_2, i64(1), i64(2)));
check("packusdw", 4*w, u16_sat(i32_1));
}
}
if (use_sse42) {
check("pcmpgtq", 2, select(i64_1 > i64_2, i64(1), i64(2)));
}
if (use_avx) {
check("vsqrtps" YMM, 8, sqrt(f32_1));
check("vsqrtpd" YMM, 4, sqrt(f64_1));
check(use_avx512_skylake ? "vrsqrt14ps" : "vrsqrtps" YMM, 8, fast_inverse_sqrt(f32_1));
check(use_avx512_skylake ? "vrcp14ps" : "vrcpps" YMM, 8, fast_inverse(f32_1));
#if 0
check("vandnps", 8, bool1 & (!bool2));
check("vandps", 8, bool1 & bool2);
check("vorps", 8, bool1 | bool2);
check("vxorps", 8, bool1 ^ bool2);
#endif
check("vaddps" YMM, 8, f32_1 + f32_2);
check("vaddpd" YMM, 4, f64_1 + f64_2);
check("vmulps" YMM, 8, f32_1 * f32_2);
check("vmulpd" YMM, 4, f64_1 * f64_2);
check("vsubps" YMM, 8, f32_1 - f32_2);
check("vsubpd" YMM, 4, f64_1 - f64_2);
check("vminps" YMM, 8, min(f32_1, f32_2));
check("vminpd" YMM, 4, min(f64_1, f64_2));
check("vmaxps" YMM, 8, max(f32_1, f32_2));
check("vmaxpd" YMM, 4, max(f64_1, f64_2));
check("vroundps" YMM, 8, round(f32_1));
check("vroundpd" YMM, 4, round(f64_1));
check("vcmpeqpd" YMM, 4, select(f64_1 == f64_2, 1.0f, 2.0f));
check("vcmpltpd" YMM, 4, select(f64_1 < f64_2, 1.0f, 2.0f));
check("vcmpeqps" YMM, 8, select(f32_1 == f32_2, 1.0f, 2.0f));
check("vcmpltps" YMM, 8, select(f32_1 < f32_2, 1.0f, 2.0f));
check(use_avx512_skylake ? "vinsertf32x8" : "vblend*ps" YMM, 8, select(f32_1 > 0.7f, f32_1, f32_2));
check(use_avx512 ? "vinsertf64x4" : "vblend*pd" YMM, 4, select(f64_1 > cast<double>(0.7f), f64_1, f64_2));
check("vcvttps2dq" YMM, 8, i32(f32_1));
check("vcvtdq2ps" YMM, 8, f32(i32_1));
check("vcvttpd2dqy", 8, i32(f64_1));
check("vcvtdq2pd" YMM, 8, f64(i32_1));
check("vcvtps2pd" YMM, 8, f64(f32_1));
check("vcvtpd2psy", 8, f32(f64_1));
}
if (use_avx2) {
check("vpaddb" YMM, 32, u8_1 + u8_2);
check("vpsubb" YMM, 32, u8_1 - u8_2);
check("vpaddsb", 32, i8_sat(i16(i8_1) + i16(i8_2)));
check("vpsubsb", 32, i8_sat(i16(i8_1) - i16(i8_2)));
check("vpaddusb", 32, u8(min(u16(u8_1) + u16(u8_2), max_u8)));
check("vpsubusb", 32, u8(max(i16(u8_1) - i16(u8_2), 0)));
check("vpaddw" YMM, 16, u16_1 + u16_2);
check("vpsubw" YMM, 16, u16_1 - u16_2);
check("vpaddsw", 16, i16_sat(i32(i16_1) + i32(i16_2)));
check("vpsubsw", 16, i16_sat(i32(i16_1) - i32(i16_2)));
check("vpaddusw", 16, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
check("vpsubusw", 16, u16(max(i32(u16_1) - i32(u16_2), 0)));
check("vpaddd" YMM, 8, i32_1 + i32_2);
check("vpsubd" YMM, 8, i32_1 - i32_2);
check("vpmulhw" YMM, 16, i16((i32(i16_1) * i32(i16_2)) / (256*256)));
check("vpmulhw" YMM, 16, i16((i32(i16_1) * i32(i16_2)) >> 16));
check("vpmullw" YMM, 16, i16_1 * i16_2);
check("vpcmp*b" YMM, 32, select(u8_1 == u8_2, u8(1), u8(2)));
check("vpcmp*b" YMM, 32, select(u8_1 > u8_2, u8(1), u8(2)));
check("vpcmp*w" YMM, 16, select(u16_1 == u16_2, u16(1), u16(2)));
check("vpcmp*w" YMM, 16, select(u16_1 > u16_2, u16(1), u16(2)));
check("vpcmp*d" YMM, 8, select(u32_1 == u32_2, u32(1), u32(2)));
check("vpcmp*d" YMM, 8, select(u32_1 > u32_2, u32(1), u32(2)));
check("vpavgb", 32, u8((u16(u8_1) + u16(u8_2) + 1)/2));
check("vpavgw", 16, u16((u32(u16_1) + u32(u16_2) + 1)/2));
check("vpmaxsw" YMM, 16, max(i16_1, i16_2));
check("vpminsw" YMM, 16, min(i16_1, i16_2));
check("vpmaxub" YMM, 32, max(u8_1, u8_2));
check("vpminub" YMM, 32, min(u8_1, u8_2));
check("vpmulhuw" YMM, 16, u16((u32(u16_1) * u32(u16_2))/(256*256)));
check("vpmulhuw" YMM, 16, u16((u32(u16_1) * u32(u16_2))>>16));
check("vpaddq" YMM, 8, i64_1 + i64_2);
check("vpsubq" YMM, 8, i64_1 - i64_2);
check(use_avx512_skylake ? "vpmullq" : "vpmuludq", 8, u64_1 * u64_2);
check("vpackssdw", 16, i16_sat(i32_1));
check("vpacksswb", 32, i8_sat(i16_1));
check("vpackuswb", 32, u8_sat(i16_1));
check("vpabsb", 32, abs(i8_1));
check("vpabsw", 16, abs(i16_1));
check("vpabsd", 8, abs(i32_1));
if (!use_avx512) {
check("vpmuludq" YMM, 8, u64(u32_1) * u64(u32_2));
}
check("vpmulld" YMM, 8, i32_1 * i32_2);
check("vpblend*b" YMM, 32, select(u8_1 > 7, u8_1, u8_2));
check("vpmaxsb" YMM, 32, max(i8_1, i8_2));
check("vpminsb" YMM, 32, min(i8_1, i8_2));
check("vpmaxuw" YMM, 16, max(u16_1, u16_2));
check("vpminuw" YMM, 16, min(u16_1, u16_2));
check("vpmaxud" YMM, 16, max(u32_1, u32_2));
check("vpminud" YMM, 16, min(u32_1, u32_2));
check("vpmaxsd" YMM, 8, max(i32_1, i32_2));
check("vpminsd" YMM, 8, min(i32_1, i32_2));
check("vpcmpeqq" YMM, 4, select(i64_1 == i64_2, i64(1), i64(2)));
check("vpackusdw", 16, u16(clamp(i32_1, 0, max_u16)));
check("vpcmpgtq" YMM, 4, select(i64_1 > i64_2, i64(1), i64(2)));
}
if (use_avx512) {
#if 0
check("vrangeps", 16, clamp(f32_1, 3.0f, 9.0f));
check("vrangepd", 8, clamp(f64_1, f64(3), f64(9)));
check("vreduceps", 16, f32_1 - floor(f32_1));
check("vreduceps", 16, f32_1 - floor(f32_1*8)/8);
check("vreduceps", 16, f32_1 - trunc(f32_1));
check("vreduceps", 16, f32_1 - trunc(f32_1*8)/8);
check("vreducepd", 8, f64_1 - floor(f64_1));
check("vreducepd", 8, f64_1 - floor(f64_1*8)/8);
check("vreducepd", 8, f64_1 - trunc(f64_1));
check("vreducepd", 8, f64_1 - trunc(f64_1*8)/8);
#endif
}
if (use_avx512_skylake) {
check("vpabsq", 8, abs(i64_1));
check("vpmaxuq", 8, max(u64_1, u64_2));
check("vpminuq", 8, min(u64_1, u64_2));
check("vpmaxsq", 8, max(i64_1, i64_2));
check("vpminsq", 8, min(i64_1, i64_2));
}
}
void check_neon_all() {
Expr f64_1 = in_f64(x), f64_2 = in_f64(x+16), f64_3 = in_f64(x+32);
Expr f32_1 = in_f32(x), f32_2 = in_f32(x+16), f32_3 = in_f32(x+32);
Expr i8_1 = in_i8(x), i8_2 = in_i8(x+16), i8_3 = in_i8(x+32);
Expr u8_1 = in_u8(x), u8_2 = in_u8(x+16), u8_3 = in_u8(x+32);
Expr i16_1 = in_i16(x), i16_2 = in_i16(x+16), i16_3 = in_i16(x+32);
Expr u16_1 = in_u16(x), u16_2 = in_u16(x+16), u16_3 = in_u16(x+32);
Expr i32_1 = in_i32(x), i32_2 = in_i32(x+16), i32_3 = in_i32(x+32);
Expr u32_1 = in_u32(x), u32_2 = in_u32(x+16), u32_3 = in_u32(x+32);
Expr i64_1 = in_i64(x), i64_2 = in_i64(x+16), i64_3 = in_i64(x+32);
Expr u64_1 = in_u64(x), u64_2 = in_u64(x+16), u64_3 = in_u64(x+32);
Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
bool arm32 = (target.bits == 32);
for (int w = 1; w <= 4; w++) {
check(arm32 ? "vaba.s8" : "saba", 8*w, i8_1 + absd(i8_2, i8_3));
check(arm32 ? "vaba.u8" : "uaba", 8*w, u8_1 + absd(u8_2, u8_3));
check(arm32 ? "vaba.s16" : "saba", 4*w, i16_1 + absd(i16_2, i16_3));
check(arm32 ? "vaba.u16" : "uaba", 4*w, u16_1 + absd(u16_2, u16_3));
check(arm32 ? "vaba.s32" : "saba", 2*w, i32_1 + absd(i32_2, i32_3));
check(arm32 ? "vaba.u32" : "uaba", 2*w, u32_1 + absd(u32_2, u32_3));
check(arm32 ? "vabal.s8" : "sabal", 8*w, i16_1 + absd(i8_2, i8_3));
check(arm32 ? "vabal.u8" : "uabal", 8*w, u16_1 + absd(u8_2, u8_3));
check(arm32 ? "vabal.s16" : "sabal", 4*w, i32_1 + absd(i16_2, i16_3));
check(arm32 ? "vabal.u16" : "uabal", 4*w, u32_1 + absd(u16_2, u16_3));
check(arm32 ? "vabal.s32" : "sabal", 2*w, i64_1 + absd(i32_2, i32_3));
check(arm32 ? "vabal.u32" : "uabal", 2*w, u64_1 + absd(u32_2, u32_3));
check(arm32 ? "vabd.s8" : "sabd", 8*w, absd(i8_2, i8_3));
check(arm32 ? "vabd.u8" : "uabd", 8*w, absd(u8_2, u8_3));
check(arm32 ? "vabd.s16" : "sabd", 4*w, absd(i16_2, i16_3));
check(arm32 ? "vabd.u16" : "uabd", 4*w, absd(u16_2, u16_3));
check(arm32 ? "vabd.s32" : "sabd", 2*w, absd(i32_2, i32_3));
check(arm32 ? "vabd.u32" : "uabd", 2*w, absd(u32_2, u32_3));
check(arm32 ? "vabd.s8" : "sabd", 8*w, u8(abs(i16(i8_2) - i8_3)));
check(arm32 ? "vabd.u8" : "uabd", 8*w, u8(abs(i16(u8_2) - u8_3)));
check(arm32 ? "vabd.s16" : "sabd", 4*w, u16(abs(i32(i16_2) - i16_3)));
check(arm32 ? "vabd.u16" : "uabd", 4*w, u16(abs(i32(u16_2) - u16_3)));
check(arm32 ? "vabd.s32" : "sabd", 2*w, u32(abs(i64(i32_2) - i32_3)));
check(arm32 ? "vabd.u32" : "uabd", 2*w, u32(abs(i64(u32_2) - u32_3)));
check(arm32 ? "vabdl.s8" : "sabdl", 8*w, i16(absd(i8_2, i8_3)));
check(arm32 ? "vabdl.u8" : "uabdl", 8*w, u16(absd(u8_2, u8_3)));
check(arm32 ? "vabdl.s16" : "sabdl", 4*w, i32(absd(i16_2, i16_3)));
check(arm32 ? "vabdl.u16" : "uabdl", 4*w, u32(absd(u16_2, u16_3)));
check(arm32 ? "vabdl.s32" : "sabdl", 2*w, i64(absd(i32_2, i32_3)));
check(arm32 ? "vabdl.u32" : "uabdl", 2*w, u64(absd(u32_2, u32_3)));
check(arm32 ? "vabdl.s8" : "sabdl", 8*w, abs(i16(i8_2) - i16(i8_3)));
check(arm32 ? "vabdl.u8" : "uabdl", 8*w, abs(i16(u8_2) - i16(u8_3)));
check(arm32 ? "vabdl.s16" : "sabdl", 4*w, abs(i32(i16_2) - i32(i16_3)));
check(arm32 ? "vabdl.u16" : "uabdl", 4*w, abs(i32(u16_2) - i32(u16_3)));
check(arm32 ? "vabdl.s32" : "sabdl", 2*w, abs(i64(i32_2) - i64(i32_3)));
check(arm32 ? "vabdl.u32" : "uabdl", 2*w, abs(i64(u32_2) - i64(u32_3)));
check(arm32 ? "vabs.f32" : "fabs", 2*w, abs(f32_1));
check(arm32 ? "vabs.s32" : "abs", 2*w, abs(i32_1));
check(arm32 ? "vabs.s16" : "abs", 4*w, abs(i16_1));
check(arm32 ? "vabs.s8" : "abs", 8*w, abs(i8_1));
check(arm32 ? "vadd.i8" : "add", 8*w, i8_1 + i8_2);
check(arm32 ? "vadd.i8" : "add", 8*w, u8_1 + u8_2);
check(arm32 ? "vadd.i16" : "add", 4*w, i16_1 + i16_2);
check(arm32 ? "vadd.i16" : "add", 4*w, u16_1 + u16_2);
check(arm32 ? "vadd.i32" : "add", 2*w, i32_1 + i32_2);
check(arm32 ? "vadd.i32" : "add", 2*w, u32_1 + u32_2);
check(arm32 ? "vadd.f32" : "fadd", 2*w, f32_1 + f32_2);
check(arm32 ? "vadd.i64" : "add", 2*w, i64_1 + i64_2);
check(arm32 ? "vadd.i64" : "add", 2*w, u64_1 + u64_2);
check(arm32 ? "vaddhn.i16" : "addhn", 8*w, i8((i16_1 + i16_2)/256));
check(arm32 ? "vaddhn.i16" : "addhn", 8*w, u8((u16_1 + u16_2)/256));
check(arm32 ? "vaddhn.i32" : "addhn", 4*w, i16((i32_1 + i32_2)/65536));
check(arm32 ? "vaddhn.i32" : "addhn", 4*w, u16((u32_1 + u32_2)/65536));
check(arm32 ? "vaddl.s8" : "saddl", 8*w, i16(i8_1) + i16(i8_2));
check(arm32 ? "vaddl.u8" : "uaddl", 8*w, u16(u8_1) + u16(u8_2));
check(arm32 ? "vaddl.s16" : "saddl", 4*w, i32(i16_1) + i32(i16_2));
check(arm32 ? "vaddl.u16" : "uaddl", 4*w, u32(u16_1) + u32(u16_2));
check(arm32 ? "vaddl.s32" : "saddl", 2*w, i64(i32_1) + i64(i32_2));
check(arm32 ? "vaddl.u32" : "uaddl", 2*w, u64(u32_1) + u64(u32_2));
check(arm32 ? "vaddw.s8" : "saddw", 8*w, i8_1 + i16_1);
check(arm32 ? "vaddw.u8" : "uaddw", 8*w, u8_1 + u16_1);
check(arm32 ? "vaddw.s16" : "saddw", 4*w, i16_1 + i32_1);
check(arm32 ? "vaddw.u16" : "uaddw", 4*w, u16_1 + u32_1);
check(arm32 ? "vaddw.s32" : "saddw", 2*w, i32_1 + i64_1);
check(arm32 ? "vaddw.u32" : "uaddw", 2*w, u32_1 + u64_1);
check(arm32 ? "vbsl" : "bsl", 2*w, select(f32_1 > f32_2, 1.0f, 2.0f));
check(arm32 ? "vceq.i8" : "cmeq", 8*w, select(i8_1 == i8_2, i8(1), i8(2)));
check(arm32 ? "vceq.i8" : "cmeq", 8*w, select(u8_1 == u8_2, u8(1), u8(2)));
check(arm32 ? "vceq.i16" : "cmeq", 4*w, select(i16_1 == i16_2, i16(1), i16(2)));
check(arm32 ? "vceq.i16" : "cmeq", 4*w, select(u16_1 == u16_2, u16(1), u16(2)));
check(arm32 ? "vceq.i32" : "cmeq", 2*w, select(i32_1 == i32_2, i32(1), i32(2)));
check(arm32 ? "vceq.i32" : "cmeq", 2*w, select(u32_1 == u32_2, u32(1), u32(2)));
check(arm32 ? "vceq.f32" : "fcmeq", 2*w, select(f32_1 == f32_2, 1.0f, 2.0f));
#if 0
check("vcge.s8", 16, select(i8_1 >= i8_2, i8(1), i8(2)));
check("vcge.u8", 16, select(u8_1 >= u8_2, u8(1), u8(2)));
check("vcge.s16", 8, select(i16_1 >= i16_2, i16(1), i16(2)));
check("vcge.u16", 8, select(u16_1 >= u16_2, u16(1), u16(2)));
check("vcge.s32", 4, select(i32_1 >= i32_2, i32(1), i32(2)));
check("vcge.u32", 4, select(u32_1 >= u32_2, u32(1), u32(2)));
check("vcge.f32", 4, select(f32_1 >= f32_2, 1.0f, 2.0f));
check("vcge.s8", 8, select(i8_1 >= i8_2, i8(1), i8(2)));
check("vcge.u8", 8, select(u8_1 >= u8_2, u8(1), u8(2)));
check("vcge.s16", 4, select(i16_1 >= i16_2, i16(1), i16(2)));
check("vcge.u16", 4, select(u16_1 >= u16_2, u16(1), u16(2)));
check("vcge.s32", 2, select(i32_1 >= i32_2, i32(1), i32(2)));
check("vcge.u32", 2, select(u32_1 >= u32_2, u32(1), u32(2)));
check("vcge.f32", 2, select(f32_1 >= f32_2, 1.0f, 2.0f));
#endif
check(arm32 ? "vcgt.s8" : "cmgt", 8*w, select(i8_1 > i8_2, i8(1), i8(2)));
check(arm32 ? "vcgt.u8" : "cmhi", 8*w, select(u8_1 > u8_2, u8(1), u8(2)));
check(arm32 ? "vcgt.s16" : "cmgt", 4*w, select(i16_1 > i16_2, i16(1), i16(2)));
check(arm32 ? "vcgt.u16" : "cmhi", 4*w, select(u16_1 > u16_2, u16(1), u16(2)));
check(arm32 ? "vcgt.s32" : "cmgt", 2*w, select(i32_1 > i32_2, i32(1), i32(2)));
check(arm32 ? "vcgt.u32" : "cmhi", 2*w, select(u32_1 > u32_2, u32(1), u32(2)));
check(arm32 ? "vcgt.f32" : "fcmgt", 2*w, select(f32_1 > f32_2, 1.0f, 2.0f));
check(arm32 ? "vcvt.f32.u32" : "ucvtf", 2*w, f32(u32_1));
check(arm32 ? "vcvt.f32.s32" : "scvtf", 2*w, f32(i32_1));
check(arm32 ? "vcvt.u32.f32" : "fcvtzu", 2*w, u32(f32_1));
check(arm32 ? "vcvt.s32.f32" : "fcvtzs", 2*w, i32(f32_1));
check(arm32 ? "vdiv.f32" : "fdiv", 2*w, f32_1/f32_2);
check(arm32 ? "vdiv.f64" : "fdiv", 2*w, f64_1/f64_2);
check(arm32 ? "vdup.8" : "dup", 16*w, i8(y));
check(arm32 ? "vdup.8" : "dup", 16*w, u8(y));
check(arm32 ? "vdup.16" : "dup", 8*w, i16(y));
check(arm32 ? "vdup.16" : "dup", 8*w, u16(y));
check(arm32 ? "vdup.32" : "dup", 4*w, i32(y));
check(arm32 ? "vdup.32" : "dup", 4*w, u32(y));
check(arm32 ? "vdup.32" : "dup", 4*w, f32(y));
#if 0
check("vext.8", 16, in_i8(x+1));
check("vext.16", 8, in_i16(x+1));
check("vext.32", 4, in_i32(x+1));
#endif
check(arm32 ? "vhadd.s8" : "shadd", 8*w, i8((i16(i8_1) + i16(i8_2))/2));
check(arm32 ? "vhadd.u8" : "uhadd", 8*w, u8((u16(u8_1) + u16(u8_2))/2));
check(arm32 ? "vhadd.s16" : "shadd", 4*w, i16((i32(i16_1) + i32(i16_2))/2));
check(arm32 ? "vhadd.u16" : "uhadd", 4*w, u16((u32(u16_1) + u32(u16_2))/2));
check(arm32 ? "vhadd.s32" : "shadd", 2*w, i32((i64(i32_1) + i64(i32_2))/2));
check(arm32 ? "vhadd.u32" : "uhadd", 2*w, u32((u64(u32_1) + u64(u32_2))/2));
check(arm32 ? "vhadd.s32" : "shadd", 2*w, (i32_1 + i32_2)/2);
check(arm32 ? "vhsub.s8" : "shsub", 8*w, i8((i16(i8_1) - i16(i8_2))/2));
check(arm32 ? "vhsub.u8" : "uhsub", 8*w, u8((u16(u8_1) - u16(u8_2))/2));
check(arm32 ? "vhsub.s16" : "shsub", 4*w, i16((i32(i16_1) - i32(i16_2))/2));
check(arm32 ? "vhsub.u16" : "uhsub", 4*w, u16((u32(u16_1) - u32(u16_2))/2));
check(arm32 ? "vhsub.s32" : "shsub", 2*w, i32((i64(i32_1) - i64(i32_2))/2));
check(arm32 ? "vhsub.u32" : "uhsub", 2*w, u32((u64(u32_1) - u64(u32_2))/2));
check(arm32 ? "vhsub.s32" : "shsub", 2*w, (i32_1 - i32_2)/2);
check(arm32 ? "vld1.8" : "ldr", 8*w, in_i8(x+y));
check(arm32 ? "vld1.8" : "ldr", 8*w, in_u8(x+y));
check(arm32 ? "vld1.16" : "ldr", 4*w, in_i16(x+y));
check(arm32 ? "vld1.16" : "ldr", 4*w, in_u16(x+y));
if (w > 1) {
check(arm32 ? "vld1.32" : "ldr", 2*w, in_i32(x+y));
check(arm32 ? "vld1.32" : "ldr", 2*w, in_u32(x+y));
check(arm32 ? "vld1.32" : "ldr", 2*w, in_f32(x+y));
}
check(arm32 ? "vld2.32" : "ld2", 4*w, in_i32(x*2) + in_i32(x*2+1));
check(arm32 ? "vld2.32" : "ld2", 4*w, in_u32(x*2) + in_u32(x*2+1));
check(arm32 ? "vld2.32" : "ld2", 4*w, in_f32(x*2) + in_f32(x*2+1));
check(arm32 ? "vld2.8" : "ld2", 8*w, in_i8(x*2) + in_i8(x*2+1));
check(arm32 ? "vld2.8" : "ld2", 8*w, in_u8(x*2) + in_u8(x*2+1));
check(arm32 ? "vld2.16" : "ld2", 4*w, in_i16(x*2) + in_i16(x*2+1));
check(arm32 ? "vld2.16" : "ld2", 4*w, in_u16(x*2) + in_u16(x*2+1));
check(arm32 ? "vld3.32" : "ld3", 4*w, in_i32(x*3+y));
check(arm32 ? "vld3.32" : "ld3", 4*w, in_u32(x*3+y));
check(arm32 ? "vld3.32" : "ld3", 4*w, in_f32(x*3+y));
check(arm32 ? "vld3.8" : "ld3", 8*w, in_i8(x*3+y));
check(arm32 ? "vld3.8" : "ld3", 8*w, in_u8(x*3+y));
check(arm32 ? "vld3.16" : "ld3", 4*w, in_i16(x*3+y));
check(arm32 ? "vld3.16" : "ld3", 4*w, in_u16(x*3+y));
check(arm32 ? "vld4.32" : "ld4", 4*w, in_i32(x*4+y));
check(arm32 ? "vld4.32" : "ld4", 4*w, in_u32(x*4+y));
check(arm32 ? "vld4.32" : "ld4", 4*w, in_f32(x*4+y));
check(arm32 ? "vld4.8" : "ld4", 8*w, in_i8(x*4+y));
check(arm32 ? "vld4.8" : "ld4", 8*w, in_u8(x*4+y));
check(arm32 ? "vld4.16" : "ld4", 4*w, in_i16(x*4+y));
check(arm32 ? "vld4.16" : "ld4", 4*w, in_u16(x*4+y));
check(arm32 ? "vmax.s8" : "smax", 8*w, max(i8_1, i8_2));
check(arm32 ? "vmax.u8" : "umax", 8*w, max(u8_1, u8_2));
check(arm32 ? "vmax.s16" : "smax", 4*w, max(i16_1, i16_2));
check(arm32 ? "vmax.u16" : "umax", 4*w, max(u16_1, u16_2));
check(arm32 ? "vmax.s32" : "smax", 2*w, max(i32_1, i32_2));
check(arm32 ? "vmax.u32" : "umax", 2*w, max(u32_1, u32_2));
check(arm32 ? "vmax.f32" : "fmax", 2*w, max(f32_1, f32_2));
check(arm32 ? "vmin.s8" : "smin", 8*w, min(i8_1, i8_2));
check(arm32 ? "vmin.u8" : "umin", 8*w, min(u8_1, u8_2));
check(arm32 ? "vmin.s16" : "smin", 4*w, min(i16_1, i16_2));
check(arm32 ? "vmin.u16" : "umin", 4*w, min(u16_1, u16_2));
check(arm32 ? "vmin.s32" : "smin", 2*w, min(i32_1, i32_2));
check(arm32 ? "vmin.u32" : "umin", 2*w, min(u32_1, u32_2));
check(arm32 ? "vmin.f32" : "fmin", 2*w, min(f32_1, f32_2));
check(arm32 ? "vmla.i8" : "mla", 8*w, i8_1 + i8_2*i8_3);
check(arm32 ? "vmla.i8" : "mla", 8*w, u8_1 + u8_2*u8_3);
check(arm32 ? "vmla.i16" : "mla", 4*w, i16_1 + i16_2*i16_3);
check(arm32 ? "vmla.i16" : "mla", 4*w, u16_1 + u16_2*u16_3);
check(arm32 ? "vmla.i32" : "mla", 2*w, i32_1 + i32_2*i32_3);
check(arm32 ? "vmla.i32" : "mla", 2*w, u32_1 + u32_2*u32_3);
if (w == 1 || w == 2) {
check(arm32 ? "vmla.f32" : "fmla", 2*w, f32_1 + f32_2*f32_3);
}
check(arm32 ? "vmls.i8" : "mls", 8*w, i8_1 - i8_2*i8_3);
check(arm32 ? "vmls.i8" : "mls", 8*w, u8_1 - u8_2*u8_3);
check(arm32 ? "vmls.i16" : "mls", 4*w, i16_1 - i16_2*i16_3);
check(arm32 ? "vmls.i16" : "mls", 4*w, u16_1 - u16_2*u16_3);
check(arm32 ? "vmls.i32" : "mls", 2*w, i32_1 - i32_2*i32_3);
check(arm32 ? "vmls.i32" : "mls", 2*w, u32_1 - u32_2*u32_3);
if (w == 1 || w == 2) {
check(arm32 ? "vmls.f32" : "fmls", 2*w, f32_1 - f32_2*f32_3);
}
check(arm32 ? "vmlal.s8" : "smlal", 8*w, i16_1 + i16(i8_2)*i8_3);
check(arm32 ? "vmlal.u8" : "umlal", 8*w, u16_1 + u16(u8_2)*u8_3);
check(arm32 ? "vmlal.s16" : "smlal", 4*w, i32_1 + i32(i16_2)*i16_3);
check(arm32 ? "vmlal.u16" : "umlal", 4*w, u32_1 + u32(u16_2)*u16_3);
check(arm32 ? "vmlal.s32" : "smlal", 2*w, i64_1 + i64(i32_2)*i32_3);
check(arm32 ? "vmlal.u32" : "umlal", 2*w, u64_1 + u64(u32_2)*u32_3);
check(arm32 ? "vmlsl.s8" : "smlsl", 8*w, i16_1 - i16(i8_2)*i8_3);
check(arm32 ? "vmlsl.u8" : "umlsl", 8*w, u16_1 - u16(u8_2)*u8_3);
check(arm32 ? "vmlsl.s16" : "smlsl", 4*w, i32_1 - i32(i16_2)*i16_3);
check(arm32 ? "vmlsl.u16" : "umlsl", 4*w, u32_1 - u32(u16_2)*u16_3);
check(arm32 ? "vmlsl.s32" : "smlsl", 2*w, i64_1 - i64(i32_2)*i32_3);
check(arm32 ? "vmlsl.u32" : "umlsl", 2*w, u64_1 - u64(u32_2)*u32_3);
check(arm32 ? "vmovl.s8" : "sshll", 8*w, i16(i8_1));
check(arm32 ? "vmovl.u8" : "ushll", 8*w, u16(u8_1));
check(arm32 ? "vmovl.u8" : "ushll", 8*w, i16(u8_1));
check(arm32 ? "vmovl.s16" : "sshll", 4*w, i32(i16_1));
check(arm32 ? "vmovl.u16" : "ushll", 4*w, u32(u16_1));
check(arm32 ? "vmovl.u16" : "ushll", 4*w, i32(u16_1));
check(arm32 ? "vmovl.s32" : "sshll", 2*w, i64(i32_1));
check(arm32 ? "vmovl.u32" : "ushll", 2*w, u64(u32_1));
check(arm32 ? "vmovl.u32" : "ushll", 2*w, i64(u32_1));
check(arm32 ? "vmovn.i16" : "xtn", 8*w, i8(i16_1));
check(arm32 ? "vmovn.i16" : "xtn", 8*w, u8(u16_1));
check(arm32 ? "vmovn.i32" : "xtn", 4*w, i16(i32_1));
check(arm32 ? "vmovn.i32" : "xtn", 4*w, u16(u32_1));
check(arm32 ? "vmovn.i64" : "xtn", 2*w, i32(i64_1));
check(arm32 ? "vmovn.i64" : "xtn", 2*w, u32(u64_1));
check(arm32 ? "vmul.f64" : "fmul", 2*w, f64_2*f64_1);
check(arm32 ? "vmul.i8" : "mul", 8*w, i8_2*i8_1);
check(arm32 ? "vmul.i8" : "mul", 8*w, u8_2*u8_1);
check(arm32 ? "vmul.i16" : "mul", 4*w, i16_2*i16_1);
check(arm32 ? "vmul.i16" : "mul", 4*w, u16_2*u16_1);
check(arm32 ? "vmul.i32" : "mul", 2*w, i32_2*i32_1);
check(arm32 ? "vmul.i32" : "mul", 2*w, u32_2*u32_1);
check(arm32 ? "vmul.f32" : "fmul", 2*w, f32_2*f32_1);
check(arm32 ? "vmull.s8" : "smull", 8*w, i16(i8_1)*i8_2);
check(arm32 ? "vmull.u8" : "umull", 8*w, u16(u8_1)*u8_2);
check(arm32 ? "vmull.s16" : "smull", 4*w, i32(i16_1)*i16_2);
check(arm32 ? "vmull.u16" : "umull", 4*w, u32(u16_1)*u16_2);
check(arm32 ? "vmull.s32" : "smull", 2*w, i64(i32_1)*i32_2);
check(arm32 ? "vmull.u32" : "umull", 2*w, u64(u32_1)*u32_2);
check(arm32 ? "vmull.u8" : "umull", 8*w, i8_1/37);
check(arm32 ? "vmull.u8" : "umull", 8*w, u8_1/37);
check(arm32 ? "vmull.u16" : "umull", 4*w, i16_1/37);
check(arm32 ? "vmull.u16" : "umull", 4*w, u16_1/37);
check(arm32 ? "vmull.u32" : "umull", 2*w, i32_1/37);
check(arm32 ? "vmull.u32" : "umull", 2*w, u32_1/37);
check(arm32 ? "vneg.s8" : "neg", 8*w, -i8_1);
check(arm32 ? "vneg.s16" : "neg", 4*w, -i16_1);
check(arm32 ? "vneg.s32" : "neg", 2*w, -i32_1);
check(arm32 ? "vneg.f32" : "fneg", 4*w, -f32_1);
check(arm32 ? "vneg.f64" : "fneg", 2*w, -f64_1);
#if 0
check("vnmla.f32", 4, -(f32_1 + f32_2*f32_3));
check("vnmla.f64", 2, -(f64_1 + f64_2*f64_3));
check("vnmls.f32", 4, -(f32_1 - f32_2*f32_3));
check("vnmls.f64", 2, -(f64_1 - f64_2*f64_3));
check("vnmul.f32", 4, -(f32_1*f32_2));
check("vnmul.f64", 2, -(f64_1*f64_2));
#endif
#if 0
check("vqabs.s8", 16, abs(max(i8_1, -max_i8)));
check("vqabs.s8", 8, abs(max(i8_1, -max_i8)));
check("vqabs.s16", 8, abs(max(i16_1, -max_i16)));
check("vqabs.s16", 4, abs(max(i16_1, -max_i16)));
check("vqabs.s32", 4, abs(max(i32_1, -max_i32)));
check("vqabs.s32", 2, abs(max(i32_1, -max_i32)));
#endif
check(arm32 ? "vqadd.s8" : "sqadd", 8*w, i8_sat(i16(i8_1) + i16(i8_2)));
check(arm32 ? "vqadd.s16" : "sqadd", 4*w, i16_sat(i32(i16_1) + i32(i16_2)));
check(arm32 ? "vqadd.s32" : "sqadd", 2*w, i32_sat(i64(i32_1) + i64(i32_2)));
check(arm32 ? "vqadd.u8" : "uqadd", 8*w, u8(min(u16(u8_1) + u16(u8_2), max_u8)));
check(arm32 ? "vqadd.u16" : "uqadd", 4*w, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
check(arm32 ? "vqadd.u8" : "uqadd", 8*w, u8(min(u16(u8_1) + 17, max_u8)));
check(arm32 ? "vqadd.u16" : "uqadd", 4*w, u16(min(u32(u16_1) + 17, max_u16)));
check(arm32 ? "vqmovn.s16" : "sqxtn", 8*w, i8_sat(i16_1));
check(arm32 ? "vqmovn.s32" : "sqxtn", 4*w, i16_sat(i32_1));
check(arm32 ? "vqmovn.s64" : "sqxtn", 2*w, i32_sat(i64_1));
check(arm32 ? "vqmovn.u16" : "uqxtn", 8*w, u8(min(u16_1, max_u8)));
check(arm32 ? "vqmovn.u32" : "uqxtn", 4*w, u16(min(u32_1, max_u16)));
check(arm32 ? "vqmovn.u64" : "uqxtn", 2*w, u32(min(u64_1, max_u32)));
check(arm32 ? "vqmovun.s16" : "sqxtun", 8*w, u8_sat(i16_1));
check(arm32 ? "vqmovun.s32" : "sqxtun", 4*w, u16_sat(i32_1));
check(arm32 ? "vqmovun.s64" : "sqxtun", 2*w, u32_sat(i64_1));
check(arm32 ? "vqneg.s8" : "sqneg", 8*w, -max(i8_1, -max_i8));
check(arm32 ? "vqneg.s16" : "sqneg", 4*w, -max(i16_1, -max_i16));
check(arm32 ? "vqneg.s32" : "sqneg", 2*w, -max(i32_1, -max_i32));
check(arm32 ? "vqrdmulh.s16" : "sqrdmulh", 4*w, i16_sat((i32(i16_1) * i32(i16_2) + (1<<14)) / (1 << 15)));
check(arm32 ? "vqrdmulh.s32" : "sqrdmulh", 2*w, i32_sat((i64(i32_1) * i64(i32_2) + (1<<30)) /
(Expr(int64_t(1)) << 31)));
check(arm32 ? "vqshl.s8" : "sqshl", 8*w, i8_sat(i16(i8_1)*16));
check(arm32 ? "vqshl.s16" : "sqshl", 4*w, i16_sat(i32(i16_1)*16));
check(arm32 ? "vqshl.s32" : "sqshl", 2*w, i32_sat(i64(i32_1)*16));
check(arm32 ? "vqshl.u8" : "uqshl", 8*w, u8(min(u16(u8_1 )*16, max_u8)));
check(arm32 ? "vqshl.u16" : "uqshl", 4*w, u16(min(u32(u16_1)*16, max_u16)));
check(arm32 ? "vqshl.u32" : "uqshl", 2*w, u32(min(u64(u32_1)*16, max_u32)));
check(arm32 ? "vqshlu.s8" : "sqshlu", 8*w, u8_sat(i16(i8_1)*16));
check(arm32 ? "vqshlu.s16" : "sqshlu", 4*w, u16_sat(i32(i16_1)*16));
check(arm32 ? "vqshlu.s32" : "sqshlu", 2*w, u32_sat(i64(i32_1)*16));
check(arm32 ? "vqshrn.s16" : "sqshrn", 8*w, i8_sat(i16_1/16));
check(arm32 ? "vqshrn.s32" : "sqshrn", 4*w, i16_sat(i32_1/16));
check(arm32 ? "vqshrn.s64" : "sqshrn", 2*w, i32_sat(i64_1/16));
check(arm32 ? "vqshrun.s16" : "sqshrun", 8*w, u8_sat(i16_1/16));
check(arm32 ? "vqshrun.s32" : "sqshrun", 4*w, u16_sat(i32_1/16));
check(arm32 ? "vqshrun.s64" : "sqshrun", 2*w, u32_sat(i64_1/16));
check(arm32 ? "vqshrn.u16" : "uqshrn", 8*w, u8(min(u16_1/16, max_u8)));
check(arm32 ? "vqshrn.u32" : "uqshrn", 4*w, u16(min(u32_1/16, max_u16)));
check(arm32 ? "vqshrn.u64" : "uqshrn", 2*w, u32(min(u64_1/16, max_u32)));
check(arm32 ? "vqsub.s8" : "sqsub", 8*w, i8_sat(i16(i8_1) - i16(i8_2)));
check(arm32 ? "vqsub.s16" : "sqsub", 4*w, i16_sat(i32(i16_1) - i32(i16_2)));
check(arm32 ? "vqsub.s32" : "sqsub", 2*w, i32_sat(i64(i32_1) - i64(i32_2)));
check(arm32 ? "vqsub.u8" : "uqsub", 8*w, u8_sat(i16(u8_1) - i16(u8_2)));
check(arm32 ? "vqsub.u16" : "uqsub", 4*w, u16_sat(i32(u16_1) - i32(u16_2)));
check(arm32 ? "vqsub.u32" : "uqsub", 2*w, u32_sat(i64(u32_1) - i64(u32_2)));
#if 0
check("vraddhn.i16", 8, i8((i16_1 + i16_2 + 128)/256));
check("vraddhn.i16", 8, u8((u16_1 + u16_2 + 128)/256));
check("vraddhn.i32", 4, i16((i32_1 + i32_2 + 32768)/65536));
check("vraddhn.i32", 4, u16((u32_1 + u32_2 + 32768)/65536));
#endif
check(arm32 ? "vrecpe.f32" : "frecpe", 2*w, fast_inverse(f32_1));
check(arm32 ? "vrecps.f32" : "frecps", 2*w, fast_inverse(f32_1));
check(arm32 ? "vrhadd.s8" : "srhadd", 8*w, i8((i16(i8_1 ) + i16(i8_2 ) + 1)/2));
check(arm32 ? "vrhadd.u8" : "urhadd", 8*w, u8((u16(u8_1 ) + u16(u8_2 ) + 1)/2));
check(arm32 ? "vrhadd.s16" : "srhadd", 4*w, i16((i32(i16_1) + i32(i16_2) + 1)/2));
check(arm32 ? "vrhadd.u16" : "urhadd", 4*w, u16((u32(u16_1) + u32(u16_2) + 1)/2));
check(arm32 ? "vrhadd.s32" : "srhadd", 2*w, i32((i64(i32_1) + i64(i32_2) + 1)/2));
check(arm32 ? "vrhadd.u32" : "urhadd", 2*w, u32((u64(u32_1) + u64(u32_2) + 1)/2));
check(arm32 ? "vrsqrte.f32" : "frsqrte", 4*w, fast_inverse_sqrt(f32_1));
check(arm32 ? "vrsqrts.f32" : "frsqrts", 4*w, fast_inverse_sqrt(f32_1));
check(arm32 ? "vshl.i64" : "shl", 2*w, i64_1*16);
check(arm32 ? "vshl.i8" : "shl", 8*w, i8_1*16);
check(arm32 ? "vshl.i16" : "shl", 4*w, i16_1*16);
check(arm32 ? "vshl.i32" : "shl", 2*w, i32_1*16);
check(arm32 ? "vshl.i64" : "shl", 2*w, u64_1*16);
check(arm32 ? "vshl.i8" : "shl", 8*w, u8_1*16);
check(arm32 ? "vshl.i16" : "shl", 4*w, u16_1*16);
check(arm32 ? "vshl.i32" : "shl", 2*w, u32_1*16);
check(arm32 ? "vshll.s8" : "sshll", 8*w, i16(i8_1)*16);
check(arm32 ? "vshll.s16" : "sshll", 4*w, i32(i16_1)*16);
check(arm32 ? "vshll.s32" : "sshll", 2*w, i64(i32_1)*16);
check(arm32 ? "vshll.u8" : "ushll", 8*w, u16(u8_1)*16);
check(arm32 ? "vshll.u16" : "ushll", 4*w, u32(u16_1)*16);
check(arm32 ? "vshll.u32" : "ushll", 2*w, u64(u32_1)*16);
check(arm32 ? "vshr.s64" : "sshr", 2*w, i64_1/16);
check(arm32 ? "vshr.s8" : "sshr", 8*w, i8_1/16);
check(arm32 ? "vshr.s16" : "sshr", 4*w, i16_1/16);
check(arm32 ? "vshr.s32" : "sshr", 2*w, i32_1/16);
check(arm32 ? "vshr.u64" : "ushr", 2*w, u64_1/16);
check(arm32 ? "vshr.u8" : "ushr", 8*w, u8_1/16);
check(arm32 ? "vshr.u16" : "ushr", 4*w, u16_1/16);
check(arm32 ? "vshr.u32" : "ushr", 2*w, u32_1/16);
check(arm32 ? "vshrn.i16" : "shrn", 8*w, i8(i16_1/256));
check(arm32 ? "vshrn.i32" : "shrn", 4*w, i16(i32_1/65536));
check(arm32 ? "vshrn.i16" : "shrn", 8*w, u8(u16_1/256));
check(arm32 ? "vshrn.i32" : "shrn", 4*w, u16(u32_1/65536));
check(arm32 ? "vshrn.i16" : "shrn", 8*w, i8(i16_1/16));
check(arm32 ? "vshrn.i32" : "shrn", 4*w, i16(i32_1/16));
check(arm32 ? "vshrn.i16" : "shrn", 8*w, u8(u16_1/16));
check(arm32 ? "vshrn.i32" : "shrn", 4*w, u16(u32_1/16));
check(arm32 ? "vsqrt.f32" : "fsqrt", 4*w, sqrt(f32_1));
check(arm32 ? "vsqrt.f64" : "fsqrt", 2*w, sqrt(f64_1));
check(arm32 ? "vsra.s64" : "ssra", 2*w, i64_2 + i64_1/16);
check(arm32 ? "vsra.s8" : "ssra", 8*w, i8_2 + i8_1/16);
check(arm32 ? "vsra.s16" : "ssra", 4*w, i16_2 + i16_1/16);
check(arm32 ? "vsra.s32" : "ssra", 2*w, i32_2 + i32_1/16);
check(arm32 ? "vsra.u64" : "usra", 2*w, u64_2 + u64_1/16);
check(arm32 ? "vsra.u8" : "usra", 8*w, u8_2 + u8_1/16);
check(arm32 ? "vsra.u16" : "usra", 4*w, u16_2 + u16_1/16);
check(arm32 ? "vsra.u32" : "usra", 2*w, u32_2 + u32_1/16);
check(arm32 ? "vsub.i64" : "sub", 2*w, i64_1 - i64_2);
check(arm32 ? "vsub.i64" : "sub", 2*w, u64_1 - u64_2);
check(arm32 ? "vsub.f32" : "fsub", 4*w, f32_1 - f32_2);
check(arm32 ? "vsub.i8" : "sub", 8*w, i8_1 - i8_2);
check(arm32 ? "vsub.i8" : "sub", 8*w, u8_1 - u8_2);
check(arm32 ? "vsub.i16" : "sub", 4*w, i16_1 - i16_2);
check(arm32 ? "vsub.i16" : "sub", 4*w, u16_1 - u16_2);
check(arm32 ? "vsub.i32" : "sub", 2*w, i32_1 - i32_2);
check(arm32 ? "vsub.i32" : "sub", 2*w, u32_1 - u32_2);
check(arm32 ? "vsub.f32" : "fsub", 2*w, f32_1 - f32_2);
check(arm32 ? "vsubhn.i16" : "subhn", 8*w, i8((i16_1 - i16_2)/256));
check(arm32 ? "vsubhn.i16" : "subhn", 8*w, u8((u16_1 - u16_2)/256));
check(arm32 ? "vsubhn.i32" : "subhn", 4*w, i16((i32_1 - i32_2)/65536));
check(arm32 ? "vsubhn.i32" : "subhn", 4*w, u16((u32_1 - u32_2)/65536));
check(arm32 ? "vsubl.s8" : "ssubl", 8*w, i16(i8_1) - i16(i8_2));
check(arm32 ? "vsubl.u8" : "usubl", 8*w, u16(u8_1) - u16(u8_2));
check(arm32 ? "vsubl.s16" : "ssubl", 4*w, i32(i16_1) - i32(i16_2));
check(arm32 ? "vsubl.u16" : "usubl", 4*w, u32(u16_1) - u32(u16_2));
check(arm32 ? "vsubl.s32" : "ssubl", 2*w, i64(i32_1) - i64(i32_2));
check(arm32 ? "vsubl.u32" : "usubl", 2*w, u64(u32_1) - u64(u32_2));
check(arm32 ? "vsubw.s8" : "ssubw", 8*w, i16_1 - i8_1);
check(arm32 ? "vsubw.u8" : "usubw", 8*w, u16_1 - u8_1);
check(arm32 ? "vsubw.s16" : "ssubw", 4*w, i32_1 - i16_1);
check(arm32 ? "vsubw.u16" : "usubw", 4*w, u32_1 - u16_1);
check(arm32 ? "vsubw.s32" : "ssubw", 2*w, i64_1 - i32_1);
check(arm32 ? "vsubw.u32" : "usubw", 2*w, u64_1 - u32_1);
check(arm32 ? "vst1.8" : "st", 8*w, i8_1);
}
for (int sign = 0; sign <= 1; sign++) {
for (int width = 128; width <= 128*4; width *= 2) {
for (int bits = 8; bits < 64; bits *= 2) {
if (width <= bits*2) continue;
Func tmp1, tmp2;
tmp1(x) = cast(sign ? Int(bits) : UInt(bits), x);
tmp1.compute_root();
tmp2(x, y) = select(x%2 == 0, tmp1(x/2), tmp1(x/2 + 16));
tmp2.compute_root().vectorize(x, width/bits);
string op = "vst2." + std::to_string(bits);
check(arm32 ? op : string("st2"), width/bits, tmp2(0, 0) + tmp2(0, 63));
}
}
}
for (int sign = 0; sign <= 1; sign++) {
for (int width = 128; width <= 128*4; width *= 2) {
for (int bits = 8; bits < 64; bits *= 2) {
if (width <= bits*2) continue;
Func tmp1, tmp2;
tmp1(x) = cast(sign ? Int(bits) : UInt(bits), x);
tmp1.compute_root();
Expr e = (tmp1(x/2)*2 + 7)/4;
tmp2(x, y) = select(x%2 == 0, e*3, e + 17);
tmp2.compute_root().vectorize(x, width/bits);
string op = "vst2." + std::to_string(bits);
check(arm32 ? op : string("st2"), width/bits, tmp2(0, 0) + tmp2(0, 127));
}
}
}
for (int sign = 0; sign <= 1; sign++) {
for (int width = 192; width <= 192*4; width *= 2) {
for (int bits = 8; bits < 64; bits *= 2) {
if (width <= bits*3) continue;
Func tmp1, tmp2;
tmp1(x) = cast(sign ? Int(bits) : UInt(bits), x);
tmp1.compute_root();
tmp2(x, y) = select(x%3 == 0, tmp1(x/3),
x%3 == 1, tmp1(x/3 + 16),
tmp1(x/3 + 32));
tmp2.compute_root().vectorize(x, width/bits);
string op = "vst3." + std::to_string(bits);
check(arm32 ? op : string("st3"), width/bits, tmp2(0, 0) + tmp2(0, 127));
}
}
}
for (int sign = 0; sign <= 1; sign++) {
for (int width = 256; width <= 256*4; width *= 2) {
for (int bits = 8; bits < 64; bits *= 2) {
if (width <= bits*4) continue;
Func tmp1, tmp2;
tmp1(x) = cast(sign ? Int(bits) : UInt(bits), x);
tmp1.compute_root();
tmp2(x, y) = select(x%4 == 0, tmp1(x/4),
x%4 == 1, tmp1(x/4 + 16),
x%4 == 2, tmp1(x/4 + 32),
tmp1(x/4 + 48));
tmp2.compute_root().vectorize(x, width/bits);
string op = "vst4." + std::to_string(bits);
check(arm32 ? op : string("st4"), width/bits, tmp2(0, 0) + tmp2(0, 127));
}
}
}
}
void check_hvx_all() {
Expr f32_1 = in_f32(x), f32_2 = in_f32(x+16), f32_3 = in_f32(x+32);
Expr f64_1 = in_f64(x), f64_2 = in_f64(x+16), f64_3 = in_f64(x+32);
Expr i8_1 = in_i8(x), i8_2 = in_i8(x+16), i8_3 = in_i8(x+32), i8_4 = in_i8(x + 48);
Expr u8_1 = in_u8(x), u8_2 = in_u8(x+16), u8_3 = in_u8(x+32), u8_4 = in_u8(x + 48);
Expr u8_even = in_u8(2*x), u8_odd = in_u8(2*x+1);
Expr i16_1 = in_i16(x), i16_2 = in_i16(x+16), i16_3 = in_i16(x+32);
Expr u16_1 = in_u16(x), u16_2 = in_u16(x+16), u16_3 = in_u16(x+32);
Expr i32_1 = in_i32(x), i32_2 = in_i32(x+16), i32_3 = in_i32(x+32);
Expr u32_1 = in_u32(x), u32_2 = in_u32(x+16), u32_3 = in_u32(x+32);
Expr i64_1 = in_i64(x), i64_2 = in_i64(x+16), i64_3 = in_i64(x+32);
Expr u64_1 = in_u64(x), u64_2 = in_u64(x+16), u64_3 = in_u64(x+32);
Expr bool_1 = (f32_1 > 0.3f), bool_2 = (f32_1 < -0.3f), bool_3 = (f32_1 != -0.34f);
int hvx_width = 0;
if (target.has_feature(Target::HVX_64)) {
hvx_width = 64;
} else if (target.has_feature(Target::HVX_128)) {
hvx_width = 128;
}
bool is_v62 = target.has_feature(Target::HVX_v62);
check("valign(v*,v*,#7)", hvx_width/1, in_u8(x + 7));
check("vlalign(v*,v*,#7)", hvx_width/1, in_u8(x + hvx_width - 7));
check("valign(v*,v*,r*)", hvx_width/1, in_u8(x + 8));
check("valign(v*,v*,r*)", hvx_width/1, in_u8(x + hvx_width - 8));
check("valign(v*,v*,#6)", hvx_width/1, in_u16(x + 3));
check("vlalign(v*,v*,#6)", hvx_width/1, in_u16(x + hvx_width - 3));
check("valign(v*,v*,r*)", hvx_width/1, in_u16(x + 4));
check("valign(v*,v*,r*)", hvx_width/1, in_u16(x + hvx_width - 4));
check("vunpack(v*.ub)", hvx_width/1, u16(u8_1));
check("vunpack(v*.ub)", hvx_width/1, i16(u8_1));
check("vunpack(v*.uh)", hvx_width/2, u32(u16_1));
check("vunpack(v*.uh)", hvx_width/2, i32(u16_1));
check("vunpack(v*.b)", hvx_width/1, u16(i8_1));
check("vunpack(v*.b)", hvx_width/1, i16(i8_1));
check("vunpack(v*.h)", hvx_width/2, u32(i16_1));
check("vunpack(v*.h)", hvx_width/2, i32(i16_1));
check("vunpack(v*.ub)", hvx_width/1, u32(u8_1));
check("vunpack(v*.ub)", hvx_width/1, i32(u8_1));
check("vunpack(v*.b)", hvx_width/1, u32(i8_1));
check("vunpack(v*.b)", hvx_width/1, i32(i8_1));
#if 0
check("vzxt(v*.ub)", hvx_width/1, u16(u8_1));
check("vzxt(v*.ub)", hvx_width/1, i16(u8_1));
check("vzxt(v*.uh)", hvx_width/2, u32(u16_1));
check("vzxt(v*.uh)", hvx_width/2, i32(u16_1));
check("vsxt(v*.b)", hvx_width/1, u16(i8_1));
check("vsxt(v*.b)", hvx_width/1, i16(i8_1));
check("vsxt(v*.h)", hvx_width/2, u32(i16_1));
check("vsxt(v*.h)", hvx_width/2, i32(i16_1));
check("vzxt(v*.ub)", hvx_width/1, u32(u8_1));
check("vzxt(v*.ub)", hvx_width/1, i32(u8_1));
check("vsxt(v*.b)", hvx_width/1, u32(i8_1));
check("vsxt(v*.b)", hvx_width/1, i32(i8_1));
#endif
check("vadd(v*.b,v*.b)", hvx_width/1, u8_1 + u8_2);
check("vadd(v*.h,v*.h)", hvx_width/2, u16_1 + u16_2);
check("vadd(v*.w,v*.w)", hvx_width/4, u32_1 + u32_2);
check("vadd(v*.b,v*.b)", hvx_width/1, i8_1 + i8_2);
check("vadd(v*.h,v*.h)", hvx_width/2, i16_1 + i16_2);
check("vadd(v*.w,v*.w)", hvx_width/4, i32_1 + i32_2);
check("v*.h = vadd(v*.ub,v*.ub)", hvx_width/1, u16(u8_1) + u16(u8_2));
check("v*.w = vadd(v*.uh,v*.uh)", hvx_width/2, u32(u16_1) + u32(u16_2));
check("v*.w = vadd(v*.h,v*.h)", hvx_width/2, i32(i16_1) + i32(i16_2));
check("vadd(v*.ub,v*.ub):sat", hvx_width/1, u8_sat(u16(u8_1) + u16(u8_2)));
check("vadd(v*.uh,v*.uh):sat", hvx_width/2, u16_sat(u32(u16_1) + u32(u16_2)));
check("vadd(v*.h,v*.h):sat", hvx_width/2, i16_sat(i32(i16_1) + i32(i16_2)));
check("vadd(v*.w,v*.w):sat", hvx_width/4, i32_sat(i64(i32_1) + i64(i32_2)));
if (is_v62) {
check("vadd(v*.uw,v*.uw):sat", hvx_width/4, u32_sat(u64(u32_1) + u64(u32_2)));
}
check("vsub(v*.b,v*.b)", hvx_width/1, u8_1 - u8_2);
check("vsub(v*.h,v*.h)", hvx_width/2, u16_1 - u16_2);
check("vsub(v*.w,v*.w)", hvx_width/4, u32_1 - u32_2);
check("vsub(v*.b,v*.b)", hvx_width/1, i8_1 - i8_2);
check("vsub(v*.h,v*.h)", hvx_width/2, i16_1 - i16_2);
check("vsub(v*.w,v*.w)", hvx_width/4, i32_1 - i32_2);
check("v*.h = vsub(v*.ub,v*.ub)", hvx_width/1, u16(u8_1) - u16(u8_2));
check("v*.w = vsub(v*.uh,v*.uh)", hvx_width/2, u32(u16_1) - u32(u16_2));
check("v*.w = vsub(v*.h,v*.h)", hvx_width/2, i32(i16_1) - i32(i16_2));
check("vsub(v*.ub,v*.ub):sat", hvx_width/1, u8_sat(i16(u8_1) - i16(u8_2)));
check("vsub(v*.uh,v*.uh):sat", hvx_width/2, u16_sat(i32(u16_1) - i32(u16_2)));
check("vsub(v*.h,v*.h):sat", hvx_width/2, i16_sat(i32(i16_1) - i32(i16_2)));
check("vsub(v*.w,v*.w):sat", hvx_width/4, i32_sat(i64(i32_1) - i64(i32_2)));
check("vadd(v*:*.b,v*:*.b)", hvx_width*2, u8_1 + u8_2);
check("vadd(v*:*.h,v*:*.h)", hvx_width/1, u16_1 + u16_2);
check("vadd(v*:*.w,v*:*.w)", hvx_width/2, u32_1 + u32_2);
check("vadd(v*:*.b,v*:*.b)", hvx_width*2, i8_1 + i8_2);
check("vadd(v*:*.h,v*:*.h)", hvx_width/1, i16_1 + i16_2);
check("vadd(v*:*.w,v*:*.w)", hvx_width/2, i32_1 + i32_2);
check("vadd(v*:*.ub,v*:*.ub):sat", hvx_width*2, u8_sat(u16(u8_1) + u16(u8_2)));
check("vadd(v*:*.uh,v*:*.uh):sat", hvx_width/1, u16_sat(u32(u16_1) + u32(u16_2)));
check("vadd(v*:*.h,v*:*.h):sat", hvx_width/1, i16_sat(i32(i16_1) + i32(i16_2)));
check("vadd(v*:*.w,v*:*.w):sat", hvx_width/2, i32_sat(i64(i32_1) + i64(i32_2)));
if (is_v62) {
check("vadd(v*:*.uw,v*:*.uw):sat", hvx_width/2, u32_sat(u64(u32_1) + u64(u32_2)));
}
check("vsub(v*:*.b,v*:*.b)", hvx_width*2, u8_1 - u8_2);
check("vsub(v*:*.h,v*:*.h)", hvx_width/1, u16_1 - u16_2);
check("vsub(v*:*.w,v*:*.w)", hvx_width/2, u32_1 - u32_2);
check("vsub(v*:*.b,v*:*.b)", hvx_width*2, i8_1 - i8_2);
check("vsub(v*:*.h,v*:*.h)", hvx_width/1, i16_1 - i16_2);
check("vsub(v*:*.w,v*:*.w)", hvx_width/2, i32_1 - i32_2);
check("vsub(v*:*.ub,v*:*.ub):sat", hvx_width*2, u8_sat(i16(u8_1) - i16(u8_2)));
check("vsub(v*:*.uh,v*:*.uh):sat", hvx_width/1, u16_sat(i32(u16_1) - i32(u16_2)));
check("vsub(v*:*.h,v*:*.h):sat", hvx_width/1, i16_sat(i32(i16_1) - i32(i16_2)));
check("vsub(v*:*.w,v*:*.w):sat", hvx_width/2, i32_sat(i64(i32_1) - i64(i32_2)));
check("vavg(v*.ub,v*.ub)", hvx_width/1, u8((u16(u8_1) + u16(u8_2))/2));
check("vavg(v*.ub,v*.ub):rnd", hvx_width/1, u8((u16(u8_1) + u16(u8_2) + 1)/2));
check("vavg(v*.uh,v*.uh)", hvx_width/2, u16((u32(u16_1) + u32(u16_2))/2));
check("vavg(v*.uh,v*.uh):rnd", hvx_width/2, u16((u32(u16_1) + u32(u16_2) + 1)/2));
check("vavg(v*.h,v*.h)", hvx_width/2, i16((i32(i16_1) + i32(i16_2))/2));
check("vavg(v*.h,v*.h):rnd", hvx_width/2, i16((i32(i16_1) + i32(i16_2) + 1)/2));
check("vavg(v*.w,v*.w)", hvx_width/4, i32((i64(i32_1) + i64(i32_2))/2));
check("vavg(v*.w,v*.w):rnd", hvx_width/4, i32((i64(i32_1) + i64(i32_2) + 1)/2));
check("vnavg(v*.ub,v*.ub)", hvx_width/1, i8_sat((i16(u8_1) - i16(u8_2))/2));
check("vnavg(v*.h,v*.h)", hvx_width/2, i16_sat((i32(i16_1) - i32(i16_2))/2));
check("vnavg(v*.w,v*.w)", hvx_width/4, i32_sat((i64(i32_1) - i64(i32_2))/2));
check("vlsr(v*.h,v*.h)", hvx_width/1, u8_1 >> (u8_2 % 8));
check("vlsr(v*.h,v*.h)", hvx_width/2, u16_1 >> (u16_2 % 16));
check("vlsr(v*.w,v*.w)", hvx_width/4, u32_1 >> (u32_2 % 32));
check("vasr(v*.h,v*.h)", hvx_width/1, i8_1 >> (i8_2 % 8));
check("vasr(v*.h,v*.h)", hvx_width/2, i16_1 >> (i16_2 % 16));
check("vasr(v*.w,v*.w)", hvx_width/4, i32_1 >> (i32_2 % 32));
check("vasr(v*.h,v*.h,r*):sat", hvx_width/1, u8_sat(i16_1 >> 4));
check("vasr(v*.w,v*.w,r*):sat", hvx_width/2, u16_sat(i32_1 >> 8));
check("vasr(v*.w,v*.w,r*):sat", hvx_width/2, i16_sat(i32_1 >> 8));
check("vasr(v*.w,v*.w,r*)", hvx_width/2, i16(i32_1 >> 8));
check("vasl(v*.h,v*.h)", hvx_width/1, u8_1 << (u8_2 % 8));
check("vasl(v*.h,v*.h)", hvx_width/2, u16_1 << (u16_2 % 16));
check("vasl(v*.w,v*.w)", hvx_width/4, u32_1 << (u32_2 % 32));
check("vasl(v*.h,v*.h)", hvx_width/1, i8_1 << (i8_2 % 8));
check("vasl(v*.h,v*.h)", hvx_width/2, i16_1 << (i16_2 % 16));
check("vasl(v*.w,v*.w)", hvx_width/4, i32_1 << (i32_2 % 32));
check("vlsr(v*.uh,r*)", hvx_width/1, u8_1 >> (u8(y) % 8));
check("vlsr(v*.uh,r*)", hvx_width/2, u16_1 >> (u16(y) % 16));
check("vlsr(v*.uw,r*)", hvx_width/4, u32_1 >> (u32(y) % 32));
check("vasr(v*.h,r*)", hvx_width/1, i8_1 >> (i8(y) % 8));
check("vasr(v*.h,r*)", hvx_width/2, i16_1 >> (i16(y) % 16));
check("vasr(v*.w,r*)", hvx_width/4, i32_1 >> (i32(y) % 32));
check("vasl(v*.h,r*)", hvx_width/1, u8_1 << (u8(y) % 8));
check("vasl(v*.h,r*)", hvx_width/2, u16_1 << (u16(y) % 16));
check("vasl(v*.w,r*)", hvx_width/4, u32_1 << (u32(y) % 32));
check("vasl(v*.h,r*)", hvx_width/1, i8_1 << (i8(y) % 8));
check("vasl(v*.h,r*)", hvx_width/2, i16_1 << (i16(y) % 16));
check("vasl(v*.w,r*)", hvx_width/4, i32_1 << (i32(y) % 32));
check("vpacke(v*.h,v*.h)", hvx_width/1, u8(u16_1));
check("vpacke(v*.h,v*.h)", hvx_width/1, u8(i16_1));
check("vpacke(v*.h,v*.h)", hvx_width/1, i8(u16_1));
check("vpacke(v*.h,v*.h)", hvx_width/1, i8(i16_1));
check("vpacke(v*.w,v*.w)", hvx_width/2, u16(u32_1));
check("vpacke(v*.w,v*.w)", hvx_width/2, u16(i32_1));
check("vpacke(v*.w,v*.w)", hvx_width/2, i16(u32_1));
check("vpacke(v*.w,v*.w)", hvx_width/2, i16(i32_1));
check("vpacko(v*.h,v*.h)", hvx_width/1, u8(u16_1 >> 8));
check("vpacko(v*.h,v*.h)", hvx_width/1, u8(i16_1 >> 8));
check("vpacko(v*.h,v*.h)", hvx_width/1, i8(u16_1 >> 8));
check("vpacko(v*.h,v*.h)", hvx_width/1, i8(i16_1 >> 8));
check("vpacko(v*.w,v*.w)", hvx_width/2, u16(u32_1 >> 16));
check("vpacko(v*.w,v*.w)", hvx_width/2, u16(i32_1 >> 16));
check("vpacko(v*.w,v*.w)", hvx_width/2, i16(u32_1 >> 16));
check("vpacko(v*.w,v*.w)", hvx_width/2, i16(i32_1 >> 16));
check("vshuffe(v*.b,v*.b)", hvx_width/1, u8(u16(u8_1) * 127));
check("vshuffe(v*.b,v*.b)", hvx_width/1, u8(i16(i8_1) * 63));
check("vshuffe(v*.b,v*.b)", hvx_width/1, i8(u16(u8_1) * 127));
check("vshuffe(v*.b,v*.b)", hvx_width/1, i8(i16(i8_1) * 63));
check("vshuffe(v*.h,v*.h)", hvx_width/2, u16(u32(u16_1) * 32767));
check("vshuffe(v*.h,v*.h)", hvx_width/2, u16(i32(i16_1) * 16383));
check("vshuffe(v*.h,v*.h)", hvx_width/2, i16(u32(u16_1) * 32767));
check("vshuffe(v*.h,v*.h)", hvx_width/2, i16(i32(i16_1) * 16383));
check("vshuffo(v*.b,v*.b)", hvx_width/1, u8((u16(u8_1) * 127) >> 8));
check("vshuffo(v*.b,v*.b)", hvx_width/1, u8((i16(i8_1) * 63) >> 8));
check("vshuffo(v*.b,v*.b)", hvx_width/1, i8((u16(u8_1) * 127) >> 8));
check("vshuffo(v*.b,v*.b)", hvx_width/1, i8((i16(i8_1) * 63) >> 8));
check("vshuffo(v*.h,v*.h)", hvx_width/2, u16((u32(u16_1) * 32767) >> 16));
check("vshuffo(v*.h,v*.h)", hvx_width/2, u16((i32(i16_1) * 16383) >> 16));
check("vshuffo(v*.h,v*.h)", hvx_width/2, i16((u32(u16_1) * 32767) >> 16));
check("vshuffo(v*.h,v*.h)", hvx_width/2, i16((i32(i16_1) * 16383) >> 16));
check("vpacke(v*.h,v*.h)", hvx_width/1, in_u8(2*x));
check("vpacke(v*.w,v*.w)", hvx_width/2, in_u16(2*x));
check("vdeal(v*,v*,r*)", hvx_width/4, in_u32(2*x));
check("vpacko(v*.h,v*.h)", hvx_width/1, in_u8(2*x + 1));
check("vpacko(v*.w,v*.w)", hvx_width/2, in_u16(2*x + 1));
check("vdeal(v*,v*,r*)", hvx_width/4, in_u32(2*x + 1));
check("vlut32(v*.b,v*.b,r*)", hvx_width/1, in_u8(3*x/2));
check("vlut16(v*.b,v*.h,r*)", hvx_width/2, in_u16(3*x/2));
check("vlut32(v*.b,v*.b,r*)", hvx_width/1, in_u8(u8_1));
check("vlut32(v*.b,v*.b,r*)", hvx_width/1, in_u8(clamp(u16_1, 0, 63)));
check("vlut16(v*.b,v*.h,r*)", hvx_width/2, in_u16(u8_1));
check("vlut16(v*.b,v*.h,r*)", hvx_width/2, in_u16(clamp(u16_1, 0, 15)));
check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width/1, u8_sat(i16_1));
check("v*.b = vpack(v*.h,v*.h):sat", hvx_width/1, i8_sat(i16_1));
check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width/2, u16_sat(i32_1));
check("v*.h = vpack(v*.w,v*.w):sat", hvx_width/2, i16_sat(i32_1));
check("v*.ub = vsat(v*.h,v*.h)", hvx_width/1, u8_sat(i16(i8_1) << 8));
check("v*.uh = vasr(v*.w,v*.w,r*):sat", hvx_width/2, u16_sat(i32(i16_1) << 16));
check("v*.h = vasr(v*.w,v*.w,r*):sat", hvx_width/2, u8_sat(i32(i16_1) >> 4));
check("v*.h = vsat(v*.w,v*.w)", hvx_width/2, i16_sat(i32(i16_1) << 16));
check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width/1, u8_sat(i32_1));
check("v*.b = vpack(v*.h,v*.h):sat", hvx_width/1, i8_sat(i32_1));
check("v*.h = vsat(v*.w,v*.w)", hvx_width/1, u8_sat(i32(i16_1) << 8));
if (is_v62) {
check("v*.uh = vsat(v*.uw, v*.uw)", hvx_width/2, u16_sat(u32_1));
}
check("vround(v*.h,v*.h)", hvx_width/1, u8_sat((i32(i16_1) + 128)/256));
check("vround(v*.h,v*.h)", hvx_width/1, i8_sat((i32(i16_1) + 128)/256));
check("vround(v*.w,v*.w)", hvx_width/2, u16_sat((i64(i32_1) + 32768)/65536));
check("vround(v*.w,v*.w)", hvx_width/2, i16_sat((i64(i32_1) + 32768)/65536));
check("vshuff(v*,v*,r*)", hvx_width*2, select((x%2) == 0, in_u8(x/2), in_u8((x+16)/2)));
check("vshuff(v*,v*,r*)", hvx_width*2, select((x%2) == 0, in_i8(x/2), in_i8((x+16)/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/2, select((x%2) == 0, in_u16(x/2), in_u16((x+16)/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/2, select((x%2) == 0, in_i16(x/2), in_i16((x+16)/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/4, select((x%2) == 0, in_u32(x/2), in_u32((x+16)/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/4, select((x%2) == 0, in_i32(x/2), in_i32((x+16)/2)));
check("vshuff(v*,v*,r*)", hvx_width*2, select((x%2) == 0, u8(x/2), u8(x/2)));
check("vshuff(v*,v*,r*)", hvx_width*2, select((x%2) == 0, i8(x/2), i8(x/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/2, select((x%2) == 0, u16(x/2), u16(x/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/2, select((x%2) == 0, i16(x/2), i16(x/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/4, select((x%2) == 0, u32(x/2), u32(x/2)));
check("vshuff(v*,v*,r*)", (hvx_width*2)/4, select((x%2) == 0, i32(x/2), i32(x/2)));
check("vmax(v*.ub,v*.ub)", hvx_width/1, max(u8_1, u8_2));
check("vmax(v*.uh,v*.uh)", hvx_width/2, max(u16_1, u16_2));
check("vmax(v*.h,v*.h)", hvx_width/2, max(i16_1, i16_2));
check("vmax(v*.w,v*.w)", hvx_width/4, max(i32_1, i32_2));
check("vmin(v*.ub,v*.ub)", hvx_width/1, min(u8_1, u8_2));
check("vmin(v*.uh,v*.uh)", hvx_width/2, min(u16_1, u16_2));
check("vmin(v*.h,v*.h)", hvx_width/2, min(i16_1, i16_2));
check("vmin(v*.w,v*.w)", hvx_width/4, min(i32_1, i32_2));
check("vcmp.gt(v*.b,v*.b)", hvx_width/1, select(i8_1 < i8_2, i8_1, i8_2));
check("vcmp.gt(v*.ub,v*.ub)", hvx_width/1, select(u8_1 < u8_2, u8_1, u8_2));
check("vcmp.gt(v*.h,v*.h)", hvx_width/2, select(i16_1 < i16_2, i16_1, i16_2));
check("vcmp.gt(v*.uh,v*.uh)", hvx_width/2, select(u16_1 < u16_2, u16_1, u16_2));
check("vcmp.gt(v*.w,v*.w)", hvx_width/4, select(i32_1 < i32_2, i32_1, i32_2));
check("vcmp.gt(v*.uw,v*.uw)", hvx_width/4, select(u32_1 < u32_2, u32_1, u32_2));
check("vcmp.gt(v*.b,v*.b)", hvx_width/1, select(i8_1 > i8_2, i8_1, i8_2));
check("vcmp.gt(v*.ub,v*.ub)", hvx_width/1, select(u8_1 > u8_2, u8_1, u8_2));
check("vcmp.gt(v*.h,v*.h)", hvx_width/2, select(i16_1 > i16_2, i16_1, i16_2));
check("vcmp.gt(v*.uh,v*.uh)", hvx_width/2, select(u16_1 > u16_2, u16_1, u16_2));
check("vcmp.gt(v*.w,v*.w)", hvx_width/4, select(i32_1 > i32_2, i32_1, i32_2));
check("vcmp.gt(v*.uw,v*.uw)", hvx_width/4, select(u32_1 > u32_2, u32_1, u32_2));
check("vcmp.gt(v*.b,v*.b)", hvx_width/1, select(i8_1 <= i8_2, i8_1, i8_2));
check("vcmp.gt(v*.ub,v*.ub)", hvx_width/1, select(u8_1 <= u8_2, u8_1, u8_2));
check("vcmp.gt(v*.h,v*.h)", hvx_width/2, select(i16_1 <= i16_2, i16_1, i16_2));
check("vcmp.gt(v*.uh,v*.uh)", hvx_width/2, select(u16_1 <= u16_2, u16_1, u16_2));
check("vcmp.gt(v*.w,v*.w)", hvx_width/4, select(i32_1 <= i32_2, i32_1, i32_2));
check("vcmp.gt(v*.uw,v*.uw)", hvx_width/4, select(u32_1 <= u32_2, u32_1, u32_2));
check("vcmp.gt(v*.b,v*.b)", hvx_width/1, select(i8_1 >= i8_2, i8_1, i8_2));
check("vcmp.gt(v*.ub,v*.ub)", hvx_width/1, select(u8_1 >= u8_2, u8_1, u8_2));
check("vcmp.gt(v*.h,v*.h)", hvx_width/2, select(i16_1 >= i16_2, i16_1, i16_2));
check("vcmp.gt(v*.uh,v*.uh)", hvx_width/2, select(u16_1 >= u16_2, u16_1, u16_2));
check("vcmp.gt(v*.w,v*.w)", hvx_width/4, select(i32_1 >= i32_2, i32_1, i32_2));
check("vcmp.gt(v*.uw,v*.uw)", hvx_width/4, select(u32_1 >= u32_2, u32_1, u32_2));
check("vcmp.eq(v*.b,v*.b)", hvx_width/1, select(i8_1 == i8_2, i8_1, i8_2));
check("vcmp.eq(v*.b,v*.b)", hvx_width/1, select(u8_1 == u8_2, u8_1, u8_2));
check("vcmp.eq(v*.h,v*.h)", hvx_width/2, select(i16_1 == i16_2, i16_1, i16_2));
check("vcmp.eq(v*.h,v*.h)", hvx_width/2, select(u16_1 == u16_2, u16_1, u16_2));
check("vcmp.eq(v*.w,v*.w)", hvx_width/4, select(i32_1 == i32_2, i32_1, i32_2));
check("vcmp.eq(v*.w,v*.w)", hvx_width/4, select(u32_1 == u32_2, u32_1, u32_2));
check("vcmp.eq(v*.b,v*.b)", hvx_width/1, select(i8_1 != i8_2, i8_1, i8_2));
check("vcmp.eq(v*.b,v*.b)", hvx_width/1, select(u8_1 != u8_2, u8_1, u8_2));
check("vcmp.eq(v*.h,v*.h)", hvx_width/2, select(i16_1 != i16_2, i16_1, i16_2));
check("vcmp.eq(v*.h,v*.h)", hvx_width/2, select(u16_1 != u16_2, u16_1, u16_2));
check("vcmp.eq(v*.w,v*.w)", hvx_width/4, select(i32_1 != i32_2, i32_1, i32_2));
check("vcmp.eq(v*.w,v*.w)", hvx_width/4, select(u32_1 != u32_2, u32_1, u32_2));
check("vabsdiff(v*.ub,v*.ub)", hvx_width/1, absd(u8_1, u8_2));
check("vabsdiff(v*.uh,v*.uh)", hvx_width/2, absd(u16_1, u16_2));
check("vabsdiff(v*.h,v*.h)", hvx_width/2, absd(i16_1, i16_2));
check("vabsdiff(v*.w,v*.w)", hvx_width/4, absd(i32_1, i32_2));
check("vand(v*,v*)", hvx_width/1, u8_1 & u8_2);
check("vand(v*,v*)", hvx_width/2, u16_1 & u16_2);
check("vand(v*,v*)", hvx_width/4, u32_1 & u32_2);
check("vor(v*,v*)", hvx_width/1, u8_1 | u8_2);
check("vor(v*,v*)", hvx_width/2, u16_1 | u16_2);
check("vor(v*,v*)", hvx_width/4, u32_1 | u32_2);
check("vxor(v*,v*)", hvx_width/1, u8_1 ^ u8_2);
check("vxor(v*,v*)", hvx_width/2, u16_1 ^ u16_2);
check("vxor(v*,v*)", hvx_width/4, u32_1 ^ u32_2);
check("vnot(v*)", hvx_width/1, ~u8_1);
check("vnot(v*)", hvx_width/2, ~u16_1);
check("vnot(v*)", hvx_width/4, ~u32_1);
if (is_v62) {
check("v*.b = vsplat(r*)", hvx_width/1, in_u8(0));
check("v*.h = vsplat(r*)", hvx_width/2, in_u16(0));
} else {
check("vsplat(r*)", hvx_width/1, in_u8(0));
check("vsplat(r*)", hvx_width/2, in_u16(0));
}
check("vsplat(r*)", hvx_width/4, in_u32(0));
check("vmux(q*,v*,v*)", hvx_width/1, select(i8_1 == i8_2, i8_1, i8_2));
check("vmux(q*,v*,v*)", hvx_width/2, select(i16_1 == i16_2, i16_1, i16_2));
check("vmux(q*,v*,v*)", hvx_width/4, select(i32_1 == i32_2, i32_1, i32_2));
check("vabs(v*.h)", hvx_width/2, abs(i16_1));
check("vabs(v*.w)", hvx_width/4, abs(i32_1));
check("vmpy(v*.ub,v*.ub)", hvx_width/1, u16(u8_1) * u16(u8_2));
check("vmpy(v*.b,v*.b)", hvx_width/1, i16(i8_1) * i16(i8_2));
check("vmpy(v*.uh,v*.uh)", hvx_width/2, u32(u16_1) * u32(u16_2));
check("vmpy(v*.h,v*.h)", hvx_width/2, i32(i16_1) * i32(i16_2));
check("vmpyi(v*.h,v*.h)", hvx_width/2, i16_1 * i16_2);
check("vmpyio(v*.w,v*.h)", hvx_width/2, i32_1 * i32(i16_1));
check("vmpyie(v*.w,v*.uh)", hvx_width/2, i32_1 * i32(u16_1));
check("vmpy(v*.uh,v*.uh)", hvx_width/2, u32_1 * u32(u16_1));
check("vmpyieo(v*.h,v*.h)", hvx_width/4, i32_1 * i32_2);
check("vmpy(v*.ub,v*.b)", hvx_width/1, i16(u8_1) * i16(i8_2));
check("vmpy(v*.h,v*.uh)", hvx_width/2, i32(u16_1) * i32(i16_2));
check("vmpy(v*.ub,v*.b)", hvx_width/1, i16(i8_1) * i16(u8_2));
check("vmpy(v*.h,v*.uh)", hvx_width/2, i32(i16_1) * i32(u16_2));
check("vmpy(v*.ub,r*.b)", hvx_width/1, i16(u8_1) * 3);
check("vmpy(v*.h,r*.h)", hvx_width/2, i32(i16_1) * 10);
check("vmpy(v*.ub,r*.ub)", hvx_width/1, u16(u8_1) * 3);
check("vmpy(v*.uh,r*.uh)", hvx_width/2, u32(u16_1) * 10);
check("vmpy(v*.ub,r*.b)", hvx_width/1, 3*i16(u8_1));
check("vmpy(v*.h,r*.h)", hvx_width/2, 10*i32(i16_1));
check("vmpy(v*.ub,r*.ub)", hvx_width/1, 3*u16(u8_1));
check("vmpy(v*.uh,r*.uh)", hvx_width/2, 10*u32(u16_1));
check("vmpyi(v*.h,r*.b)", hvx_width/2, i16_1 * 127);
check("vmpyi(v*.h,r*.b)", hvx_width/2, 127 * i16_1);
check("vmpyi(v*.w,r*.h)", hvx_width/4, i32_1 * 32767);
check("vmpyi(v*.w,r*.h)", hvx_width/4, 32767 * i32_1);
check("v*.h += vmpyi(v*.h,v*.h)", hvx_width/2, i16_1 + i16_2*i16_3);
check("v*.h += vmpyi(v*.h,r*.b)", hvx_width/2, i16_1 + i16_2 * 127);
check("v*.w += vmpyi(v*.w,r*.h)", hvx_width/4, i32_1 + i32_2 * 32767);
check("v*.h += vmpyi(v*.h,r*.b)", hvx_width/2, i16_1 + 127 * i16_2);
check("v*.w += vmpyi(v*.w,r*.h)", hvx_width/4, i32_1 + 32767 * i32_2);
check("v*.uh += vmpy(v*.ub,v*.ub)", hvx_width/1, u16_1 + u16(u8_1) * u16(u8_2));
check("v*.uw += vmpy(v*.uh,v*.uh)", hvx_width/2, u32_1 + u32(u16_1) * u32(u16_2));
check("v*.h += vmpy(v*.b,v*.b)", hvx_width/1, i16_1 + i16(i8_1) * i16(i8_2));
check("v*.w += vmpy(v*.h,v*.h)", hvx_width/2, i32_1 + i32(i16_1) * i32(i16_2));
check("v*.h += vmpy(v*.ub,v*.b)", hvx_width/1, i16_1 + i16(u8_1) * i16(i8_2));
check("v*.w += vmpy(v*.h,v*.uh)", hvx_width/2, i32_1 + i32(i16_1) * i32(u16_2));
check("v*.h += vmpy(v*.ub,v*.b)", hvx_width/1, i16_1 + i16(u8_1) * i16(i8_2));
check("v*.w += vmpy(v*.h,v*.uh)", hvx_width/2, i32_1 + i32(i16_1) * i32(u16_2));
check("v*.h += vmpy(v*.ub,v*.b)", hvx_width/1, i16_1 + i16(i8_1) * i16(u8_2));
check("v*.w += vmpy(v*.h,v*.uh)", hvx_width/2, i32_1 + i32(u16_1) * i32(i16_2));
check("v*.h += vmpy(v*.ub,v*.b)", hvx_width/1, i16_1 + i16(i8_1) * i16(u8_2));
check("v*.w += vmpy(v*.h,v*.uh)", hvx_width/2, i32_1 + i32(u16_1) * i32(i16_2));
check("v*.uh += vmpy(v*.ub,r*.ub)", hvx_width/1, u16_1 + u16(u8_1) * 255);
check("v*.h += vmpy(v*.ub,r*.b)", hvx_width/1, i16_1 + i16(u8_1) * 127);
check("v*.uw += vmpy(v*.uh,r*.uh)", hvx_width/2, u32_1 + u32(u16_1) * 65535);
check("v*.uh += vmpy(v*.ub,r*.ub)", hvx_width/1, u16_1 + 255 * u16(u8_1));
check("v*.h += vmpy(v*.ub,r*.b)", hvx_width/1, i16_1 + 127 * i16(u8_1));
check("v*.uw += vmpy(v*.uh,r*.uh)", hvx_width/2, u32_1 + 65535 * u32(u16_1));
check("v*.h += vmpy(v*.ub,r*.b)", hvx_width/1, i16_1 - i16(u8_1) * -127);
check("v*.h += vmpyi(v*.h,r*.b)", hvx_width/2, i16_1 - i16_2 * -127);
check("v*.w += vmpy(v*.h,r*.h)", hvx_width/1, i32_1 + i32(i16_1)*32767);
check("v*.w += vmpy(v*.h,r*.h)", hvx_width/1, i32_1 + 32767*i32(i16_1));
check("vmpy(v*.h,v*.h):<<1:rnd:sat", hvx_width/2, i16_sat((i32(i16_1)*i32(i16_2) + 16384)/32768));
check("vmpy(v*.h,r*.h):<<1:sat", hvx_width/2, i16_sat((i32(i16_1)*32767)/32768));
check("vmpy(v*.h,r*.h):<<1:sat", hvx_width/2, i16_sat((32767*i32(i16_1))/32768));
check("vmpy(v*.h,r*.h):<<1:rnd:sat", hvx_width/2, i16_sat((i32(i16_1)*32767 + 16384)/32768));
check("vmpy(v*.h,r*.h):<<1:rnd:sat", hvx_width/2, i16_sat((32767*i32(i16_1) + 16384)/32768));
check("vmpyo(v*.w,v*.h)", hvx_width/4, i32((i64(i32_1)*i64(i32_2))/(i64(1) << 32)));
check("vmpyo(v*.w,v*.h):<<1:sat", hvx_width/4, i32_sat((i64(i32_1)*i64(i32_2))/(i64(1) << 31)));
check("vmpyo(v*.w,v*.h):<<1:rnd:sat", hvx_width/4, i32_sat((i64(i32_1)*i64(i32_2) + (1 << 30))/(i64(1) << 31)));
check("vmpa(v*.ub,r*.b)", hvx_width/1, i16(u8_1)*127 + i16(u8_2)*-128);
check("vmpa(v*.ub,r*.b)", hvx_width/1, i16(u8_1)*127 + 126*i16(u8_2));
check("vmpa(v*.ub,r*.b)", hvx_width/1, -100*i16(u8_1) + 40*i16(u8_2));
check("v*.h += vmpa(v*.ub,r*.b)", hvx_width/1, 2*i16(u8_1) + 3*i16(u8_2) + i16_1);
check("vmpa(v*.h,r*.b)", hvx_width/2, i32(i16_1)*2 + i32(i16_2)*3);
check("vmpa(v*.h,r*.b)", hvx_width/2, i32(i16_1)*2 + 3*i32(i16_2));
check("vmpa(v*.h,r*.b)", hvx_width/2, 2*i32(i16_1) + 3*i32(i16_2));
check("v*.w += vmpa(v*.h,r*.b)", hvx_width/2, 2*i32(i16_1) + 3*i32(i16_2) + i32_1);
check("vdmpy(v*.ub,r*.b)", hvx_width/2, i16(in_u8(2*x))*127 + i16(in_u8(2*x + 1))*-128);
check("vdmpy(v*.h,r*.b)", hvx_width/4, i32(in_i16(2*x))*2 + i32(in_i16(2*x + 1))*3);
check("v*.h += vdmpy(v*.ub,r*.b)", hvx_width/2, i16(in_u8(2*x))*120 + i16(in_u8(2*x + 1))*-50 + i16_1);
check("v*.w += vdmpy(v*.h,r*.b)", hvx_width/4, i32(in_i16(2*x))*80 + i32(in_i16(2*x + 1))*33 + i32_1);
#if 0
check("vdmpy(v*:*.ub,r*.b)", (hvx_width/2)*2, i16(in_u8(2*x))*2 + i16(in_u8(2*x + 1))*3);
check("vdmpy(v*:*.h,r*.b)", (hvx_width/4)*2, i32(in_i16(2*x))*2 + i32(in_i16(2*x + 1))*3);
check("v*:*.h += vdmpy(v*:*.ub,r*.b)", (hvx_width/2)*2, i16(in_u8(2*x))*2 + i16(in_u8(2*x + 1))*3 + i16_1);
check("v*:*.w += vdmpy(v*:*.h,r*.b)", (hvx_width/4)*2, i32(in_i16(2*x))*2 + i32(in_i16(2*x + 1))*3 + i32_1);
#endif
check("vrmpy(v*.ub,r*.ub)", hvx_width, u32(u8_1)*255 + u32(u8_2)*254 + u32(u8_3)*253 + u32(u8_4)*252);
check("vrmpy(v*.ub,r*.b)", hvx_width, i32(u8_1)*127 + i32(u8_2)*-128 + i32(u8_3)*126 + i32(u8_4)*-127);
check("v*.uw += vrmpy(v*.ub,r*.ub)", hvx_width, u32_1 + u32(u8_1)*2 + u32(u8_2)*3 + u32(u8_3)*4 + u32(u8_4)*5);
check("v*.w += vrmpy(v*.ub,r*.b)", hvx_width, i32_1 + i32(u8_1)*2 + i32(u8_2)*-3 + i32(u8_3)*-4 + i32(u8_4)*5);
check("vrmpy(v*.ub,r*.b)", hvx_width, i32(u8_1) + i32(u8_2)*-2 + i32(u8_3)*3 + i32(u8_4)*-4);
check("v*.w += vrmpy(v*.ub,r*.b)", hvx_width, i32_1 + i32(u8_1) + i32(u8_2)*2 + i32(u8_3)*3 + i32(u8_4)*4);
check("vrmpy(v*.ub,r*.ub)", hvx_width, u32(u16(u8_1)*255) + u32(u16(u8_2)*254) + u32(u16(u8_3)*253) + u32(u16(u8_4)*252));
check("v*.w += vrmpy(v*.ub,r*.b)", hvx_width, i32_1 + i32(i16(u8_1)*2) + i32(i16(u8_2)*-3) + i32(i16(u8_3)*-4) + i32(i16(u8_4)*5));
check("vrmpy(v*.ub,v*.ub)", hvx_width, u32(u8_1)*u8_1 + u32(u8_2)*u8_2 + u32(u8_3)*u8_3 + u32(u8_4)*u8_4);
check("vrmpy(v*.b,v*.b)", hvx_width, i32(i8_1)*i8_1 + i32(i8_2)*i8_2 + i32(i8_3)*i8_3 + i32(i8_4)*i8_4);
check("v*.uw += vrmpy(v*.ub,v*.ub)", hvx_width, u32_1 + u32(u8_1)*u8_1 + u32(u8_2)*u8_2 + u32(u8_3)*u8_3 + u32(u8_4)*u8_4);
check("v*.w += vrmpy(v*.b,v*.b)", hvx_width, i32_1 + i32(i8_1)*i8_1 + i32(i8_2)*i8_2 + i32(i8_3)*i8_3 + i32(i8_4)*i8_4);
#if 0
check("vrmpy(v*.ub,v*.b)", hvx_width, i32(u8_1)*i8_1) + i32(u8_2)*i8_2) + i32(u8_3)*i8_3 + i32(u8_4)*i8_4);
check("v*.w += vrmpy(v*.ub,v*.b)", hvx_width, i32_1 + i32(u8_1)*i8_1 + i32(u8_2)*i8_2 + i32(u8_3)*i8_3 + i32(u8_4)*i8_4);
check("vrmpy(v*.ub,v*.b)", hvx_width, i16(u8_1)*i8_1 + i16(u8_2)*i8_2 + i16(u8_3)*i8_3 + i16(u8_4)*i8_4);
#endif
Expr u8_4x4[] = {
in_u8(4*x + 0),
in_u8(4*x + 1),
in_u8(4*x + 2),
in_u8(4*x + 3),
};
check("vrmpy(v*.ub,r*.b)", hvx_width/2, i16(u8_4x4[0])*127 + i16(u8_4x4[1])*126 + i16(u8_4x4[2])*-125 + i16(u8_4x4[3])*124);
check("vmpa(v*.ub,r*.b)", hvx_width, i16(u8_1)*127 + i16(u8_2)*-126 + i16(u8_3)*125 + i16(u8_4)*124);
check("v*.w += vasl(v*.w,r*)", hvx_width/4, u32_1 + (u32_2 * 8));
check("v*.w += vasl(v*.w,r*)", hvx_width/4, i32_1 + (i32_2 * 8));
check("v*.w += vasr(v*.w,r*)", hvx_width/4, i32_1 + (i32_2 / 8));
check("v*.w += vasl(v*.w,r*)", hvx_width/4, i32_1 + (i32_2 << (y % 32)));
check("v*.w += vasr(v*.w,r*)", hvx_width/4, i32_1 + (i32_2 >> (y % 32)));
check("vcl0(v*.uh)", hvx_width/2, count_leading_zeros(u16_1));
check("vcl0(v*.uw)", hvx_width/4, count_leading_zeros(u32_1));
check("vnormamt(v*.h)", hvx_width/2, max(count_leading_zeros(i16_1), count_leading_zeros(~i16_1)));
check("vnormamt(v*.w)", hvx_width/4, max(count_leading_zeros(i32_1), count_leading_zeros(~i32_1)));
check("vpopcount(v*.h)", hvx_width/2, popcount(u16_1));
}
void check_altivec_all() {
Expr f32_1 = in_f32(x), f32_2 = in_f32(x+16), f32_3 = in_f32(x+32);
Expr f64_1 = in_f64(x), f64_2 = in_f64(x+16), f64_3 = in_f64(x+32);
Expr i8_1 = in_i8(x), i8_2 = in_i8(x+16), i8_3 = in_i8(x+32);
Expr u8_1 = in_u8(x), u8_2 = in_u8(x+16), u8_3 = in_u8(x+32);
Expr i16_1 = in_i16(x), i16_2 = in_i16(x+16), i16_3 = in_i16(x+32);
Expr u16_1 = in_u16(x), u16_2 = in_u16(x+16), u16_3 = in_u16(x+32);
Expr i32_1 = in_i32(x), i32_2 = in_i32(x+16), i32_3 = in_i32(x+32);
Expr u32_1 = in_u32(x), u32_2 = in_u32(x+16), u32_3 = in_u32(x+32);
Expr i64_1 = in_i64(x), i64_2 = in_i64(x+16), i64_3 = in_i64(x+32);
Expr u64_1 = in_u64(x), u64_2 = in_u64(x+16), u64_3 = in_u64(x+32);
for (int w = 1; w <= 4; w++) {
check("vaddsbs", 16*w, i8_sat(i16( i8_1) + i16( i8_2)));
check("vaddshs", 8*w, i16_sat(i32(i16_1) + i32(i16_2)));
check("vaddsws", 4*w, i32_sat(i64(i32_1) + i64(i32_2)));
check("vaddubm", 16*w, i8_1 + i8_2);
check("vadduhm", 8*w, i16_1 + i16_2);
check("vadduwm", 4*w, i32_1 + i32_2);
check("vaddubs", 16*w, u8(min(u16( u8_1) + u16( u8_2), max_u8)));
check("vadduhs", 8*w, u16(min(u32(u16_1) + u32(u16_2), max_u16)));
check("vadduws", 4*w, u32(min(u64(u32_1) + u64(u32_2), max_u32)));
check("vsubsbs", 16*w, i8_sat(i16( i8_1) - i16( i8_2)));
check("vsubshs", 8*w, i16_sat(i32(i16_1) - i32(i16_2)));
check("vsubsws", 4*w, i32_sat(i64(i32_1) - i64(i32_2)));
check("vsububm", 16*w, i8_1 - i8_2);
check("vsubuhm", 8*w, i16_1 - i16_2);
check("vsubuwm", 4*w, i32_1 - i32_2);
check("vsububs", 16*w, u8(max(i16( u8_1) - i16( u8_2), 0)));
check("vsubuhs", 8*w, u16(max(i32(u16_1) - i32(u16_2), 0)));
check("vsubuws", 4*w, u32(max(i64(u32_1) - i64(u32_2), 0)));
check("vavgsb", 16*w, i8((i16( i8_1) + i16( i8_2) + 1)/2));
check("vavgub", 16*w, u8((u16( u8_1) + u16( u8_2) + 1)/2));
check("vavgsh", 8*w, i16((i32(i16_1) + i32(i16_2) + 1)/2));
check("vavguh", 8*w, u16((u32(u16_1) + u32(u16_2) + 1)/2));
check("vavgsw", 4*w, i32((i64(i32_1) + i64(i32_2) + 1)/2));
check("vavguw", 4*w, u32((u64(u32_1) + u64(u32_2) + 1)/2));
check("vmaxsb", 16*w, max( i8_1, i8_2));
check("vmaxub", 16*w, max( u8_1, u8_2));
check("vmaxsh", 8*w, max(i16_1, i16_2));
check("vmaxuh", 8*w, max(u16_1, u16_2));
check("vmaxsw", 4*w, max(i32_1, i32_2));
check("vmaxuw", 4*w, max(u32_1, u32_2));
check("vminsb", 16*w, min( i8_1, i8_2));
check("vminub", 16*w, min( u8_1, u8_2));
check("vminsh", 8*w, min(i16_1, i16_2));
check("vminuh", 8*w, min(u16_1, u16_2));
check("vminsw", 4*w, min(i32_1, i32_2));
check("vminuw", 4*w, min(u32_1, u32_2));
check(use_vsx ? "xvaddsp" : "vaddfp", 4*w, f32_1 + f32_2);
check(use_vsx ? "xvsubsp" : "vsubfp", 4*w, f32_1 - f32_2);
check(use_vsx ? "xvmaddasp" : "vmaddfp", 4*w, f32_1 * f32_2 + f32_3);
check("vmaxfp", 4*w, max(f32_1, f32_2));
check("vminfp", 4*w, min(f32_1, f32_2));
}
if (use_vsx) {
for (int w = 1; w <= 4; w++) {
check("xvadddp", 2*w, f64_1 + f64_2);
check("xvmuldp", 2*w, f64_1 * f64_2);
check("xvsubdp", 2*w, f64_1 - f64_2);
check("xvaddsp", 4*w, f32_1 + f32_2);
check("xvmulsp", 4*w, f32_1 * f32_2);
check("xvsubsp", 4*w, f32_1 - f32_2);
check("xvmaxdp", 2*w, max(f64_1, f64_2));
check("xvmindp", 2*w, min(f64_1, f64_2));
}
}
if (use_power_arch_2_07) {
for (int w = 1; w <= 4; w++) {
check("vaddudm", 2*w, i64_1 + i64_2);
check("vsubudm", 2*w, i64_1 - i64_2);
check("vmaxsd", 2*w, max(i64_1, i64_2));
check("vmaxud", 2*w, max(u64_1, u64_2));
check("vminsd", 2*w, min(i64_1, i64_2));
check("vminud", 2*w, min(u64_1, u64_2));
}
}
}
bool test_all() {
if (target.arch == Target::X86) {
check_sse_all();
} else if (target.arch == Target::ARM) {
check_neon_all();
} else if (target.arch == Target::Hexagon) {
check_hvx_all();
} else if (target.arch == Target::POWERPC) {
check_altivec_all();
}
Halide::Internal::ThreadPool<TestResult> pool(num_threads);
std::vector<std::future<TestResult>> futures;
for (const Task &task : tasks) {
futures.push_back(pool.async([this, task]() {
return check_one(task.op, task.name, task.vector_width, task.expr);
}));
}
bool success = true;
for (auto &f : futures) {
const TestResult &result = f.get();
std::cout << result.op << "\n";
if (!result.error_msg.empty()) {
std::cerr << result.error_msg;
success = false;
}
}
return success;
}
};
int main(int argc, char **argv) {
Test test;
if (argc > 1) {
test.filter = argv[1];
num_threads = 1;
}
if (argc > 2) {
test.output_directory = argv[2];
}
bool success = test.test_all();
compile_standalone_runtime(test.output_directory + "simd_op_check_runtime.o", test.target);
if (!success) {
return -1;
}
printf("Success!\n");
return 0;
}