From 12e9711813ccdca852a59af0c91c633d1fd031b0 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 20 Mar 2023 19:16:57 -0700 Subject: [PATCH] Cleanup some HWIntrinsic logic to ensure the right gtType and simdSize are being set (#83516) * Cleanup some HWIntrinsic logic to ensure the right gtType and simdSize are being set * Apply formatting patch * Add a missing GetLower call * Fix an assert for lowering TYP_SIMD12 where its handled as TYP_SIMD16 * Ensure GetLower is used in Dot for TYP_SIMD32 * Apply formatting patch * Insert after, not before, for the _GetLower to avoid a codegen regression * Put the _GetLower in the right place to avoid the codegen regression * Don't change the simd size of TYP_SIMD8/12 DotProduct unnecessarily --- src/coreclr/jit/gentree.cpp | 59 ++++++------ src/coreclr/jit/hwintrinsicarm64.cpp | 8 +- src/coreclr/jit/hwintrinsicxarch.cpp | 2 +- src/coreclr/jit/lower.h | 2 +- src/coreclr/jit/lowerxarch.cpp | 128 ++++++++++++--------------- 5 files changed, 89 insertions(+), 110 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index f811a72061f9..7f9f38da566b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19544,7 +19544,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(genTreeOps op, assert(varTypeIsArithmetic(simdBaseType)); assert(op1 != nullptr); - assert(op1->TypeIs(type, simdBaseType, genActualType(simdBaseType))); + assert(op1->TypeIs(type, simdBaseType, genActualType(simdBaseType)) || + (op1->TypeIs(TYP_SIMD12) && type == TYP_SIMD16)); assert(op2 != nullptr); @@ -19554,7 +19555,8 @@ GenTree* Compiler::gtNewSimdBinOpNode(genTreeOps op, } else { - assert(op2->TypeIs(type, simdBaseType, genActualType(simdBaseType))); + assert(op2->TypeIs(type, simdBaseType, genActualType(simdBaseType)) || + (op2->TypeIs(TYP_SIMD12) && type == TYP_SIMD16)); } NamedIntrinsic intrinsic = NI_Illegal; @@ -22425,11 +22427,9 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, GenTree* vecCon2 = gtCloneExpr(vecCon1); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, vecCon1, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(type, op2, vecCon2, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, + tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, simdSize, isSimdAsHWIntrinsic); CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; @@ -22468,11 +22468,9 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, GenTree* vecCon2 = gtCloneExpr(vecCon1); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, vecCon1, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(type, op2, vecCon2, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE41_PackUnsignedSaturate, CORINFO_TYPE_USHORT, + tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_PackUnsignedSaturate, CORINFO_TYPE_USHORT, simdSize, isSimdAsHWIntrinsic); CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; @@ -22576,10 +22574,8 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, GenTree* vecCon2 = gtCloneExpr(vecCon1); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, vecCon1, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(type, op2, vecCon2, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, simdSize, isSimdAsHWIntrinsic); @@ -22618,10 +22614,10 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, GenTree* vecCon2 = gtCloneExpr(vecCon1); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, vecCon1, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(type, op2, vecCon2, NI_SSE2_And, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = + gtNewSimdBinOpNode(GT_AND, type, op1, vecCon1, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp2 = + gtNewSimdBinOpNode(GT_AND, type, op2, vecCon2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE41_PackUnsignedSaturate, CORINFO_TYPE_USHORT, simdSize, isSimdAsHWIntrinsic); @@ -22928,8 +22924,8 @@ GenTree* Compiler::gtNewSimdShuffleNode(var_types type, simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; GenTree* op1Dup = fgMakeMultiUse(&op1, clsHnd); - GenTree* op1Lower = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + GenTree* op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); op2 = gtNewVconNode(TYP_SIMD16); op2->AsVecCon()->gtSimd16Val = vecCns.v128[0]; @@ -22937,7 +22933,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(var_types type, op1Lower = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Lower, op2, NI_SSSE3_Shuffle, simdBaseJitType, 16, isSimdAsHWIntrinsic); - GenTree* op1Upper = gtNewSimdHWIntrinsicNode(type, op1Dup, gtNewIconNode(1), NI_AVX_ExtractVector128, + GenTree* op1Upper = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1Dup, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); op2 = gtNewVconNode(TYP_SIMD16); @@ -23346,12 +23342,12 @@ GenTree* Compiler::gtNewSimdSumNode( op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(0x01, TYP_INT), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - tmp = gtNewSimdHWIntrinsicNode(simdType, tmp, NI_Vector256_GetLower, simdBaseJitType, simdSize, + tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, tmp, intrinsic, simdBaseJitType, 16, isSimdAsHWIntrinsic); } - return gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, 16, isSimdAsHWIntrinsic); #elif defined(TARGET_ARM64) switch (simdBaseType) { @@ -23544,8 +23540,8 @@ GenTree* Compiler::gtNewSimdWidenLowerNode( assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = - gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -23673,7 +23669,8 @@ GenTree* Compiler::gtNewSimdWidenLowerNode( if (simdSize == 8) { - tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, 16, isSimdAsHWIntrinsic); + tmp1 = + gtNewSimdHWIntrinsicNode(TYP_SIMD8, tmp1, NI_Vector128_GetLower, simdBaseJitType, 16, isSimdAsHWIntrinsic); } return tmp1; @@ -23706,8 +23703,8 @@ GenTree* Compiler::gtNewSimdWidenUpperNode( assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -23860,7 +23857,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode( zero = gtNewZeroConNode(TYP_SIMD16); tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, simdBaseJitType, 16, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, simdSize, + return gtNewSimdHWIntrinsicNode(TYP_SIMD8, tmp1, NI_Vector128_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } #else diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 57f3498ad401..01f7a4420df1 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -424,7 +424,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, assert(retType == TYP_SIMD8); op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize); break; } @@ -1056,7 +1056,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (varTypeIsByte(simdBaseType) && (simdSize == 16)) { - CORINFO_CLASS_HANDLE simdClsHnd = gtGetStructHandleForSimdOrHW(simdType, simdBaseJitType); + CORINFO_CLASS_HANDLE simdClsHnd = gtGetStructHandleForSimdOrHW(TYP_SIMD16, simdBaseJitType); op1 = impCloneExpr(op1, &op2, simdClsHnd, CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); @@ -1069,10 +1069,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ false); op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, TYP_INT); - GenTree* zero = gtNewZeroConNode(simdType); + GenTree* zero = gtNewZeroConNode(TYP_SIMD16); ssize_t index = 8 / genTypeSize(simdBaseType); - op2 = gtNewSimdHWIntrinsicNode(simdType, op2, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, + op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_Vector128_GetLower, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 977bb1b320a8..e94123307059 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1382,7 +1382,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, simdVal.u64[3] = 0x8080808080808080; shuffleIntrinsic = NI_AVX2_Shuffle; - moveMaskIntrinsic = NI_AVX2_MoveMask; + moveMaskIntrinsic = NI_SSE2_MoveMask; } else if (compOpportunisticallyDependsOn(InstructionSet_SSSE3)) { diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2a7eb25ca9c7..db5aa9dc2119 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -349,13 +349,13 @@ class Lowering final : public Phase GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); - void LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); void LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); + GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node); GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 56fd1c30c1a6..f7b087b9db07 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1049,8 +1049,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: { - LowerHWIntrinsicCndSel(node); - break; + return LowerHWIntrinsicCndSel(node); } case NI_Vector128_Create: @@ -1787,7 +1786,7 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // Arguments: // node - The hardware intrinsic node. // -void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) +GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) { var_types simdType = node->gtType; CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); @@ -1844,7 +1843,7 @@ void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) { // result = BlendVariable op3 (right) op2 (left) op1 (mask) node->ResetHWIntrinsicId(blendVariableId, comp, op3, op2, op1); - return; + return LowerNode(node); } } @@ -1860,6 +1859,7 @@ void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) GenTree* tmp1; GenTree* tmp2; GenTree* tmp3; + GenTree* tmp4; LIR::Use op1Use(BlockRange(), &node->Op(1), node); ReplaceWithLclVar(op1Use); @@ -1882,42 +1882,24 @@ void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(op3, tmp3); LowerNode(tmp3); - // determine which Or intrinsic to use, depending on target architecture - NamedIntrinsic orIntrinsic = NI_Illegal; - - if (simdSize == 32) - { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + // ... + // tmp4 = tmp2 | tmp3 + // ... + tmp4 = comp->gtNewSimdBinOpNode(GT_OR, simdType, tmp2, tmp3, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + BlockRange().InsertBefore(node, tmp4); - if (varTypeIsFloating(simdBaseType)) - { - orIntrinsic = NI_AVX_Or; - } - else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) - { - orIntrinsic = NI_AVX2_Or; - } - else - { - // Since this is a bitwise operation, we can still support it by lying - // about the type and doing the operation using a supported instruction - orIntrinsic = NI_AVX_Or; - simdBaseJitType = CORINFO_TYPE_FLOAT; - } - } - else if (simdBaseType == TYP_FLOAT) + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) { - orIntrinsic = NI_SSE_Or; + use.ReplaceWith(tmp4); } else { - orIntrinsic = NI_SSE2_Or; + tmp4->SetUnusedValue(); } - // ... - // result = tmp2 | tmp3 - node->ResetHWIntrinsicId(orIntrinsic, tmp2, tmp3); - node->SetSimdBaseJitType(simdBaseJitType); + BlockRange().Remove(node); + return LowerNode(tmp4); } //---------------------------------------------------------------------------------------------- @@ -1998,6 +1980,9 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) { if (isCreateScalar) { + node->gtType = TYP_SIMD16; + node->SetSimdSize(16); + switch (simdBaseType) { case TYP_BYTE: @@ -3284,6 +3269,7 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) unreached(); } + node->SetSimdSize(16); node->ResetHWIntrinsicId(resIntrinsic, op1); } else @@ -3328,11 +3314,10 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) unreached(); } + node->SetSimdSize(16); node->ResetHWIntrinsicId(resIntrinsic, op1, op2); } - node->SetSimdSize(16); - if (node->GetHWIntrinsicId() != intrinsicId) { LowerNode(node); @@ -3738,9 +3723,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - NamedIntrinsic multiply = NI_Illegal; NamedIntrinsic horizontalAdd = NI_Illegal; - NamedIntrinsic add = NI_Illegal; NamedIntrinsic shuffle = NI_Illegal; if (simdSize == 32) @@ -3753,10 +3736,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_UINT: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - - multiply = NI_AVX2_MultiplyLow; horizontalAdd = NI_AVX2_HorizontalAdd; - add = NI_AVX2_Add; break; } @@ -3769,11 +3749,13 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // /--* op1 simd16 // +--* op2 simd16 // +--* idx int - // tmp1 = * HWINTRINSIC simd16 T DotProduct - // /--* tmp1 simd16 - // * STORE_LCL_VAR simd16 - // tmp1 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 + // tmp1 = * HWINTRINSIC simd32 T DotProduct + // /--* tmp1 simd32 + // * STORE_LCL_VAR simd32 + // tmp1 = LCL_VAR simd32 + // /--* tmp1 simd32 + // tmp1 = * HWINTRINSIC simd16 T GetLower + // tmp2 = LCL_VAR simd32 // idx = CNS_INT int 0x01 // /--* tmp2 simd16 // +--* idx int @@ -3814,8 +3796,13 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(idx, tmp2); LowerNode(tmp2); - tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_SSE_Add, simdBaseJitType, 16); - BlockRange().InsertAfter(tmp2, tmp3); + tmp1 = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, NI_Vector256_GetLower, simdBaseJitType, simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + tmp3 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp2, tmp1, simdBaseJitType, 16, false); + BlockRange().InsertAfter(tmp1, tmp3); LowerNode(tmp3); node->SetSimdSize(16); @@ -3828,10 +3815,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_DOUBLE: { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); - - multiply = NI_AVX_Multiply; horizontalAdd = NI_AVX_HorizontalAdd; - add = NI_AVX_Add; break; } @@ -3850,9 +3834,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_SHORT: case TYP_USHORT: { - multiply = NI_SSE2_MultiplyLow; horizontalAdd = NI_SSSE3_HorizontalAdd; - add = NI_SSE2_Add; if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3)) { @@ -3864,11 +3846,8 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_INT: case TYP_UINT: { - multiply = NI_SSE41_MultiplyLow; - horizontalAdd = NI_SSSE3_HorizontalAdd; - add = NI_SSE2_Add; - assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41)); + horizontalAdd = NI_SSSE3_HorizontalAdd; break; } @@ -3927,9 +3906,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) return LowerNode(node); } - multiply = NI_SSE_Multiply; horizontalAdd = NI_SSE3_HorizontalAdd; - add = NI_SSE_Add; if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) { @@ -3981,9 +3958,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) return LowerNode(node); } - multiply = NI_SSE2_Multiply; horizontalAdd = NI_SSE3_HorizontalAdd; - add = NI_SSE2_Add; if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) { @@ -4046,10 +4021,9 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) GenTreeVecCon* vecCon1 = comp->gtNewVconNode(simdType); memcpy(&vecCon1->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertAfter(op1, vecCon1); - op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, vecCon1, NI_SSE_And, simdBaseJitType, simdSize); + op1 = comp->gtNewSimdBinOpNode(GT_AND, simdType, op1, vecCon1, simdBaseJitType, simdSize, false); BlockRange().InsertAfter(vecCon1, op1); LowerNode(vecCon1); @@ -4075,10 +4049,9 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) GenTreeVecCon* vecCon2 = comp->gtNewVconNode(simdType); memcpy(&vecCon2->gtSimdVal, &simd16Val, sizeof(simd16_t)); - BlockRange().InsertAfter(op2, vecCon2); - op2 = comp->gtNewSimdHWIntrinsicNode(simdType, op2, vecCon2, NI_SSE_And, simdBaseJitType, simdSize); + op2 = comp->gtNewSimdBinOpNode(GT_AND, simdType, op2, vecCon2, simdBaseJitType, simdSize, false); BlockRange().InsertAfter(vecCon2, op2); LowerNode(vecCon2); @@ -4096,7 +4069,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // var tmp1 = Isa.Multiply(op1, op2); // ... - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdBinOpNode(GT_MUL, simdType, op1, op2, simdBaseJitType, simdSize, false); BlockRange().InsertBefore(node, tmp1); LowerNode(tmp1); @@ -4301,7 +4274,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // tmp1 = Isa.Add(tmp1, tmp2); // ... - tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, add, simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdBinOpNode(GT_ADD, simdType, tmp1, tmp2, simdBaseJitType, simdSize, false); } BlockRange().InsertAfter(tmp2, tmp1); @@ -4312,12 +4285,14 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) { // We will be constructing the following parts: // ... - // /--* tmp1 simd16 - // * STORE_LCL_VAR simd16 - // tmp1 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd32 + // * STORE_LCL_VAR simd32 + // tmp1 = LCL_VAR simd32 + // /--* tmp1 simd32 + // tmp1 = * HWINTRINSIC simd16 T GetLower + // tmp2 = LCL_VAR simd32 // idx = CNS_INT int 0x01 - // /--* tmp2 simd16 + // /--* tmp2 simd32 // +--* idx int // tmp2 = * HWINTRINSIC simd16 T ExtractVector128 // /--* tmp1 simd16 @@ -4329,9 +4304,11 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // ... // var tmp2 = tmp1; // tmp2 = Avx.ExtractVector128(tmp2, 0x01); - // var tmp1 = Isa.Add(tmp1, tmp2); + // var tmp1 = Isa.Add(tmp1.GetLower(), tmp2); // ... + assert(simdBaseType != TYP_FLOAT); + node->Op(1) = tmp1; LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); ReplaceWithLclVar(tmp1Use); @@ -4348,11 +4325,16 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(idx, tmp2); LowerNode(tmp2); - tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, add, simdBaseJitType, 16); + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, NI_Vector256_GetLower, simdBaseJitType, simdSize); BlockRange().InsertAfter(tmp2, tmp1); LowerNode(tmp1); + tmp3 = comp->gtNewSimdBinOpNode(GT_ADD, TYP_SIMD16, tmp2, tmp1, simdBaseJitType, 16, false); + BlockRange().InsertAfter(tmp1, tmp3); + LowerNode(tmp3); + node->SetSimdSize(16); + tmp1 = tmp3; } if (varTypeIsSIMD(node->gtType))