-
Notifications
You must be signed in to change notification settings - Fork 45
/
avx2.spec
222 lines (177 loc) · 4.73 KB
/
avx2.spec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# See: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
# 256 bit logical and (maybe change in ecBDep to VPAND)
AND_u256(a@256, b@256) -> @256 =
and<256>(a, b)
ADD_8(a@8, b@8) -> @8 =
add<8>(a, b)
OPP_8(a@8) -> @8 =
add<8>(xor<8>(a, 255), 1)
# Intel intrinsic: _mm256_permutexvar_epi32
VPERMD(widx@256, w@256) -> @256 =
map<32, 8>(
fun idx@32 . let i = idx[0:3] in w[@32|i],
widx
)
# Intel intrinsic: _mm256_permute4x64_epi64
VPERMQ(w@256, i@8) -> @256 =
let permute (i@2) = w[@64|i] in
concat<64>(
permute(i[@2|0]),
permute(i[@2|1]),
permute(i[@2|2]),
permute(i[@2|3])
)
# Intel intrinsic: _mm256_add_epi16
VPADD_16u16(w1@256, w2@256) -> @256 =
map<16, 16>(add<16>, w1, w2)
# Intel intrinsic: _mm256_add_epi8
VPADD_32u8(w1@256, w2@256) -> @256 =
map<8, 32>(add<8>, w1, w2)
# Intel intrinsic: _mm256_sub_epi16
VPSUB_16u16(w1@256, w2@256) -> @256 =
map<16, 16>(sub<16>, w1, w2)
# Intel intrinsic: _mm256_sub_epi16
VPSUB_32u8(w1@256, w2@256) -> @256 =
map<8, 32>(sub<8>, w1, w2)
# Intel intrinsic: _mm256_and_si256
VPAND_256(w1@256, w2@256) -> @256 =
and<256>(w1, w2)
# Intel intrinsic: _mm256_andnot_si256
VPNAND_256(w1@256, w2@256) -> @256 =
not<256>(and<256>(w1, w2))
# Intel intrinsic: _mm256_broadcastw_epi16
VPBROADCAST_16u16(w@16) -> @256 =
repeat<16>(w[@16|0], 16)
# Intel intrinsic: _mm256_mulhi_epu16
VPMULH_16u16(w1@256, w2@256) -> @256 =
map<16, 16>(umulhi<16>, w1, w2)
# Intel intrinsic: _mm256_mulhrs_epi16
VPMULHRS_16u16(w1@256, w2@256) -> @256 =
map<16, 16>(
fun x@16 y@16 .
let w = smul<16>(x, y) in
let w = incr<32>(srl<32>(w, 14)) in
w[1:16],
w1,
w2
)
# Intel intrinsic: _mm256_srai_epi16
VPSRA_16u16(w@256, count@8) -> @256 =
map<16, 16>(sra<16>(., count), w)
# Intel intrinsic: _mm256_srli_epi16
VPSRL_16u16(w@256, count@8) -> @256 =
map<16, 16>(srl<16>(., count), w)
# Intel intrinsic: _mm256_srli_epi64
VPSRL_4u64(w@256, count@8) -> @256 =
map<64, 4>(srl<64>(., count), w)
# Intel intrinsic: _mm256_sll_epi64
VPSLL_4u64(w@256, count@8) -> @256 =
map<64, 4>(sll<64>(., count), w)
VPSLLDQ_256(w@256, count@8) -> @256 =
map<128, 2>(sll<128>(., count), w)
VPSRLDQ_256(w@256, count@8) -> @256 =
map<128, 2>(srl<128>(., count), w)
VPSLLDQ_128(w@128, count@8) -> @128 =
sll<128>(w, count)
VPSRLDQ_128(w@128, count@8) -> @128 =
srl<128>(w, count)
# Intel intrinsic: _mm256_maddubs_epi16
VPMADDUBSW_256(w1@256, w2@256) -> @256 =
map<16, 16>(
fun x@16 y@16 .
ssadd<16>(
usmul<8>(x[@8|0], y[@8|0]),
usmul<8>(x[@8|1], y[@8|1])
),
w1,
w2
)
# Intel intrinsic: _mm256_packus_epi16
VPACKUS_16u16(w1@256, w2@256) -> @256 =
let pack (w@128) = map<16, 8>(usat<16, 8>, w) in
concat<64>(
pack(w1[@128|0]),
pack(w2[@128|0]),
pack(w1[@128|1]),
pack(w2[@128|1])
)
# Intel intrincis: _mm256_packs_epi16
VPACKSS_16u16(w1@256, w2@256) -> @256 =
let pack (w@128) = map<16, 8>(ssat<16, 8>, w) in
concat<64>(
pack(w1[@128|0]),
pack(w2[@128|0]),
pack(w1[@128|1]),
pack(w2[@128|1])
)
# Intel intrinsic: _mm256_shuffle_epi8
VPSHUFB_256(w@256, widx@256) -> @256 =
map<128, 2>(
fun w@128 widx@128 .
map<8, 16>(
fun idx@8 . idx[7] ? 0 : w[@8|idx[0:4]],
widx
),
w,
widx
)
# Intel intrinsic: _mm256_blend_epi16
# FIXME: we need an heterogeneous `map' combinator
VPBLEND_16u16(w1@256, w2@256, c@8) -> @256 =
let c = repeat<8>(c, 2) in
let c = map<1, 16>(uextend<1, 16>, c) in
map<16, 16>(
fun c@16 w1@16 w2@16 . c[0] ? w2 : w1,
c,
w1,
w2
)
# Intel intrinsic: _mm256_cmpgt_epi16
VPCMPGT_16u16(w1@256, w2@256) -> @256 =
map<16, 16>(
fun w1@16 w2@16 . sgt<16>(w1, w2) ? 0xffff@16 : 0x0000@16,
w1,
w2
)
# Intel intrincis: _mm256_movemask_epi8
VPMOVMSKB_u256u64(w@256) -> @32 =
map<8, 32>(fun i@8 . i[7], w)
# Intel intrinsic: _mm256_unpacklo_epi8
VPUNPCKL_32u8(w1@256, w2@256) -> @256 =
let interleave (w1@64, w2@64) =
map<8, 8>(
fun w1@8 w2@8 . concat<8>(w1, w2),
w1,
w2
)
in
concat<128>(
interleave(w1[@64|0], w2[@64|0]),
interleave(w1[@64|2], w2[@64|2])
)
# Intel intrinsic: _mm256_extracti128_si256
VPEXTRACTI128(w@256, i@8) -> @128 =
w[@128|i[0]]
VEXTRACTI128(w@256, i@8) -> @128 =
w[@128|i[0]]
# Intel intrinsic: _mm256_inserti128_si256
VPINSERTI128(w@256, m@128, i@8) -> @256 =
w[@128|i[0] <- m]
# XOR
VPXOR_256(w1@256, w2@256) -> @256 =
xor<256>(w1, w2)
VPXOR_128(w1@128, w2@128) -> @128 =
xor<128>(w1, w2)
# FIXME
concat_2u128(a@128, b@128) -> @256 =
concat<128>(b, a)
# Add later
truncateu128(w@256) -> @128 =
w[@128|0]
## Auxiliary stuff
COMPRESS(w@16) -> @4 =
srl<32>(umul<16>(
add<16>(
sll<16>(w, 4),
1665)
, 80635), 28)[@4|0]