Removed more "/KYBER_Q" from the source code, if compiled to DIV, the…

…y might be turned into a plaintext-checking oracle (thanks to Prasanna Ravi and Matthias Kannwischer for reporting!)
pq-crystals · Dec 30, 2023 · 11d00ff · 11d00ff
1 parent bc8e640
commit 11d00ff
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 106 deletions.
diff --git a/avx2/poly.c b/avx2/poly.c
@@ -22,94 +22,7 @@
 *                            (of length KYBER_POLYCOMPRESSEDBYTES)
 *              - const poly *a: pointer to input polynomial
 **************************************************/
-#if (KYBER_POLYCOMPRESSEDBYTES == 96)
-void poly_compress(uint8_t r[96], const poly * restrict a)
-{
-  unsigned int i;
-  __m256i f0, f1, f2, f3;
-  __m128i t0, t1;
-  const __m256i v = _mm256_load_si256(&qdata.vec[_16XV/16]);
-  const __m256i shift1 = _mm256_set1_epi16(1 << 8);
-  const __m256i mask = _mm256_set1_epi16(7);
-  const __m256i shift2 = _mm256_set1_epi16((8 << 8) + 1);
-  const __m256i shift3 = _mm256_set1_epi32((64 << 16) + 1);
-  const __m256i sllvdidx = _mm256_set1_epi64x(12LL << 32);
-  const __m256i shufbidx = _mm256_set_epi8( 8, 2, 1, 0,-1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9,
-                                           -1,-1,-1,-1,14,13,12, 6, 5, 4,10, 9, 8, 2, 1, 0);
-
-  for(i=0;i<KYBER_N/64;i++) {
-    f0 = _mm256_load_si256(&a->vec[4*i+0]);
-    f1 = _mm256_load_si256(&a->vec[4*i+1]);
-    f2 = _mm256_load_si256(&a->vec[4*i+2]);
-    f3 = _mm256_load_si256(&a->vec[4*i+3]);
-    f0 = _mm256_mulhi_epi16(f0,v);
-    f1 = _mm256_mulhi_epi16(f1,v);
-    f2 = _mm256_mulhi_epi16(f2,v);
-    f3 = _mm256_mulhi_epi16(f3,v);
-    f0 = _mm256_mulhrs_epi16(f0,shift1);
-    f1 = _mm256_mulhrs_epi16(f1,shift1);
-    f2 = _mm256_mulhrs_epi16(f2,shift1);
-    f3 = _mm256_mulhrs_epi16(f3,shift1);
-    f0 = _mm256_and_si256(f0,mask);
-    f1 = _mm256_and_si256(f1,mask);
-    f2 = _mm256_and_si256(f2,mask);
-    f3 = _mm256_and_si256(f3,mask);
-    f0 = _mm256_packus_epi16(f0,f1);
-    f2 = _mm256_packus_epi16(f2,f3);
-    f0 = _mm256_maddubs_epi16(f0,shift2);	// a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
-    f2 = _mm256_maddubs_epi16(f2,shift2);	// c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
-    f0 = _mm256_madd_epi16(f0,shift3);		// a0 a1 b0 b1 a2 a3 b2 b3
-    f2 = _mm256_madd_epi16(f2,shift3);		// c0 c1 d0 d1 c2 c3 d2 d3
-    f0 = _mm256_sllv_epi32(f0,sllvdidx);
-    f2 = _mm256_sllv_epi32(f2,sllvdidx);
-    f0 = _mm256_hadd_epi32(f0,f2);		// a0 c0 c0 d0 a1 b1 c1 d1
-    f0 = _mm256_permute4x64_epi64(f0,0xD8);	// a0 b0 a1 b1 c0 d0 c1 d1
-    f0 = _mm256_shuffle_epi8(f0,shufbidx);
-    t0 = _mm256_castsi256_si128(f0);
-    t1 = _mm256_extracti128_si256(f0,1);
-    t0 = _mm_blend_epi32(t0,t1,0x08);
-    _mm_storeu_si128((__m128i *)&r[24*i+ 0],t0);
-    _mm_storel_epi64((__m128i *)&r[24*i+16],t1);
-  }
-}
-
-/*************************************************
-* Name:        poly_decompress
-*
-* Description: De-serialization and subsequent decompression of a polynomial;
-*              approximate inverse of poly_compress
-*
-* Arguments:   - poly *r: pointer to output polynomial
-*              - const uint8_t *a: pointer to input byte array
-*                                  (of length KYBER_POLYCOMPRESSEDBYTES bytes)
-**************************************************/
-void poly_decompress(poly * restrict r, const uint8_t a[96])
-{
-  unsigned int i;
-  __m128i t;
-  __m256i f;
-  const __m256i q = _mm256_load_si256(&qdata.vec[_16XQ/16]);
-  const __m256i shufbidx = _mm256_set_epi8(5,5,5,5,5,4,4,4,4,4,4,3,3,3,3,3,
-                                           2,2,2,2,2,1,1,1,1,1,1,0,0,0,0,0);
-  const __m256i mask = _mm256_set_epi16(224,28,896,112,14,448,56,7,
-                                        224,28,896,112,14,448,56,7);
-  const __m256i shift = _mm256_set_epi16(128,1024,32,256,2048,64,512,4096,
-                                         128,1024,32,256,2048,64,512,4096);
-
-  for(i=0;i<KYBER_N/16;i++) {
-    t = _mm_castps_si128(_mm_load_ss((float *)&a[6*i+0])));
-    t = _mm_insert_epi16(t,*(int16_t *)&a[6*i+4],2);
-    f = _mm256_broadcastsi128_si256(t);
-    f = _mm256_blend_epi16(f,g,0x);
-    f = _mm256_shuffle_epi8(f,shufbidx);
-    f = _mm256_and_si256(f,mask);
-    f = _mm256_mullo_epi16(f,shift);
-    f = _mm256_mulhrs_epi16(f,q);
-    _mm256_store_si256(&r->vec[i],f);
-  }
-}
-
-#elif (KYBER_POLYCOMPRESSEDBYTES == 128)
+#if (KYBER_POLYCOMPRESSEDBYTES == 128)
 void poly_compress(uint8_t r[128], const poly * restrict a)
 {
   unsigned int i;

diff --git a/ref/poly.c b/ref/poly.c
@@ -18,16 +18,23 @@
 void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
 {
   unsigned int i,j;
-  int16_t u;
+  int32_t u;
+  uint32_t d0;
   uint8_t t[8];
 
 #if (KYBER_POLYCOMPRESSEDBYTES == 128)
+
   for(i=0;i<KYBER_N/8;i++) {
     for(j=0;j<8;j++) {
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15;
+/*    t[j] = ((((uint16_t)u << 4) + KYBER_Q/2)/KYBER_Q) & 15; */
+      d0 = u << 4;
+      d0 += 1665;
+      d0 *= 80635;
+      d0 >>= 28;
+      t[j] = d0 & 0xf;
     }
 
     r[0] = t[0] | (t[1] << 4);
@@ -42,7 +49,12 @@ void poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a)
       // map to positive standard representatives
       u  = a->coeffs[8*i+j];
       u += (u >> 15) & KYBER_Q;
-      t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31;
+/*    t[j] = ((((uint32_t)u << 5) + KYBER_Q/2)/KYBER_Q) & 31; */
+      d0 = u << 5;
+      d0 += 1664;
+      d0 *= 40318;
+      d0 >>= 27;
+      t[j] = d0 & 0x1f;
     }
 
     r[0] = (t[0] >> 0) | (t[1] << 5);

diff --git a/ref/polyvec.c b/ref/polyvec.c
@@ -15,6 +15,7 @@
 void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
 {
   unsigned int i,j,k;
+  uint64_t d0;
 
 #if (KYBER_POLYVECCOMPRESSEDBYTES == (KYBER_K * 352))
   uint16_t t[8];
@@ -23,7 +24,14 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<8;k++) {
         t[k]  = a->vec[i].coeffs[8*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff;
+/*      t[k]  = ((((uint32_t)t[k] << 11) + KYBER_Q/2)/KYBER_Q) & 0x7ff; */
+        d0 = t[k];
+        d0 <<= 11;
+        d0 += 1664;
+        d0 *= 645084;
+        d0 >>= 31;
+        t[k] = d0 & 0x7ff;
+
       }
 
       r[ 0] = (t[0] >>  0);
@@ -47,7 +55,13 @@ void polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a)
       for(k=0;k<4;k++) {
         t[k]  = a->vec[i].coeffs[4*j+k];
         t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
-        t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff;
+/*      t[k]  = ((((uint32_t)t[k] << 10) + KYBER_Q/2)/ KYBER_Q) & 0x3ff; */
+        d0 = t[k];
+        d0 <<= 10;
+        d0 += 1665;
+        d0 *= 1290167;
+        d0 >>= 32;
+        t[k] = d0 & 0x3ff;
       }
 
       r[0] = (t[0] >> 0);

diff --git a/ref/test/test_kyber.c b/ref/test/test_kyber.c
@@ -6,7 +6,7 @@
 
 #define NTESTS 1000
 
-static int test_keys()
+static int test_keys(void)
 {
   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
   uint8_t sk[CRYPTO_SECRETKEYBYTES];
@@ -31,7 +31,7 @@ static int test_keys()
   return 0;
 }
 
-static int test_invalid_sk_a()
+static int test_invalid_sk_a(void)
 {
   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
   uint8_t sk[CRYPTO_SECRETKEYBYTES];
@@ -59,7 +59,7 @@ static int test_invalid_sk_a()
   return 0;
 }
 
-static int test_invalid_ciphertext()
+static int test_invalid_ciphertext(void)
 {
   uint8_t pk[CRYPTO_PUBLICKEYBYTES];
   uint8_t sk[CRYPTO_SECRETKEYBYTES];

diff --git a/ref/test/test_speed.c b/ref/test/test_speed.c
@@ -16,7 +16,7 @@
 uint64_t t[NTESTS];
 uint8_t seed[KYBER_SYMBYTES] = {0};
 
-int main()
+int main(void)
 {
   unsigned int i;
   uint8_t pk[CRYPTO_PUBLICKEYBYTES];

diff --git a/runtests.sh b/runtests.sh
@@ -11,21 +11,22 @@ else
 fi
 
 if [ "$ARCH" = "amd64" -o "$ARCH" = "arm64" ]; then
-  export CC=/usr/bin/clang 
-  export CFLAGS="-fsanitize=undefined,address ${CFLAGS}"
+  export CC=/usr/bin/gcc
+#  export CFLAGS="-fsanitize=undefined,address ${CFLAGS}"
 fi
 
 for dir in $DIRS; do
   make -j$(nproc) -C $dir clean
   make -j$(nproc) -C $dir
   for alg in 512 768 1024; do
-    #valgrind --vex-guest-max-insns=25 ./$dir/test_kyber$alg
-    ./$dir/test/test_kyber$alg &
-    PID1=$!
-    echo testvec$alg
-    ./$dir/test/test_vectors$alg > tvecs$alg &
-    PID2=$!
-    wait $PID1 $PID2
+    valgrind --vex-guest-max-insns=25 ./$dir/test/test_kyber$alg
+    echo test_kyber$alg
+    ./$dir/test/test_kyber$alg 
+#    PID1=$!
+#    echo testvec$alg
+    ./$dir/test/test_vectors$alg > tvecs$alg 
+#    PID2=$!
+#    wait $PID1 $PID2
   done
   shasum -a256 -c SHA256SUMS
 done