From b2625f38cc3a200e840861f2a3371eb8f5afdd73 Mon Sep 17 00:00:00 2001
From: Luca Bruno <lucab@lucabruno.net>
Date: Wed, 6 Mar 2024 20:56:10 +0100
Subject: [PATCH] simd: split cursor advancing from value matching (#156)

This refactors all SIMD modules in order to make the value-matching
logic self-contained. Thus, all bytes-cursor manipulations are now
grouped and performed once at the end, outside of SIMD logic.
---
 src/lib.rs          |  33 +++++----
 src/simd/avx2.rs    |  48 +++++++------
 src/simd/mod.rs     |  30 ++++----
 src/simd/neon.rs    | 165 ++++++++++++++++++++++----------------------
 src/simd/runtime.rs |  10 +--
 src/simd/sse42.rs   |  54 +++++++++------
 src/simd/swar.rs    | 123 ++++++++++++++-------------------
 7 files changed, 229 insertions(+), 234 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 846ecee..3232852 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -953,18 +953,20 @@ fn parse_token<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
 #[allow(missing_docs)]
 // WARNING: Exported for internal benchmarks, not fit for public consumption
 pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
-    let start = bytes.pos();
-    simd::match_uri_vectored(bytes);
     // URI must have at least one char
-    if bytes.pos() == start {
+    let uri_len = simd::match_uri_vectored(bytes.as_ref());
+    if uri_len == 0 {
         return Err(Error::Token);
     }
+    // SAFETY: these bytes have just been matched here above.
+    unsafe { bytes.advance(uri_len) };
+    let uri_slice = bytes.slice();
 
-    if next!(bytes) == b' ' {
-        return Ok(Status::Complete(
-            // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
-            unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
-        ));
+    let space_delim = next!(bytes);
+    if space_delim == b' ' {
+        // SAFETY: all bytes within `uri_slice` must have been `is_token` and therefore also utf-8.
+        let uri = unsafe { str::from_utf8_unchecked(uri_slice) };
+        Ok(Status::Complete(uri))
     } else {
         Err(Error::Token)
     }
@@ -1179,15 +1181,15 @@ fn parse_headers_iter_uninit<'a>(
         #[allow(clippy::never_loop)]
         // parse header name until colon
         let header_name: &str = 'name: loop {
-            simd::match_header_name_vectored(bytes);
-            let mut b = next!(bytes);
-
-            // SAFETY: previously bumped by 1 with next! -> always safe.
-            let bslice = unsafe { bytes.slice_skip(1) };
+            let len = simd::match_header_name_vectored(bytes.as_ref());
+            // SAFETY: these bytes have just been matched here above.
+            unsafe { bytes.advance(len) };
+            let bslice = bytes.slice();
             // SAFETY: previous call to match_header_name_vectored ensured all bytes are valid
             // header name chars, and as such also valid utf-8.
             let name = unsafe { str::from_utf8_unchecked(bslice) };
 
+            let mut b = next!(bytes);
             if b == b':' {
                 break 'name name;
             }
@@ -1213,6 +1215,7 @@ fn parse_headers_iter_uninit<'a>(
             // eat white space between colon and value
             'whitespace_after_colon: loop {
                 b = next!(bytes);
+
                 if b == b' ' || b == b'\t' {
                     bytes.slice();
                     continue 'whitespace_after_colon;
@@ -1239,7 +1242,9 @@ fn parse_headers_iter_uninit<'a>(
             'value_lines: loop {
                 // parse value till EOL
 
-                simd::match_header_value_vectored(bytes);
+                let len = simd::match_header_value_vectored(bytes.as_ref());
+                // SAFETY: these bytes have just been matched here above.
+                unsafe { bytes.advance(len) };
                 let b = next!(bytes);
 
                 //found_ctl
diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs
index 556620c..4f7d6fa 100644
--- a/src/simd/avx2.rs
+++ b/src/simd/avx2.rs
@@ -1,24 +1,27 @@
-use crate::iter::Bytes;
-
 #[cfg(target_arch = "x86")]
-pub unsafe fn match_uri_vectored(_: &mut Bytes) {
+pub(crate) unsafe fn match_uri_vectored(_: &[u8]) -> usize {
     unreachable!("AVX2 detection should be disabled for x86");
 }
 
 #[inline]
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2", enable = "sse4.2")]
-pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 32 {
-        let advance = match_url_char_32_avx(bytes.as_ref());
-        bytes.advance(advance);
+pub(crate) unsafe fn match_uri_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 32 {
+        let advance = match_url_char_32_avx(remaining);
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
 
         if advance != 32 {
-            return;
+            return len;
         }
     }
     // do both, since avx2 only works when bytes.len() >= 32
-    super::sse42::match_uri_vectored(bytes)
+    let advance = super::sse42::match_uri_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline(always)]
@@ -64,23 +67,28 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
 }
 
 #[cfg(target_arch = "x86")]
-pub unsafe fn match_header_value_vectored(_: &mut Bytes) {
+pub(crate) unsafe fn match_header_value_vectored(_: &[u8]) -> usize {
     unreachable!("AVX2 detection should be disabled for x86");
 }
 
 #[cfg(target_arch = "x86_64")]
 #[target_feature(enable = "avx2", enable = "sse4.2")]
-pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 32 {
-        let advance = match_header_value_char_32_avx(bytes.as_ref());
-        bytes.advance(advance);
+pub(crate) unsafe fn match_header_value_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 32 {
+        let advance = match_header_value_char_32_avx(remaining);
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
 
         if advance != 32 {
-            return;
+            return len;
         }
     }
     // do both, since avx2 only works when bytes.len() >= 32
-    super::sse42::match_header_value_vectored(bytes)
+    let advance = super::sse42::match_header_value_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline(always)]
@@ -152,7 +160,7 @@ fn avx2_code_matches_header_value_chars_table() {
 }
 
 #[cfg(test)]
-unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
+unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &[u8]) -> usize) -> bool {
     let slice = [
         b'_', b'_', b'_', b'_',
         b'_', b'_', b'_', b'_',
@@ -163,11 +171,9 @@ unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool
         b'_', b'_', byte, b'_',
         b'_', b'_', b'_', b'_',
     ];
-    let mut bytes = Bytes::new(&slice);
-
-    f(&mut bytes);
 
-    match bytes.pos() {
+    let pos = f(&slice);
+    match pos {
         32 => true,
         26 => false,
         _ => unreachable!(),
diff --git a/src/simd/mod.rs b/src/simd/mod.rs
index 63464b4..a2da053 100644
--- a/src/simd/mod.rs
+++ b/src/simd/mod.rs
@@ -11,7 +11,7 @@ mod swar;
         )
     ),
 )))]
-pub use self::swar::*;
+pub(crate) use self::swar::*;
 
 #[cfg(all(
     httparse_simd,
@@ -59,7 +59,7 @@ mod runtime;
         target_arch = "x86_64",
     ),
 ))]
-pub use self::runtime::*;
+pub(crate) use self::runtime::*;
 
 #[cfg(all(
     httparse_simd,
@@ -72,18 +72,18 @@ pub use self::runtime::*;
 ))]
 mod sse42_compile_time {
     #[inline(always)]
-    pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) {
-        super::swar::match_header_name_vectored(b);
+    pub(crate) fn match_header_name_vectored(b: &[u8]) -> usize {
+        super::swar::match_header_name_vectored(b)
     }
 
     #[inline(always)]
-    pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) {
+    pub(crate) fn match_uri_vectored(b: &[u8]) -> usize {
         // SAFETY: calls are guarded by a compile time feature check
         unsafe { crate::simd::sse42::match_uri_vectored(b) }
     }
-    
+
     #[inline(always)]
-    pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) {
+    pub(crate) fn match_header_value_vectored(b: &[u8]) -> usize {
         // SAFETY: calls are guarded by a compile time feature check
         unsafe { crate::simd::sse42::match_header_value_vectored(b) }
     }
@@ -98,7 +98,7 @@ mod sse42_compile_time {
         target_arch = "x86_64",
     ),
 ))]
-pub use self::sse42_compile_time::*;
+pub(crate) use self::sse42_compile_time::*;
 
 #[cfg(all(
     httparse_simd,
@@ -110,18 +110,18 @@ pub use self::sse42_compile_time::*;
 ))]
 mod avx2_compile_time {
     #[inline(always)]
-    pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) {
-        super::swar::match_header_name_vectored(b);
+    pub(crate) fn match_header_name_vectored(b: &[u8]) -> usize {
+        super::swar::match_header_name_vectored(b)
     }
 
     #[inline(always)]
-    pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) {
+    pub(crate) fn match_uri_vectored(b: &[u8]) -> usize {
         // SAFETY: calls are guarded by a compile time feature check
         unsafe { crate::simd::avx2::match_uri_vectored(b) }
     }
-    
+
     #[inline(always)]
-    pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) {
+    pub(crate) fn match_header_value_vectored(b: &[u8]) -> usize {
         // SAFETY: calls are guarded by a compile time feature check
         unsafe { crate::simd::avx2::match_header_value_vectored(b) }
     }
@@ -135,7 +135,7 @@ mod avx2_compile_time {
         target_arch = "x86_64",
     ),
 ))]
-pub use self::avx2_compile_time::*;
+pub(crate) use self::avx2_compile_time::*;
 
 #[cfg(all(
     httparse_simd,
@@ -149,4 +149,4 @@ mod neon;
     target_arch = "aarch64",
     httparse_simd_neon_intrinsics,
 ))]
-pub use self::neon::*;
+pub(crate) use self::neon::*;
diff --git a/src/simd/neon.rs b/src/simd/neon.rs
index c6b86a8..1e85589 100644
--- a/src/simd/neon.rs
+++ b/src/simd/neon.rs
@@ -1,52 +1,60 @@
-use crate::iter::Bytes;
 use core::arch::aarch64::*;
 
 #[inline]
-pub fn match_header_name_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 16 {
-        // SAFETY: ensured that there are at least 16 bytes remaining 
-        unsafe {
-            let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());
-            bytes.advance(advance);
-
-            if advance != 16 {
-                return;
-            }
+pub(crate) fn match_header_name_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining.
+        let advance = unsafe { match_header_name_char_16_neon(remaining) };
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+
+        if advance != 16 {
+            return len;
         }
     }
-    super::swar::match_header_name_vectored(bytes);
+    let advance = super::swar::match_header_name_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline]
-pub fn match_header_value_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 16 {
-        // SAFETY: ensured that there are at least 16 bytes remaining 
-        unsafe {
-            let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());
-            bytes.advance(advance);
-
-            if advance != 16 {
-                return;
-            }
+pub(crate) fn match_header_value_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining.
+        let advance = unsafe { match_header_value_char_16_neon(remaining) };
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+
+        if advance != 16 {
+            return len;
         }
     }
-    super::swar::match_header_value_vectored(bytes);
+    let advance = super::swar::match_header_value_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline]
-pub fn match_uri_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 16 {
-        // SAFETY: ensured that there are at least 16 bytes remaining 
-        unsafe {
-            let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());
-            bytes.advance(advance);
-
-            if advance != 16 {
-                return;
-            }
+pub(crate) fn match_uri_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining.
+        let advance = unsafe { match_url_char_16_neon(remaining) };
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+
+        if advance != 16 {
+            return len;
         }
     }
-    super::swar::match_uri_vectored(bytes);
+    let advance = super::swar::match_uri_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 const fn bit_set(x: u8) -> bool {
@@ -81,7 +89,7 @@ const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();
 
 // NOTE: adapted from 256-bit version, with upper 128-bit ops commented out
 #[inline]
-unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
+unsafe fn match_header_name_char_16_neon(bytes: &[u8]) -> usize {
     let bitmaps = BITMAPS;
     // NOTE: ideally compile-time constants
     let (bitmap_0_7, _bitmap_8_15) = bitmaps;
@@ -94,7 +102,7 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
     let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());
 
     // Load 16 input bytes.
-    let input = vld1q_u8(ptr);
+    let input = vld1q_u8(bytes.as_ptr());
 
     // Extract indices for row_0_7.
     let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;
@@ -122,8 +130,8 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
 }
 
 #[inline]
-unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
-    let input = vld1q_u8(ptr);
+unsafe fn match_url_char_16_neon(bytes: &[u8]) -> usize {
+    let input = vld1q_u8(bytes.as_ptr());
 
     // Check that b'!' <= input <= b'~'
     let result = vandq_u8(
@@ -141,8 +149,8 @@ unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
 }
 
 #[inline]
-unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {
-    let input = vld1q_u8(ptr);
+unsafe fn match_header_value_char_16_neon(bytes: &[u8]) -> usize {
+    let input = vld1q_u8(bytes.as_ptr());
 
     // Check that b' ' <= and b != 127 or b == 9
     let result = vcleq_u8(vdupq_n_u8(b' '), input);
@@ -195,67 +203,56 @@ unsafe fn offsetnz(x: uint8x16_t) -> u32 {
 
 #[test]
 fn neon_code_matches_uri_chars_table() {
-    #[allow(clippy::undocumented_unsafe_blocks)]
-    unsafe {
-        assert!(byte_is_allowed(b'_', match_uri_vectored));
-
-        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
-            assert_eq!(
-                byte_is_allowed(b as u8, match_uri_vectored),
-                allowed,
-                "byte_is_allowed({:?}) should be {:?}",
-                b,
-                allowed,
-            );
-        }
+    assert!(byte_is_allowed(b'_', match_uri_vectored));
+
+    for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
+        assert_eq!(
+            byte_is_allowed(b as u8, match_uri_vectored),
+            allowed,
+            "byte_is_allowed({:?}) should be {:?}",
+            b,
+            allowed,
+        );
     }
 }
 
 #[test]
 fn neon_code_matches_header_value_chars_table() {
-    #[allow(clippy::undocumented_unsafe_blocks)]
-    unsafe {
-        assert!(byte_is_allowed(b'_', match_header_value_vectored));
-
-        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
-            assert_eq!(
-                byte_is_allowed(b as u8, match_header_value_vectored),
-                allowed,
-                "byte_is_allowed({:?}) should be {:?}",
-                b,
-                allowed,
-            );
-        }
+    assert!(byte_is_allowed(b'_', match_header_value_vectored));
+
+    for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
+        assert_eq!(
+            byte_is_allowed(b as u8, match_header_value_vectored),
+            allowed,
+            "byte_is_allowed({:?}) should be {:?}",
+            b,
+            allowed,
+        );
     }
 }
 
 #[test]
 fn neon_code_matches_header_name_chars_table() {
-    #[allow(clippy::undocumented_unsafe_blocks)]
-    unsafe {
-        assert!(byte_is_allowed(b'_', match_header_name_vectored));
-
-        for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
-            assert_eq!(
-                byte_is_allowed(b as u8, match_header_name_vectored),
-                allowed,
-                "byte_is_allowed({:?}) should be {:?}",
-                b,
-                allowed,
-            );
-        }
+    assert!(byte_is_allowed(b'_', match_header_name_vectored));
+
+    for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
+        assert_eq!(
+            byte_is_allowed(b as u8, match_header_name_vectored),
+            allowed,
+            "byte_is_allowed({:?}) should be {:?}",
+            b,
+            allowed,
+        );
     }
 }
 
 #[cfg(test)]
-unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
+fn byte_is_allowed(byte: u8, f: fn(bytes: &[u8]) -> usize) -> bool {
     let mut slice = [b'_'; 16];
     slice[10] = byte;
-    let mut bytes = Bytes::new(&slice);
-
-    f(&mut bytes);
 
-    match bytes.pos() {
+    let pos = f(&slice);
+    match pos {
         16 => true,
         10 => false,
         x => panic!("unexpected pos: {}", x),
diff --git a/src/simd/runtime.rs b/src/simd/runtime.rs
index c523a92..c51e317 100644
--- a/src/simd/runtime.rs
+++ b/src/simd/runtime.rs
@@ -1,5 +1,5 @@
 use std::sync::atomic::{AtomicU8, Ordering};
-use crate::iter::Bytes;
+
 use super::avx2;
 use super::sse42;
 
@@ -30,11 +30,11 @@ fn get_runtime_feature() -> u8 {
     feature
 }
 
-pub fn match_header_name_vectored(bytes: &mut Bytes) {
-    super::swar::match_header_name_vectored(bytes);
+pub(crate) fn match_header_name_vectored(bytes: &[u8]) -> usize {
+    super::swar::match_header_name_vectored(bytes)
 }
 
-pub fn match_uri_vectored(bytes: &mut Bytes) {
+pub(crate) fn match_uri_vectored(bytes: &[u8]) -> usize {
     // SAFETY: calls are guarded by a feature check
     unsafe {
         match get_runtime_feature() {
@@ -45,7 +45,7 @@ pub fn match_uri_vectored(bytes: &mut Bytes) {
     }
 }
 
-pub fn match_header_value_vectored(bytes: &mut Bytes) {
+pub(crate) fn match_header_value_vectored(bytes: &[u8]) -> usize {
     // SAFETY: calls are guarded by a feature check
     unsafe {
         match get_runtime_feature() {
diff --git a/src/simd/sse42.rs b/src/simd/sse42.rs
index d6fbf02..d835f19 100644
--- a/src/simd/sse42.rs
+++ b/src/simd/sse42.rs
@@ -1,16 +1,20 @@
-use crate::iter::Bytes;
-
 #[target_feature(enable = "sse4.2")]
-pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 16 {
-        let advance = match_url_char_16_sse(bytes.as_ref());
-        bytes.advance(advance);
+pub(crate) unsafe fn match_uri_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining.
+        let advance = match_url_char_16_sse(remaining);
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
 
         if advance != 16 {
-            return;
+            return len;
         }
     }
-    super::swar::match_uri_vectored(bytes);
+    let advance = super::swar::match_uri_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline(always)]
@@ -62,16 +66,22 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
 }
 
 #[target_feature(enable = "sse4.2")]
-pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
-    while bytes.as_ref().len() >= 16 {
-        let advance = match_header_value_char_16_sse(bytes.as_ref());
-        bytes.advance(advance);
-
-       if advance != 16 {
-            return;
-       }
+pub(crate) unsafe fn match_header_value_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining.
+        let advance = match_header_value_char_16_sse(remaining);
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+
+        if advance != 16 {
+            return len;
+        }
     }
-    super::swar::match_header_value_vectored(bytes);
+    let advance = super::swar::match_header_value_vectored(remaining);
+    len = len.saturating_add(advance);
+    len
 }
 
 #[inline(always)]
@@ -143,18 +153,16 @@ fn sse_code_matches_header_value_chars_table() {
 
 #[allow(clippy::missing_safety_doc)]
 #[cfg(test)]
-unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
-    let slice = [
+unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &[u8]) -> usize) -> bool {
+    let slice = &[
         b'_', b'_', b'_', b'_',
         b'_', b'_', b'_', b'_',
         b'_', b'_', byte, b'_',
         b'_', b'_', b'_', b'_',
     ];
-    let mut bytes = Bytes::new(&slice);
-
-    f(&mut bytes);
 
-    match bytes.pos() {
+    let pos = f(slice);
+    match pos {
         16 => true,
         10 => false,
         _ => unreachable!(),
diff --git a/src/simd/swar.rs b/src/simd/swar.rs
index 4b352ba..940f500 100644
--- a/src/simd/swar.rs
+++ b/src/simd/swar.rs
@@ -1,83 +1,73 @@
 /// SWAR: SIMD Within A Register
 /// SIMD validator backend that validates register-sized chunks of data at a time.
-use crate::{is_header_name_token, is_header_value_token, is_uri_token, Bytes};
+use crate::{is_header_name_token, is_header_value_token, is_uri_token};
+use core::convert::TryInto;
 
 // Adapt block-size to match native register size, i.e: 32bit => 4, 64bit => 8
 const BLOCK_SIZE: usize = core::mem::size_of::<usize>();
 type ByteBlock = [u8; BLOCK_SIZE];
 
 #[inline]
-pub fn match_uri_vectored(bytes: &mut Bytes) {
-    loop {
-        if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
-            let n = match_uri_char_8_swar(bytes8);
-            // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
-            // in `bytes`, so calling `advance(n)` is safe.
-            unsafe {
-                bytes.advance(n);
-            }
-            if n == BLOCK_SIZE {
-                continue;
-            }
+pub(crate) fn match_uri_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= BLOCK_SIZE {
+        let block = &remaining[..BLOCK_SIZE];
+        let advance = match_uri_char_8_swar(block.try_into().unwrap());
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+        if advance != BLOCK_SIZE {
+            // NOTE: must continue to tail-matching logic below, due to known
+            // false-negatives that need to be individually checked.
+            break;
         }
-        if let Some(b) = bytes.peek() {
-            if is_uri_token(b) {
-                // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
-                // in bytes, so calling advance is safe.
-                unsafe {
-                    bytes.advance(1);
-                }
-                continue;
-            }
-        }
-        break;
     }
+    let tail_len = match_tail(is_uri_token, remaining);
+    len = len.saturating_add(tail_len);
+    len
 }
 
 #[inline]
-pub fn match_header_value_vectored(bytes: &mut Bytes) {
-    loop {
-        if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
-            let n = match_header_value_char_8_swar(bytes8);
-            // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
-            // in `bytes`, so calling `advance(n)` is safe.
-            unsafe {
-                bytes.advance(n);
-            }
-            if n == BLOCK_SIZE {
-                continue;
-            }
-        }
-        if let Some(b) = bytes.peek() {
-            if is_header_value_token(b) {
-                // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
-                // in bytes, so calling advance is safe.
-                unsafe {
-                    bytes.advance(1);
-                }
-                continue;
-            }
+pub(crate) fn match_header_value_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= BLOCK_SIZE {
+        let block = &remaining[..BLOCK_SIZE];
+        let advance = match_header_value_char_8_swar(block.try_into().unwrap());
+        len = len.saturating_add(advance);
+        remaining = &bytes[len..];
+        if advance != BLOCK_SIZE {
+            // NOTE: must continue to tail-matching logic below, due to known
+            // false-negatives that need to be individually checked.
+            break;
         }
-        break;
     }
+    let tail_len = match_tail(is_header_value_token, remaining);
+    len = len.saturating_add(tail_len);
+    len
 }
 
 #[inline]
-pub fn match_header_name_vectored(bytes: &mut Bytes) {
-    while let Some(block) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
-        let n = match_block(is_header_name_token, block);
-        // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
-        // in `bytes`, so calling `advance(n)` is safe.
-        unsafe {
-            bytes.advance(n);
-        }
-        if n != BLOCK_SIZE {
-            return;
-        }
+pub(crate) fn match_header_name_vectored(bytes: &[u8]) -> usize {
+    let mut len = 0usize;
+    let mut remaining = bytes;
+    while remaining.len() >= BLOCK_SIZE {
+        let block = &remaining[..BLOCK_SIZE];
+        let advance = block.iter().position(|b| !is_header_name_token(*b));
+        match advance {
+            None => {
+                len = len.saturating_add(BLOCK_SIZE);
+                remaining = &bytes[len..];
+            }
+            Some(v) => {
+                len = len.saturating_add(v);
+                return len;
+            }
+        };
     }
-    // SAFETY: match_tail processes at most the remaining data in `bytes`. advances `bytes` to the
-    // end, but no further.
-    unsafe { bytes.advance(match_tail(is_header_name_token, bytes.as_ref())) };
+    let tail_len = match_tail(is_header_name_token, remaining);
+    len = len.saturating_add(tail_len);
+    len
 }
 
 // Matches "tail", i.e: when we have <BLOCK_SIZE bytes in the buffer, should be uncommon
@@ -92,17 +82,6 @@ fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize {
     bytes.len()
 }
 
-// Naive fallback block matcher
-#[inline(always)]
-fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
-    for (i, &b) in block.iter().enumerate() {
-        if !f(b) {
-            return i;
-        }
-    }
-    BLOCK_SIZE
-}
-
 // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
 // creates a u64 whose bytes are each equal to b
 const fn uniform_block(b: u8) -> usize {