netmap: Reply to ARP requests from gateway for scan source IPs (#807)

* netmap: Reply to ARP requests from gateway for scan source IPs In netmap mode, the OS network stack never gets to see incoming packets unless we explicitly forward them to the host rings; hence the kernel will not be responding to ARP requests. To remove the need for static ARP entries on the gateway, respond to ARP requests from the gateway for any of the source IPs of the scan. * PR feedback: Document the why of handling ARP in netmap mode --------- Co-authored-by: Phillip Stephens <phillip@cs.stanford.edu>
zmap · Mar 5, 2024 · 660f7d9 · 660f7d9
1 parent f0ba1ad
commit 660f7d9
Show file tree

Hide file tree

Showing 3 changed files with 208 additions and 55 deletions.
diff --git a/src/recv-netmap.c b/src/recv-netmap.c
@@ -23,6 +23,7 @@
 #include "../lib/logger.h"
 
 #include <net/netmap_user.h>
+#include <net/if_arp.h>
 #include <sys/mman.h>
 #include <sys/ioctl.h>
 #include <fcntl.h>
@@ -33,41 +34,121 @@
 #include <assert.h>
 #include <inttypes.h>
 
-static struct pollfd fds;
-static struct netmap_if *nm_if;
-static bool *in_multi_seg_packet;
+static void handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts);
 static void (*handle_packet_func)(uint32_t buflen, const uint8_t *bytes, const struct timespec ts);
-static if_stats_ctx_t *stats_ctx;
-static bool need_recv_counter;
-static uint64_t recv_counter;
+typedef size_t (*make_packet_func_t)(uint8_t *buf, void const *arg);
 
+// Send a packet on a netmap ring and fd directly.
+// Used to send packets before send threads are up.
 static void
-handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
+send_packet(make_packet_func_t mkpkt, void const *arg)
 {
-	if (buflen < sizeof(struct ether_header) + sizeof(struct ip) + ICMP_MINLEN) {
+	// Synthesize a sock_t for the main netmap fd.
+	// We're syncing all TX rings this way, not just ring 0.
+	sock_t sock;
+	sock.nm.tx_ring_idx = 0;
+	sock.nm.tx_ring_fd = zconf.nm.nm_fd;
+
+	batch_t *batch = create_packet_batch(1);
+	batch->lens[0] = (int)mkpkt((uint8_t *)batch->packets, arg);
+	assert(batch->lens[0] <= MAX_PACKET_SIZE);
+	batch->ips[0] = 0; // unused by netmap
+	batch->len = 1;
+	if (send_batch_internal(sock, batch) != 1) {
+		log_fatal("recv-netmap", "Failed to send packet: %d: %s", errno, strerror(errno));
+	}
+	free_packet_batch(batch);
+}
+
+// Submit a packet for sending by send thread 0.
+// Used to send packets after send threads are up.
+// Submitted packets are sent once per scan batch.
+static void
+submit_packet(make_packet_func_t mkpkt, void const *arg)
+{
+	batch_t *batch = create_packet_batch(1);
+	batch->lens[0] = (int)mkpkt((uint8_t *)batch->packets, arg);
+	assert(batch->lens[0] <= MAX_PACKET_SIZE);
+	batch->ips[0] = 0; // unused by netmap
+	batch->len = 1;
+	submit_batch_internal(batch); // consumes batch
+}
+
+// In netmap mode, the OS network stack never gets to see incoming packets
+// unless we explicitly forward them to the host rings; hence the kernel will
+// not be responding to ARP requests.  To remove the need for static ARP
+// entries on the gateway, respond to ARP requests from the gateway for any of
+// the source IPs of the scan.
+
+#define ARP_ETHER_INET_PKT_LEN (sizeof(struct ether_header) + sizeof(struct arphdr) + 2 * ETHER_ADDR_LEN + 2 * sizeof(uint32_t))
+#define x_ar_sha(ap) ((uint8_t *)((ap) + 1))
+#define x_ar_spa(ap) (((uint8_t *)((ap) + 1)) + ETHER_ADDR_LEN)
+#define x_ar_tha(ap) (((uint8_t *)((ap) + 1)) + ETHER_ADDR_LEN + sizeof(uint32_t))
+#define x_ar_tpa(ap) (((uint8_t *)((ap) + 1)) + 2 * ETHER_ADDR_LEN + sizeof(uint32_t))
+
+static size_t
+make_arp_resp(uint8_t *buf, void const *arg)
+{
+	struct arphdr const *req_ah = (struct arphdr const *)arg;
+
+	struct ether_header *eh = (struct ether_header *)buf;
+	memcpy(eh->ether_shost, zconf.hw_mac, ETHER_ADDR_LEN);
+	memcpy(eh->ether_dhost, x_ar_sha(req_ah), ETHER_ADDR_LEN);
+	eh->ether_type = htons(ETHERTYPE_ARP);
+
+	struct arphdr *ah = (struct arphdr *)(eh + 1);
+	ah->ar_hrd = htons(ARPHRD_ETHER);
+	ah->ar_pro = htons(ETHERTYPE_IP);
+	ah->ar_hln = ETHER_ADDR_LEN;
+	ah->ar_pln = sizeof(uint32_t);
+	ah->ar_op = htons(ARPOP_REPLY);
+	memcpy(x_ar_sha(ah), zconf.hw_mac, ETHER_ADDR_LEN);
+	*(uint32_t *)x_ar_spa(ah) = *(uint32_t *)x_ar_tpa(req_ah);
+	memcpy(x_ar_tha(ah), x_ar_sha(req_ah), ETHER_ADDR_LEN);
+	*(uint32_t *)x_ar_tpa(ah) = *(uint32_t *)x_ar_spa(req_ah);
+
+	return ARP_ETHER_INET_PKT_LEN;
+}
+
+static void
+handle_packet_arp(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
+{
+	if (buflen < ARP_ETHER_INET_PKT_LEN) {
 		return;
 	}
 	struct ether_header *eh = (struct ether_header *)bytes;
-	if (eh->ether_type != htons(ETHERTYPE_IP)) {
+	if (eh->ether_type != htons(ETHERTYPE_ARP)) {
 		return;
 	}
-	struct ip *iph = (struct ip *)(eh + 1);
-	if (iph->ip_v != 4 ||
-	    iph->ip_p != IPPROTO_ICMP ||
-	    iph->ip_src.s_addr != zconf.nm.wait_ping_dstip) {
+	struct arphdr *ah = (struct arphdr *)(eh + 1);
+	if (ah->ar_op != htons(ARPOP_REQUEST) ||
+	    ah->ar_hrd != htons(ARPHRD_ETHER) ||
+	    ah->ar_pro != htons(ETHERTYPE_IP) ||
+	    ah->ar_hln != ETHER_ADDR_LEN ||
+	    ah->ar_pln != sizeof(uint32_t)) {
 		return;
 	}
-	struct icmp *icmph = (struct icmp *)(iph + 1);
-	if (icmph->icmp_type != ICMP_ECHOREPLY) {
+	macaddr_t *sender_hardware_address = (macaddr_t *)x_ar_sha(ah);
+	if (memcmp(sender_hardware_address, eh->ether_shost, ETHER_ADDR_LEN) != 0 ||
+	    memcmp(sender_hardware_address, zconf.gw_mac, ETHER_ADDR_LEN) != 0) {
 		return;
 	}
-
-	log_debug("recv-netmap", "Received ICMP echo reply, ready to commence scan");
-	handle_packet_func = handle_packet;
+	in_addr_t target_protocol_address = *(in_addr_t *)x_ar_tpa(ah);
+	for (size_t i = 0; i < zconf.number_source_ips; i++) {
+		if (target_protocol_address == zconf.source_ip_addresses[i]) {
+			log_debug("recv-netmap", "Received ARP request from gateway");
+			if (handle_packet_func == handle_packet_wait_ping) {
+				send_packet(make_arp_resp, (void const *)ah);
+			} else {
+				submit_packet(make_arp_resp, (void const *)ah);
+			}
+			return;
+		}
+	}
 }
 
 static size_t
-make_wait_ping_req(uint8_t *buf)
+make_wait_ping_req(uint8_t *buf, UNUSED void const *arg)
 {
 	struct ether_header *eh = (struct ether_header *)buf;
 	make_eth_header(eh, zconf.hw_mac, zconf.gw_mac);
@@ -89,17 +170,28 @@ make_wait_ping_req(uint8_t *buf)
 }
 
 static void
-send_wait_ping_req(sock_t sock)
+handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
 {
-	batch_t *batch = create_packet_batch(1);
-	batch->lens[0] = (int)make_wait_ping_req((uint8_t *)batch->packets);
-	batch->ips[0] = zconf.nm.wait_ping_dstip;
-	batch->len = 1;
-	if (send_batch(sock, batch, 1) != 1) {
-		log_fatal("recv-netmap", "Failed to send ICMP echo request: %d: %s", errno, strerror(errno));
+	if (buflen < sizeof(struct ether_header) + sizeof(struct ip) + ICMP_MINLEN) {
+		return;
 	}
-	free_packet_batch(batch);
-	log_debug("recv-netmap", "Sent ICMP echo request");
+	struct ether_header *eh = (struct ether_header *)bytes;
+	if (eh->ether_type != htons(ETHERTYPE_IP)) {
+		return;
+	}
+	struct ip *iph = (struct ip *)(eh + 1);
+	if (iph->ip_v != 4 ||
+	    iph->ip_p != IPPROTO_ICMP ||
+	    iph->ip_src.s_addr != zconf.nm.wait_ping_dstip) {
+		return;
+	}
+	struct icmp *icmph = (struct icmp *)(iph + 1);
+	if (icmph->icmp_type != ICMP_ECHOREPLY) {
+		return;
+	}
+
+	log_debug("recv-netmap", "Received ICMP echo reply, ready to commence scan");
+	handle_packet_func = handle_packet;
 }
 
 #ifndef NSEC_PER_SEC
@@ -135,13 +227,6 @@ wait_for_e2e_connectivity(void)
 {
 	static const time_t timeout_secs = 60;
 
-	// Synthesize a sock_t for the main netmap fd.
-	// This is safe as long as send threads are not spun up yet.
-	// We're syncing all TX rings this way, not just ring 0.
-	sock_t sock;
-	sock.nm.tx_ring_idx = 0;
-	sock.nm.tx_ring_fd = zconf.nm.nm_fd;
-
 	struct timespec t_start;
 	timespec_get_monotonic(&t_start);
 	struct timespec t_last_send;
@@ -159,14 +244,22 @@ wait_for_e2e_connectivity(void)
 		}
 
 		if (timespec_diff(&t_now, &t_last_send).tv_sec >= 1) {
-			send_wait_ping_req(sock);
+			send_packet(make_wait_ping_req, NULL);
 			timespec_get_monotonic(&t_last_send);
+			log_debug("recv-netmap", "Sent ICMP echo request");
 		}
 
 		recv_packets();
 	}
 }
 
+static struct pollfd fds;
+static struct netmap_if *nm_if;
+static bool *in_multi_seg_packet;
+static if_stats_ctx_t *stats_ctx;
+static bool need_recv_counter;
+static uint64_t recv_counter;
+
 void recv_init(void)
 {
 	fds.fd = zconf.nm.nm_fd;
@@ -276,6 +369,7 @@ recv_packets(void)
 			if (need_recv_counter) {
 				recv_counter++;
 			}
+			handle_packet_arp(slot->len, (uint8_t *)buf, ts);
 			handle_packet_func(slot->len, (uint8_t *)buf, ts);
 		}
 		rxring->cur = rxring->head = head;

diff --git a/src/send-internal.h b/src/send-internal.h
@@ -16,6 +16,9 @@ int send_batch(sock_t sock, batch_t *batch, int retries);
 
 #if defined(PFRING)
 #include "send-pfring.h"
+#elif defined(NETMAP)
+void submit_batch_internal(batch_t *batch);
+int send_batch_internal(sock_t sock, batch_t *batch);
 #elif defined(__linux__)
 #include "send-linux.h"
 #endif

diff --git a/src/send-netmap.c b/src/send-netmap.c
@@ -20,16 +20,32 @@
 #include <errno.h>
 #include <string.h>
 #include <assert.h>
+#include <pthread.h>
 
 #include "../lib/includes.h"
 #include "../lib/logger.h"
+#include "../lib/queue.h"
 
 #include "socket.h"
 #include "state.h"
 
+static pthread_once_t submit_queue_inited = PTHREAD_ONCE_INIT;
+static zqueue_t *submit_queue;
+
+static void
+submit_queue_init_once(void)
+{
+	submit_queue = queue_init();
+	assert(submit_queue);
+}
+
 int
 send_run_init(sock_t sock)
 {
+	if (sock.nm.tx_ring_idx == 0) {
+		pthread_once(&submit_queue_inited, submit_queue_init_once);
+	}
+
 	struct pollfd fds = {
 		.fd = sock.nm.tx_ring_fd,
 		.events = POLLOUT,
@@ -42,27 +58,23 @@ send_run_init(sock_t sock)
 	return 0;
 }
 
-// This implementation does not use attempts, because retries do not
-// make sense based on the premise that syncing a TX ring will never
-// fail for transient reasons.
-//
-// This implementation never reports batches as partially failed,
-// because the netmap API does not have partial failure semantics.
-// All we know is that a poll or ioctl syscall failed, not if or
-// how many of the packets we placed in the ringbuffer were sent.
-//
-// ZMap's current architecture forces us to copy packet data here.
-// An even more optimised implementation might reuse packet data
-// in buffers (unless NS_BUF_CHANGED has been set by the kernel on
-// a slot), and only update the fields that need to change, such
-// as dst IP, checksum etc depending on scan type and params.
-int
-send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)
+// Called from the recv thread to submit a batch of packets
+// for sending on thread 0; typically batch size is just 1.
+// Used for responding to ARP requests.
+// The way this works is rather inefficient and only makes
+// sense for low volume packets.
+// Since we don't know if send_run_init() has been called
+// yet or not, we need to ensure the queue is initialized.
+void
+submit_batch_internal(batch_t *batch)
 {
-	if (batch->len == 0) {
-		return 0;
-	}
+	pthread_once(&submit_queue_inited, submit_queue_init_once);
+	push_back((void *)batch, submit_queue);
+}
 
+int
+send_batch_internal(sock_t sock, batch_t *batch)
+{
 	struct netmap_ring *ring = NETMAP_TXRING(zconf.nm.nm_if, sock.nm.tx_ring_idx);
 	struct pollfd fds = {
 		.fd = sock.nm.tx_ring_fd,
@@ -96,3 +108,47 @@ send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)
 
 	return batch->len;
 }
+
+// Netmap's send_batch does not use attempts, because retries do
+// not make sense based on the premise that syncing a TX ring will
+// never fail for transient reasons.
+//
+// Netmap's send_batch never reports batches as partially failed,
+// because the netmap API does not have partial failure semantics.
+// All we know is that a poll or ioctl syscall failed, not if or
+// how many of the packets we placed in the ringbuffer were sent.
+//
+// There is a bit of unused optimization potential here; ZMap's
+// current architecture requires us to copy packet data on the
+// send path, we cannot supply netmap buffers to ZMap to write
+// into directly.  And even though netmap would allow us to reuse
+// data still in buffers (unless NS_BUF_CHANGED has been set by
+// the kernel), we cannot take advantage of that currently.
+int
+send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)
+{
+	// On send thread 0, send any batches that have been
+	// submitted onto the submit_queue before sending the
+	// actual batch.  There should only be packets in the
+	// submit_queue very infrequently.
+	if (sock.nm.tx_ring_idx == 0) {
+		while (!is_empty(submit_queue)) {
+			znode_t *node = pop_front(submit_queue);
+			batch_t *extra_batch = (batch_t *)node->data;
+			assert(extra_batch->len > 0);
+			free(node);
+			if (send_batch_internal(sock, extra_batch) != extra_batch->len) {
+				log_error("send-netmap", "Failed to send extra batch of %u submitted packet(s)", extra_batch->len);
+			} else {
+				log_debug("send-netmap", "Sent extra batch of %u submitted packet(s)", extra_batch->len);
+			}
+			free_packet_batch(extra_batch);
+		}
+	}
+
+	if (batch->len == 0) {
+		return 0;
+	}
+
+	return send_batch_internal(sock, batch);
+}