Skip to content

Commit

Permalink
netmap: Reply to ARP requests from gateway for scan source IPs (#807)
Browse files Browse the repository at this point in the history
* netmap: Reply to ARP requests from gateway for scan source IPs

In netmap mode, the OS network stack never gets to see incoming packets
unless we explicitly forward them to the host rings; hence the kernel
will not be responding to ARP requests.  To remove the need for static
ARP entries on the gateway, respond to ARP requests from the gateway for
any of the source IPs of the scan.

* PR feedback:  Document the why of handling ARP in netmap mode

---------

Co-authored-by: Phillip Stephens <phillip@cs.stanford.edu>
  • Loading branch information
droe and phillip-stephens committed Mar 5, 2024
1 parent f0ba1ad commit 660f7d9
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 55 deletions.
166 changes: 130 additions & 36 deletions src/recv-netmap.c
Expand Up @@ -23,6 +23,7 @@
#include "../lib/logger.h"

#include <net/netmap_user.h>
#include <net/if_arp.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <fcntl.h>
Expand All @@ -33,41 +34,121 @@
#include <assert.h>
#include <inttypes.h>

static struct pollfd fds;
static struct netmap_if *nm_if;
static bool *in_multi_seg_packet;
static void handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts);
static void (*handle_packet_func)(uint32_t buflen, const uint8_t *bytes, const struct timespec ts);
static if_stats_ctx_t *stats_ctx;
static bool need_recv_counter;
static uint64_t recv_counter;
typedef size_t (*make_packet_func_t)(uint8_t *buf, void const *arg);

// Send a packet on a netmap ring and fd directly.
// Used to send packets before send threads are up.
static void
handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
send_packet(make_packet_func_t mkpkt, void const *arg)
{
if (buflen < sizeof(struct ether_header) + sizeof(struct ip) + ICMP_MINLEN) {
// Synthesize a sock_t for the main netmap fd.
// We're syncing all TX rings this way, not just ring 0.
sock_t sock;
sock.nm.tx_ring_idx = 0;
sock.nm.tx_ring_fd = zconf.nm.nm_fd;

batch_t *batch = create_packet_batch(1);
batch->lens[0] = (int)mkpkt((uint8_t *)batch->packets, arg);
assert(batch->lens[0] <= MAX_PACKET_SIZE);
batch->ips[0] = 0; // unused by netmap
batch->len = 1;
if (send_batch_internal(sock, batch) != 1) {
log_fatal("recv-netmap", "Failed to send packet: %d: %s", errno, strerror(errno));
}
free_packet_batch(batch);
}

// Submit a packet for sending by send thread 0.
// Used to send packets after send threads are up.
// Submitted packets are sent once per scan batch.
static void
submit_packet(make_packet_func_t mkpkt, void const *arg)
{
batch_t *batch = create_packet_batch(1);
batch->lens[0] = (int)mkpkt((uint8_t *)batch->packets, arg);
assert(batch->lens[0] <= MAX_PACKET_SIZE);
batch->ips[0] = 0; // unused by netmap
batch->len = 1;
submit_batch_internal(batch); // consumes batch
}

// In netmap mode, the OS network stack never gets to see incoming packets
// unless we explicitly forward them to the host rings; hence the kernel will
// not be responding to ARP requests. To remove the need for static ARP
// entries on the gateway, respond to ARP requests from the gateway for any of
// the source IPs of the scan.

#define ARP_ETHER_INET_PKT_LEN (sizeof(struct ether_header) + sizeof(struct arphdr) + 2 * ETHER_ADDR_LEN + 2 * sizeof(uint32_t))
#define x_ar_sha(ap) ((uint8_t *)((ap) + 1))
#define x_ar_spa(ap) (((uint8_t *)((ap) + 1)) + ETHER_ADDR_LEN)
#define x_ar_tha(ap) (((uint8_t *)((ap) + 1)) + ETHER_ADDR_LEN + sizeof(uint32_t))
#define x_ar_tpa(ap) (((uint8_t *)((ap) + 1)) + 2 * ETHER_ADDR_LEN + sizeof(uint32_t))

static size_t
make_arp_resp(uint8_t *buf, void const *arg)
{
struct arphdr const *req_ah = (struct arphdr const *)arg;

struct ether_header *eh = (struct ether_header *)buf;
memcpy(eh->ether_shost, zconf.hw_mac, ETHER_ADDR_LEN);
memcpy(eh->ether_dhost, x_ar_sha(req_ah), ETHER_ADDR_LEN);
eh->ether_type = htons(ETHERTYPE_ARP);

struct arphdr *ah = (struct arphdr *)(eh + 1);
ah->ar_hrd = htons(ARPHRD_ETHER);
ah->ar_pro = htons(ETHERTYPE_IP);
ah->ar_hln = ETHER_ADDR_LEN;
ah->ar_pln = sizeof(uint32_t);
ah->ar_op = htons(ARPOP_REPLY);
memcpy(x_ar_sha(ah), zconf.hw_mac, ETHER_ADDR_LEN);
*(uint32_t *)x_ar_spa(ah) = *(uint32_t *)x_ar_tpa(req_ah);
memcpy(x_ar_tha(ah), x_ar_sha(req_ah), ETHER_ADDR_LEN);
*(uint32_t *)x_ar_tpa(ah) = *(uint32_t *)x_ar_spa(req_ah);

return ARP_ETHER_INET_PKT_LEN;
}

static void
handle_packet_arp(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
{
if (buflen < ARP_ETHER_INET_PKT_LEN) {
return;
}
struct ether_header *eh = (struct ether_header *)bytes;
if (eh->ether_type != htons(ETHERTYPE_IP)) {
if (eh->ether_type != htons(ETHERTYPE_ARP)) {
return;
}
struct ip *iph = (struct ip *)(eh + 1);
if (iph->ip_v != 4 ||
iph->ip_p != IPPROTO_ICMP ||
iph->ip_src.s_addr != zconf.nm.wait_ping_dstip) {
struct arphdr *ah = (struct arphdr *)(eh + 1);
if (ah->ar_op != htons(ARPOP_REQUEST) ||
ah->ar_hrd != htons(ARPHRD_ETHER) ||
ah->ar_pro != htons(ETHERTYPE_IP) ||
ah->ar_hln != ETHER_ADDR_LEN ||
ah->ar_pln != sizeof(uint32_t)) {
return;
}
struct icmp *icmph = (struct icmp *)(iph + 1);
if (icmph->icmp_type != ICMP_ECHOREPLY) {
macaddr_t *sender_hardware_address = (macaddr_t *)x_ar_sha(ah);
if (memcmp(sender_hardware_address, eh->ether_shost, ETHER_ADDR_LEN) != 0 ||
memcmp(sender_hardware_address, zconf.gw_mac, ETHER_ADDR_LEN) != 0) {
return;
}

log_debug("recv-netmap", "Received ICMP echo reply, ready to commence scan");
handle_packet_func = handle_packet;
in_addr_t target_protocol_address = *(in_addr_t *)x_ar_tpa(ah);
for (size_t i = 0; i < zconf.number_source_ips; i++) {
if (target_protocol_address == zconf.source_ip_addresses[i]) {
log_debug("recv-netmap", "Received ARP request from gateway");
if (handle_packet_func == handle_packet_wait_ping) {
send_packet(make_arp_resp, (void const *)ah);
} else {
submit_packet(make_arp_resp, (void const *)ah);
}
return;
}
}
}

static size_t
make_wait_ping_req(uint8_t *buf)
make_wait_ping_req(uint8_t *buf, UNUSED void const *arg)
{
struct ether_header *eh = (struct ether_header *)buf;
make_eth_header(eh, zconf.hw_mac, zconf.gw_mac);
Expand All @@ -89,17 +170,28 @@ make_wait_ping_req(uint8_t *buf)
}

static void
send_wait_ping_req(sock_t sock)
handle_packet_wait_ping(uint32_t buflen, const uint8_t *bytes, UNUSED const struct timespec ts)
{
batch_t *batch = create_packet_batch(1);
batch->lens[0] = (int)make_wait_ping_req((uint8_t *)batch->packets);
batch->ips[0] = zconf.nm.wait_ping_dstip;
batch->len = 1;
if (send_batch(sock, batch, 1) != 1) {
log_fatal("recv-netmap", "Failed to send ICMP echo request: %d: %s", errno, strerror(errno));
if (buflen < sizeof(struct ether_header) + sizeof(struct ip) + ICMP_MINLEN) {
return;
}
free_packet_batch(batch);
log_debug("recv-netmap", "Sent ICMP echo request");
struct ether_header *eh = (struct ether_header *)bytes;
if (eh->ether_type != htons(ETHERTYPE_IP)) {
return;
}
struct ip *iph = (struct ip *)(eh + 1);
if (iph->ip_v != 4 ||
iph->ip_p != IPPROTO_ICMP ||
iph->ip_src.s_addr != zconf.nm.wait_ping_dstip) {
return;
}
struct icmp *icmph = (struct icmp *)(iph + 1);
if (icmph->icmp_type != ICMP_ECHOREPLY) {
return;
}

log_debug("recv-netmap", "Received ICMP echo reply, ready to commence scan");
handle_packet_func = handle_packet;
}

#ifndef NSEC_PER_SEC
Expand Down Expand Up @@ -135,13 +227,6 @@ wait_for_e2e_connectivity(void)
{
static const time_t timeout_secs = 60;

// Synthesize a sock_t for the main netmap fd.
// This is safe as long as send threads are not spun up yet.
// We're syncing all TX rings this way, not just ring 0.
sock_t sock;
sock.nm.tx_ring_idx = 0;
sock.nm.tx_ring_fd = zconf.nm.nm_fd;

struct timespec t_start;
timespec_get_monotonic(&t_start);
struct timespec t_last_send;
Expand All @@ -159,14 +244,22 @@ wait_for_e2e_connectivity(void)
}

if (timespec_diff(&t_now, &t_last_send).tv_sec >= 1) {
send_wait_ping_req(sock);
send_packet(make_wait_ping_req, NULL);
timespec_get_monotonic(&t_last_send);
log_debug("recv-netmap", "Sent ICMP echo request");
}

recv_packets();
}
}

static struct pollfd fds;
static struct netmap_if *nm_if;
static bool *in_multi_seg_packet;
static if_stats_ctx_t *stats_ctx;
static bool need_recv_counter;
static uint64_t recv_counter;

void recv_init(void)
{
fds.fd = zconf.nm.nm_fd;
Expand Down Expand Up @@ -276,6 +369,7 @@ recv_packets(void)
if (need_recv_counter) {
recv_counter++;
}
handle_packet_arp(slot->len, (uint8_t *)buf, ts);
handle_packet_func(slot->len, (uint8_t *)buf, ts);
}
rxring->cur = rxring->head = head;
Expand Down
3 changes: 3 additions & 0 deletions src/send-internal.h
Expand Up @@ -16,6 +16,9 @@ int send_batch(sock_t sock, batch_t *batch, int retries);

#if defined(PFRING)
#include "send-pfring.h"
#elif defined(NETMAP)
void submit_batch_internal(batch_t *batch);
int send_batch_internal(sock_t sock, batch_t *batch);
#elif defined(__linux__)
#include "send-linux.h"
#endif
Expand Down
94 changes: 75 additions & 19 deletions src/send-netmap.c
Expand Up @@ -20,16 +20,32 @@
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <pthread.h>

#include "../lib/includes.h"
#include "../lib/logger.h"
#include "../lib/queue.h"

#include "socket.h"
#include "state.h"

static pthread_once_t submit_queue_inited = PTHREAD_ONCE_INIT;
static zqueue_t *submit_queue;

static void
submit_queue_init_once(void)
{
submit_queue = queue_init();
assert(submit_queue);
}

int
send_run_init(sock_t sock)
{
if (sock.nm.tx_ring_idx == 0) {
pthread_once(&submit_queue_inited, submit_queue_init_once);
}

struct pollfd fds = {
.fd = sock.nm.tx_ring_fd,
.events = POLLOUT,
Expand All @@ -42,27 +58,23 @@ send_run_init(sock_t sock)
return 0;
}

// This implementation does not use attempts, because retries do not
// make sense based on the premise that syncing a TX ring will never
// fail for transient reasons.
//
// This implementation never reports batches as partially failed,
// because the netmap API does not have partial failure semantics.
// All we know is that a poll or ioctl syscall failed, not if or
// how many of the packets we placed in the ringbuffer were sent.
//
// ZMap's current architecture forces us to copy packet data here.
// An even more optimised implementation might reuse packet data
// in buffers (unless NS_BUF_CHANGED has been set by the kernel on
// a slot), and only update the fields that need to change, such
// as dst IP, checksum etc depending on scan type and params.
int
send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)
// Called from the recv thread to submit a batch of packets
// for sending on thread 0; typically batch size is just 1.
// Used for responding to ARP requests.
// The way this works is rather inefficient and only makes
// sense for low volume packets.
// Since we don't know if send_run_init() has been called
// yet or not, we need to ensure the queue is initialized.
void
submit_batch_internal(batch_t *batch)
{
if (batch->len == 0) {
return 0;
}
pthread_once(&submit_queue_inited, submit_queue_init_once);
push_back((void *)batch, submit_queue);
}

int
send_batch_internal(sock_t sock, batch_t *batch)
{
struct netmap_ring *ring = NETMAP_TXRING(zconf.nm.nm_if, sock.nm.tx_ring_idx);
struct pollfd fds = {
.fd = sock.nm.tx_ring_fd,
Expand Down Expand Up @@ -96,3 +108,47 @@ send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)

return batch->len;
}

// Netmap's send_batch does not use attempts, because retries do
// not make sense based on the premise that syncing a TX ring will
// never fail for transient reasons.
//
// Netmap's send_batch never reports batches as partially failed,
// because the netmap API does not have partial failure semantics.
// All we know is that a poll or ioctl syscall failed, not if or
// how many of the packets we placed in the ringbuffer were sent.
//
// There is a bit of unused optimization potential here; ZMap's
// current architecture requires us to copy packet data on the
// send path, we cannot supply netmap buffers to ZMap to write
// into directly. And even though netmap would allow us to reuse
// data still in buffers (unless NS_BUF_CHANGED has been set by
// the kernel), we cannot take advantage of that currently.
int
send_batch(sock_t sock, batch_t *batch, UNUSED int attempts)
{
// On send thread 0, send any batches that have been
// submitted onto the submit_queue before sending the
// actual batch. There should only be packets in the
// submit_queue very infrequently.
if (sock.nm.tx_ring_idx == 0) {
while (!is_empty(submit_queue)) {
znode_t *node = pop_front(submit_queue);
batch_t *extra_batch = (batch_t *)node->data;
assert(extra_batch->len > 0);
free(node);
if (send_batch_internal(sock, extra_batch) != extra_batch->len) {
log_error("send-netmap", "Failed to send extra batch of %u submitted packet(s)", extra_batch->len);
} else {
log_debug("send-netmap", "Sent extra batch of %u submitted packet(s)", extra_batch->len);
}
free_packet_batch(extra_batch);
}
}

if (batch->len == 0) {
return 0;
}

return send_batch_internal(sock, batch);
}

0 comments on commit 660f7d9

Please sign in to comment.