Skip to content

Commit

Permalink
better performance
Browse files Browse the repository at this point in the history
this implements dynamic priority based scheduling in the LAArbiter.
Also it does improvements on the host-side python
  • Loading branch information
anuejn committed Nov 15, 2023
1 parent a9faebc commit dede629
Show file tree
Hide file tree
Showing 7 changed files with 346 additions and 155 deletions.
169 changes: 110 additions & 59 deletions software/glasgow/applet/interface/better_la/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
import io
import logging
import argparse
from vcd import VCDWriter
Expand All @@ -9,27 +10,49 @@
from ....gateware.analyzer import *
from ... import *
from .signal_compressor import SignalCompressor
from .arbeiter import LAArbeiter

# This LA uses a simple protocol for sending compressed values over the FIFO:
# Each packet starts with a 8 bit size word. The size can be 0, then the word only consists of that
# word. If the size is n != 0, the packet is n*2 bytes long. Each 16bit word is encoded acording
# to the format described in the SignalCompressor value. The packets are round-robin for each pin.
from .arbiter import LAarbiter

# This LA uses a simple protocol for sending compressed values over the FIFO which is explained
# in the arbiter.py (high level chunks) and signal_compressor.py (low level packets) files.
# The basic architecture is as follows:
# +------------------+ +--------+
# Pin0 --->| SignalCompressor |------>| FIFO |-----+
# +------------------+ +--------+ |
# |
# +------------------+ +--------+ |
# Pin1 --->| SignalCompressor |------>| FIFO |-----+ +-----------+ +----------+
# +------------------+ +--------+ | | | | |
# +---->| LAArbiter |----->| USB-FIFO |
# +------------------+ +--------+ | | | | |
# Pin2 --->| SignalCompressor |------>| FIFO |-----+ +-----------+ +----------+
# +------------------+ +--------+ |
# |
# +------------------+ +--------+ |
# PinN --->| ... |------>| ... |-----+
# +------------------+ +--------+

class BetterLASubtarget(Elaboratable):
def __init__(self, pads, in_fifo):
def __init__(self, pads, in_fifo, counter_target=False):
self.pads = pads
self.in_fifo = in_fifo
self.counter_target = counter_target

self.la = LAArbeiter(in_fifo)
self.la = LAarbiter(in_fifo)

def elaborate(self, platform):
m = Module()
m.submodules += self.la

pins_i = Signal.like(self.pads.i_t.i)
m.submodules += FFSynchronizer(self.pads.i_t.i, pins_i)
m.d.comb += self.la.input.eq(pins_i)
if self.counter_target:
print("building bitstream with simulated counter target")
counter = Signal(len(self.pads.i_t.i)+2)
m.d.sync += counter.eq(counter + 1)
m.d.comb += self.la.input.eq(counter[2:])
else:
print("building bitstream connected to real target")
pins_i = Signal.like(self.pads.i_t.i)
m.submodules += FFSynchronizer(self.pads.i_t.i, pins_i)
m.d.comb += self.la.input.eq(pins_i)

return m

Expand All @@ -46,12 +69,17 @@ def add_build_arguments(cls, parser, access):
super().add_build_arguments(parser, access)

access.add_pin_set_argument(parser, "i", width=range(1, 17), default=1)
parser.add_argument(
"--counter-target", default=False, action="store_true",
help="simulate a target with a counter signal",
)

def build(self, target, args):
self.mux_interface = iface = target.multiplexer.claim_interface(self, args)
iface.add_subtarget(BetterLASubtarget(
pads=iface.get_pads(args, pin_sets=("i",)),
in_fifo=iface.get_in_fifo(depth=512*16),
in_fifo=iface.get_in_fifo(depth=512*16, auto_flush=False),
counter_target=args.counter_target
))

self._sample_freq = target.sys_clk_freq
Expand Down Expand Up @@ -85,53 +113,76 @@ def add_interact_arguments(cls, parser):
parser.add_argument(
"file", metavar="VCD-FILE", type=argparse.FileType("w"),
help="write VCD waveforms to VCD-FILE")
parser.add_argument("--buffer-size", type=int, default=10,
help="how much data to capture in MB")

async def interact(self, device, args, iface):
pins = defaultdict(list)
overrun = False

zero_chunks = 0
chunks = 0
try: # this try catches Ctrl+C for being able to manually interrupt capture
while not overrun:
for p in self._pins:
pkgs = await LAArbeiter.read_chunk(iface.read)
if len(pkgs) == 0:
zero_chunks += 1
chunks += 1
pins[p].extend(pkgs)
if len(pkgs) > 255 - len(self._pins):
overrun = True
print("overrun")
# Step 1: record a buffer
# we do this before to get the full USB performance and not have any lag-spikes in between
try:
print(f"starting capture of {args.buffer_size} MB")
buffer = await iface.read(1024*1024 * args.buffer_size)
except KeyboardInterrupt:
pass
finally:
events = []
cycles = 0
for p, pkgs in pins.items():
cycle = 0
for pkg in pkgs:
for value, duration in SignalCompressor.decode_pkg(pkg):
timestamp = cycle * 1_000_000_000 // self._sample_freq
events.append((timestamp, p, value))
cycle += duration
cycles = max(cycle, cycles)
events.sort(key=lambda e: e[0])

total_pkgs = sum(len(pkgs) for pkgs in pins.values())
total_bytes = chunks + total_pkgs * 2

print(f"captured {cycles} cycles")
print(f"chunking overhead: {chunks / total_bytes * 100}%")
print(f"zero chunks overhead: {zero_chunks / total_bytes * 100}%")
print(f"compression gain: {100 - (total_bytes * 8 / cycle * 100)}%")


vcd_writer = VCDWriter(args.file, timescale="1 ns", check_values=False)
vcd_signals = {
p: vcd_writer.register_var(scope="", name="pin[{}]".format(p), var_type="wire",
size=1, init=0)
for p in pins.keys()
}
for timestamp, p, value in events:
signal = vcd_signals[p]
vcd_writer.change(signal, timestamp, value)
vcd_writer.close(timestamp)
print("captured buffer, converting...")


# Step 2: parse the packets from the captured buffer and sort them into channels
ptr = 0
async def read(size, ) -> bytes:
nonlocal ptr
to_return = buffer[ptr:ptr+size]
ptr += size
if ptr >= len(buffer):
return None
return to_return
channels = defaultdict(list)
chunks = 0
while True:
read_result = await LAarbiter.read_chunk(read)
if read_result is None:
break
channel, chunk = read_result
if len(chunk) == 255:
print(f"channel {channel} overrun")
break
channels[self._pins[channel]].extend(chunk)
chunks += 1

# Step 3: convert each channels packets into events, attach timestamps and sort them by
# timestamp
events = []
cycles = None
for p, pkgs in channels.items():
cycle = 0
for pkg in pkgs:
for value, duration in SignalCompressor.decode_pkg(pkg):
events.append((cycle, p, value))
cycle += duration
cycles = cycle if cycles is None else cycle if cycle < cycles else cycles
events.sort(key=lambda e: e[0])

# Step 3.5: report statistics
total_pkgs = sum(len(pkgs) for pkgs in channels.values())
total_bytes = chunks + total_pkgs * 2
print(f"captured {cycles} samples ({cycles / self._sample_freq * 1000}ms)")
print(f"chunking overhead: {chunks / total_bytes * 100}%")
print(f"compression gain: {100 - (total_bytes * 8 / (cycle * len(self._pins)) * 100)}%")


# Step 4: write out VCD file
vcd_writer = VCDWriter(args.file, timescale="1 ns", check_values=False)
vcd_signals = {
p: vcd_writer.register_var(scope="", name="pin[{}]".format(p), var_type="wire",
size=1, init=0)
for p in self._pins
}
for cycle, p, value in events:
if cycle > cycles:
# we dont write any timestamps for which we dont have data on all channels
break
signal = vcd_signals[p]
timestamp = cycle * 1_000_000_000 // self._sample_freq
vcd_writer.change(signal, timestamp, value)
vcd_writer.close(timestamp)
86 changes: 0 additions & 86 deletions software/glasgow/applet/interface/better_la/arbeiter.py

This file was deleted.

0 comments on commit dede629

Please sign in to comment.