From 31e8b50b7c7d44f3a033225df2db000077a07b2e Mon Sep 17 00:00:00 2001 From: Alexander Boettcher Date: Wed, 7 Feb 2024 09:09:38 +0100 Subject: [PATCH] nic/ipxe: batch TX requests A bunch of transmit requests received by the Uplink server (nic_router) are currently added one by one to the ring buffer and every time the hardware is notified to process each single request. Instead, add as many as possible transmit requests in the ring buffer of the hardware and when done trigger the hardware to process the ring. Additionally, don't receive an "processed" TX IRQ for each element in the ring, which causes high CPU load. With this commit the TX IRQs in the ipxe driver for a iperf -c X.X.X.X -t 60 from within a VM to the outside iperf server is reduced from about ~2'600'000 IRQs to about ~200'000. The overall CPU load for the driver (when executed alone on CPU 0) is reduced from ~85 percent load to ~45 percent load. Issue #5149 --- repos/dde_ipxe/include/dde_ipxe/nic.h | 2 + repos/dde_ipxe/patches/intel_tx_batch.patch | 111 ++++++++++++++++++++ repos/dde_ipxe/ports/dde_ipxe.hash | 2 +- repos/dde_ipxe/ports/dde_ipxe.port | 3 +- repos/dde_ipxe/src/drivers/nic/main.cc | 5 + repos/dde_ipxe/src/lib/dde_ipxe/nic.c | 14 ++- 6 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 repos/dde_ipxe/patches/intel_tx_batch.patch diff --git a/repos/dde_ipxe/include/dde_ipxe/nic.h b/repos/dde_ipxe/include/dde_ipxe/nic.h index 5d1d55cbc9..8c45df8b35 100644 --- a/repos/dde_ipxe/include/dde_ipxe/nic.h +++ b/repos/dde_ipxe/include/dde_ipxe/nic.h @@ -68,6 +68,8 @@ extern void dde_ipxe_nic_unregister_callbacks(); */ extern int dde_ipxe_nic_tx(unsigned if_index, const char *packet, unsigned packet_len); +extern void dde_ipxe_nic_tx_done(); + /** * Get MAC address of device * diff --git a/repos/dde_ipxe/patches/intel_tx_batch.patch b/repos/dde_ipxe/patches/intel_tx_batch.patch new file mode 100644 index 0000000000..3617101ce4 --- /dev/null +++ b/repos/dde_ipxe/patches/intel_tx_batch.patch @@ -0,0 +1,111 @@ +--- a/src/drivers/net/intel.c ++++ b/src/drivers/net/intel.c +@@ -717,6 +717,23 @@ + intel_reset ( intel ); + } + ++static int intel_transmit_done ( struct net_device *netdev) ++{ ++ struct intel_nic *intel = netdev->priv; ++ ++ unsigned int tx_tail; ++ ++ tx_tail = ( intel->tx.prod % INTEL_NUM_TX_DESC ); ++ ++ wmb(); ++ ++ /* Notify card that there are packets ready to transmit */ ++ writel ( tx_tail, intel->regs + intel->tx.reg + INTEL_xDT ); ++ ++ return 0; ++ ++} ++ + /** + * Transmit packet + * +@@ -745,10 +762,11 @@ + address = virt_to_bus ( iobuf->data ); + len = iob_len ( iobuf ); + intel->tx.describe ( tx, address, len ); +- wmb(); + +- /* Notify card that there are packets ready to transmit */ +- writel ( tx_tail, intel->regs + intel->tx.reg + INTEL_xDT ); ++ /* ptr check just in case we make transmit_done configurable - set/unset */ ++ if ((intel->tx.cons == ((tx_tail + 1) % INTEL_NUM_TX_DESC)) || !netdev->op->transmit_done) { ++ intel_transmit_done(netdev); ++ } + + DBGC2 ( intel, "INTEL %p TX %d is [%llx,%llx)\n", intel, tx_idx, + ( ( unsigned long long ) address ), +@@ -845,7 +863,7 @@ + return; + + /* Poll for TX completions, if applicable */ +- if ( icr & INTEL_IRQ_TXDW ) ++ if ( icr & (INTEL_IRQ_TXDW | INTEL_IRQ_TXQE)) + intel_poll_tx ( netdev ); + + /* Poll for RX completions, if applicable */ +@@ -882,7 +900,7 @@ + struct intel_nic *intel = netdev->priv; + uint32_t mask; + +- mask = ( INTEL_IRQ_TXDW | INTEL_IRQ_LSC | INTEL_IRQ_RXT0 ); ++ mask = ( INTEL_IRQ_TXQE | INTEL_IRQ_LSC | INTEL_IRQ_RXT0 ); + if ( enable ) { + writel ( mask, intel->regs + INTEL_IMS ); + } else { +@@ -897,6 +915,7 @@ + .transmit = intel_transmit, + .poll = intel_poll, + .irq = intel_irq, ++ .transmit_done = intel_transmit_done, + }; + + /****************************************************************************** +--- a/src/include/ipxe/netdevice.h ++++ b/src/include/ipxe/netdevice.h +@@ -260,6 +260,8 @@ + * supported. + */ + void ( * irq ) ( struct net_device *netdev, int enable ); ++ ++ int ( * transmit_done ) ( struct net_device *netdev); + }; + + /** Network device error */ +@@ -575,6 +577,7 @@ + extern void netdev_link_err ( struct net_device *netdev, int rc ); + extern void netdev_link_down ( struct net_device *netdev ); + extern int netdev_tx ( struct net_device *netdev, struct io_buffer *iobuf ); ++extern int netdev_tx_done ( struct net_device *netdev ); + extern void netdev_tx_defer ( struct net_device *netdev, + struct io_buffer *iobuf ); + extern void netdev_tx_err ( struct net_device *netdev, +--- a/src/net/netdevice.c ++++ b/src/net/netdevice.c +@@ -213,6 +213,22 @@ + return rc; + } + ++int netdev_tx_done ( struct net_device *netdev ) { ++ ++ int rc; ++ ++ /* Avoid calling transmit() on unopened network devices */ ++ if ( ! netdev_is_open ( netdev ) ) { ++ rc = -ENETUNREACH; ++ return rc; ++ } ++ ++ if ( !netdev->op || !netdev->op->transmit_done ) ++ return -ENETUNREACH; ++ ++ return netdev->op->transmit_done ( netdev ); ++} ++ + /** + * Defer transmitted packet + * diff --git a/repos/dde_ipxe/ports/dde_ipxe.hash b/repos/dde_ipxe/ports/dde_ipxe.hash index be9467c951..17cbf4fa29 100644 --- a/repos/dde_ipxe/ports/dde_ipxe.hash +++ b/repos/dde_ipxe/ports/dde_ipxe.hash @@ -1 +1 @@ -79cd5d4ab2b1451966107c40fb0890a4a206279d +c20a39b17bca4d9cefcee43f366f5c23296396e0 diff --git a/repos/dde_ipxe/ports/dde_ipxe.port b/repos/dde_ipxe/ports/dde_ipxe.port index 1fbd8d6b9d..426ec7c82b 100644 --- a/repos/dde_ipxe/ports/dde_ipxe.port +++ b/repos/dde_ipxe/ports/dde_ipxe.port @@ -10,7 +10,8 @@ PATCHES := patches/dde_ipxe.patch \ patches/intel.patch \ patches/intel_update.patch \ patches/tg3.patch \ - patches/realtek.patch + patches/realtek.patch \ + patches/intel_tx_batch.patch PATCH_OPT := -p1 -d ${DIR(ipxe)} diff --git a/repos/dde_ipxe/src/drivers/nic/main.cc b/repos/dde_ipxe/src/drivers/nic/main.cc index 2f3a59b4d0..b364b15e9b 100644 --- a/repos/dde_ipxe/src/drivers/nic/main.cc +++ b/repos/dde_ipxe/src/drivers/nic/main.cc @@ -94,6 +94,11 @@ class Uplink_client : public Uplink_client_base return Transmit_result::REJECTED; } + void _drv_finish_transmitted_pkts() override + { + dde_ipxe_nic_tx_done(); + } + public: Uplink_client(Env &env, diff --git a/repos/dde_ipxe/src/lib/dde_ipxe/nic.c b/repos/dde_ipxe/src/lib/dde_ipxe/nic.c index ddf656deba..905f9dec95 100644 --- a/repos/dde_ipxe/src/lib/dde_ipxe/nic.c +++ b/repos/dde_ipxe/src/lib/dde_ipxe/nic.c @@ -239,6 +239,14 @@ int dde_ipxe_nic_link_state(unsigned if_index) } +void dde_ipxe_nic_tx_done() +{ + dde_lock_enter(); + netdev_tx_done(net_dev); + dde_lock_leave(); +} + + int dde_ipxe_nic_tx(unsigned if_index, const char *packet, unsigned packet_len) { if (if_index != 1) @@ -248,13 +256,13 @@ int dde_ipxe_nic_tx(unsigned if_index, const char *packet, unsigned packet_len) struct io_buffer *iobuf = alloc_iob(packet_len); - dde_lock_leave(); + if (!iobuf) { + dde_lock_leave(); - if (!iobuf) return -1; + } memcpy(iob_put(iobuf, packet_len), packet, packet_len); - dde_lock_enter(); netdev_poll(net_dev); netdev_tx(net_dev, iob_disown(iobuf));