Update DPDK version from 19.11 to 20.11 and also support hns3 PMD for Kunpeng 920 and Kunpeng 930. Signed-off-by: speech_white <humin29@huawei.com>
260 lines
8.8 KiB
Diff
260 lines
8.8 KiB
Diff
From 73fce35c11863c8cccc5444f054e5820f303e9bd Mon Sep 17 00:00:00 2001
|
|
From: Chengwen Feng <fengchengwen@huawei.com>
|
|
Date: Fri, 30 Apr 2021 14:28:48 +0800
|
|
Subject: [PATCH 149/189] net/hns3: improve IO path data cache usage
|
|
|
|
This patch improves data cache usage by:
|
|
1. Rearrange the rxq frequency accessed fields in the IO path to the
|
|
first 128B.
|
|
2. Rearrange the txq frequency accessed fields in the IO path to the
|
|
first 64B.
|
|
3. Make sure ptype table align cacheline size which is 128B instead of
|
|
min cacheline size which is 64B because the L1/L2 is 64B and L3 is
|
|
128B on Kunpeng ARM platform.
|
|
|
|
The performance gains are 1.5% in 64B packet macfwd scenarios.
|
|
|
|
Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
|
|
Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
|
|
---
|
|
drivers/net/hns3/hns3_ethdev.h | 4 +-
|
|
drivers/net/hns3/hns3_rxtx.h | 126 ++++++++++++++++++++++++-----------------
|
|
2 files changed, 77 insertions(+), 53 deletions(-)
|
|
|
|
diff --git a/drivers/net/hns3/hns3_ethdev.h b/drivers/net/hns3/hns3_ethdev.h
|
|
index 00fedf0..c70950a 100644
|
|
--- a/drivers/net/hns3/hns3_ethdev.h
|
|
+++ b/drivers/net/hns3/hns3_ethdev.h
|
|
@@ -738,7 +738,7 @@ struct hns3_ptype_table {
|
|
* descriptor, it functions only when firmware report the capability of
|
|
* HNS3_CAPS_RXD_ADV_LAYOUT_B and driver enabled it.
|
|
*/
|
|
- uint32_t ptype[HNS3_PTYPE_NUM] __rte_cache_min_aligned;
|
|
+ uint32_t ptype[HNS3_PTYPE_NUM] __rte_cache_aligned;
|
|
};
|
|
|
|
#define HNS3_FIXED_MAX_TQP_NUM_MODE 0
|
|
@@ -842,7 +842,7 @@ struct hns3_adapter {
|
|
|
|
uint64_t dev_caps_mask;
|
|
|
|
- struct hns3_ptype_table ptype_tbl __rte_cache_min_aligned;
|
|
+ struct hns3_ptype_table ptype_tbl __rte_cache_aligned;
|
|
};
|
|
|
|
enum {
|
|
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
|
|
index 92f01ed..811be96 100644
|
|
--- a/drivers/net/hns3/hns3_rxtx.h
|
|
+++ b/drivers/net/hns3/hns3_rxtx.h
|
|
@@ -289,22 +289,14 @@ struct hns3_rx_bd_errors_stats {
|
|
};
|
|
|
|
struct hns3_rx_queue {
|
|
- void *io_base;
|
|
volatile void *io_head_reg;
|
|
- struct hns3_adapter *hns;
|
|
struct hns3_ptype_table *ptype_tbl;
|
|
struct rte_mempool *mb_pool;
|
|
struct hns3_desc *rx_ring;
|
|
- uint64_t rx_ring_phys_addr; /* RX ring DMA address */
|
|
- const struct rte_memzone *mz;
|
|
struct hns3_entry *sw_ring;
|
|
- struct rte_mbuf *pkt_first_seg;
|
|
- struct rte_mbuf *pkt_last_seg;
|
|
|
|
- uint16_t queue_id;
|
|
uint16_t port_id;
|
|
uint16_t nb_rx_desc;
|
|
- uint16_t rx_buf_len;
|
|
/*
|
|
* threshold for the number of BDs waited to passed to hardware. If the
|
|
* number exceeds the threshold, driver will pass these BDs to hardware.
|
|
@@ -318,8 +310,6 @@ struct hns3_rx_queue {
|
|
/* 4 if DEV_RX_OFFLOAD_KEEP_CRC offload set, 0 otherwise */
|
|
uint8_t crc_len;
|
|
|
|
- bool rx_deferred_start; /* don't start this queue in dev start */
|
|
- bool configured; /* indicate if rx queue has been configured */
|
|
/*
|
|
* Indicate whether ignore the outer VLAN field in the Rx BD reported
|
|
* by the Hardware. Because the outer VLAN is the PVID if the PVID is
|
|
@@ -331,23 +321,45 @@ struct hns3_rx_queue {
|
|
* driver does not need to perform PVID-related operation in Rx. At this
|
|
* point, the pvid_sw_discard_en will be false.
|
|
*/
|
|
- bool pvid_sw_discard_en;
|
|
- bool ptype_en; /* indicate if the ptype field enabled */
|
|
- bool enabled; /* indicate if Rx queue has been enabled */
|
|
+ uint8_t pvid_sw_discard_en:1;
|
|
+ uint8_t ptype_en:1; /* indicate if the ptype field enabled */
|
|
+
|
|
+ uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
|
|
+ /* offset_table: used for vector, to solve execute re-order problem */
|
|
+ uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
|
|
+
|
|
+ uint16_t bulk_mbuf_num; /* indicate bulk_mbuf valid nums */
|
|
|
|
struct hns3_rx_basic_stats basic_stats;
|
|
+
|
|
+ struct rte_mbuf *pkt_first_seg;
|
|
+ struct rte_mbuf *pkt_last_seg;
|
|
+
|
|
+ struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
|
|
+
|
|
/* DFX statistics that driver does not need to discard packets */
|
|
struct hns3_rx_dfx_stats dfx_stats;
|
|
/* Error statistics that driver needs to discard packets */
|
|
struct hns3_rx_bd_errors_stats err_stats;
|
|
|
|
- struct rte_mbuf *bulk_mbuf[HNS3_BULK_ALLOC_MBUF_NUM];
|
|
- uint16_t bulk_mbuf_num;
|
|
-
|
|
- /* offset_table: used for vector, to solve execute re-order problem */
|
|
- uint8_t offset_table[HNS3_VECTOR_RX_OFFSET_TABLE_LEN + 1];
|
|
- uint64_t mbuf_initializer; /* value to init mbufs used with vector rx */
|
|
struct rte_mbuf fake_mbuf; /* fake mbuf used with vector rx */
|
|
+
|
|
+
|
|
+ /*
|
|
+ * The following fields are not accessed in the I/O path, so they are
|
|
+ * placed at the end.
|
|
+ */
|
|
+ void *io_base;
|
|
+ struct hns3_adapter *hns;
|
|
+ uint64_t rx_ring_phys_addr; /* RX ring DMA address */
|
|
+ const struct rte_memzone *mz;
|
|
+
|
|
+ uint16_t queue_id;
|
|
+ uint16_t rx_buf_len;
|
|
+
|
|
+ bool configured; /* indicate if rx queue has been configured */
|
|
+ bool rx_deferred_start; /* don't start this queue in dev start */
|
|
+ bool enabled; /* indicate if Rx queue has been enabled */
|
|
};
|
|
|
|
struct hns3_tx_basic_stats {
|
|
@@ -407,16 +419,10 @@ struct hns3_tx_dfx_stats {
|
|
};
|
|
|
|
struct hns3_tx_queue {
|
|
- void *io_base;
|
|
volatile void *io_tail_reg;
|
|
- struct hns3_adapter *hns;
|
|
struct hns3_desc *tx_ring;
|
|
- uint64_t tx_ring_phys_addr; /* TX ring DMA address */
|
|
- const struct rte_memzone *mz;
|
|
struct hns3_entry *sw_ring;
|
|
|
|
- uint16_t queue_id;
|
|
- uint16_t port_id;
|
|
uint16_t nb_tx_desc;
|
|
/*
|
|
* index of next BD whose corresponding rte_mbuf can be released by
|
|
@@ -432,21 +438,12 @@ struct hns3_tx_queue {
|
|
uint16_t tx_free_thresh;
|
|
|
|
/*
|
|
- * For better performance in tx datapath, releasing mbuf in batches is
|
|
- * required.
|
|
- * Only checking the VLD bit of the last descriptor in a batch of the
|
|
- * thresh descriptors does not mean that these descriptors are all sent
|
|
- * by hardware successfully. So we need to check that the VLD bits of
|
|
- * all descriptors are cleared. and then free all mbufs in the batch.
|
|
- * - tx_rs_thresh
|
|
- * Number of mbufs released at a time.
|
|
- *
|
|
- * - free
|
|
- * Tx mbuf free array used for preserving temporarily address of mbuf
|
|
- * released back to mempool, when releasing mbuf in batches.
|
|
+ * The minimum length of the packet supported by hardware in the Tx
|
|
+ * direction.
|
|
*/
|
|
- uint16_t tx_rs_thresh;
|
|
- struct rte_mbuf **free;
|
|
+ uint8_t min_tx_pkt_len;
|
|
+
|
|
+ uint8_t max_non_tso_bd_num; /* max BD number of one non-TSO packet */
|
|
|
|
/*
|
|
* tso mode.
|
|
@@ -464,7 +461,7 @@ struct hns3_tx_queue {
|
|
* checksum of packets that need TSO, so network driver software
|
|
* not need to recalculate it.
|
|
*/
|
|
- uint8_t tso_mode;
|
|
+ uint16_t tso_mode:1;
|
|
/*
|
|
* udp checksum mode.
|
|
* value range:
|
|
@@ -480,16 +477,10 @@ struct hns3_tx_queue {
|
|
* In this mode, HW does not have the preceding problems and can
|
|
* directly calculate the checksum of these UDP packets.
|
|
*/
|
|
- uint8_t udp_cksum_mode;
|
|
- /*
|
|
- * The minimum length of the packet supported by hardware in the Tx
|
|
- * direction.
|
|
- */
|
|
- uint32_t min_tx_pkt_len;
|
|
+ uint16_t udp_cksum_mode:1;
|
|
|
|
- uint8_t max_non_tso_bd_num; /* max BD number of one non-TSO packet */
|
|
- bool tx_deferred_start; /* don't start this queue in dev start */
|
|
- bool configured; /* indicate if tx queue has been configured */
|
|
+ uint16_t simple_bd_enable:1;
|
|
+ uint16_t tx_push_enable:1; /* check whether the tx push is enabled */
|
|
/*
|
|
* Indicate whether add the vlan_tci of the mbuf to the inner VLAN field
|
|
* of Tx BD. Because the outer VLAN will always be the PVID when the
|
|
@@ -502,11 +493,44 @@ struct hns3_tx_queue {
|
|
* PVID-related operations in Tx. And pvid_sw_shift_en will be false at
|
|
* this point.
|
|
*/
|
|
- bool pvid_sw_shift_en;
|
|
- bool enabled; /* indicate if Tx queue has been enabled */
|
|
+ uint16_t pvid_sw_shift_en:1;
|
|
+
|
|
+ /*
|
|
+ * For better performance in tx datapath, releasing mbuf in batches is
|
|
+ * required.
|
|
+ * Only checking the VLD bit of the last descriptor in a batch of the
|
|
+ * thresh descriptors does not mean that these descriptors are all sent
|
|
+ * by hardware successfully. So we need to check that the VLD bits of
|
|
+ * all descriptors are cleared. and then free all mbufs in the batch.
|
|
+ * - tx_rs_thresh
|
|
+ * Number of mbufs released at a time.
|
|
+ *
|
|
+ * - free
|
|
+ * Tx mbuf free array used for preserving temporarily address of mbuf
|
|
+ * released back to mempool, when releasing mbuf in batches.
|
|
+ */
|
|
+ uint16_t tx_rs_thresh;
|
|
+ struct rte_mbuf **free;
|
|
|
|
struct hns3_tx_basic_stats basic_stats;
|
|
struct hns3_tx_dfx_stats dfx_stats;
|
|
+
|
|
+
|
|
+ /*
|
|
+ * The following fields are not accessed in the I/O path, so they are
|
|
+ * placed at the end.
|
|
+ */
|
|
+ void *io_base;
|
|
+ struct hns3_adapter *hns;
|
|
+ uint64_t tx_ring_phys_addr; /* TX ring DMA address */
|
|
+ const struct rte_memzone *mz;
|
|
+
|
|
+ uint16_t port_id;
|
|
+ uint16_t queue_id;
|
|
+
|
|
+ bool configured; /* indicate if tx queue has been configured */
|
|
+ bool tx_deferred_start; /* don't start this queue in dev start */
|
|
+ bool enabled; /* indicate if Tx queue has been enabled */
|
|
};
|
|
|
|
#define HNS3_GET_TX_QUEUE_PEND_BD_NUM(txq) \
|
|
--
|
|
2.7.4
|
|
|