Sync some patchs from upstreaming and modifies are as follow: - maintainers: update for hns3 driver - app/testpmd: add command to flush multicast MAC addresses - app/testpmd: fix help string - app/testpmd: fix multicast address pool leak - net/hns3: optimize SVE Rx performance - net/hns3: optimize rearm mbuf for SVE Rx - net/hns3: optimize free mbuf for SVE Tx - net/hns3: fix order in NEON Rx - net/hns3: fix traffic management dump text alignment - net/hns3: fix traffic management thread safety - net/hns3: fix flushing multicast MAC address - net/hns3: fix error code for multicast resource - net/hns3: fix VF default MAC modified when set failed - net/hns3: fix index to look up table in NEON Rx - net/hns3: fix non-zero weight for disabled TC - config/arm: add HiSilicon HIP10 Signed-off-by: Dengdui Huang <huangdengdui@huawei.com>
243 lines
9.1 KiB
Diff
243 lines
9.1 KiB
Diff
From 5e6c0f58eff79c06edf3638108c096e792b81a3b Mon Sep 17 00:00:00 2001
|
|
From: Huisong Li <lihuisong@huawei.com>
|
|
Date: Tue, 11 Jul 2023 18:24:48 +0800
|
|
Subject: [PATCH 362/366] net/hns3: optimize SVE Rx performance
|
|
|
|
[ upstream commit f1ad6decfbd44c3dc2d73dcda3fa8fb37b140186 ]
|
|
|
|
This patch optimizes SVE Rx performance by the following ways:
|
|
1> optimize the calculation of valid BD number.
|
|
2> remove a temporary variable (key_fields)
|
|
3> use C language to parse some descriptor fields, instead of
|
|
SVE instruction.
|
|
4> small step prefetch descriptor.
|
|
|
|
On the rxonly forwarding mode, the performance of a single queue
|
|
or 64B packet is improved by ~40%.
|
|
|
|
Signed-off-by: Huisong Li <lihuisong@huawei.com>
|
|
Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
|
|
---
|
|
drivers/net/hns3/hns3_rxtx_vec_sve.c | 137 ++++++---------------------
|
|
1 file changed, 27 insertions(+), 110 deletions(-)
|
|
|
|
diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
|
|
index 1251939..88b484d 100644
|
|
--- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
|
|
+++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
|
|
@@ -20,40 +20,36 @@
|
|
|
|
#define BD_SIZE 32
|
|
#define BD_FIELD_ADDR_OFFSET 0
|
|
-#define BD_FIELD_L234_OFFSET 8
|
|
-#define BD_FIELD_XLEN_OFFSET 12
|
|
-#define BD_FIELD_RSS_OFFSET 16
|
|
-#define BD_FIELD_OL_OFFSET 24
|
|
#define BD_FIELD_VALID_OFFSET 28
|
|
|
|
-typedef struct {
|
|
- uint32_t l234_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
|
|
- uint32_t ol_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
|
|
- uint32_t bd_base_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
|
|
-} HNS3_SVE_KEY_FIELD_S;
|
|
-
|
|
static inline uint32_t
|
|
hns3_desc_parse_field_sve(struct hns3_rx_queue *rxq,
|
|
struct rte_mbuf **rx_pkts,
|
|
- HNS3_SVE_KEY_FIELD_S *key,
|
|
+ struct hns3_desc *rxdp,
|
|
uint32_t bd_vld_num)
|
|
{
|
|
+ uint32_t l234_info, ol_info, bd_base_info;
|
|
uint32_t retcode = 0;
|
|
int ret, i;
|
|
|
|
for (i = 0; i < (int)bd_vld_num; i++) {
|
|
/* init rte_mbuf.rearm_data last 64-bit */
|
|
rx_pkts[i]->ol_flags = RTE_MBUF_F_RX_RSS_HASH;
|
|
-
|
|
- ret = hns3_handle_bdinfo(rxq, rx_pkts[i], key->bd_base_info[i],
|
|
- key->l234_info[i]);
|
|
+ rx_pkts[i]->hash.rss = rxdp[i].rx.rss_hash;
|
|
+ rx_pkts[i]->pkt_len = rte_le_to_cpu_16(rxdp[i].rx.pkt_len) -
|
|
+ rxq->crc_len;
|
|
+ rx_pkts[i]->data_len = rx_pkts[i]->pkt_len;
|
|
+
|
|
+ l234_info = rxdp[i].rx.l234_info;
|
|
+ ol_info = rxdp[i].rx.ol_info;
|
|
+ bd_base_info = rxdp[i].rx.bd_base_info;
|
|
+ ret = hns3_handle_bdinfo(rxq, rx_pkts[i], bd_base_info, l234_info);
|
|
if (unlikely(ret)) {
|
|
retcode |= 1u << i;
|
|
continue;
|
|
}
|
|
|
|
- rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq,
|
|
- key->l234_info[i], key->ol_info[i]);
|
|
+ rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
|
|
|
|
/* Increment bytes counter */
|
|
rxq->basic_stats.bytes += rx_pkts[i]->pkt_len;
|
|
@@ -77,46 +73,16 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
|
|
uint16_t nb_pkts,
|
|
uint64_t *bd_err_mask)
|
|
{
|
|
-#define XLEN_ADJUST_LEN 32
|
|
-#define RSS_ADJUST_LEN 16
|
|
-#define GEN_VLD_U8_ZIP_INDEX svindex_s8(28, -4)
|
|
uint16_t rx_id = rxq->next_to_use;
|
|
struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
|
|
struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
|
|
- struct hns3_desc *rxdp2;
|
|
- HNS3_SVE_KEY_FIELD_S key_field;
|
|
+ struct hns3_desc *rxdp2, *next_rxdp;
|
|
uint64_t bd_valid_num;
|
|
uint32_t parse_retcode;
|
|
uint16_t nb_rx = 0;
|
|
int pos, offset;
|
|
|
|
- uint16_t xlen_adjust[XLEN_ADJUST_LEN] = {
|
|
- 0, 0xffff, 1, 0xffff, /* 1st mbuf: pkt_len and dat_len */
|
|
- 2, 0xffff, 3, 0xffff, /* 2st mbuf: pkt_len and dat_len */
|
|
- 4, 0xffff, 5, 0xffff, /* 3st mbuf: pkt_len and dat_len */
|
|
- 6, 0xffff, 7, 0xffff, /* 4st mbuf: pkt_len and dat_len */
|
|
- 8, 0xffff, 9, 0xffff, /* 5st mbuf: pkt_len and dat_len */
|
|
- 10, 0xffff, 11, 0xffff, /* 6st mbuf: pkt_len and dat_len */
|
|
- 12, 0xffff, 13, 0xffff, /* 7st mbuf: pkt_len and dat_len */
|
|
- 14, 0xffff, 15, 0xffff, /* 8st mbuf: pkt_len and dat_len */
|
|
- };
|
|
-
|
|
- uint32_t rss_adjust[RSS_ADJUST_LEN] = {
|
|
- 0, 0xffff, /* 1st mbuf: rss */
|
|
- 1, 0xffff, /* 2st mbuf: rss */
|
|
- 2, 0xffff, /* 3st mbuf: rss */
|
|
- 3, 0xffff, /* 4st mbuf: rss */
|
|
- 4, 0xffff, /* 5st mbuf: rss */
|
|
- 5, 0xffff, /* 6st mbuf: rss */
|
|
- 6, 0xffff, /* 7st mbuf: rss */
|
|
- 7, 0xffff, /* 8st mbuf: rss */
|
|
- };
|
|
-
|
|
svbool_t pg32 = svwhilelt_b32(0, HNS3_SVE_DEFAULT_DESCS_PER_LOOP);
|
|
- svuint16_t xlen_tbl1 = svld1_u16(PG16_256BIT, xlen_adjust);
|
|
- svuint16_t xlen_tbl2 = svld1_u16(PG16_256BIT, &xlen_adjust[16]);
|
|
- svuint32_t rss_tbl1 = svld1_u32(PG32_256BIT, rss_adjust);
|
|
- svuint32_t rss_tbl2 = svld1_u32(PG32_256BIT, &rss_adjust[8]);
|
|
|
|
/* compile-time verifies the xlen_adjust mask */
|
|
RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
|
|
@@ -126,30 +92,21 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
|
|
|
|
for (pos = 0; pos < nb_pkts; pos += HNS3_SVE_DEFAULT_DESCS_PER_LOOP,
|
|
rxdp += HNS3_SVE_DEFAULT_DESCS_PER_LOOP) {
|
|
- svuint64_t vld_clz, mbp1st, mbp2st, mbuf_init;
|
|
- svuint64_t xlen1st, xlen2st, rss1st, rss2st;
|
|
- svuint32_t l234, ol, vld, vld2, xlen, rss;
|
|
- svuint8_t vld_u8;
|
|
+ svuint64_t mbp1st, mbp2st, mbuf_init;
|
|
+ svuint32_t vld;
|
|
+ svbool_t vld_op;
|
|
|
|
/* calc how many bd valid: part 1 */
|
|
vld = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp,
|
|
svindex_u32(BD_FIELD_VALID_OFFSET, BD_SIZE));
|
|
- vld2 = svlsl_n_u32_z(pg32, vld,
|
|
- HNS3_UINT32_BIT - 1 - HNS3_RXD_VLD_B);
|
|
- vld2 = svreinterpret_u32_s32(svasr_n_s32_z(pg32,
|
|
- svreinterpret_s32_u32(vld2), HNS3_UINT32_BIT - 1));
|
|
+ vld = svand_n_u32_z(pg32, vld, BIT(HNS3_RXD_VLD_B));
|
|
+ vld_op = svcmpne_n_u32(pg32, vld, BIT(HNS3_RXD_VLD_B));
|
|
+ bd_valid_num = svcntp_b32(pg32, svbrkb_b_z(pg32, vld_op));
|
|
+ if (bd_valid_num == 0)
|
|
+ break;
|
|
|
|
/* load 4 mbuf pointer */
|
|
mbp1st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos]);
|
|
-
|
|
- /* calc how many bd valid: part 2 */
|
|
- vld_u8 = svtbl_u8(svreinterpret_u8_u32(vld2),
|
|
- svreinterpret_u8_s8(GEN_VLD_U8_ZIP_INDEX));
|
|
- vld_clz = svnot_u64_z(PG64_64BIT, svreinterpret_u64_u8(vld_u8));
|
|
- vld_clz = svclz_u64_z(PG64_64BIT, vld_clz);
|
|
- svst1_u64(PG64_64BIT, &bd_valid_num, vld_clz);
|
|
- bd_valid_num /= HNS3_UINT8_BIT;
|
|
-
|
|
/* load 4 more mbuf pointer */
|
|
mbp2st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos + 4]);
|
|
|
|
@@ -159,65 +116,25 @@ hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
|
|
|
|
/* store 4 mbuf pointer into rx_pkts */
|
|
svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos], mbp1st);
|
|
-
|
|
- /* load key field to vector reg */
|
|
- l234 = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
|
|
- svindex_u32(BD_FIELD_L234_OFFSET, BD_SIZE));
|
|
- ol = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
|
|
- svindex_u32(BD_FIELD_OL_OFFSET, BD_SIZE));
|
|
-
|
|
/* store 4 mbuf pointer into rx_pkts again */
|
|
svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos + 4], mbp2st);
|
|
|
|
- /* load datalen, pktlen and rss_hash */
|
|
- xlen = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
|
|
- svindex_u32(BD_FIELD_XLEN_OFFSET, BD_SIZE));
|
|
- rss = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
|
|
- svindex_u32(BD_FIELD_RSS_OFFSET, BD_SIZE));
|
|
-
|
|
- /* store key field to stash buffer */
|
|
- svst1_u32(pg32, (uint32_t *)key_field.l234_info, l234);
|
|
- svst1_u32(pg32, (uint32_t *)key_field.bd_base_info, vld);
|
|
- svst1_u32(pg32, (uint32_t *)key_field.ol_info, ol);
|
|
-
|
|
- /* sub crc_len for pkt_len and data_len */
|
|
- xlen = svreinterpret_u32_u16(svsub_n_u16_z(PG16_256BIT,
|
|
- svreinterpret_u16_u32(xlen), rxq->crc_len));
|
|
-
|
|
/* init mbuf_initializer */
|
|
mbuf_init = svdup_n_u64(rxq->mbuf_initializer);
|
|
-
|
|
- /* extract datalen, pktlen and rss from xlen and rss */
|
|
- xlen1st = svreinterpret_u64_u16(
|
|
- svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl1));
|
|
- xlen2st = svreinterpret_u64_u16(
|
|
- svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl2));
|
|
- rss1st = svreinterpret_u64_u32(
|
|
- svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl1));
|
|
- rss2st = svreinterpret_u64_u32(
|
|
- svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl2));
|
|
-
|
|
/* save mbuf_initializer */
|
|
svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
|
|
offsetof(struct rte_mbuf, rearm_data), mbuf_init);
|
|
svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
|
|
offsetof(struct rte_mbuf, rearm_data), mbuf_init);
|
|
|
|
- /* save datalen and pktlen and rss */
|
|
- svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
|
|
- offsetof(struct rte_mbuf, pkt_len), xlen1st);
|
|
- svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
|
|
- offsetof(struct rte_mbuf, hash.rss), rss1st);
|
|
- svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
|
|
- offsetof(struct rte_mbuf, pkt_len), xlen2st);
|
|
- svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
|
|
- offsetof(struct rte_mbuf, hash.rss), rss2st);
|
|
-
|
|
- rte_prefetch_non_temporal(rxdp +
|
|
- HNS3_SVE_DEFAULT_DESCS_PER_LOOP);
|
|
+ next_rxdp = rxdp + HNS3_SVE_DEFAULT_DESCS_PER_LOOP;
|
|
+ rte_prefetch_non_temporal(next_rxdp);
|
|
+ rte_prefetch_non_temporal(next_rxdp + 2);
|
|
+ rte_prefetch_non_temporal(next_rxdp + 4);
|
|
+ rte_prefetch_non_temporal(next_rxdp + 6);
|
|
|
|
parse_retcode = hns3_desc_parse_field_sve(rxq, &rx_pkts[pos],
|
|
- &key_field, bd_valid_num);
|
|
+ &rxdp2[offset], bd_valid_num);
|
|
if (unlikely(parse_retcode))
|
|
(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
|
|
|
|
--
|
|
2.41.0.windows.2
|
|
|