diff --git a/0040-fix-sock-invalid-address.patch b/0040-fix-sock-invalid-address.patch new file mode 100644 index 0000000..5ca2825 --- /dev/null +++ b/0040-fix-sock-invalid-address.patch @@ -0,0 +1,35 @@ +From 912ac954d0f462418bb09d2ad91e7092d5ad37be Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Tue, 19 Apr 2022 19:10:10 +0800 +Subject: [PATCH 01/18] fix sock invalid address + +sockets pointer is allocated memory in gazelle_network_init(). +if invoke select_path before sockets pointer be initialized, sock is invalid +--- + src/lstack/api/lstack_wrap.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 6ee5639..0164069 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -52,10 +52,15 @@ static inline enum KERNEL_LWIP_PATH select_path(int fd) + } + return PATH_KERNEL; + } ++ ++ if (unlikely(posix_api->is_chld)) { ++ return PATH_KERNEL; ++ } ++ + struct lwip_sock *sock = posix_api->get_socket(fd); + + /* AF_UNIX case */ +- if (!sock || unlikely(posix_api->is_chld)) { ++ if (!sock) { + return PATH_KERNEL; + } + +-- +2.23.0 + diff --git a/0041-exit-lstack-process-after-ltran-instance-logout.patch b/0041-exit-lstack-process-after-ltran-instance-logout.patch new file mode 100644 index 0000000..806c2a0 --- /dev/null +++ b/0041-exit-lstack-process-after-ltran-instance-logout.patch @@ -0,0 +1,70 @@ +From 336703252c327d82f49d40f79b1d1e4e65a9281e Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Tue, 19 Apr 2022 19:49:06 +0800 +Subject: [PATCH 02/18] exit lstack process after ltran instance logout + +close fd is to notify ltran to execute the lstack instance logout. +200ms is an empirical value of instance logout. +--- + src/lstack/api/lstack_signal.c | 4 +++- + src/lstack/core/lstack_control_plane.c | 9 +++++++++ + src/lstack/include/lstack_control_plane.h | 1 + + 3 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/src/lstack/api/lstack_signal.c b/src/lstack/api/lstack_signal.c +index f4763e8..87cbdda 100644 +--- a/src/lstack/api/lstack_signal.c ++++ b/src/lstack/api/lstack_signal.c +@@ -19,8 +19,9 @@ + #include + + #include "lstack_log.h" ++#include "lstack_control_plane.h" + +-static int g_hijack_signal[] = { SIGTERM, SIGINT, SIGSEGV, SIGBUS, SIGFPE, SIGILL }; ++static int g_hijack_signal[] = { SIGTERM, SIGINT, SIGSEGV, SIGBUS, SIGFPE, SIGILL, SIGKILL}; + #define HIJACK_SIGNAL_COUNT (sizeof(g_hijack_signal) / sizeof(g_hijack_signal[0])) + #define BACKTRACE_SIZE 64 + static void dump_stack(void) +@@ -54,6 +55,7 @@ static inline bool match_hijack_signal(int sig) + static void lstack_sig_default_handler(int sig) + { + LSTACK_LOG(ERR, LSTACK, "lstack dumped,caught signal:%d\n", sig); ++ control_fd_close(); + dump_stack(); + lwip_exit(); + (void)kill(getpid(), sig); +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index c782d51..01b2ff0 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -446,6 +446,15 @@ int32_t client_reg_thrd_ring(void) + return 0; + } + ++void control_fd_close(void) ++{ ++ if (g_data_fd != 0) { ++ close(g_data_fd); ++ /* 200ms: wait ltran instance logout */ ++ rte_delay_ms(200); ++ } ++} ++ + int32_t control_init_client(bool is_reconnect) + { + int32_t ret; +diff --git a/src/lstack/include/lstack_control_plane.h b/src/lstack/include/lstack_control_plane.h +index 0af891a..1fa84e6 100644 +--- a/src/lstack/include/lstack_control_plane.h ++++ b/src/lstack/include/lstack_control_plane.h +@@ -32,5 +32,6 @@ void control_server_thread(void *arg); + bool get_register_state(void); + void thread_register_phase1(struct rpc_msg *msg); + void thread_register_phase2(struct rpc_msg *msg); ++void control_fd_close(void); + + #endif /* GAZELLE_CONTROL_PLANE_H */ +-- +2.23.0 + diff --git a/0042-use-atomic-variales-to-count.patch b/0042-use-atomic-variales-to-count.patch new file mode 100644 index 0000000..088aba2 --- /dev/null +++ b/0042-use-atomic-variales-to-count.patch @@ -0,0 +1,78 @@ +From 900685bd99e25f832c4aeac202dfb7d5f5075833 Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Tue, 19 Apr 2022 20:01:34 +0800 +Subject: [PATCH 03/18] use atomic variales to count + +name_tick and pool_index are shared by mutiple threads. +atomic variables are required. +--- + src/lstack/core/lstack_lwip.c | 9 +++++---- + src/lstack/core/lstack_thread_rpc.c | 3 ++- + 2 files changed, 7 insertions(+), 5 deletions(-) + +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 887464d..4143f2f 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -11,6 +11,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -138,25 +139,25 @@ void gazelle_init_sock(int32_t fd) + + reset_sock_data(sock); + +- sock->recv_ring = create_ring("sock_recv", SOCK_RECV_RING_SIZE, 0, name_tick++); ++ sock->recv_ring = create_ring("sock_recv", SOCK_RECV_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); + if (sock->recv_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "sock_recv create failed. errno: %d.\n", rte_errno); + return; + } + +- sock->recv_wait_free = create_ring("wait_free", SOCK_RECV_RING_SIZE, 0, name_tick++); ++ sock->recv_wait_free = create_ring("wait_free", SOCK_RECV_RING_SIZE, 0, atomic_fecth_add(&name_tick, 1)); + if (sock->recv_wait_free == NULL) { + LSTACK_LOG(ERR, LSTACK, "wait_free create failed. errno: %d.\n", rte_errno); + return; + } + +- sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, 0, name_tick++); ++ sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, 0, atomic_fecth_add(&name_tick, 1)); + if (sock->send_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "sock_send create failed. errno: %d.\n", rte_errno); + return; + } + +- sock->send_idle_ring = create_ring("idle_send", SOCK_SEND_RING_SIZE, 0, name_tick++); ++ sock->send_idle_ring = create_ring("idle_send", SOCK_SEND_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); + if (sock->send_idle_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "idle_send create failed. errno: %d.\n", rte_errno); + return; +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 26725f7..312e192 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -10,6 +10,7 @@ + * See the Mulan PSL v2 for more details. + */ + #include ++#include + #include + #include + #include +@@ -36,7 +37,7 @@ struct rpc_msg *rpc_msg_alloc(struct protocol_stack *stack, rpc_msg_func func) + + static uint16_t pool_index = 0; + if (rpc_pool == NULL) { +- rpc_pool = create_rpc_mempool("rpc_msg", pool_index++); ++ rpc_pool = create_rpc_mempool("rpc_msg", atomic_fetch_add(&pool_index, 1)); + if (rpc_pool == NULL) { + return NULL; + } +-- +2.23.0 + diff --git a/0043-re-arrange-the-program-to-invoke-rte_eth_dev_start-b.patch b/0043-re-arrange-the-program-to-invoke-rte_eth_dev_start-b.patch new file mode 100644 index 0000000..10624e2 --- /dev/null +++ b/0043-re-arrange-the-program-to-invoke-rte_eth_dev_start-b.patch @@ -0,0 +1,45 @@ +From 7f4143cd462cba5499cda0434fedd498c0967623 Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Tue, 19 Apr 2022 21:28:00 +0800 +Subject: [PATCH 04/18] re-arrange the program to invoke rte_eth_dev_start + before rss_setup + +in rss_setup(), the program invokes rte_eth_dev_rss_reta_update(). +this API should be invoked after rte_eth_dev_start(). +--- + src/lstack/core/lstack_dpdk.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index 3f446ea..a5b2ddc 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -495,6 +495,12 @@ int32_t dpdk_ethdev_init(void) + return ret; + } + ++ ret = dpdk_ethdev_start(); ++ if (ret < 0) { ++ LSTACK_LOG(ERR, LSTACK, "dpdk_ethdev_start failed\n"); ++ return ret; ++ } ++ + if (rss_enable) { + rss_setup(port_id, nb_queues); + } +@@ -604,12 +610,6 @@ int32_t init_dpdk_ethdev(void) + return -1; + } + +- ret = dpdk_ethdev_start(); +- if (ret < 0) { +- LSTACK_LOG(ERR, LSTACK, "dpdk_ethdev_start failed\n"); +- return -1; +- } +- + if (get_global_cfg_params()->kni_switch) { + ret = dpdk_init_lstack_kni(); + if (ret < 0) { +-- +2.23.0 + diff --git a/0044-delete-redundant-file.patch b/0044-delete-redundant-file.patch new file mode 100644 index 0000000..beaeaab --- /dev/null +++ b/0044-delete-redundant-file.patch @@ -0,0 +1,183 @@ +From f3059a5a1e2fcf5b7bfa2ad50865598f79eccf16 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 21 Apr 2022 15:24:20 +0800 +Subject: [PATCH 05/18] delete redundant file + +--- + src/lstack/include/lstack_lstack.h | 32 -------- + src/lstack/include/lstack_weakup.h | 124 ----------------------------- + 2 files changed, 156 deletions(-) + delete mode 100644 src/lstack/include/lstack_lstack.h + delete mode 100644 src/lstack/include/lstack_weakup.h + +diff --git a/src/lstack/include/lstack_lstack.h b/src/lstack/include/lstack_lstack.h +deleted file mode 100644 +index c2e6733..0000000 +--- a/src/lstack/include/lstack_lstack.h ++++ /dev/null +@@ -1,32 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#ifndef _LSTACK_H +-#define _LSTACK_H +- +-#if defined __GNUC__ +-#define LSTACK_EXPORT_SYMBOL __attribute__((visibility("default"))) +- +-#elif defined(_MSC_VER) +-#define LSTACK_EXPORT_SYMBOL extern __declspec(dllexport) +- +-#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550) +-#define LSTACK_EXPORT_SYMBOL __global +- +-#else +-#define LSTACK_EXPORT_SYMBOL /* unknown compiler */ +-#endif +- +-/* Return string describing version of currently running lstack. */ +-LSTACK_EXPORT_SYMBOL const char *get_lstack_version(void); +- +-#endif /* lstack.h */ +diff --git a/src/lstack/include/lstack_weakup.h b/src/lstack/include/lstack_weakup.h +deleted file mode 100644 +index 77f3b9d..0000000 +--- a/src/lstack/include/lstack_weakup.h ++++ /dev/null +@@ -1,124 +0,0 @@ +-/* +-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. +-* gazelle is licensed under the Mulan PSL v2. +-* You can use this software according to the terms and conditions of the Mulan PSL v2. +-* You may obtain a copy of Mulan PSL v2 at: +-* http://license.coscl.org.cn/MulanPSL2 +-* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR +-* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR +-* PURPOSE. +-* See the Mulan PSL v2 for more details. +-*/ +- +-#ifndef __GAZELLE_WEAKUP_THREAD_H__ +-#define __GAZELLE_WEAKUP_THREAD_H__ +- +-#include +-#include "lstack_dpdk.h" +- +-#define EPOLL_MAX_EVENTS 512 +- +-struct weakup_poll { +- sem_t event_sem; +- struct lwip_sock *sock_list[EPOLL_MAX_EVENTS]; +- struct rte_ring *event_ring; +- struct rte_ring *self_ring; +-}; +- +-#define WEAKUP_MAX (32) +- +-static inline void wakeup_list_sock(struct list_node *wakeup_list) +-{ +- struct list_node *node, *temp; +- +- list_for_each_safe(node, temp, wakeup_list) { +- struct lwip_sock *sock = container_of(node, struct lwip_sock, wakeup_list); +- +- struct weakup_poll *weakup = sock->weakup; +- struct protocol_stack *stack = sock->stack; +- if (weakup == NULL || stack == NULL) { +- list_del_node_init(&sock->wakeup_list); +- continue; +- } +- +- int32_t ret = rte_ring_mp_enqueue(weakup->event_ring, (void *)sock); +- if (ret == 0) { +- list_del_node_init(&sock->wakeup_list); +- sem_post(&weakup->event_sem); +- stack->stats.lwip_events++; +- } else { +- break; +- } +- } +-} +- +-static inline int32_t weakup_attach_sock(struct list_node *attach_list) +-{ +- struct list_node *node, *temp; +- int32_t wakeuped = -1; +- +- list_for_each_safe(node, temp, attach_list) { +- struct lwip_sock *sock = container_of(node, struct lwip_sock, attach_list); +- +- struct weakup_poll *weakup = sock->weakup; +- struct protocol_stack *stack = sock->stack; +- if (weakup == NULL || stack == NULL) { +- continue; +- } +- +- int32_t ret = rte_ring_mp_enqueue(weakup->event_ring, (void *)sock); +- if (ret == 0) { +- sem_post(&weakup->event_sem); +- stack->stats.lwip_events++; +- wakeuped = 0; +- } +- } +- +- return wakeuped; +-} +- +-static inline void weakup_thread(struct rte_ring *weakup_ring, struct list_node *wakeup_list) +-{ +- struct lwip_sock *sock; +- +- for (uint32_t i = 0; i < WEAKUP_MAX; ++i) { +- int32_t ret = rte_ring_sc_dequeue(weakup_ring, (void **)&sock); +- if (ret != 0) { +- break; +- } +- +- struct weakup_poll *weakup = sock->weakup; +- struct protocol_stack *stack = sock->stack; +- if (weakup == NULL || stack == NULL) { +- continue; +- } +- +- ret = rte_ring_mp_enqueue(weakup->event_ring, (void *)sock); +- if (ret == 0) { +- sem_post(&weakup->event_sem); +- stack->stats.lwip_events++; +- } +- +- /* listen notice attach sock */ +- int32_t wakeuped = -1; +- if (!list_is_empty(&sock->attach_list)) { +- wakeuped = weakup_attach_sock(&sock->attach_list); +- } +- +- /* notice any epoll enough */ +- if (ret != 0 && wakeuped != 0) { +- if (list_is_empty(&sock->wakeup_list)) { +- list_add_node(wakeup_list, &sock->wakeup_list); +- } +- break; +- } +- } +-} +- +-static inline __attribute__((always_inline)) +-int weakup_enqueue(struct rte_ring *weakup_ring, struct lwip_sock *sock) +-{ +- return rte_ring_sp_enqueue(weakup_ring, (void *)sock); +-} +- +-#endif +-- +2.23.0 + diff --git a/0045-lstack-all-exit-move-to-init.patch b/0045-lstack-all-exit-move-to-init.patch new file mode 100644 index 0000000..637d8dd --- /dev/null +++ b/0045-lstack-all-exit-move-to-init.patch @@ -0,0 +1,297 @@ +From 625c01d808ffd9a21bcd5705b7237d00acd7ee66 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 21 Apr 2022 15:34:36 +0800 +Subject: [PATCH 06/18] lstack:all exit move to init + +--- + src/lstack/core/lstack_control_plane.c | 5 +-- + src/lstack/core/lstack_dpdk.c | 23 ++++++++------ + src/lstack/core/lstack_init.c | 20 +++++++++++- + src/lstack/core/lstack_lwip.c | 4 +-- + src/lstack/core/lstack_protocol_stack.c | 37 ++++++++++++---------- + src/lstack/include/lstack_dpdk.h | 2 +- + src/lstack/include/lstack_protocol_stack.h | 4 ++- + 7 files changed, 61 insertions(+), 34 deletions(-) + +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index 01b2ff0..13a2ed3 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -699,12 +699,13 @@ void control_server_thread(void *arg) + { + int32_t listenfd = control_init_server(); + if (listenfd < 0) { +- LSTACK_EXIT(1, "control_init_server failed\n"); ++ LSTACK_LOG(ERR, LSTACK, "control_init_server failed\n"); ++ return; + } + + int32_t epfd = init_epoll(listenfd); + if (epfd < 0) { +- LSTACK_EXIT(1, "init_epoll failed\n"); ++ LSTACK_LOG(ERR, LSTACK, "control_init_server failed\n"); + return; + } + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index a5b2ddc..aa91201 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -86,21 +86,26 @@ int32_t thread_affinity_init(int32_t cpu_id) + return 0; + } + +-void dpdk_eal_init(void) ++int32_t dpdk_eal_init(void) + { + int32_t ret; + struct cfg_params *global_params = get_global_cfg_params(); + + ret = rte_eal_init(global_params->dpdk_argc, global_params->dpdk_argv); + if (ret < 0) { +- if (rte_errno == EALREADY) ++ if (rte_errno == EALREADY) { + LSTACK_PRE_LOG(LSTACK_INFO, "rte_eal_init aleady init\n"); +- else ++ /* maybe other program inited, merge init param share init */ ++ ret = 0; ++ } ++ else { + LSTACK_PRE_LOG(LSTACK_ERR, "rte_eal_init failed init, rte_errno %d\n", rte_errno); +- +- LSTACK_EXIT(1, "pthread_getaffinity_np failed\n"); ++ } ++ } else { ++ LSTACK_PRE_LOG(LSTACK_INFO, "dpdk_eal_init success\n"); + } +- LSTACK_PRE_LOG(LSTACK_INFO, "dpdk_eal_init success\n"); ++ ++ return ret; + } + + static struct rte_mempool *create_pktmbuf_mempool(const char *name, uint32_t nb_mbuf, +@@ -116,13 +121,11 @@ static struct rte_mempool *create_pktmbuf_mempool(const char *name, uint32_t nb_ + } + + /* time stamp before pbuf_custom as priv_data */ +- pthread_mutex_lock(get_mem_mutex()); + pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, mbuf_cache_size, + sizeof(struct pbuf_custom) + GAZELLE_MBUFF_PRIV_SIZE, MBUF_SZ, rte_socket_id()); + if (pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "cannot create %s pool rte_err=%d\n", pool_name, rte_errno); + } +- pthread_mutex_unlock(get_mem_mutex()); + return pool; + } + +@@ -136,13 +139,13 @@ struct rte_mempool *create_rpc_mempool(const char *name, uint16_t queue_id) + if (ret < 0) { + return NULL; + } +- pthread_mutex_lock(get_mem_mutex()); ++ + pool = rte_mempool_create(pool_name, CALL_POOL_SZ, sizeof(struct rpc_msg), 0, 0, NULL, NULL, NULL, + NULL, rte_socket_id(), 0); + if (pool == NULL) { + LSTACK_LOG(ERR, LSTACK, "cannot create %s pool rte_err=%d\n", pool_name, rte_errno); + } +- pthread_mutex_unlock(get_mem_mutex()); ++ + return pool; + } + +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index 774d0f3..335d834 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -48,6 +48,13 @@ + #define LSTACK_PRELOAD_NAME_LEN PATH_MAX + #define LSTACK_PRELOAD_ENV_PROC "GAZELLE_BIND_PROCNAME" + ++static volatile int32_t g_init_fail = 0; ++ ++void set_init_fail(void) ++{ ++ g_init_fail = 1; ++} ++ + struct lstack_preload { + int32_t preload_switch; + char env_procname[LSTACK_PRELOAD_NAME_LEN]; +@@ -207,7 +214,11 @@ __attribute__((constructor)) void gazelle_network_init(void) + } + ret = pthread_create(&tid, NULL, (void *(*)(void *))control_client_thread, NULL); + } else { +- dpdk_eal_init(); ++ ret = dpdk_eal_init(); ++ if (ret < 0) { ++ LSTACK_EXIT(1, "dpdk_eal_init failed ret=%d errno=%d\n", ret, errno); ++ } ++ + ret = pthread_create(&tid, NULL, (void *(*)(void *))control_server_thread, NULL); + } + if (ret != 0) { +@@ -248,7 +259,14 @@ __attribute__((constructor)) void gazelle_network_init(void) + * Phase 10: register core sig handler func to dumped stack */ + lstack_signal_init(); + ++ /* wait stack thread and kernel_event thread init finish */ ++ wait_sem_value(&get_protocol_stack_group()->all_init, get_protocol_stack_group()->stack_num); ++ if (g_init_fail) { ++ LSTACK_EXIT(1, "stack thread or kernel_event thread failed\n"); ++ } ++ + lstack_prelog_uninit(); + posix_api->is_chld = 0; + LSTACK_LOG(INFO, LSTACK, "gazelle_network_init success\n"); ++ rte_smp_mb(); + } +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 4143f2f..00a82fb 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -145,13 +145,13 @@ void gazelle_init_sock(int32_t fd) + return; + } + +- sock->recv_wait_free = create_ring("wait_free", SOCK_RECV_RING_SIZE, 0, atomic_fecth_add(&name_tick, 1)); ++ sock->recv_wait_free = create_ring("wait_free", SOCK_RECV_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); + if (sock->recv_wait_free == NULL) { + LSTACK_LOG(ERR, LSTACK, "wait_free create failed. errno: %d.\n", rte_errno); + return; + } + +- sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, 0, atomic_fecth_add(&name_tick, 1)); ++ sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); + if (sock->send_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "sock_send create failed. errno: %d.\n", rte_errno); + return; +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index da320e2..8f0b785 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -39,19 +39,14 @@ + static PER_THREAD uint16_t g_stack_idx = PROTOCOL_STACK_MAX; + static struct protocol_stack_group g_stack_group = {0}; + static PER_THREAD long g_stack_tid = 0; +-static pthread_mutex_t g_mem_mutex = PTHREAD_MUTEX_INITIALIZER; + ++void set_init_fail(void); + typedef void *(*stack_thread_func)(void *arg); + + #ifdef GAZELLE_USE_EPOLL_EVENT_STACK + void update_stack_events(struct protocol_stack *stack); + #endif + +-pthread_mutex_t *get_mem_mutex(void) +-{ +- return &g_mem_mutex; +-} +- + int32_t bind_to_stack_numa(struct protocol_stack *stack) + { + int32_t ret; +@@ -248,6 +243,14 @@ static void init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + stack_group->stacks[queue_id] = stack; + } + ++void wait_sem_value(sem_t *sem, int32_t wait_value) ++{ ++ int32_t sem_val; ++ do { ++ sem_getvalue(sem, &sem_val); ++ } while (sem_val < wait_value); ++} ++ + static struct protocol_stack * stack_thread_init(uint16_t queue_id) + { + struct protocol_stack_group *stack_group = get_protocol_stack_group(); +@@ -304,10 +307,10 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + + sem_post(&stack_group->thread_phase1); + +- int32_t sem_val; +- do { +- sem_getvalue(&stack_group->ethdev_init, &sem_val); +- } while (!sem_val && !use_ltran()); ++ if (!use_ltran()) { ++ wait_sem_value(&stack_group->ethdev_init, 1); ++ } ++ + + ret = ethdev_init(stack); + if (ret != 0) { +@@ -332,10 +335,13 @@ static void* gazelle_stack_thread(void *arg) + + struct protocol_stack *stack = stack_thread_init(queue_id); + if (stack == NULL) { +- pthread_mutex_lock(&g_mem_mutex); +- LSTACK_EXIT(1, "stack_thread_init failed\n"); +- pthread_mutex_unlock(&g_mem_mutex); ++ /* exit in main thread, avoid create mempool and exit at the same time */ ++ set_init_fail(); ++ sem_post(&get_protocol_stack_group()->all_init); ++ LSTACK_LOG(ERR, LSTACK, "stack_thread_init failed queue_id=%d\n", queue_id); ++ return NULL; + } ++ sem_post(&get_protocol_stack_group()->all_init); + LSTACK_LOG(INFO, LSTACK, "stack_%02d init success\n", queue_id); + + for (;;) { +@@ -386,10 +392,7 @@ int32_t init_protocol_stack(void) + } + } + +- int32_t thread_inited_num; +- do { +- sem_getvalue(&stack_group->thread_phase1, &thread_inited_num); +- } while (thread_inited_num < stack_group->stack_num); ++ wait_sem_value(&stack_group->thread_phase1, stack_group->stack_num); + + ret = init_stack_numa_cpuset(); + if (ret < 0) { +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index e8080e1..4295f01 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -58,7 +58,7 @@ int thread_affinity_default(void); + int thread_affinity_init(int cpu_id); + + int32_t fill_mbuf_to_ring(struct rte_mempool *mempool, struct rte_ring *ring, uint32_t mbuf_num); +-void dpdk_eal_init(void); ++int32_t dpdk_eal_init(void); + int32_t pktmbuf_pool_init(struct protocol_stack *stack, uint16_t stack_num); + struct rte_ring *create_ring(const char *name, uint32_t count, uint32_t flags, int32_t queue_id); + int32_t create_shared_ring(struct protocol_stack *stack); +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 9753385..9852878 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -64,6 +64,7 @@ struct protocol_stack_group { + uint16_t port_id; + sem_t thread_phase1; + sem_t ethdev_init; ++ sem_t all_init; + struct rte_mempool *kni_pktmbuf_pool; + struct eth_params *eth_params; + struct protocol_stack *stacks[PROTOCOL_STACK_MAX]; +@@ -82,7 +83,6 @@ struct wakeup_poll { + }; + + long get_stack_tid(void); +-pthread_mutex_t *get_mem_mutex(void); + struct protocol_stack *get_protocol_stack(void); + struct protocol_stack *get_protocol_stack_by_fd(int32_t fd); + struct protocol_stack *get_minconn_protocol_stack(void); +@@ -92,6 +92,8 @@ int32_t init_protocol_stack(void); + int32_t bind_to_stack_numa(struct protocol_stack *stack); + int32_t init_dpdk_ethdev(void); + ++void wait_sem_value(sem_t *sem, int32_t wait_value); ++ + /* any protocol stack thread receives arp packet and sync it to other threads so that it can have the arp table */ + void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cur_stack); + +-- +2.23.0 + diff --git a/0046-clean-code-fix-huge-func.patch b/0046-clean-code-fix-huge-func.patch new file mode 100644 index 0000000..77a0144 --- /dev/null +++ b/0046-clean-code-fix-huge-func.patch @@ -0,0 +1,356 @@ +From 77a8fb02a7c4352fee106d0aa83500d81c20d315 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 21 Apr 2022 16:42:01 +0800 +Subject: [PATCH 07/18] clean code:fix huge func + +--- + src/lstack/core/lstack_control_plane.c | 2 +- + src/lstack/core/lstack_init.c | 101 +++++++++++++----------- + src/lstack/core/lstack_protocol_stack.c | 89 +++++++++++---------- + 3 files changed, 105 insertions(+), 87 deletions(-) + +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index 13a2ed3..26a1b1c 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -705,7 +705,7 @@ void control_server_thread(void *arg) + + int32_t epfd = init_epoll(listenfd); + if (epfd < 0) { +- LSTACK_LOG(ERR, LSTACK, "control_init_server failed\n"); ++ LSTACK_LOG(ERR, LSTACK, "init_epoll failed\n"); + return; + } + +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index 335d834..037b8fd 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -155,12 +155,52 @@ __attribute__((destructor)) void gazelle_network_exit(void) + } + } + +-__attribute__((constructor)) void gazelle_network_init(void) ++static void create_control_thread(void) + { + int32_t ret; + ++ pthread_t tid; ++ if (use_ltran()) { ++ dpdk_skip_nic_init(); ++ if (control_init_client(false) != 0) { ++ LSTACK_EXIT(1, "control_init_client failed\n"); ++ } ++ ret = pthread_create(&tid, NULL, (void *(*)(void *))control_client_thread, NULL); ++ } else { ++ ret = dpdk_eal_init(); ++ if (ret < 0) { ++ LSTACK_EXIT(1, "dpdk_eal_init failed ret=%d errno=%d\n", ret, errno); ++ } ++ ++ ret = pthread_create(&tid, NULL, (void *(*)(void *))control_server_thread, NULL); ++ } ++ if (ret != 0) { ++ LSTACK_EXIT(1, "pthread_create failed ret=%d errno=%d\n", ret, errno); ++ } ++ ++ if (pthread_setname_np(tid, CONTROL_THREAD_NAME) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "pthread_setname_np failed errno=%d\n", errno); ++ } ++ LSTACK_LOG(INFO, LSTACK, "create control_easy_thread success\n"); ++} ++ ++static void gazelle_signal_init(void) ++{ ++ /* to prevent crash , just ignore SIGPIPE when socket is closed */ ++ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { ++ LSTACK_PRE_LOG(LSTACK_ERR, "signal error, errno:%d.", errno); ++ LSTACK_EXIT(1, "signal SIGPIPE SIG_IGN\n"); ++ } ++ ++ /* ++ * register core sig handler func to dumped stack */ ++ lstack_signal_init(); ++} ++ ++__attribute__((constructor)) void gazelle_network_init(void) ++{ + /* +- * Phase 1: Init POSXI API and prelog */ ++ * Init POSXI API and prelog */ + lstack_prelog_init("LSTACK"); + if (posix_api_init() != 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); +@@ -168,7 +208,7 @@ __attribute__((constructor)) void gazelle_network_init(void) + } + + /* +- * Phase 2: Init LD_PRELOAD */ ++ * Init LD_PRELOAD */ + if (preload_info_init() < 0) { + return; + } +@@ -177,7 +217,7 @@ __attribute__((constructor)) void gazelle_network_init(void) + } + + /* +- * Phase 3: Read configure from lstack.cfg */ ++ * Read configure from lstack.cfg */ + if (cfg_init() != 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "cfg_init failed\n"); + LSTACK_EXIT(1, "cfg_init failed\n"); +@@ -185,87 +225,56 @@ __attribute__((constructor)) void gazelle_network_init(void) + LSTACK_PRE_LOG(LSTACK_INFO, "cfg_init success\n"); + + /* +- * Phase 4: check conflict */ ++ * check conflict */ + if (check_process_conflict() < 0) { + LSTACK_PRE_LOG(LSTACK_INFO, "Have another same primary process. WARNING: Posix API will use kernel mode!\n"); + return; + } + + /* +- * Phase 5: save initial affinity */ ++ * save initial affinity */ + if (thread_affinity_default() < 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "pthread_getaffinity_np failed\n"); + LSTACK_EXIT(1, "pthread_getaffinity_np failed\n"); + } + +- /* to prevent crash , just ignore SIGPIPE when socket is closed */ +- if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { +- LSTACK_PRE_LOG(LSTACK_ERR, "signal error, errno:%d.", errno); +- LSTACK_EXIT(1, "signal SIGPIPE SIG_IGN\n"); +- } ++ gazelle_signal_init(); + + /* +- * Phase 6: Init control plane and dpdk init */ +- pthread_t tid; +- if (use_ltran()) { +- dpdk_skip_nic_init(); +- if (control_init_client(false) != 0) { +- LSTACK_EXIT(1, "control_init_client failed\n"); +- } +- ret = pthread_create(&tid, NULL, (void *(*)(void *))control_client_thread, NULL); +- } else { +- ret = dpdk_eal_init(); +- if (ret < 0) { +- LSTACK_EXIT(1, "dpdk_eal_init failed ret=%d errno=%d\n", ret, errno); +- } +- +- ret = pthread_create(&tid, NULL, (void *(*)(void *))control_server_thread, NULL); +- } +- if (ret != 0) { +- LSTACK_EXIT(1, "pthread_create failed errno=%d\n", errno); +- } +- if (pthread_setname_np(tid, CONTROL_THREAD_NAME) != 0) { +- LSTACK_LOG(ERR, LSTACK, "pthread_setname_np failed errno=%d\n", errno); +- } +- LSTACK_LOG(INFO, LSTACK, "create control_easy_thread success\n"); ++ * Init control plane and dpdk init */ ++ create_control_thread(); + + /* +- * Phase 7: cancel the core binding from DPDK initialization */ ++ * cancel the core binding from DPDK initialization */ + if (thread_affinity_default() < 0) { + LSTACK_EXIT(1, "pthread_setaffinity_np failed\n"); + } + + lstack_log_level_init(); ++ lstack_prelog_uninit(); + +- ret = init_protocol_stack(); +- if (ret != 0) { ++ if (init_protocol_stack() != 0) { + LSTACK_EXIT(1, "init_protocol_stack failed\n"); + } + + /* +- * Phase 8: nic */ ++ * nic */ + if (!use_ltran()) { +- ret = init_dpdk_ethdev(); +- if (ret != 0) { ++ if (init_dpdk_ethdev() != 0) { + LSTACK_EXIT(1, "init_dpdk_ethdev failed\n"); + } + } + + /* +- * Phase 9: lwip initialization */ ++ * lwip initialization */ + lwip_sock_init(); + +- /* +- * Phase 10: register core sig handler func to dumped stack */ +- lstack_signal_init(); +- + /* wait stack thread and kernel_event thread init finish */ + wait_sem_value(&get_protocol_stack_group()->all_init, get_protocol_stack_group()->stack_num); + if (g_init_fail) { + LSTACK_EXIT(1, "stack thread or kernel_event thread failed\n"); + } + +- lstack_prelog_uninit(); + posix_api->is_chld = 0; + LSTACK_LOG(INFO, LSTACK, "gazelle_network_init success\n"); + rte_smp_mb(); +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 8f0b785..565d19b 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -217,7 +217,7 @@ static void* gazelle_weakup_thread(void *arg) + return NULL; + } + +-static void init_stack_value(struct protocol_stack *stack, uint16_t queue_id) ++static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + { + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + +@@ -241,6 +241,31 @@ static void init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + stack_stat_init(); + + stack_group->stacks[queue_id] = stack; ++ ++ cpu_set_t cpuset; ++ CPU_ZERO(&cpuset); ++ CPU_SET(stack->cpu_id, &cpuset); ++ if (rte_thread_set_affinity(&cpuset) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "rte_thread_set_affinity failed\n"); ++ return -1; ++ } ++ RTE_PER_LCORE(_lcore_id) = stack->cpu_id; ++ ++ stack->socket_id = numa_node_of_cpu(stack->cpu_id); ++ if (stack->socket_id < 0) { ++ LSTACK_LOG(ERR, LSTACK, "numa_node_of_cpu failed\n"); ++ return -1; ++ } ++ ++ if (pktmbuf_pool_init(stack, stack_group->stack_num) != 0) { ++ return -1; ++ } ++ ++ if (create_shared_ring(stack) != 0) { ++ return -1; ++ } ++ ++ return 0; + } + + void wait_sem_value(sem_t *sem, int32_t wait_value) +@@ -260,33 +285,8 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + LSTACK_LOG(ERR, LSTACK, "malloc stack failed\n"); + return NULL; + } +- init_stack_value(stack, queue_id); +- +- cpu_set_t cpuset; +- CPU_ZERO(&cpuset); +- CPU_SET(stack->cpu_id, &cpuset); +- if (rte_thread_set_affinity(&cpuset) != 0) { +- LSTACK_LOG(ERR, LSTACK, "rte_thread_set_affinity failed\n"); +- free(stack); +- return NULL; +- } +- RTE_PER_LCORE(_lcore_id) = stack->cpu_id; +- +- stack->socket_id = numa_node_of_cpu(stack->cpu_id); +- if (stack->socket_id < 0) { +- LSTACK_LOG(ERR, LSTACK, "numa_node_of_cpu failed\n"); +- free(stack); +- return NULL; +- } + +- int32_t ret = pktmbuf_pool_init(stack, stack_group->stack_num); +- if (ret != 0) { +- free(stack); +- return NULL; +- } +- +- ret = create_shared_ring(stack); +- if (ret != 0) { ++ if (init_stack_value(stack, queue_id) != 0) { + free(stack); + return NULL; + } +@@ -298,8 +298,7 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + tcpip_init(NULL, NULL); + + if (use_ltran()) { +- ret = client_reg_thrd_ring(); +- if (ret != 0) { ++ if (client_reg_thrd_ring() != 0) { + free(stack); + return NULL; + } +@@ -311,15 +310,13 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + wait_sem_value(&stack_group->ethdev_init, 1); + } + +- +- ret = ethdev_init(stack); +- if (ret != 0) { ++ if (ethdev_init(stack) != 0) { + free(stack); + return NULL; + } + + if (stack_group->wakeup_enable) { +- ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_weakup_thread); ++ int32_t ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_weakup_thread); + if (ret != 0) { + free(stack); + return NULL; +@@ -363,25 +360,37 @@ static void* gazelle_stack_thread(void *arg) + return NULL; + } + +-int32_t init_protocol_stack(void) ++static int32_t init_protocol_sem(void) + { +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); + int32_t ret; +- +- stack_group->stack_num = get_global_cfg_params()->num_cpu; +- stack_group->wakeup_enable = (get_global_cfg_params()->num_wakeup > 0) ? true : false; ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); + + if (!use_ltran()) { + ret = sem_init(&stack_group->ethdev_init, 0, 0); + if (ret < 0) { +- LSTACK_LOG(ERR, PORT, "sem_init failed\n"); ++ LSTACK_LOG(ERR, PORT, "sem_init failed ret=%d errno=%d\n", ret, errno); + return -1; + } + } + + ret = sem_init(&stack_group->thread_phase1, 0, 0); + if (ret < 0) { +- LSTACK_LOG(ERR, PORT, "sem_init failed\n"); ++ LSTACK_LOG(ERR, PORT, "sem_init failed ret=%d errno=%d\n", ret, errno); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++int32_t init_protocol_stack(void) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ int32_t ret; ++ ++ stack_group->stack_num = get_global_cfg_params()->num_cpu; ++ stack_group->wakeup_enable = (get_global_cfg_params()->num_wakeup > 0) ? true : false; ++ ++ if (init_protocol_sem() != 0) { + return -1; + } + +-- +2.23.0 + diff --git a/0047-add-kernel-path-in-epoll-funcs.patch b/0047-add-kernel-path-in-epoll-funcs.patch new file mode 100644 index 0000000..041e448 --- /dev/null +++ b/0047-add-kernel-path-in-epoll-funcs.patch @@ -0,0 +1,58 @@ +From 17fa541e456acccae61669fcc403b44e4aabb2d5 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 21 Apr 2022 16:59:44 +0800 +Subject: [PATCH 08/18] add kernel path in epoll funcs + +--- + src/lstack/api/lstack_wrap.c | 21 ++++++++++++++++++--- + 1 file changed, 18 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index 0164069..f623da3 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -77,6 +77,15 @@ static inline enum KERNEL_LWIP_PATH select_path(int fd) + + static inline int32_t do_epoll_create(int32_t size) + { ++ if (posix_api == NULL) { ++ /* link liblstack.so using LD_PRELOAD mode will read liblstack.so, ++ poisx_api need to be initialized here */ ++ if (posix_api_init() != 0) { ++ LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); ++ } ++ return posix_api->epoll_create_fn(size); ++ } ++ + if (unlikely(posix_api->is_chld)) { + return posix_api->epoll_create_fn(size); + } +@@ -90,16 +99,22 @@ static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + ++ struct lwip_sock *sock = get_socket_by_fd(epfd); ++ if (sock == NULL || sock->wakeup == NULL) { ++ return posix_api->epoll_ctl_fn(epfd, op, fd, event); ++ } ++ + return lstack_epoll_ctl(epfd, op, fd, event); + } + + static inline int32_t do_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) + { +- if (events == NULL || maxevents == 0) { +- GAZELLE_RETURN(EINVAL); ++ if (unlikely(posix_api->is_chld)) { ++ return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +- if (unlikely(posix_api->is_chld)) { ++ struct lwip_sock *sock = get_socket_by_fd(epfd); ++ if (sock == NULL || sock->wakeup == NULL) { + return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +-- +2.23.0 + diff --git a/0048-refactor-kernel-event-poll-epoll.patch b/0048-refactor-kernel-event-poll-epoll.patch new file mode 100644 index 0000000..5580bef --- /dev/null +++ b/0048-refactor-kernel-event-poll-epoll.patch @@ -0,0 +1,836 @@ +From a74d5b38b2021397d13b13aaa30f41f69be6f475 Mon Sep 17 00:00:00 2001 +From: wuchangsheng +Date: Thu, 21 Apr 2022 17:21:59 +0800 +Subject: [PATCH 09/18] refactor kernel event poll/epoll + +--- + src/lstack/api/lstack_epoll.c | 343 +++++++++++++++------ + src/lstack/api/lstack_wrap.c | 21 +- + src/lstack/core/lstack_dpdk.c | 4 +- + src/lstack/core/lstack_init.c | 2 +- + src/lstack/core/lstack_lwip.c | 1 + + src/lstack/core/lstack_protocol_stack.c | 85 ++++- + src/lstack/include/lstack_cfg.h | 1 - + src/lstack/include/lstack_protocol_stack.h | 8 +- + src/lstack/include/posix/lstack_epoll.h | 24 ++ + 9 files changed, 350 insertions(+), 139 deletions(-) + +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index b8d53f6..cba67ea 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -32,10 +33,14 @@ + #include "gazelle_base_func.h" + #include "lstack_lwip.h" + #include "lstack_protocol_stack.h" ++#include "posix/lstack_epoll.h" + + #define EPOLL_KERNEL_INTERVAL 10 /* ms */ +-#define EPOLL_NSEC_TO_SEC 1000000000 ++#define SEC_TO_NSEC 1000000000 ++#define SEC_TO_MSEC 1000 ++#define MSEC_TO_NSEC 1000000 + #define EPOLL_MAX_EVENTS 512 ++#define POLL_KERNEL_EVENTS 32 + + static PER_THREAD struct wakeup_poll g_wakeup_poll = {0}; + static bool g_use_epoll = false; /* FIXME: when no epoll close prepare event for performance testing */ +@@ -149,12 +154,12 @@ int32_t lstack_epoll_create(int32_t size) + posix_api->close_fn(fd); + GAZELLE_RETURN(EINVAL); + } +- + memset_s(wakeup, sizeof(struct wakeup_poll), 0, sizeof(struct wakeup_poll)); +- sem_init(&wakeup->event_sem, 0, 0); + +- sock->wakeup = wakeup; + init_list_node(&wakeup->event_list); ++ wakeup->epollfd = fd; ++ sem_init(&wakeup->event_sem, 0, 0); ++ sock->wakeup = wakeup; + + g_use_epoll = true; + return fd; +@@ -162,6 +167,8 @@ int32_t lstack_epoll_create(int32_t size) + + int32_t lstack_epoll_close(int32_t fd) + { ++ posix_api->close_fn(fd); ++ + struct lwip_sock *sock = get_socket_by_fd(fd); + if (sock == NULL) { + LSTACK_LOG(ERR, LSTACK, "fd=%d sock is NULL errno=%d\n", fd, errno); +@@ -176,6 +183,43 @@ int32_t lstack_epoll_close(int32_t fd) + return 0; + } + ++static uint16_t find_max_cnt_stack(int32_t *stack_count, uint16_t stack_num, struct protocol_stack *last_stack) ++{ ++ uint16_t max_index = 0; ++ bool all_same_cnt = true; ++ ++ for (uint16_t i = 1; i < stack_num; i++) { ++ if (stack_count[i] != stack_count[0]) { ++ all_same_cnt = false; ++ } ++ ++ if (stack_count[i] > stack_count[max_index]) { ++ max_index = i; ++ } ++ } ++ ++ /* all stack same, don't change */ ++ if (all_same_cnt && last_stack) { ++ return last_stack->queue_id; ++ } ++ ++ /* first bind and all stack same. choice tick as queue_id, avoid all bind to statck_0.*/ ++ static uint16_t tick = 0; ++ if (all_same_cnt && stack_num) { ++ max_index = atomic_fetch_add(&tick, 1) % stack_num; ++ } ++ ++ return max_index; ++} ++ ++static void update_epoll_max_stack(struct wakeup_poll *wakeup) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ uint16_t bind_id = find_max_cnt_stack(wakeup->stack_fd_cnt, stack_group->stack_num, wakeup->max_stack); ++ ++ wakeup->max_stack = stack_group->stacks[bind_id]; ++} ++ + int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event) + { + LSTACK_LOG(DEBUG, LSTACK, "op=%d events: fd: %d\n", op, fd); +@@ -185,35 +229,38 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + GAZELLE_RETURN(EINVAL); + } + ++ struct lwip_sock *epoll_sock = get_socket_by_fd(epfd); ++ if (epoll_sock == NULL || epoll_sock->wakeup == NULL) { ++ return posix_api->epoll_ctl_fn(epfd, op, fd, event); ++ } ++ + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { ++ epoll_sock->wakeup->have_kernel_fd = true; + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + + if (CONN_TYPE_HAS_HOST(sock->conn)) { ++ epoll_sock->wakeup->have_kernel_fd = true; + int32_t ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); + if (ret < 0) { + return ret; + } + } + +- struct lwip_sock *epoll_sock = get_socket_by_fd(epfd); +- if (epoll_sock == NULL || epoll_sock->wakeup == NULL) { +- LSTACK_LOG(ERR, LSTACK, "epfd=%d\n", fd); +- GAZELLE_RETURN(EINVAL); +- } +- +- uint32_t events = event->events | EPOLLERR | EPOLLHUP; + do { + switch (op) { + case EPOLL_CTL_ADD: + sock->wakeup = epoll_sock->wakeup; ++ if (sock->stack) { ++ epoll_sock->wakeup->stack_fd_cnt[sock->stack->queue_id]++; ++ } + if (list_is_empty(&sock->event_list)) { + list_add_node(&sock->wakeup->event_list, &sock->event_list); + } + /* fall through */ + case EPOLL_CTL_MOD: +- sock->epoll_events = events; ++ sock->epoll_events = event->events | EPOLLERR | EPOLLHUP; + sock->ep_data = event->data; + if (sock->conn && NETCONNTYPE_GROUP(netconn_type(sock->conn)) == NETCONN_TCP) { + raise_pending_events(sock); +@@ -222,6 +269,9 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + case EPOLL_CTL_DEL: + list_del_node_init(&sock->event_list); + sock->epoll_events = 0; ++ if (sock->stack) { ++ epoll_sock->wakeup->stack_fd_cnt[sock->stack->queue_id]--; ++ } + break; + default: + GAZELLE_RETURN(EINVAL); +@@ -230,6 +280,7 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + sock = get_socket(fd); + } while (fd > 0 && sock != NULL); + ++ update_epoll_max_stack(epoll_sock->wakeup); + return 0; + } + +@@ -346,129 +397,196 @@ static int32_t poll_lwip_event(struct pollfd *fds, nfds_t nfds) + return event_num; + } + +-static inline bool have_kernel_fd(int32_t epfd, struct pollfd *fds, nfds_t nfds) ++static void ms_to_timespec(struct timespec *timespec, int32_t timeout) + { +- /* when epfd > 0 is epoll type */ +- for (uint32_t i = 0; i < nfds && epfd < 0; i++) { +- if (get_socket(fds[i].fd) == NULL) { +- return true; ++ clock_gettime(CLOCK_REALTIME, timespec); ++ timespec->tv_sec += timeout / SEC_TO_MSEC; ++ timespec->tv_nsec += (timeout % SEC_TO_MSEC) * MSEC_TO_NSEC; ++ timespec->tv_sec += timespec->tv_nsec / SEC_TO_NSEC; ++ timespec->tv_nsec = timespec->tv_nsec % SEC_TO_NSEC; ++} ++ ++static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct protocol_stack *old_stack, ++ struct protocol_stack *new_stack) ++{ ++ if (old_stack) { ++ if (posix_api->epoll_ctl_fn(old_stack->epollfd, EPOLL_CTL_DEL, wakeup->epollfd, NULL) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); + } + } + +- return false; ++ /* avoid kernel thread post too much, use EPOLLET */ ++ struct epoll_event event; ++ event.data.ptr = &wakeup->event_sem; ++ event.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; ++ if (posix_api->epoll_ctl_fn(new_stack->epollfd, EPOLL_CTL_ADD, wakeup->epollfd, &event) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); ++ } + } + +-static inline int32_t poll_kernel_event(struct pollfd *fds, nfds_t nfds) ++static void epoll_bind_statck(struct wakeup_poll *wakeup) + { +- int32_t event_num = 0; +- +- for (uint32_t i = 0; i < nfds; i++) { +- /* lwip event */ +- if (get_socket(fds[i].fd) != NULL || fds[i].fd < 0) { +- continue; +- } +- +- int32_t ret = posix_api->poll_fn(&fds[i], 1, 0); +- if (ret < 0) { +- if (errno != EINTR) { +- return ret; +- } +- } else { +- event_num += ret; +- } ++ /* all fd is kernel, set rand stack */ ++ if (wakeup->bind_stack == NULL && wakeup->max_stack== NULL) { ++ update_epoll_max_stack(wakeup); + } + +- return event_num; ++ if (wakeup->bind_stack != wakeup->max_stack && wakeup->max_stack) { ++ bind_to_stack_numa(wakeup->max_stack); ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, wakeup->max_stack); ++ wakeup->bind_stack = wakeup->max_stack; ++ } + } + +-static int32_t get_event(struct wakeup_poll *wakeup, int32_t epfd, void *out, int32_t maxevents, int32_t timeout) ++int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) + { +- struct pollfd *fds = (struct pollfd *)out; +- struct epoll_event *events = (struct epoll_event *)out; +- bool have_kernel = have_kernel_fd(epfd, fds, maxevents); ++ struct lwip_sock *sock = get_socket_by_fd(epfd); ++ if (sock == NULL || sock->wakeup == NULL) { ++ return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); ++ } ++ + int32_t event_num = 0; +- int32_t poll_time = 0; + int32_t ret; + +- /* when epfd > 0 is epoll type */ ++ struct timespec epoll_time; ++ if (timeout >= 0) { ++ ms_to_timespec(&epoll_time, timeout); ++ } ++ ++ epoll_bind_statck(sock->wakeup); ++ + do { +- event_num += (epfd > 0) ? epoll_lwip_event(wakeup, &events[event_num], maxevents - event_num) : +- poll_lwip_event(fds, maxevents); +- +- if (have_kernel) { +- int32_t event_kernel_num = (epfd > 0) ? +- posix_api->epoll_wait_fn(epfd, &events[event_num], maxevents - event_num, 0) : +- poll_kernel_event(fds, maxevents); +- if (event_kernel_num < 0) { +- return event_kernel_num; +- } +- event_num += event_kernel_num; +- if (timeout >= 0 && poll_time >= timeout) { +- break; +- } +- poll_time += EPOLL_KERNEL_INTERVAL; ++ event_num += epoll_lwip_event(sock->wakeup, &events[event_num], maxevents - event_num); ++ ++ if (sock->wakeup->have_kernel_fd) { ++ event_num += posix_api->epoll_wait_fn(epfd, &events[event_num], maxevents - event_num, 0); + } + + if (event_num > 0) { + break; + } + +- int32_t interval = (have_kernel) ? EPOLL_KERNEL_INTERVAL : timeout; +- struct timespec epoll_interval; +- clock_gettime(CLOCK_REALTIME, &epoll_interval); +- epoll_interval.tv_sec += interval / 1000; +- epoll_interval.tv_nsec += (interval % 1000) * 1000000; +- epoll_interval.tv_sec += epoll_interval.tv_nsec / 1000000000; +- epoll_interval.tv_nsec = epoll_interval.tv_nsec % 1000000000; +- +- if (timeout < 0 && !have_kernel) { +- ret = sem_wait(&wakeup->event_sem); ++ if (timeout < 0) { ++ ret = sem_wait(&sock->wakeup->event_sem); + } else { +- ret = sem_timedwait(&wakeup->event_sem, &epoll_interval); ++ ret = sem_timedwait(&sock->wakeup->event_sem, &epoll_time); + } +- +- if (!have_kernel && ret < 0) { +- break; +- } +- } while (event_num <= maxevents); ++ } while (ret == 0); + + return event_num; + } + +-int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) ++static void init_poll_wakeup_data(struct wakeup_poll *wakeup) + { +- /* avoid the starvation of epoll events from both netstack */ +- maxevents = LWIP_MIN(LWIP_EPOOL_MAX_EVENTS, maxevents); ++ sem_init(&wakeup->event_sem, 0, 0); + +- struct lwip_sock *sock = get_socket_by_fd(epfd); +- if (sock == NULL) { +- GAZELLE_RETURN(EINVAL); ++ wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); ++ if (wakeup->last_fds == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); + } + +- if (sock->wakeup == NULL) { +- return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); ++ wakeup->events = calloc(POLL_KERNEL_EVENTS, sizeof(struct epoll_event)); ++ if (wakeup->events == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); + } + +- return get_event(sock->wakeup, epfd, events, maxevents, timeout); ++ wakeup->last_max_nfds = POLL_KERNEL_EVENTS; ++ ++ wakeup->epollfd = posix_api->epoll_create_fn(POLL_KERNEL_EVENTS); ++ if (wakeup->epollfd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_create_fn errno=%d\n", errno); ++ } + } + +-static void poll_init(struct pollfd *fds, nfds_t nfds, struct wakeup_poll *wakeup) ++static void resize_kernel_poll(struct wakeup_poll *wakeup, nfds_t nfds) + { +- int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; ++ wakeup->last_fds = realloc(wakeup->last_fds, nfds * sizeof(struct pollfd)); ++ if (wakeup->last_fds == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ } ++ ++ wakeup->events = realloc(wakeup->events, nfds * sizeof(struct epoll_event)); ++ if (wakeup->events == NULL) { ++ LSTACK_LOG(ERR, LSTACK, "calloc failed errno=%d\n", errno); ++ } ++ ++ wakeup->last_max_nfds = nfds; ++ memset_s(wakeup->last_fds, nfds * sizeof(struct pollfd), 0, nfds * sizeof(struct pollfd)); ++} ++ ++static void poll_bind_statck(struct wakeup_poll *wakeup, int32_t *stack_count) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ uint16_t bind_id = find_max_cnt_stack(stack_count, stack_group->stack_num, wakeup->bind_stack); ++ ++ if (wakeup->bind_stack && wakeup->bind_stack->queue_id == bind_id) { ++ return; ++ } + ++ change_epollfd_kernel_thread(wakeup, wakeup->bind_stack, stack_group->stacks[bind_id]); ++ bind_to_stack_numa(stack_group->stacks[bind_id]); ++ wakeup->bind_stack = stack_group->stacks[bind_id]; ++} ++ ++static void update_kernel_poll(struct wakeup_poll *wakeup, uint32_t index, struct pollfd *new_fd) ++{ ++ posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_DEL, wakeup->last_fds[index].fd, NULL); ++ ++ if (new_fd == NULL) { ++ return; ++ } ++ ++ struct epoll_event event; ++ event.data.u32 = index; ++ event.events = new_fd->events; ++ if (posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_ADD, new_fd->fd, &event) != 0) { ++ LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); ++ } ++ ++ wakeup->last_fds[index].fd = new_fd->fd; ++ wakeup->last_fds[index].events = new_fd->events; ++ ++ wakeup->have_kernel_fd = true; ++} ++ ++static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfds) ++{ + if (!wakeup->init) { + wakeup->init = true; +- sem_init(&wakeup->event_sem, 0, 0); ++ init_poll_wakeup_data(wakeup); + } else { + while (sem_trywait(&wakeup->event_sem) == 0) {} + } + ++ if (nfds > wakeup->last_max_nfds) { ++ resize_kernel_poll(wakeup, nfds); ++ } ++ ++ int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; ++ int32_t poll_change = 0; ++ ++ /* poll fds num less, del old fd */ ++ for (uint32_t i = nfds; i < wakeup->last_nfds; i++) { ++ update_kernel_poll(wakeup, i, NULL); ++ poll_change = 1; ++ } ++ + for (uint32_t i = 0; i < nfds; i++) { +- int32_t fd = fds[i].fd; + fds[i].revents = 0; + ++ if (fds[i].fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { ++ continue; ++ } ++ poll_change = 1; ++ ++ int32_t fd = fds[i].fd; ++ struct lwip_sock *sock = get_socket(fd); ++ if (sock == NULL || CONN_TYPE_HAS_HOST(sock->conn)) { ++ update_kernel_poll(wakeup, i, fds + i); ++ } ++ + do { +- struct lwip_sock *sock = get_socket(fd); ++ sock = get_socket(fd); + if (sock == NULL || sock->conn == NULL) { + break; + } +@@ -481,25 +599,50 @@ static void poll_init(struct pollfd *fds, nfds_t nfds, struct wakeup_poll *wakeu + } while (fd > 0); + } + +- if (wakeup->bind_stack) { ++ wakeup->last_nfds = nfds; ++ if (poll_change == 0) { + return; + } + +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- uint32_t bind_id = 0; +- for (uint32_t i = 0; i < stack_group->stack_num; i++) { +- if (stack_count[i] > stack_count[bind_id]) { +- bind_id = i; +- } +- } +- +- bind_to_stack_numa(stack_group->stacks[bind_id]); +- wakeup->bind_stack = stack_group->stacks[bind_id]; ++ poll_bind_statck(wakeup, stack_count); + } + + int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + { +- poll_init(fds, nfds, &g_wakeup_poll); ++ poll_init(&g_wakeup_poll, fds, nfds); + +- return get_event(&g_wakeup_poll, -1, fds, nfds, timeout); ++ int32_t event_num = 0; ++ int32_t ret; ++ ++ struct timespec poll_time; ++ if (timeout >= 0) { ++ ms_to_timespec(&poll_time, timeout); ++ } ++ ++ /* when epfd > 0 is epoll type */ ++ do { ++ event_num += poll_lwip_event(fds, nfds); ++ ++ /* reduce syscall epoll_wait */ ++ if (g_wakeup_poll.have_kernel_fd) { ++ int32_t kernel_num = posix_api->epoll_wait_fn(g_wakeup_poll.epollfd, g_wakeup_poll.events, nfds, 0); ++ for (int32_t i = 0; i < kernel_num; i++) { ++ uint32_t index = g_wakeup_poll.events[i].data.u32; ++ fds[index].revents = g_wakeup_poll.events[i].events; ++ } ++ event_num += kernel_num >= 0 ? kernel_num : 0; ++ } ++ ++ if (event_num > 0) { ++ break; ++ } ++ ++ if (timeout < 0) { ++ ret = sem_wait(&g_wakeup_poll.event_sem); ++ } else { ++ ret = sem_timedwait(&g_wakeup_poll.event_sem, &poll_time); ++ } ++ } while (ret == 0); ++ ++ return event_num; + } +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index f623da3..bf5dcb4 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -45,8 +45,7 @@ enum KERNEL_LWIP_PATH { + static inline enum KERNEL_LWIP_PATH select_path(int fd) + { + if (posix_api == NULL) { +- /* link liblstack.so using LD_PRELOAD mode will read liblstack.so, +- poisx_api need to be initialized here */ ++ /* posix api maybe call before gazelle init */ + if (posix_api_init() != 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); + } +@@ -78,8 +77,7 @@ static inline enum KERNEL_LWIP_PATH select_path(int fd) + static inline int32_t do_epoll_create(int32_t size) + { + if (posix_api == NULL) { +- /* link liblstack.so using LD_PRELOAD mode will read liblstack.so, +- poisx_api need to be initialized here */ ++ /* posix api maybe call before gazelle init */ + if (posix_api_init() != 0) { + LSTACK_PRE_LOG(LSTACK_ERR, "posix_api_init failed\n"); + } +@@ -99,11 +97,6 @@ static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + +- struct lwip_sock *sock = get_socket_by_fd(epfd); +- if (sock == NULL || sock->wakeup == NULL) { +- return posix_api->epoll_ctl_fn(epfd, op, fd, event); +- } +- + return lstack_epoll_ctl(epfd, op, fd, event); + } + +@@ -113,11 +106,6 @@ static inline int32_t do_epoll_wait(int32_t epfd, struct epoll_event* events, in + return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +- struct lwip_sock *sock = get_socket_by_fd(epfd); +- if (sock == NULL || sock->wakeup == NULL) { +- return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); +- } +- + if (epfd < 0) { + GAZELLE_RETURN(EBADF); + } +@@ -362,6 +350,11 @@ static inline ssize_t do_sendmsg(int32_t s, const struct msghdr *message, int32_ + + static inline int32_t do_close(int32_t s) + { ++ struct lwip_sock *sock = get_socket_by_fd(s); ++ if (sock && sock->wakeup && sock->wakeup->epollfd == s) { ++ return lstack_epoll_close(s); ++ } ++ + if (select_path(s) == PATH_KERNEL) { + return posix_api->close_fn(s); + } +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index aa91201..cdd2c05 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -95,8 +95,8 @@ int32_t dpdk_eal_init(void) + if (ret < 0) { + if (rte_errno == EALREADY) { + LSTACK_PRE_LOG(LSTACK_INFO, "rte_eal_init aleady init\n"); +- /* maybe other program inited, merge init param share init */ +- ret = 0; ++ /* maybe other program inited, merge init param share init */ ++ ret = 0; + } + else { + LSTACK_PRE_LOG(LSTACK_ERR, "rte_eal_init failed init, rte_errno %d\n", rte_errno); +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index 037b8fd..f8e96bf 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -270,7 +270,7 @@ __attribute__((constructor)) void gazelle_network_init(void) + lwip_sock_init(); + + /* wait stack thread and kernel_event thread init finish */ +- wait_sem_value(&get_protocol_stack_group()->all_init, get_protocol_stack_group()->stack_num); ++ wait_sem_value(&get_protocol_stack_group()->all_init, get_protocol_stack_group()->stack_num * 2); + if (g_init_fail) { + LSTACK_EXIT(1, "stack thread or kernel_event thread failed\n"); + } +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 00a82fb..8544ef7 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -29,6 +29,7 @@ + #include "lstack_log.h" + #include "lstack_dpdk.h" + #include "lstack_stack_stat.h" ++#include "posix/lstack_epoll.h" + #include "lstack_lwip.h" + + #define HALF_DIVISOR (2) +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 565d19b..eb975c0 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -35,6 +35,7 @@ + #define READ_LIST_MAX 32 + #define SEND_LIST_MAX 32 + #define HANDLE_RPC_MSG_MAX 32 ++#define KERNEL_EPOLL_MAX 256 + + static PER_THREAD uint16_t g_stack_idx = PROTOCOL_STACK_MAX; + static struct protocol_stack_group g_stack_group = {0}; +@@ -43,9 +44,6 @@ static PER_THREAD long g_stack_tid = 0; + void set_init_fail(void); + typedef void *(*stack_thread_func)(void *arg); + +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +-void update_stack_events(struct protocol_stack *stack); +-#endif + + int32_t bind_to_stack_numa(struct protocol_stack *stack) + { +@@ -206,6 +204,10 @@ static void* gazelle_weakup_thread(void *arg) + LSTACK_LOG(INFO, LSTACK, "weakup_%02d start\n", stack->queue_id); + + for (;;) { ++ if (rte_ring_count(stack->wakeup_ring) == 0) { ++ continue; ++ } ++ + sem_t *event_sem; + if (rte_ring_sc_dequeue(stack->wakeup_ring, (void **)&event_sem)) { + continue; +@@ -268,6 +270,61 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + return 0; + } + ++static void* gazelle_kernel_event(void *arg) ++{ ++ uint16_t queue_id = *(uint16_t *)arg; ++ ++ int32_t epoll_fd = posix_api->epoll_create_fn(GAZELLE_LSTACK_MAX_CONN); ++ if (epoll_fd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "queue_id=%d epoll_fd=%d errno=%d\n", queue_id, epoll_fd, errno); ++ /* exit in main thread, avoid create mempool and exit at the same time */ ++ set_init_fail(); ++ sem_post(&get_protocol_stack_group()->all_init); ++ return NULL; ++ } ++ ++ struct protocol_stack *stack = get_protocol_stack_group()->stacks[queue_id]; ++ stack->epollfd = epoll_fd; ++ ++ sem_post(&get_protocol_stack_group()->all_init); ++ LSTACK_LOG(INFO, LSTACK, "kernel_event_%02d start\n", stack->queue_id); ++ ++ struct epoll_event events[KERNEL_EPOLL_MAX]; ++ for (;;) { ++ int32_t event_num = posix_api->epoll_wait_fn(epoll_fd, events, KERNEL_EPOLL_MAX, -1); ++ if (event_num <= 0) { ++ continue; ++ } ++ ++ for (int32_t i = 0; i < event_num; i++) { ++ if (events[i].data.ptr) { ++ sem_post((sem_t *)events[i].data.ptr); ++ } ++ } ++ } ++ ++ return NULL; ++} ++ ++static int32_t create_companion_thread(struct protocol_stack_group *stack_group, struct protocol_stack *stack) ++{ ++ int32_t ret; ++ ++ if (stack_group->wakeup_enable) { ++ ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_weakup_thread); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "gazelleweakup ret=%d errno=%d\n", ret, errno); ++ return ret; ++ } ++ } ++ ++ ret = create_thread(stack->queue_id, "gazellekernel", gazelle_kernel_event); ++ if (ret != 0) { ++ LSTACK_LOG(ERR, LSTACK, "gazellekernelEvent ret=%d errno=%d\n", ret, errno); ++ } ++ return ret; ++} ++ + void wait_sem_value(sem_t *sem, int32_t wait_value) + { + int32_t sem_val; +@@ -315,12 +372,9 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + return NULL; + } + +- if (stack_group->wakeup_enable) { +- int32_t ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_weakup_thread); +- if (ret != 0) { +- free(stack); +- return NULL; +- } ++ if (create_companion_thread(stack_group, stack) != 0) { ++ free(stack); ++ return NULL; + } + + return stack; +@@ -338,6 +392,7 @@ static void* gazelle_stack_thread(void *arg) + LSTACK_LOG(ERR, LSTACK, "stack_thread_init failed queue_id=%d\n", queue_id); + return NULL; + } ++ + sem_post(&get_protocol_stack_group()->all_init); + LSTACK_LOG(INFO, LSTACK, "stack_%02d init success\n", queue_id); + +@@ -351,10 +406,6 @@ static void* gazelle_stack_thread(void *arg) + send_stack_list(stack, SEND_LIST_MAX); + + sys_timer_run(); +- +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +- update_stack_events(stack); +-#endif + } + + return NULL; +@@ -378,7 +429,13 @@ static int32_t init_protocol_sem(void) + LSTACK_LOG(ERR, PORT, "sem_init failed ret=%d errno=%d\n", ret, errno); + return -1; + } +- ++ ++ ret = sem_init(&stack_group->all_init, 0, 0); ++ if (ret < 0) { ++ LSTACK_LOG(ERR, PORT, "sem_init failed ret=%d errno=%d\n", ret, errno); ++ return -1; ++ } ++ + return 0; + } + +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index 345a373..987828d 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -33,7 +33,6 @@ + #define LOG_DIR_PATH PATH_MAX + #define LOG_LEVEL_LEN 16 + #define GAZELLE_MAX_NUMA_NODES 8 +-#define LWIP_EPOOL_MAX_EVENTS 512 + + /* Default value of low power mode parameters */ + #define LSTACK_LPM_DETECT_MS_MIN (5 * 1000) +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index 9852878..bc4e4bd 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -51,6 +51,7 @@ struct protocol_stack { + struct list_node send_list; + struct list_node event_list; + pthread_spinlock_t event_lock; ++ int32_t epollfd; /* kernel event thread epoll fd */ + + struct gazelle_stat_pkts stats; + struct gazelle_stack_latency latency; +@@ -75,13 +76,6 @@ struct protocol_stack_group { + uint64_t call_alloc_fail; + }; + +-struct wakeup_poll { +- bool init; +- struct protocol_stack *bind_stack; +- struct list_node event_list; /* epoll temp use poll */ +- sem_t event_sem; +-}; +- + long get_stack_tid(void); + struct protocol_stack *get_protocol_stack(void); + struct protocol_stack *get_protocol_stack_by_fd(int32_t fd); +diff --git a/src/lstack/include/posix/lstack_epoll.h b/src/lstack/include/posix/lstack_epoll.h +index cac640b..a83f41f 100644 +--- a/src/lstack/include/posix/lstack_epoll.h ++++ b/src/lstack/include/posix/lstack_epoll.h +@@ -18,6 +18,30 @@ extern "C" { + #endif + + #include ++#include ++#include ++ ++#include "lstack_protocol_stack.h" ++ ++struct wakeup_poll { ++ bool init; ++ struct protocol_stack *bind_stack; ++ sem_t event_sem; ++ ++ int32_t epollfd; ++ bool have_kernel_fd; ++ ++ /* poll */ ++ struct pollfd *last_fds; ++ nfds_t last_nfds; ++ nfds_t last_max_nfds; ++ struct epoll_event *events; ++ ++ /* epoll */ ++ int32_t stack_fd_cnt[PROTOCOL_STACK_MAX]; ++ struct protocol_stack *max_stack; ++ struct list_node event_list; /* epoll temp use */ ++}; + + int32_t lstack_epoll_create(int32_t size); + int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event *event); +-- +2.23.0 + diff --git a/0049-post-thread_phase1-sem-to-avoid-block-main-thread-wh.patch b/0049-post-thread_phase1-sem-to-avoid-block-main-thread-wh.patch new file mode 100644 index 0000000..cccd67d --- /dev/null +++ b/0049-post-thread_phase1-sem-to-avoid-block-main-thread-wh.patch @@ -0,0 +1,39 @@ +From e0d26480fc3919affe10b55ccbb5837aa88c5c57 Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Mon, 16 May 2022 18:34:55 +0800 +Subject: [PATCH 10/18] post thread_phase1 sem to avoid block main thread when + stack error + +--- + src/lstack/core/lstack_protocol_stack.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index eb975c0..88513ba 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -339,11 +339,13 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + + struct protocol_stack *stack = malloc(sizeof(*stack)); + if (stack == NULL) { ++ sem_post(&stack_group->thread_phase1); + LSTACK_LOG(ERR, LSTACK, "malloc stack failed\n"); + return NULL; + } + + if (init_stack_value(stack, queue_id) != 0) { ++ sem_post(&stack_group->thread_phase1); + free(stack); + return NULL; + } +@@ -356,6 +358,7 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + + if (use_ltran()) { + if (client_reg_thrd_ring() != 0) { ++ sem_post(&stack_group->thread_phase1); + free(stack); + return NULL; + } +-- +2.23.0 + diff --git a/0050-adjust-the-number-of-RX-TX-mbufs-of-each-stack-threa.patch b/0050-adjust-the-number-of-RX-TX-mbufs-of-each-stack-threa.patch new file mode 100644 index 0000000..7d111bb --- /dev/null +++ b/0050-adjust-the-number-of-RX-TX-mbufs-of-each-stack-threa.patch @@ -0,0 +1,46 @@ +From 98f76a2d2d512338d40cd435b4a75f6989aa13bf Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Mon, 16 May 2022 18:49:18 +0800 +Subject: [PATCH 11/18] adjust the number of RX/TX mbufs of each stack thread + +--- + src/lstack/core/lstack_dpdk.c | 4 ++-- + src/lstack/include/lstack_dpdk.h | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index cdd2c05..df0332b 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -175,13 +175,13 @@ int32_t pktmbuf_pool_init(struct protocol_stack *stack, uint16_t stack_num) + return -1; + } + +- stack->rx_pktmbuf_pool = create_pktmbuf_mempool("rx_mbuf", RX_NB_MBUF / stack_num, RX_MBUF_CACHE_SZ, ++ stack->rx_pktmbuf_pool = create_pktmbuf_mempool("rx_mbuf", RX_NB_MBUF, RX_MBUF_CACHE_SZ, + stack->queue_id); + if (stack->rx_pktmbuf_pool == NULL) { + return -1; + } + +- stack->tx_pktmbuf_pool = create_pktmbuf_mempool("tx_mbuf", TX_NB_MBUF / stack_num, TX_MBUF_CACHE_SZ, ++ stack->tx_pktmbuf_pool = create_pktmbuf_mempool("tx_mbuf", TX_NB_MBUF, TX_MBUF_CACHE_SZ, + stack->queue_id); + if (stack->tx_pktmbuf_pool == NULL) { + return -1; +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index 4295f01..bb9be21 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -23,7 +23,7 @@ + #include "dpdk_common.h" + struct protocol_stack; + +-#define RX_NB_MBUF ((5 * MAX_CLIENTS) + (VDEV_RX_QUEUE_SZ * DEFAULT_BACKUP_RING_SIZE_FACTOR)) ++#define RX_NB_MBUF ((5 * (MAX_CLIENTS / 4)) + (VDEV_RX_QUEUE_SZ * DEFAULT_BACKUP_RING_SIZE_FACTOR)) + #define RX_MBUF_CACHE_SZ (VDEV_RX_QUEUE_SZ) + #define TX_NB_MBUF (128 * DEFAULT_RING_SIZE) + #define TX_MBUF_CACHE_SZ (DEFAULT_RING_SIZE) +-- +2.23.0 + diff --git a/0051-modify-README.patch b/0051-modify-README.patch new file mode 100644 index 0000000..1eff3ba --- /dev/null +++ b/0051-modify-README.patch @@ -0,0 +1,365 @@ +From 981a5ddaa7708b7e2c34fc52dae592b703fd64f2 Mon Sep 17 00:00:00 2001 +From: jiangheng +Date: Fri, 20 May 2022 17:14:07 +0800 +Subject: [PATCH 2/6] modify readme + +--- + README.md | 262 ++++++++++++++++++++++++---------------------------- + 2 files changed, 121 insertions(+), 404 deletions(-) + delete mode 100644 Gazelle.md + +diff --git a/README.md b/README.md +index 5a99633..61f298c 100644 +--- a/README.md ++++ b/README.md +@@ -1,90 +1,75 @@ +-gazelle ++# 用户态协议栈Gazelle用户指南 + +-# gazelle ++## 简介 + +-## Introduction +-gazelle是高性能的用户态协议栈,通过dpdk在用户态直接读写网卡报文,共享大页内存传递报文,并使用轻量级lwip协议栈。能够大幅提高应用的网络IO吞吐能力. ++Gazelle是一款高性能用户态协议栈。它基于DPDK在用户态直接读写网卡报文,共享大页内存传递报文,使用轻量级LwIP协议栈。能够大幅提高应用的网络I/O吞吐能力。专注于数据库网络性能加速,如MySQL、redis等。 ++- 高性能 ++报文零拷贝,无锁,灵活scale-out,自适应调度。 ++- 通用性 ++完全兼容POSIX,零修改,适用不同类型的应用。 + +-## Compile +-- 编译依赖软件包 +-cmake gcc-c++ lwip dpdk-devel(>=21.11-2) +-numactl-devel libpcap-devel libconfig-devel libboundscheck rpm-build +-- 编译 +-``` sh +-#创建目录 +-mkdir -p ~/rpmbuild/SPECS +-mkdir -p ~/rpmbuild/SOURCES +- +-#创建压缩包 +-mkdir gazelle-1.0.0 +-mv build gazelle-1.0.0 +-mv src gazelle-1.0.0 +-tar zcvf gazelle-1.0.0.tar.gz gazelle-1.0.0/ +- +-#编包 +-mv gazelle-1.0.0.tar.gz ~/rpmbuild/SPECS +-cp gazelle.spec ~/rpmbuild/SPECS +-cd ~/rpmbuild/SPECS +-rpmbuild -bb gazelle.spec +- +-#编出的包 +-ls ~/rpmbuild/RPMS +-``` ++单进程且网卡支持多队列时,只需使用liblstack.so有更短的报文路径。其余场景使用ltran进程分发报文到各个线程。 + +-## Install +-``` sh ++## 安装 ++配置openEuler的yum源,直接使用yum命令安装 ++```sh + #dpdk >= 21.11-2 + yum install dpdk + yum install libconfig +-yum install numacttl ++yum install numactl + yum install libboundscheck + yum install libpcap + yum install gazelle +- + ``` + +-## Use +-### 1. 安装ko模块 ++## 使用方法 ++配置运行环境,使用Gazelle加速应用程序步骤如下: ++### 1. 使用root权限安装ko ++根据实际情况选择使用ko,提供虚拟网口、绑定网卡到用户态功能。 ++若使用虚拟网口功能,则使用rte_kni.ko + ``` sh +-modprobe uio +-insmod /usr/lib/modules/5.10.0-54.0.0.27.oe1.x86_64/extra/dpdk/igb_uio.ko +-insmod /usr/lib/modules/5.10.0-54.0.0.27.oe1.x86_64/extra/dpdk/rte_kni.ko carrier="on" ++modprobe rte_kni carrier="on" ++``` ++网卡从内核驱动绑为用户态驱动的ko,根据实际情况选择一种 ++``` sh ++#若IOMMU能使用 ++modprobe vfio-pci ++ ++#若IOMMU不能使用,且VFIO支持noiommu ++modprobe vfio enable_unsafe_noiommu_mode=1 ++modprobe vfio-pci ++ ++#其它情况 ++modprobe igb_uio + ``` + ++ + ### 2. dpdk绑定网卡 +-- 对于虚拟网卡或一般物理网卡,绑定到驱动igb_uio ++将网卡绑定到步骤1选择的驱动。为用户态网卡驱动提供网卡资源访问接口。 + ``` sh ++#使用vfio-pci ++dpdk-devbind -b vfio-pci enp3s0 ++ ++#使用igb_uio + dpdk-devbind -b igb_uio enp3s0 + ``` +-- 1822网卡绑定到驱动vfio-pci(由kernel提供) ++ ++### 3. 大页内存配置 ++Gazelle使用大页内存提高效率。使用root权限配置系统预留大页内存,可选用任意页大小。因每页内存都需要一个fd,使用内存较大时,建议使用1G的大页,避免占用过多fd。 ++根据实际情况,选择一种页大小,配置足够的大页内存即可。配置大页操作如下: + ``` sh +-modprobe vfio-pci +-dpdk-devbind -b vfio-pci enp3s0 +-``` ++#配置2M大页内存:在node0上配置 2M * 1024 = 2G ++echo 1024 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages + +-### 3. 大页内存配置 +-dpdk提供了高效的大页内存管理和共享机制,gazelle的报文数据、无锁队列等都使用了大页内存。大页内存需要root用户配置。2M或1G大页按实际需要配置,推荐使用2M大页内存,该内存是本机上ltran和所有lstack可以使用的总内存,具体方法如下: +-- 2M大页配置 +- - 配置系统大页数量 +- ``` sh +- #示例:在node0上配置2M * 2000 = 4000M +- echo 2000 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages +- echo 0 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages +- echo 0 > /sys/devices/system/node/node2/hugepages/hugepages-2048kB/nr_hugepages +- echo 0 > /sys/devices/system/node/node3/hugepages/hugepages-2048kB/nr_hugepages +- # 查看配置结果 +- grep Huge /proc/meminfo +- ``` +-- 1G大页配置 +-1G大页配置方法与2M类似 +- - 配置系统大页数量 +- ``` sh +- #示例:在node0上配置1G * 5 = 5G +- echo 5 > /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages +- ``` ++#配置1G大页内存:在node0上配置1G * 5 = 5G ++echo 5 > /sys/devices/system/node/node0/hugepages/hugepages-1048576kB/nr_hugepages ++ ++#查看配置结果 ++grep Huge /proc/meminfo ++``` + + ### 4. 挂载大页内存 +-创建两个目录,分别给lstack的进程、ltran进程使用。操作步骤如下: ++创建两个目录,分别给lstack的进程、ltran进程访问大页内存使用。操作步骤如下: + ``` sh + mkdir -p /mnt/hugepages + mkdir -p /mnt/hugepages-2M +@@ -94,34 +79,35 @@ mount -t hugetlbfs nodev /mnt/hugepages + mount -t hugetlbfs nodev /mnt/hugepages-2M + ``` + +-### 5. 应用程序从内核协议栈切换至用户态协议栈 +-+ 一种方式:重新编译程序 +-修改应用的makefile文件,使其链接liblstack.so。示例如下: +-``` makefile +-#在makefile中添加 +-ifdef USE_GAZELLE +- -include /etc/gazelle/lstack.Makefile +-endif +-gcc test.c -o test $(LSTACK_LIBS) ++### 5. 应用程序使用Gazelle ++有两种使能Gazelle方法,根据需要选择其一 ++- 重新编译应用程序,链接Gazelle的库 ++修改应用makefile文件链接liblstack.so,示例如下: + ``` ++#makefile中添加Gazelle的Makefile ++-include /etc/gazelle/lstack.Makefile + +-+ 另一个方式:使用LD_PRELOAD ++#编译添加LSTACK_LIBS变量 ++gcc test.c -o test ${LSTACK_LIBS} + ``` +-GAZELLE_BIND_PROCNAME=test(具体进程名) LD_PRELOAD=/usr/lib64/liblstack.so ./test ++ ++- 使用LD_PRELOAD加载Gazelle的库 ++GAZELLE_BIND_PROCNAME环境变量指定进程名,LD_PRELOAD指定Gazelle库路径 ++``` ++GAZELLE_BIND_PROCNAME=test LD_PRELOAD=/usr/lib64/liblstack.so ./test + ``` + + ### 6. 配置文件 +-- lstack.conf用于指定lstack的启动参数,Gazelle发布件会包括ltran.conf供用户参考,路径为/etc/gazelle/lstack.conf, 配置文件参数如下 ++- lstack.conf用于指定lstack的启动参数,默认路径为/etc/gazelle/lstack.conf, 配置文件参数如下 + + |选项|参数格式|说明| + |:---|:---|:---| + |dpdk_args|--socket-mem(必需)
--huge-dir(必需)
--proc-type(必需)
--legacy-mem
--map-perfect
等|dpdk初始化参数,参考dpdk说明| + |use_ltran| 0/1 | 是否使用ltran | +-|num_cpus|"0,2,4 ..."|lstack线程绑定的cpu编号,编号的数量为lstack线程个数(小于等于网卡多队列数量),仅在use_ltran=0时生效,如果机器不支持网卡多队列,lstack线程数量应该为1| +-|num_weakup|"1,3,5 ..."|weakup线程绑定的cpu编号,编号的数量为weakup线程个数,与lstack线程的数量保持一致| +-|numa_bind|0/1|是否支持将用户线程绑定到与某lstack线程相同numa内| ++|num_cpus|"0,2,4 ..."|lstack线程绑定的cpu编号,编号的数量为lstack线程个数(小于等于网卡多队列数量)。可按NUMA选择cpu| ++|num_wakeup|"1,3,5 ..."|wakeup线程绑定的cpu编号,编号的数量为wakeup线程个数,与lstack线程的数量保持一致。与numcpus选择对应NUMA的cpu。不配置则为不使用唤醒线程| + |low_power_mode|0/1|是否开启低功耗模式,暂不支持| +-|kni_swith|0/1|rte_kni开关,默认为0| ++|kni_swith|0/1|rte_kni开关,默认为0。只有不使用ltran时才能开启| + |host_addr|"192.168.xx.xx"|协议栈的IP地址,必须和redis-server配置
文件里的“bind”字段保存一致。| + |mask_addr|"255.255.xx.xx"|掩码地址| + |gateway_addr|"192.168.xx.1"|网关地址| +@@ -137,10 +123,8 @@ kni_switch=0 + + low_power_mode=0 + +-num_cpus="2" +-num_weakup="3" +- +-numa_bind=1 ++num_cpus="2,22" ++num_wakeup="3,23" + + host_addr="192.168.1.10" + mask_addr="255.255.255.0" +@@ -148,7 +132,7 @@ gateway_addr="192.168.1.1" + devices="aa:bb:cc:dd:ee:ff" + ``` + +-- ltran.conf用于指定ltran启动的参数,Gazelle发布件会包括ltran.conf供用户参考,路径为/etc/gazelle/ltran.conf,仅在lstack.conf内配置use_ltran=1时生效,配置文件格式如下 ++- ltran.conf用于指定ltran启动的参数,默认路径为/etc/gazelle/ltran.conf。使用ltran时,lstack.conf内配置use_ltran=1,配置参数如下: + + |功能分类|选项|参数格式|说明| + |:---|:---|:---|:---| +@@ -182,24 +166,26 @@ bond_macs="aa:bb:cc:dd:ee:ff" + bond_ports="0x1" + + tcp_conn_scan_interval=10 +-``` +-### 7. 启动 +-- 不使用ltran模式(use_ltran=0)时,不需要启动ltran +-- 启动ltran,如果不指定--config-file,则使用默认路径/etc/gazelle/ltran.conf ++``` ++### 7. 启动应用程序 ++- 启动ltran进程 ++单进程且网卡支持多队列,则直接使用网卡多队列分发报文到各线程,不启动ltran进程,lstack.conf的use_ltran配置为0. ++启动ltran时不使用-config-file指定配置文件,则使用默认路径/etc/gazelle/ltran.conf + ``` sh + ltran --config-file ./ltran.conf + ``` +-- 启动redis,如果不指定环境变量LSTACK_CONF_PATH,则使用默认路径/etc/gazelle/lstack.conf ++- 启动应用程序 ++启动应用程序前不使用环境变量LSTACK_CONF_PATH指定配置文件,则使用默认路径/etc/gazelle/lstack.conf + ``` sh + export LSTACK_CONF_PATH=./lstack.conf +-redis-server redis.conf ++LD_PRELOAD=/usr/lib64/liblstack.so GAZELLE_BIND_PROCNAME=redis-server redis-server redis.conf + ``` + + ### 8. API +-liblstack.so编译进应用程序后wrap网络编程标准接口,应用程序无需修改代码。 ++Gazelle wrap应用程序POSIX接口,应用程序无需修改代码。 + +-### 9. gazellectl +-- 不使用ltran模式时不支持gazellectl ltran xxx命令,以及-r, rate命令 ++### 9. 调测命令 ++- 不使用ltran模式时不支持gazellectl ltran xxx命令,以及lstack -r命令 + ``` + Usage: gazellectl [-h | help] + or: gazellectl ltran {quit | show} [LTRAN_OPTIONS] [time] +@@ -228,56 +214,50 @@ Usage: gazellectl [-h | help] + #### 1. dpdk配置文件的位置 + 如果是root用户,dpdk启动后的配置文件将会放到/var/run/dpdk目录下; + 如果是非root用户,dpdk配置文件的路径将由环境变量XDG_RUNTIME_DIR决定; +-+ 如果XDG_RUNTIME_DIR为空,dpdk配置文件放到/tmp/dpdk目录下; +-+ 如果XDG_RUNTIME_DIR不为空,dpdk配置文件放到变量XDG_RUNTIME_DIR下; +-+ 注意有些机器会默认设置XDG_RUNTIME_DIR ++- 如果XDG_RUNTIME_DIR为空,dpdk配置文件放到/tmp/dpdk目录下; ++- 如果XDG_RUNTIME_DIR不为空,dpdk配置文件放到变量XDG_RUNTIME_DIR下; ++- 注意有些机器会默认设置XDG_RUNTIME_DIR ++ ++## 约束限制 + +-## Constraints +-- 提供的命令行、配置文件以及配置大页内存需要root权限执行或修改。非root用户使用,需先提权以及修改文件权限。 +-- 若要把用户态网卡绑回内核驱动,必须先将Gazelle退出。 ++使用 Gazelle 存在一些约束限制: ++#### 功能约束 + - 不支持accept阻塞模式或者connect阻塞模式。 +-- 最多只支持1500个连接。 +-- 协议栈当前只支持tcp、icmp、arp、ipv4。 +-- 大页内存不支持在挂载点里创建子目录重新挂载。 +-- 在对端ping时,要求指定报文长度小于等于14000。 ++- 最多支持1500个TCP连接。 ++- 当前仅支持TCP、ICMP、ARP、IPv4 协议。 ++- 在对端ping Gazelle时,要求指定报文长度小于等于14000B。 + - 不支持使用透明大页。 +-- 需要保证ltran的可用大页内存 >=1G +-- 需要保证应用实例协议栈线程的可用大页内存 >=800M +-- 不支持32位系统使用。 + - ltran不支持使用多种类型的网卡混合组bond。 + - ltran的bond1主备模式,只支持链路层故障主备切换(例如网线断开),不支持物理层故障主备切换(例如网卡下电、拔网卡)。 +-- 构建X86版本使用-march=native选项,基于构建环境的CPU(Intel® Xeon® Gold 5118 CPU @ 2.30GHz)指令集进行优化。要求运行环境CPU支持SSE4.2、AVX、AVX2、AVX-512指令集。 +-- 最大IP分片数为10(ping最大包长14790),TCP协议不使用IP分片。 +-- sysctl配置网卡rp_filter参数为1,否则可能使用内核协议栈 +-- 虚拟机网卡不支持多队列。 +-- 不使用ltran模式,kni网口只支持本地通讯使用,且需要启动前配置NetworkManager不管理kni网卡 +-- 虚拟kni网口的ip及mac地址,需要与lstack配置文件保持一致 +-- gazelle运行过程中,不允许删除运行文件,如果删除,需要重启gazelle +-- lstack配置的ip需要与应用程序的ip保持一致 +- +-## Security risk note +-gazelle有如下安全风险,用户需要评估使用场景风险 +-1. 共享内存 ++- 虚拟机网卡不支持多队列。 ++#### 操作约束 ++- 提供的命令行、配置文件默认root权限。非root用户使用,需先提权以及修改文件所有者。 ++- 将用户态网卡绑回到内核驱动,必须先退出Gazelle。 ++- 大页内存不支持在挂载点里创建子目录重新挂载。 ++- ltran需要最低大页内存为1GB。 ++- 每个应用实例协议栈线程最低大页内存为800MB 。 ++- 仅支持64位系统。 ++- 构建x86版本的Gazelle使用了-march=native选项,基于构建环境的CPU(Intel® Xeon® Gold 5118 CPU @ 2.30GHz指令集进行优化。要求运行环境CPU支持 SSE4.2、AVX、AVX2、AVX-512 指令集。 ++- 最大IP分片数为10(ping 最大包长14790B),TCP协议不使用IP分片。 ++- sysctl配置网卡rp_filter参数为1,否则可能不按预期使用Gazelle协议栈,而是依然使用内核协议栈。 ++- 不使用ltran模式,KNI网口不可配置只支持本地通讯使用,且需要启动前配置NetworkManager不管理KNI网卡。 ++- 虚拟KNI网口的IP及mac地址,需要与lstack.conf配置文件保持一致 。 ++ ++## 风险提示 ++Gazelle可能存在如下安全风险,用户需要根据使用场景评估风险。 ++ ++**共享内存** + - 现状 +-大页内存mount至/mnt/hugepages-2M目录,链接liblstack.so的进程初始化时在/mnt/hugepages-2M目录下创建文件,每个文件对应2M大页内存,并mmap这些文件。ltran在收到lstask的注册信息后,根据大页内存配置信息也mmap目录下文件,实现大页内存共享。 +-ltran在/mnt/hugepages目录的大页内存同理。 +-- 当前消减措施 +-大页文件权限600,只有OWNER用户才能访问文件,默认root用户,支持配置成其它用户; +-大页文件有dpdk文件锁,不能直接写或者mmap。 +-- 风险点 +-属于同一用户的恶意进程模仿DPDK实现逻辑,通过大页文件共享大页内存,写破坏大页内存,导致gazelle程序crash。建议用户下的进程属于同一信任域。 +-2. 流量限制 +-- 风险点 +-gazelle没有做流量限制,用户有能力发送最大网卡线速流量的报文到网络。 +-3. 进程仿冒 +-- 风险点 +-合法注册到ltran的两个lstack进程,进程A可仿冒进程B发送仿冒消息给ltran,修改ltran的转发控制信息,造成进程B通讯异常,进程B报文转发给进程A等问题。建议lstack进程都为可信任进程。 +- +-## How to Contribute +-We are happy to provide guidance for the new contributors. +-Please sign the CLA before contributing. +- +-## Licensing +-gazelle is licensed under the Mulan PSL v2. +- +- ++ 大页内存 mount 至 /mnt/hugepages-2M 目录,链接 liblstack.so 的进程初始化时在 /mnt/hugepages-2M 目录下创建文件,每个文件对应 2M 大页内存,并 mmap 这些文件。ltran 在收到 lstask 的注册信息后,根据大页内存配置信息也 mmap 目录下文件,实现大页内存共享。 ++ ltran 在 /mnt/hugepages 目录的大页内存同理。 ++- 当前消减措施 ++ 大页文件权限 600,只有 OWNER 用户才能访问文件,默认 root 用户,支持配置成其它用户; ++ 大页文件有 DPDK 文件锁,不能直接写或者映射。 ++- 风险点 ++ 属于同一用户的恶意进程模仿DPDK实现逻辑,通过大页文件共享大页内存,写破坏大页内存,导致Gazelle程序crash。建议用户下的进程属于同一信任域。 ++ ++**流量限制** ++Gazelle没有做流量限制,用户有能力发送最大网卡线速流量的报文到网络,可能导致网络流量拥塞。 ++ ++**进程仿冒** ++合法注册到ltran的两个lstack进程,进程A可仿冒进程B发送仿冒消息给ltran,修改ltran的转发控制信息,造成进程B通讯异常,进程B报文转发给进程A信息泄露等问题。建议lstack进程都为可信任进程。 +-- +2.23.0 + diff --git a/0052-bugfix-https-gitee.com-src-openeuler-gazelle-issues-.patch b/0052-bugfix-https-gitee.com-src-openeuler-gazelle-issues-.patch new file mode 100644 index 0000000..e5ea239 --- /dev/null +++ b/0052-bugfix-https-gitee.com-src-openeuler-gazelle-issues-.patch @@ -0,0 +1,40 @@ +From 94a5043e03d3d83b661af64c3723d5f775c10706 Mon Sep 17 00:00:00 2001 +From: zhangqiang +Date: Fri, 1 Jul 2022 01:26:09 +0800 +Subject: [PATCH 3/6] =?UTF-8?q?=20=E6=8C=89=E7=85=A7POSIX=E6=A0=87?= + =?UTF-8?q?=E5=87=86,sigaction=E5=87=BD=E6=95=B0act=3D=3DNULL,=E8=B0=83?= + =?UTF-8?q?=E7=94=A8=E8=80=85=E5=8F=AF=E4=BB=A5=E9=80=9A=E8=BF=87oldact?= + =?UTF-8?q?=E8=8E=B7=E5=8F=96=E4=B9=8B=E5=89=8D=E7=9A=84=E4=BF=A1=E5=8F=B7?= + =?UTF-8?q?action=20=E6=8C=89=E7=85=A7lstack=E7=9A=84=E7=90=86=E8=A7=A3,?= + =?UTF-8?q?=E5=AE=83HOOK=20sigaction=E5=87=BD=E6=95=B0,=E6=98=AF=E5=BD=93?= + =?UTF-8?q?=E8=B0=83=E7=94=A8=E8=80=85=E8=AE=BE=E7=BD=AE=E6=96=B0=E7=9A=84?= + =?UTF-8?q?=E4=BF=A1=E5=8F=B7=E5=87=BD=E6=95=B0=E4=B8=BA=E7=BC=BA=E7=9C=81?= + =?UTF-8?q?=E5=A4=84=E7=BD=AE=E8=A1=8C=E4=B8=BA=E6=97=B6,=E5=AF=B9?= + =?UTF-8?q?=E6=8C=87=E5=AE=9A=E4=BF=A1=E5=8F=B7=E8=BF=9B=E8=A1=8C=E6=8B=A6?= + =?UTF-8?q?=E6=88=AA,=E8=AF=A5=E6=94=B9=E5=8A=A8=E4=B8=8D=E5=BD=B1?= + =?UTF-8?q?=E5=93=8D=E5=8E=9F=E5=8A=9F=E8=83=BD?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +--- + src/lstack/api/lstack_signal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/lstack/api/lstack_signal.c b/src/lstack/api/lstack_signal.c +index 87cbdda..4dba472 100644 +--- a/src/lstack/api/lstack_signal.c ++++ b/src/lstack/api/lstack_signal.c +@@ -78,7 +78,7 @@ int lstack_sigaction(int sig_num, const struct sigaction *action, struct sigacti + { + struct sigaction new_action; + +- if ((match_hijack_signal(sig_num) != 0) && (action->sa_handler == SIG_DFL)) { ++ if ((match_hijack_signal(sig_num) != 0) && (action && action->sa_handler == SIG_DFL)) { + new_action = *action; + new_action.sa_flags |= SA_RESETHAND; + new_action.sa_handler = lstack_sig_default_handler; +-- +2.23.0 + diff --git a/0053-update-README.md.patch b/0053-update-README.md.patch new file mode 100644 index 0000000..1067020 --- /dev/null +++ b/0053-update-README.md.patch @@ -0,0 +1,332 @@ +From 5249ab9e033d2f02ca78eca1b96db540f9990641 Mon Sep 17 00:00:00 2001 +From: xiusailong +Date: Wed, 29 Jun 2022 11:28:10 +0800 +Subject: [PATCH 4/6] update README.md + +--- + README.md | 10 +- + ...50\346\210\267\346\214\207\345\215\227.md" | 159 +++++++----------- + 2 files changed, 62 insertions(+), 107 deletions(-) + +diff --git a/README.md b/README.md +index 61f298c..3c1487c 100644 +--- a/README.md ++++ b/README.md +@@ -75,12 +75,12 @@ mkdir -p /mnt/hugepages + mkdir -p /mnt/hugepages-2M + chmod -R 700 /mnt/hugepages + chmod -R 700 /mnt/hugepages-2M +-mount -t hugetlbfs nodev /mnt/hugepages +-mount -t hugetlbfs nodev /mnt/hugepages-2M ++mount -t hugetlbfs nodev /mnt/hugepages -o pagesize=2M ++mount -t hugetlbfs nodev /mnt/hugepages-2M -o pagesize=2M + ``` + + ### 5. 应用程序使用Gazelle +-有两种使能Gazelle方法,根据需要选择其一 ++有两种使用Gazelle方法,根据需要选择其一 + - 重新编译应用程序,链接Gazelle的库 + 修改应用makefile文件链接liblstack.so,示例如下: + ``` +@@ -256,8 +256,8 @@ Gazelle可能存在如下安全风险,用户需要根据使用场景评估风 + - 风险点 + 属于同一用户的恶意进程模仿DPDK实现逻辑,通过大页文件共享大页内存,写破坏大页内存,导致Gazelle程序crash。建议用户下的进程属于同一信任域。 + +-**流量限制** ++**流量限制** + Gazelle没有做流量限制,用户有能力发送最大网卡线速流量的报文到网络,可能导致网络流量拥塞。 + +-**进程仿冒** ++**进程仿冒** + 合法注册到ltran的两个lstack进程,进程A可仿冒进程B发送仿冒消息给ltran,修改ltran的转发控制信息,造成进程B通讯异常,进程B报文转发给进程A信息泄露等问题。建议lstack进程都为可信任进程。 +diff --git "a/doc/gazelle-\347\224\250\346\210\267\346\214\207\345\215\227.md" "b/doc/gazelle-\347\224\250\346\210\267\346\214\207\345\215\227.md" +index bfe0f7c..3f40bf1 100644 +--- "a/doc/gazelle-\347\224\250\346\210\267\346\214\207\345\215\227.md" ++++ "b/doc/gazelle-\347\224\250\346\210\267\346\214\207\345\215\227.md" +@@ -1,54 +1,52 @@ +-[toc] ++# gazelle-用户指南 + +-# 1 简介 ++## 1 简介 + + EulerOS提供了利用Gazelle runtime优化数据库服务性能的完整解决方案,Gazelle + runtime基于bypass内核的架构设计,能够提供更高性能运行环境,优化服务软件性 + 能,可以很好地满足产品的性能需求。 + 本文主要介绍Gazelle软件如何安装使用。 + +-# 2 安装软件包,编译应用程序(以redis 为例) ++## 2 安装软件包,编译应用程序(以redis 为例) + +-1. 在使用之前,需要安装Gazelle软件包。 ++在使用之前,需要安装Gazelle软件包。 + +-#### 操作步骤 ++### 操作步骤 + +- Gazelle是高性能用户态协议栈软件包,使用如下命令安装Gazelle软件包: ++Gazelle是高性能用户态协议栈软件包,使用如下命令安装Gazelle软件包: + + ``` +- yum install gazelle ++yum install gazelle + ``` + +-#### 安装正确性验证 ++### 安装正确性验证 + +- 1. 安装完成后,使用“rpm -qa | grep -E gazelle”命令确认是否已经正确安装。示例如下: ++1. 安装完成后,使用“rpm -qa | grep -E gazelle”命令确认是否已经正确安装。示例如下: + +- 如果已正确安装则结果显示如下: ++如果已正确安装则结果显示如下: + ``` +- gazelle-1.0.0-h1.eulerosv2r10.aarch64 ++gazelle-1.0.0-h1.eulerosv2r10.aarch64 + ``` + + 2. 确认以下dpdk组件是否存在: + ``` +- +- /lib/modules/4.19.90-vhulk2007.2.0.h188.eulerosv2r10.aarch64/extra/dpdk/igb_uio.ko +- /usr/share/dpdk +- /usr/share/dpdk/usertools/dpdk-devbind.py ++rpm -ql dpdk | grep igb_uio ++/usr/share/dpdk ++/usr/share/dpdk/usertools/dpdk-devbind.py + ``` + + 3. 确认以下Gazelle组件是否存在: +- +-``` + +- /usr/bin/gazellectl +- /usr/bin/ltran +- /usr/lib64/liblstack.so +- /etc/gazelle/ltran.conf +- /etc/gazelle/lstack.conf +- /etc/gazelle/lstack.Makefile ++``` ++/usr/bin/gazellectl ++/usr/bin/ltran ++/usr/lib64/liblstack.so ++/etc/gazelle/ltran.conf ++/etc/gazelle/lstack.conf ++/etc/gazelle/lstack.Makefile + ``` + +-#### 链接Gazelle编译redis-server ++### 链接Gazelle编译redis-server + + 1. 获取开源redis代码,更改redis/src/Makefile文件,使其链接lstack,示例如下: + +@@ -78,10 +76,10 @@ index 4b2a31c..92fa17d 100644 + + 2. 编译redis: + +- ``` +- localhost:/euler/zyk/serverless/lredis/src # make distclean +- localhost:/euler/zyk/serverless/lredis/src # make USE_GAZELLE=1 -j32 +- ``` ++ ``` ++ localhost:/euler/zyk/serverless/lredis/src # make distclean ++ localhost:/euler/zyk/serverless/lredis/src # make USE_GAZELLE=1 -j32 ++ ``` + + #### 注意 + +@@ -89,7 +87,7 @@ index 4b2a31c..92fa17d 100644 + + 支持LD_PRELOAD方式免编译使用gazelle,可跳过编译应用程序步骤。 + +-#3 设置配置文件(以redis为例) ++## 3 设置配置文件(以redis为例) + + 安装完软件包后,运行Gazelle服务需要设置必要的配置文件。 + +@@ -168,13 +166,13 @@ tcp_conn_scan_interval=10 + + redis.conf为redis服务的配置文件,可以参考开源的配置文件,需要注意的是,redis.conf侦听的ip必须和其使用的lstack.conf里面的host_addr值保持一致。 + +-# 4 环境初始化 ++## 4 环境初始化 + + 配置文件完成后,需要配置大页、插入igb_uio.ko、绑定dpdk网卡等环境初始化工作才可以运行Gazelle服务。 + + **说明:igb_uio.ko依赖于uio.ko,需要用户先确保已安装uio.ko模块。** + +-#### 操作步骤 ++### 操作步骤 + + 1. **配置大页内存** + +@@ -203,68 +201,24 @@ tcp_conn_scan_interval=10 + 将需要通信的网卡绑定到dpdk,示例如下: + + ``` +- [root@ARM159server usertools]# ./dpdk-devbind.py --bind=igb_uio eth4 ++[root@ARM159server usertools]# dpdk-devbind.py --bind=igb_uio eth4 + ``` + + 如果是1822网卡,必须绑定vfio-pci驱动: +-``` +- [root@ARM159server usertools]# ./dpdk-devbind.py --bind=vfio-pci eth4 +-``` +- +-4. **一键部署脚本使用** +- +- 提供gazelle_setup脚本,用于快速自动化部署gazelle运行环境。 +- +- 一键部署脚本执行示例: +- +-``` +- gazelle_setup.sh –i/--nic eth0 –n/--numa 1024,1024 –d/--daemon 1/0 –k/--kni 1/0 –l/--lowpower 1/0 --ltrancore 0,1 --lstackcore 2-3 +-``` +- +- 参数描述: +- +- -i/--nic:设置待绑定网卡,此参数必须配置,且网卡需要有ip、路由和网关等必须参数,否则会读取配置失败,必选。 +- +- -n/--numa:lstack大页内存(不包括ltran的,ltran默认为1024M,setup脚本不对其做修改),根据numa节点配置,并用","(英文的逗号)分离,这里需要根据系统环境内存配置对应的大小,默认为1024, 可选。 +- +- -d/--daemon:是否开启deamon模式,开启为1,关闭为0;默认为1,可选。 +- +- -k/--kni:是否开启kni,开启为1,关闭为0;默认为0,可选。 +- +- -l/--lowpower:是否开启低功耗模式,开启为1,关闭为0;默认为0,可选。 +- +- --ltrancore:ltran的绑核参数,参考dpdk的参数配置,此处不做参数校验;默认为0,1,可选。 +- +- --lstackcore:lstack的绑核参数,同--ltrancore,默认为2,可选。 +- +- **说明** +- +- 1. gazelle_setup.sh支持非root用户启动;(即执行脚本对应的用户及用户组)。 +- 2. 默认配置文件的目录为:/etc/gazelle。 +- 3. gazelle_setup.sh会按照启动参数生成lstack.conf;对于lstack多进程场景,若多进程要使用不同配置文件,则需要自己拷贝修改每个进程的lstack.conf。 +- 4. 部署脚本会启动ltran进程。 +- +- 一键退出脚本执行实例: + + ``` +- gazelle_exit.sh ++[root@ARM159server usertools]# dpdk-devbind.py --bind=vfio-pci eth4 + ``` + +- **说明** +- +- 若启动了ltran的守护任务(gazelle_setup.sh指定了 -d/--daemon 1),那么在杀死ltran之后,守护任务仍会将ltran拉起,所以此时若要完全退出ltran,需要执行gazelle_exit.sh。 +- +- ++## 5 运行Gazelle + +-# 5 运行Gazelle +- +-#### 前提条件 ++### 前提条件 + + - 已完成软件包的安装。 + - 已设置完配置文件。 + - 已初始化环境。 + +-#### 操作步骤 ++### 操作步骤 + + 1. **启动ltran** + +@@ -275,21 +229,22 @@ tcp_conn_scan_interval=10 + **说明:一键部署脚本已启动ltran,若使用一键部署脚本,无需此步骤。** + + ``` +- [root@localhost deploy_open_source]# ltran --config-file /usr/share/gazelle/ltran.confEAL: Detected 96 lcore(s) +- EAL: Detected 4 NUMA nodes +- EAL: Multi-process socket /var/run/dpdk/(null)/mp_socket +- EAL: Selected IOVA mode 'PA' +- EAL: No free hugepages reported in hugepages-2048kB +- EAL: No free hugepages reported in hugepages-2048kB +- EAL: No free hugepages reported in hugepages-2048kB +- EAL: No available hugepages reported in hugepages-1048576kB +- EAL: Probing VFIO support... +- EAL: VFIO support initialized +- EAL: PCI device 0000:03:00.0 on NUMA socket 0...... +- EAL: Finished Process ltran_core_init. +- EAL: Finished Process ctrl_thread_fn. +- EAL: Finished Process client_rx_buf_init. +- EAL: Runing Process forward. ++[root@localhost deploy_open_source]# ltran --config-file /usr/share/gazelle/ltran.conf ++EAL: Detected 96 lcore(s) ++EAL: Detected 4 NUMA nodes ++EAL: Multi-process socket /var/run/dpdk/(null)/mp_socket ++EAL: Selected IOVA mode 'PA' ++EAL: No free hugepages reported in hugepages-2048kB ++EAL: No free hugepages reported in hugepages-2048kB ++EAL: No free hugepages reported in hugepages-2048kB ++EAL: No available hugepages reported in hugepages-1048576kB ++EAL: Probing VFIO support... ++EAL: VFIO support initialized ++EAL: PCI device 0000:03:00.0 on NUMA socket 0...... ++EAL: Finished Process ltran_core_init. ++EAL: Finished Process ctrl_thread_fn. ++EAL: Finished Process client_rx_buf_init. ++EAL: Runing Process forward. + ``` + + 2. **启动redis** +@@ -367,7 +322,7 @@ GAZELLE_BIND_PROCNAME=benchmark_ker GAZELLE_BIND_THREADNAME=disp LD_PRELOAD=/lib + 1. 不支持使用export方式单独声明LD_PRELOAD环境变量。 + 2. GAZELLE_BIND_PROCNAME指定lstack绑定的进程名称;GAZELLE_BIND_THREADNAME指定lstack绑定的进程中的具体线程名,且支持字串匹配,如设置disp,表示进程中所有线程名包含disp 字串的线程都会绑定lstack。 + +-# 6 最佳性能配置 ++## 6 最佳性能配置 + + 为获得最佳的性能,Gazelle在启动时对cpu和内存配置有一定要求。最佳性能配置如下: + +@@ -377,7 +332,7 @@ GAZELLE_BIND_PROCNAME=benchmark_ker GAZELLE_BIND_THREADNAME=disp LD_PRELOAD=/lib + 4. 应用进程绑核可提高性能,与网卡同一个numa节点对应的核如果有空闲,应当优先绑定。 + 5. Gazelle的网络高性能只有在远程访问的时候才有保证,本机的tcp连接功能也支持,但如果本机有频繁的数据访问,会导致实例整体性能下降,不建议这么部署。 + +-# 7 使用约束 ++## 7 使用约束 + + 1. Gazelle提供的命令行及配置文件仅root权限开源执行或修改。配置大页内存需要root用户执行操作。 + 2. 在将网卡绑定到igb_uio后,禁止将网卡绑回ixgbe。 +@@ -408,7 +363,7 @@ GAZELLE_BIND_PROCNAME=benchmark_ker GAZELLE_BIND_THREADNAME=disp LD_PRELOAD=/lib + + + +-# 8 升级说明 ++## 8 升级说明 + + 后续升级需要变更的组件包括ltran、liblstack.so、gazellectl、lstack.Makefile、lstack.conf、ltran.conf、gazelle_setup.sh、gazelle_exit.sh、gazelle_crontab.sh、gazelle_common.sh。 + +@@ -436,9 +391,9 @@ GAZELLE_BIND_PROCNAME=benchmark_ker GAZELLE_BIND_THREADNAME=disp LD_PRELOAD=/lib + + + +-#9 调测工具 ++## 9 调测工具 + +-##9.1 获取ltran统计数据说明 ++### 9.1 获取ltran统计数据说明 + + #### 概述 + +@@ -501,7 +456,7 @@ GAZELLE_BIND_PROCNAME=benchmark_ker GAZELLE_BIND_THREADNAME=disp LD_PRELOAD=/lib + + + +-## 9.2 获取Lstack统计数据说明 ++### 9.2 获取Lstack统计数据说明 + + #### 概述 + +@@ -553,4 +508,4 @@ lstack作为网络协议栈底座,使用时必须指定需要获取的lstack + + ``` + gazellectl lstack set {client_ip} lowpower {0 | 1}1:enable lowpower mode +-``` +\ No newline at end of file ++``` +-- +2.23.0 + diff --git a/0054-ltran-fix-use-after-free-issue.patch b/0054-ltran-fix-use-after-free-issue.patch new file mode 100644 index 0000000..c9c93cd --- /dev/null +++ b/0054-ltran-fix-use-after-free-issue.patch @@ -0,0 +1,28 @@ +From 0f37499d7922d59fa46b7961bde36ab0b20eac62 Mon Sep 17 00:00:00 2001 +From: Honggang LI +Date: Thu, 23 Jun 2022 13:56:31 +0800 +Subject: [PATCH 5/6] ltran: fix use-after-free issue + +Signed-off-by: Honggang LI +--- + src/ltran/ltran_timer.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/ltran/ltran_timer.c b/src/ltran/ltran_timer.c +index 29b307c..1327203 100644 +--- a/src/ltran/ltran_timer.c ++++ b/src/ltran/ltran_timer.c +@@ -63,9 +63,9 @@ void gazelle_detect_sock_logout(struct gazelle_tcp_sock_htable *tcp_sock_htable) + hlist_del_init(&tcp_sock->tcp_sock_node); + tcp_sock_htable->cur_tcp_sock_num--; + tcp_sock_htable->array[i].chain_size--; +- free(tcp_sock); + LTRAN_DEBUG("delete the tcp sock htable: tid %u ip %u port %u\n", + tcp_sock->tid, tcp_sock->ip, (uint32_t)ntohs(tcp_sock->port)); ++ free(tcp_sock); + } + } + } +-- +2.23.0 + diff --git a/0055-refactor-pkt-read-send-performance.patch b/0055-refactor-pkt-read-send-performance.patch new file mode 100644 index 0000000..23c8ecc --- /dev/null +++ b/0055-refactor-pkt-read-send-performance.patch @@ -0,0 +1,3232 @@ +From f0e65d55ace8b4e5c1bd2023f1b62da181f421c5 Mon Sep 17 00:00:00 2001 +From: jiangheng14 +Date: Thu, 7 Jul 2022 21:31:20 +0800 +Subject: [PATCH 6/6] refactor-pkt-read-send-performance + +--- + src/common/dpdk_common.h | 145 +++++++ + src/common/gazelle_dfx_msg.h | 50 ++- + src/lstack/api/lstack_epoll.c | 321 ++++++-------- + src/lstack/api/lstack_wrap.c | 15 +- + src/lstack/core/lstack_cfg.c | 18 + + src/lstack/core/lstack_control_plane.c | 4 +- + src/lstack/core/lstack_dpdk.c | 32 +- + src/lstack/core/lstack_init.c | 5 +- + src/lstack/core/lstack_lwip.c | 477 ++++++++++----------- + src/lstack/core/lstack_protocol_stack.c | 276 ++++++------ + src/lstack/core/lstack_stack_stat.c | 80 +++- + src/lstack/core/lstack_thread_rpc.c | 85 ++-- + src/lstack/include/lstack_cfg.h | 1 + + src/lstack/include/lstack_dpdk.h | 5 +- + src/lstack/include/lstack_lockless_queue.h | 10 +- + src/lstack/include/lstack_lwip.h | 11 +- + src/lstack/include/lstack_protocol_stack.h | 31 +- + src/lstack/include/lstack_stack_stat.h | 3 + + src/lstack/include/lstack_thread_rpc.h | 5 +- + src/lstack/include/posix/lstack_epoll.h | 22 +- + src/lstack/lstack.conf | 1 + + src/lstack/netif/lstack_ethdev.c | 9 +- + src/lstack/netif/lstack_vdev.c | 14 +- + src/ltran/ltran_dfx.c | 52 ++- + src/ltran/ltran_forward.c | 18 +- + src/ltran/ltran_stat.c | 19 +- + 26 files changed, 911 insertions(+), 798 deletions(-) + +diff --git a/src/common/dpdk_common.h b/src/common/dpdk_common.h +index 595e85f..4a7bd37 100644 +--- a/src/common/dpdk_common.h ++++ b/src/common/dpdk_common.h +@@ -14,6 +14,7 @@ + #define __GAZELLE_DPDK_COMMON_H__ + + #include ++#include + + #define GAZELLE_KNI_NAME "kni" // will be removed during dpdk update + +@@ -35,6 +36,7 @@ static __rte_always_inline void copy_mbuf(struct rte_mbuf *dst, struct rte_mbuf + return; + + dst->ol_flags = src->ol_flags; ++ dst->tx_offload = src->tx_offload; + // there is buf_len in rx_descriptor_fields1, copy it is dangerous acturely. 16 : mbuf desc size + rte_memcpy((uint8_t *)dst->rx_descriptor_fields1, (const uint8_t *)src->rx_descriptor_fields1, 16); + +@@ -65,4 +67,147 @@ int32_t dpdk_kni_init(uint16_t port, struct rte_mempool *pool); + int32_t kni_process_tx(struct rte_mbuf **pkts_burst, uint32_t count); + void kni_process_rx(uint16_t port); + ++/* ++ gazelle custom rte ring interface ++ lightweight ring reduce atomic and smp_mb. ++ only surpport single-consumers or the single-consumer. ++ */ ++static __rte_always_inline uint32_t gazelle_light_ring_enqueue_busrt(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t cons = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); ++ uint32_t prod = r->prod.tail; ++ uint32_t free_entries = r->capacity + cons - prod; ++ ++ if (n > free_entries) { ++ return 0; ++ } ++ ++ __rte_ring_enqueue_elems(r, prod, obj_table, sizeof(void *), n); ++ ++ __atomic_store_n(&r->prod.tail, prod + n, __ATOMIC_RELEASE); ++ ++ return n; ++} ++ ++static __rte_always_inline uint32_t gazelle_light_ring_dequeue_burst(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t prod = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); ++ uint32_t cons = r->cons.tail; ++ uint32_t entries = prod - cons; ++ ++ if (n > entries) { ++ n = entries; ++ } ++ ++ if (n == 0) { ++ return 0; ++ } ++ ++ __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); ++ ++ __atomic_store_n(&r->cons.tail, cons + n, __ATOMIC_RELEASE); ++ ++ return n; ++} ++ ++/* ++ gazelle custom rte ring interface ++ one thread enqueue and dequeue, other thread read object use and object still in queue. ++ so malloc and free in same thread. only surpport single-consumers or the single-consumer. ++ ++ cons.tail prod.tail prod.head cons.head ++ gazelle_ring_sp_enqueue: cons.head-->> cons.tal, enqueue object ++ gazelle_ring_sc_dequeue: cons.tal -->> prod.tail, dequeue object ++ gazelle_ring_read: prod.tail-->> cons.head, read object, prod.head = prod.tail + N ++ gazelle_ring_read_over: prod.tail = prod.head, update prod.tail ++ */ ++static __rte_always_inline uint32_t gazelle_ring_sp_enqueue(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t head = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE); ++ uint32_t tail = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); ++ ++ uint32_t entries = r->capacity + tail - head; ++ if (n > entries) { ++ return 0; ++ } ++ ++ ++ __rte_ring_enqueue_elems(r, head, obj_table, sizeof(void *), n); ++ ++ __atomic_store_n(&r->cons.head, head + n, __ATOMIC_RELEASE); ++ ++ return n; ++} ++ ++static __rte_always_inline uint32_t gazelle_ring_sc_dequeue(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t cons = __atomic_load_n(&r->cons.tail, __ATOMIC_ACQUIRE); ++ uint32_t prod = __atomic_load_n(&r->prod.tail, __ATOMIC_ACQUIRE); ++ ++ uint32_t entries = prod - cons; ++ if (n > entries) { ++ n = entries; ++ } ++ if (unlikely(n == 0)) { ++ return 0; ++ } ++ ++ ++ __rte_ring_dequeue_elems(r, cons, obj_table, sizeof(void *), n); ++ ++ __atomic_store_n(&r->cons.tail, cons + n, __ATOMIC_RELEASE); ++ ++ return n; ++} ++ ++static __rte_always_inline uint32_t gazelle_ring_read(struct rte_ring *r, void **obj_table, uint32_t n) ++{ ++ uint32_t cons = __atomic_load_n(&r->cons.head, __ATOMIC_ACQUIRE); ++ uint32_t prod = r->prod.head; ++ ++ const uint32_t entries = cons - prod; ++ if (n > entries) { ++ n = entries; ++ } ++ if (unlikely(n == 0)) { ++ return 0; ++ } ++ ++ __rte_ring_dequeue_elems(r, prod, obj_table, sizeof(void *), n); ++ ++ r->prod.head = prod + n; ++ ++ return n; ++} ++ ++static __rte_always_inline void gazelle_ring_read_n(struct rte_ring *r, uint32_t n) ++{ ++ __atomic_store_n(&r->prod.tail, r->prod.tail + n, __ATOMIC_RELEASE); ++} ++ ++static __rte_always_inline void gazelle_ring_read_over(struct rte_ring *r) ++{ ++ __atomic_store_n(&r->prod.tail, r->prod.head, __ATOMIC_RELEASE); ++} ++ ++static __rte_always_inline uint32_t gazelle_ring_readover_count(struct rte_ring *r) ++{ ++ rte_smp_rmb(); ++ return r->prod.tail - r->cons.tail; ++} ++static __rte_always_inline uint32_t gazelle_ring_readable_count(const struct rte_ring *r) ++{ ++ rte_smp_rmb(); ++ return r->cons.head - r->prod.tail; ++} ++ ++static __rte_always_inline uint32_t gazelle_ring_count(const struct rte_ring *r) ++{ ++ rte_smp_rmb(); ++ return r->cons.head - r->cons.tail; ++} ++static __rte_always_inline uint32_t gazelle_ring_free_count(const struct rte_ring *r) ++{ ++ return r->capacity - gazelle_ring_count(r); ++} + #endif +diff --git a/src/common/gazelle_dfx_msg.h b/src/common/gazelle_dfx_msg.h +index 6db67ee..cf435cd 100644 +--- a/src/common/gazelle_dfx_msg.h ++++ b/src/common/gazelle_dfx_msg.h +@@ -57,34 +57,37 @@ enum GAZELLE_LATENCY_TYPE { + GAZELLE_LATENCY_READ, + }; + +-struct gazelle_stat_pkts { +- uint64_t tx; +- uint64_t rx; +- uint64_t tx_drop; +- uint64_t rx_drop; +- uint64_t rx_allocmbuf_fail; +- uint64_t tx_allocmbuf_fail; +- uint64_t call_msg_cnt; +- uint16_t conn_num; +- uint16_t send_idle_ring_cnt; +- uint64_t event_list; ++struct gazelle_stack_stat { ++ uint64_t wakeup_events; ++ uint64_t write_lwip_cnt; ++ uint64_t send_self_rpc; + uint64_t read_lwip_drop; + uint64_t read_lwip_cnt; +- uint64_t write_lwip_drop; +- uint64_t write_lwip_cnt; ++ uint64_t rx_allocmbuf_fail; ++ uint64_t tx_allocmbuf_fail; ++ uint64_t call_null; ++ uint64_t rx_drop; ++ uint64_t rx; ++ uint64_t tx_drop; ++ uint64_t tx; ++}; ++ ++struct gazelle_wakeup_stat { ++ uint64_t app_events; ++ uint64_t app_write_idlefail; + uint64_t app_write_cnt; + uint64_t app_read_cnt; +- uint64_t app_write_idlefail; +- uint64_t app_write_drop; +- uint64_t recv_list; +- uint64_t wakeup_events; +- uint64_t app_events; +- uint64_t call_alloc_fail; + uint64_t read_null; +- uint64_t call_null; +- uint64_t arp_copy_fail; +- uint64_t send_self_rpc; +- uint64_t send_list; ++}; ++ ++struct gazelle_stat_pkts { ++ uint64_t call_msg_cnt; ++ uint16_t conn_num; ++ uint64_t recv_list_cnt; ++ uint64_t call_alloc_fail; ++ uint64_t send_list_cnt; ++ struct gazelle_stack_stat stack_stat; ++ struct gazelle_wakeup_stat wakeup_stat; + }; + + /* same as define in lwip/stats.h - struct stats_mib2 */ +@@ -159,6 +162,7 @@ struct gazelle_stat_lstack_conn_info { + uint32_t recv_ring_cnt; + uint32_t tcp_sub_state; + int32_t sem_cnt; ++ int32_t fd; + }; + + struct gazelle_stat_lstack_conn { +diff --git a/src/lstack/api/lstack_epoll.c b/src/lstack/api/lstack_epoll.c +index cba67ea..4978f02 100644 +--- a/src/lstack/api/lstack_epoll.c ++++ b/src/lstack/api/lstack_epoll.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -39,36 +40,33 @@ + #define SEC_TO_NSEC 1000000000 + #define SEC_TO_MSEC 1000 + #define MSEC_TO_NSEC 1000000 +-#define EPOLL_MAX_EVENTS 512 + #define POLL_KERNEL_EVENTS 32 + +-static PER_THREAD struct wakeup_poll g_wakeup_poll = {0}; +-static bool g_use_epoll = false; /* FIXME: when no epoll close prepare event for performance testing */ +- + void add_epoll_event(struct netconn *conn, uint32_t event) + { + /* conn sock nerver null, because lwip call this func */ +- struct lwip_sock *sock = get_socket(conn->socket); +- +- if ((event & sock->epoll_events) == 0) { ++ struct lwip_sock *sock = get_socket_by_fd(conn->socket); ++ if (sock->wakeup == NULL || (event & sock->epoll_events) == 0) { + return; + } ++ struct wakeup_poll *wakeup = sock->wakeup; ++ struct protocol_stack *stack = sock->stack; + +- sock->events |= event & sock->epoll_events; +- +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +- if (g_use_epoll && list_is_empty(&sock->event_list)) { +- list_add_node(&sock->stack->event_list, &sock->event_list); ++ if (wakeup->type == WAKEUP_EPOLL) { ++ pthread_spin_lock(&wakeup->event_list_lock); ++ sock->events |= (event == EPOLLERR) ? (EPOLLIN | EPOLLERR) : (event & sock->epoll_events); ++ if (list_is_null(&sock->event_list)) { ++ list_add_node(&wakeup->event_list, &sock->event_list); ++ } ++ pthread_spin_unlock(&wakeup->event_list_lock); + } +-#endif + +- if (sock->wakeup) { +- sock->stack->stats.wakeup_events++; +- if (get_protocol_stack_group()->wakeup_enable) { +- rte_ring_sp_enqueue(sock->stack->wakeup_ring, &sock->wakeup->event_sem); +- } else { +- sem_post(&sock->wakeup->event_sem); +- } ++ stack->stats.wakeup_events++; ++ sem_t *sem = &wakeup->event_sem; ++ if (get_protocol_stack_group()->wakeup_enable) { ++ gazelle_light_ring_enqueue_busrt(stack->wakeup_ring, (void **)&sem, 1); ++ } else { ++ sem_post(sem); + } + } + +@@ -77,61 +75,34 @@ static inline uint32_t update_events(struct lwip_sock *sock) + uint32_t event = 0; + + if (sock->epoll_events & EPOLLIN) { +- if (sock->attach_fd > 0 && NETCONN_IS_ACCEPTIN(sock)) { +- event |= EPOLLIN; +- } +- +- if (sock->attach_fd < 0 && NETCONN_IS_DATAIN(sock)) { ++ if (NETCONN_IS_DATAIN(sock) || NETCONN_IS_ACCEPTIN(sock)) { + event |= EPOLLIN; + } + } + + if ((sock->epoll_events & EPOLLOUT) && NETCONN_IS_OUTIDLE(sock)) { +- event |= EPOLLOUT; ++ /* lwip_netconn_do_connected set LIBOS FLAGS when connected */ ++ if (sock->conn && CONN_TYPE_IS_LIBOS(sock->conn)) { ++ event |= EPOLLOUT; ++ } + } + +- if ((sock->epoll_events & EPOLLERR) && (sock->events & EPOLLERR)) { ++ if (sock->errevent > 0) { + event |= EPOLLERR | EPOLLIN; + } + + return event; + } + +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +-void update_stack_events(struct protocol_stack *stack) ++static void raise_pending_events(struct wakeup_poll *wakeup, struct lwip_sock *sock) + { +- if (!g_use_epoll) { +- return; +- } +- +- struct list_node *node, *temp; +- list_for_each_safe(node, temp, &stack->event_list) { +- struct lwip_sock *sock = container_of(node, struct lwip_sock, event_list); +- +- sock->events = update_events(sock); +- if (sock->events != 0) { +- continue; +- } +- +- if (pthread_spin_trylock(&stack->event_lock)) { +- continue; ++ sock->events = update_events(sock); ++ if (sock->events) { ++ pthread_spin_lock(&wakeup->event_list_lock); ++ if (list_is_null(&sock->event_list)) { ++ list_add_node(&wakeup->event_list, &sock->event_list); + } +- list_del_node_init(&sock->event_list); +- pthread_spin_unlock(&stack->event_lock); +- } +-} +-#endif +- +-static void raise_pending_events(struct lwip_sock *sock) +-{ +- struct lwip_sock *attach_sock = (sock->attach_fd > 0) ? get_socket_by_fd(sock->attach_fd) : sock; +- if (attach_sock == NULL) { +- return; +- } +- +- attach_sock->events = update_events(attach_sock); +- if (attach_sock->events & attach_sock->epoll_events) { +- rpc_call_addevent(attach_sock->stack, attach_sock); ++ pthread_spin_unlock(&wakeup->event_list_lock); + } + } + +@@ -157,11 +128,15 @@ int32_t lstack_epoll_create(int32_t size) + memset_s(wakeup, sizeof(struct wakeup_poll), 0, sizeof(struct wakeup_poll)); + + init_list_node(&wakeup->event_list); +- wakeup->epollfd = fd; + sem_init(&wakeup->event_sem, 0, 0); ++ pthread_spin_init(&wakeup->event_list_lock, PTHREAD_PROCESS_PRIVATE); ++ ++ wakeup->type = WAKEUP_EPOLL; ++ wakeup->epollfd = fd; + sock->wakeup = wakeup; + +- g_use_epoll = true; ++ register_wakeup(wakeup); ++ + return fd; + } + +@@ -176,6 +151,9 @@ int32_t lstack_epoll_close(int32_t fd) + } + + if (sock->wakeup) { ++ unregister_wakeup(sock->wakeup); ++ sem_destroy(&sock->wakeup->event_sem); ++ pthread_spin_destroy(&sock->wakeup->event_list_lock); + free(sock->wakeup); + } + sock->wakeup = NULL; +@@ -236,161 +214,116 @@ int32_t lstack_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_even + + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { +- epoll_sock->wakeup->have_kernel_fd = true; + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + + if (CONN_TYPE_HAS_HOST(sock->conn)) { +- epoll_sock->wakeup->have_kernel_fd = true; + int32_t ret = posix_api->epoll_ctl_fn(epfd, op, fd, event); + if (ret < 0) { +- return ret; ++ LSTACK_LOG(ERR, LSTACK, "fd=%d epfd=%d op=%d\n", fd, epfd, op); + } + } + ++ struct wakeup_poll *wakeup = epoll_sock->wakeup; + do { + switch (op) { + case EPOLL_CTL_ADD: +- sock->wakeup = epoll_sock->wakeup; +- if (sock->stack) { +- epoll_sock->wakeup->stack_fd_cnt[sock->stack->queue_id]++; +- } +- if (list_is_empty(&sock->event_list)) { +- list_add_node(&sock->wakeup->event_list, &sock->event_list); +- } ++ sock->wakeup = wakeup; ++ wakeup->stack_fd_cnt[sock->stack->queue_id]++; + /* fall through */ + case EPOLL_CTL_MOD: + sock->epoll_events = event->events | EPOLLERR | EPOLLHUP; + sock->ep_data = event->data; +- if (sock->conn && NETCONNTYPE_GROUP(netconn_type(sock->conn)) == NETCONN_TCP) { +- raise_pending_events(sock); +- } ++ raise_pending_events(wakeup, sock); + break; + case EPOLL_CTL_DEL: +- list_del_node_init(&sock->event_list); + sock->epoll_events = 0; +- if (sock->stack) { +- epoll_sock->wakeup->stack_fd_cnt[sock->stack->queue_id]--; +- } ++ wakeup->stack_fd_cnt[sock->stack->queue_id]--; ++ pthread_spin_lock(&wakeup->event_list_lock); ++ list_del_node_null(&sock->event_list); ++ pthread_spin_unlock(&wakeup->event_list_lock); + break; + default: + GAZELLE_RETURN(EINVAL); + } +- fd = sock->nextfd; +- sock = get_socket(fd); +- } while (fd > 0 && sock != NULL); ++ sock = sock->listen_next; ++ } while (sock != NULL); + +- update_epoll_max_stack(epoll_sock->wakeup); ++ update_epoll_max_stack(wakeup); + return 0; + } + +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +-static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event *events, uint32_t maxevents) ++static void del_node_array(struct epoll_event *events, int32_t event_num, int32_t del_index) + { +- int32_t event_num = 0; +- struct protocol_stack_group *stack_group = get_protocol_stack_group(); +- +- maxevents = LWIP_MIN(EPOLL_MAX_EVENTS, maxevents); +- for (uint32_t i = 0; i < stack_group->stack_num && event_num < maxevents; i++) { +- struct protocol_stack *stack = stack_group->stacks[i]; +- int32_t start_event_num = event_num; +- +- if (pthread_spin_trylock(&stack->event_lock)) { +- continue; +- } +- +- struct list_node *node, *temp; +- list_for_each_safe(node, temp, &stack->event_list) { +- struct lwip_sock *sock = container_of(node, struct lwip_sock, event_list); +- +- uint32_t event = sock->events & sock->epoll_events; +- if (event == 0 || sock->wait_close) { +- continue; +- } +- +- events[event_num].events = event; +- events[event_num].data = sock->ep_data; +- event_num++; ++ for (int32_t i = del_index; i + 1 < event_num; i++) { ++ events[i] = events[i + 1]; ++ } ++} + +- if (event_num >= maxevents) { +- break; ++static int32_t del_duplicate_event(struct epoll_event *events, int32_t event_num) ++{ ++ for (int32_t i = 0; i < event_num; i++) { ++ for (int32_t j = i + 1; j < event_num; j++) { ++ if (events[i].data.u64 == events[j].data.u64) { ++ del_node_array(events, event_num, j); ++ event_num--; + } + } +- +- pthread_spin_unlock(&stack->event_lock); +- +- __sync_fetch_and_add(&stack->stats.app_events, event_num - start_event_num); + } + + return event_num; + } +-#else ++ + static int32_t epoll_lwip_event(struct wakeup_poll *wakeup, struct epoll_event *events, uint32_t maxevents) + { + int32_t event_num = 0; + struct list_node *node, *temp; ++ int32_t accept_num = 0; ++ ++ pthread_spin_lock(&wakeup->event_list_lock); ++ + list_for_each_safe(node, temp, &wakeup->event_list) { + struct lwip_sock *sock = container_of(node, struct lwip_sock, event_list); +- if (sock->conn == NULL) { +- list_del_node_init(&sock->event_list); +- continue; ++ if (sock->conn && sock->conn->acceptmbox) { ++ accept_num++; + } + +- struct lwip_sock *temp_sock = sock; +- do { +- struct lwip_sock *attach_sock = (temp_sock->attach_fd > 0) ? get_socket(temp_sock->attach_fd) : temp_sock; +- if (attach_sock == NULL || temp_sock->wait_close) { +- temp_sock = (temp_sock->nextfd > 0) ? get_socket(temp_sock->nextfd) : NULL; +- continue; +- } ++ events[event_num].events = sock->events; ++ events[event_num].data = sock->ep_data; ++ event_num++; + +- uint32_t event = update_events(attach_sock); +- if (event != 0) { +- events[event_num].events = event; +- events[event_num].data = temp_sock->ep_data; +- event_num++; +- if (event_num >= maxevents) { +- break; +- } +- } ++ if (event_num >= maxevents) { ++ break; ++ } ++ } + +- temp_sock = (temp_sock->nextfd > 0) ? get_socket(temp_sock->nextfd) : NULL; +- } while (temp_sock); ++ pthread_spin_unlock(&wakeup->event_list_lock); ++ ++ if (accept_num > 1) { ++ event_num = del_duplicate_event(events, event_num); + } + ++ // atomic_fetch_add(&wakeup->bind_stack->stats.app_events, event_num); + return event_num; + } +-#endif + + static int32_t poll_lwip_event(struct pollfd *fds, nfds_t nfds) + { + int32_t event_num = 0; + + for (uint32_t i = 0; i < nfds; i++) { +- /* listenfd nextfd pointerto next stack listen, others nextfd=-1 */ ++ /* sock->listen_next pointerto next stack listen */ + int32_t fd = fds[i].fd; +- while (fd > 0) { +- struct lwip_sock *sock = get_socket(fd); +- if (sock == NULL) { +- break; +- } +- +- /* attach listen is empty, all event in attached listen. attached listen attach_fd is self */ +- struct lwip_sock *attach_sock = (sock->attach_fd > 0) ? get_socket(sock->attach_fd) : sock; +- if (attach_sock == NULL || sock->wait_close) { +- fd = sock->nextfd; +- continue; +- } +- +- uint32_t events = update_events(attach_sock); ++ struct lwip_sock *sock = get_socket_by_fd(fd); ++ while (sock && sock->conn) { ++ uint32_t events = update_events(sock); + if (events) { + fds[i].revents = events; +- __sync_fetch_and_add(&sock->stack->stats.app_events, 1); + event_num++; + break; + } + +- fd = sock->nextfd; ++ sock = sock->listen_next;; + } + } + +@@ -417,7 +350,7 @@ static void change_epollfd_kernel_thread(struct wakeup_poll *wakeup, struct prot + + /* avoid kernel thread post too much, use EPOLLET */ + struct epoll_event event; +- event.data.ptr = &wakeup->event_sem; ++ event.data.ptr = wakeup; + event.events = EPOLLIN | EPOLLOUT | EPOLLERR | EPOLLHUP | EPOLLET; + if (posix_api->epoll_ctl_fn(new_stack->epollfd, EPOLL_CTL_ADD, wakeup->epollfd, &event) != 0) { + LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); +@@ -457,15 +390,18 @@ int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxe + + do { + event_num += epoll_lwip_event(sock->wakeup, &events[event_num], maxevents - event_num); ++ sock->wakeup->stat.app_events += event_num; + +- if (sock->wakeup->have_kernel_fd) { ++ if (__atomic_load_n(&sock->wakeup->have_kernel_event, __ATOMIC_RELAXED)) { + event_num += posix_api->epoll_wait_fn(epfd, &events[event_num], maxevents - event_num, 0); + } + + if (event_num > 0) { ++ while (sem_trywait(&sock->wakeup->event_sem) == 0); + break; + } + ++ sock->wakeup->have_kernel_event = false; + if (timeout < 0) { + ret = sem_wait(&sock->wakeup->event_sem); + } else { +@@ -479,6 +415,7 @@ int32_t lstack_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxe + static void init_poll_wakeup_data(struct wakeup_poll *wakeup) + { + sem_init(&wakeup->event_sem, 0, 0); ++ wakeup->type = WAKEUP_POLL; + + wakeup->last_fds = calloc(POLL_KERNEL_EVENTS, sizeof(struct pollfd)); + if (wakeup->last_fds == NULL) { +@@ -542,11 +479,6 @@ static void update_kernel_poll(struct wakeup_poll *wakeup, uint32_t index, struc + if (posix_api->epoll_ctl_fn(wakeup->epollfd, EPOLL_CTL_ADD, new_fd->fd, &event) != 0) { + LSTACK_LOG(ERR, LSTACK, "epoll_ctl_fn errno=%d\n", errno); + } +- +- wakeup->last_fds[index].fd = new_fd->fd; +- wakeup->last_fds[index].events = new_fd->events; +- +- wakeup->have_kernel_fd = true; + } + + static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfds) +@@ -554,17 +486,17 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + if (!wakeup->init) { + wakeup->init = true; + init_poll_wakeup_data(wakeup); +- } else { +- while (sem_trywait(&wakeup->event_sem) == 0) {} +- } +- +- if (nfds > wakeup->last_max_nfds) { +- resize_kernel_poll(wakeup, nfds); ++ register_wakeup(wakeup); + } + + int32_t stack_count[PROTOCOL_STACK_MAX] = {0}; + int32_t poll_change = 0; + ++ /* poll fds num more, recalloc fds size */ ++ if (nfds > wakeup->last_max_nfds) { ++ resize_kernel_poll(wakeup, nfds); ++ poll_change = 1; ++ } + /* poll fds num less, del old fd */ + for (uint32_t i = nfds; i < wakeup->last_nfds; i++) { + update_kernel_poll(wakeup, i, NULL); +@@ -572,44 +504,51 @@ static void poll_init(struct wakeup_poll *wakeup, struct pollfd *fds, nfds_t nfd + } + + for (uint32_t i = 0; i < nfds; i++) { ++ int32_t fd = fds[i].fd; + fds[i].revents = 0; ++ struct lwip_sock *sock = get_socket_by_fd(fd); + +- if (fds[i].fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { +- continue; ++ if (fd == wakeup->last_fds[i].fd && fds[i].events == wakeup->last_fds[i].events) { ++ /* fd close then socket may get same fd. */ ++ if (sock == NULL || sock->wakeup != NULL) { ++ continue; ++ } + } ++ wakeup->last_fds[i].fd = fd; ++ wakeup->last_fds[i].events = fds[i].events; + poll_change = 1; + +- int32_t fd = fds[i].fd; +- struct lwip_sock *sock = get_socket(fd); +- if (sock == NULL || CONN_TYPE_HAS_HOST(sock->conn)) { ++ if (sock == NULL || sock->conn == NULL || CONN_TYPE_HAS_HOST(sock->conn)) { + update_kernel_poll(wakeup, i, fds + i); + } + +- do { +- sock = get_socket(fd); +- if (sock == NULL || sock->conn == NULL) { +- break; ++ while (sock && sock->conn) { ++ if (sock->epoll_events != (fds[i].events | POLLERR)) { ++ sock->epoll_events = fds[i].events | POLLERR; ++ } ++ if (sock->wakeup != wakeup) { ++ sock->wakeup = wakeup; + } +- sock->epoll_events = fds[i].events | POLLERR; +- sock->wakeup = wakeup; + +- /* listenfd list */ +- fd = sock->nextfd; + stack_count[sock->stack->queue_id]++; +- } while (fd > 0); ++ /* listenfd list */ ++ sock = sock->listen_next; ++ } + } + +- wakeup->last_nfds = nfds; + if (poll_change == 0) { + return; + } ++ wakeup->last_nfds = nfds; + + poll_bind_statck(wakeup, stack_count); + } + + int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + { +- poll_init(&g_wakeup_poll, fds, nfds); ++ static PER_THREAD struct wakeup_poll wakeup_poll = {0}; ++ ++ poll_init(&wakeup_poll, fds, nfds); + + int32_t event_num = 0; + int32_t ret; +@@ -624,23 +563,25 @@ int32_t lstack_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + event_num += poll_lwip_event(fds, nfds); + + /* reduce syscall epoll_wait */ +- if (g_wakeup_poll.have_kernel_fd) { +- int32_t kernel_num = posix_api->epoll_wait_fn(g_wakeup_poll.epollfd, g_wakeup_poll.events, nfds, 0); ++ if (__atomic_load_n(&wakeup_poll.have_kernel_event, __ATOMIC_RELAXED)) { ++ int32_t kernel_num = posix_api->epoll_wait_fn(wakeup_poll.epollfd, wakeup_poll.events, nfds, 0); + for (int32_t i = 0; i < kernel_num; i++) { +- uint32_t index = g_wakeup_poll.events[i].data.u32; +- fds[index].revents = g_wakeup_poll.events[i].events; ++ uint32_t index = wakeup_poll.events[i].data.u32; ++ fds[index].revents = wakeup_poll.events[i].events; + } + event_num += kernel_num >= 0 ? kernel_num : 0; + } + + if (event_num > 0) { ++ while (sem_trywait(&wakeup_poll.event_sem) == 0); + break; + } + ++ wakeup_poll.have_kernel_event = false; + if (timeout < 0) { +- ret = sem_wait(&g_wakeup_poll.event_sem); ++ ret = sem_wait(&wakeup_poll.event_sem); + } else { +- ret = sem_timedwait(&g_wakeup_poll.event_sem, &poll_time); ++ ret = sem_timedwait(&wakeup_poll.event_sem, &poll_time); + } + } while (ret == 0); + +diff --git a/src/lstack/api/lstack_wrap.c b/src/lstack/api/lstack_wrap.c +index bf5dcb4..ec68d62 100644 +--- a/src/lstack/api/lstack_wrap.c ++++ b/src/lstack/api/lstack_wrap.c +@@ -52,7 +52,7 @@ static inline enum KERNEL_LWIP_PATH select_path(int fd) + return PATH_KERNEL; + } + +- if (unlikely(posix_api->is_chld)) { ++ if (unlikely(posix_api->ues_posix)) { + return PATH_KERNEL; + } + +@@ -84,7 +84,7 @@ static inline int32_t do_epoll_create(int32_t size) + return posix_api->epoll_create_fn(size); + } + +- if (unlikely(posix_api->is_chld)) { ++ if (unlikely(posix_api->ues_posix)) { + return posix_api->epoll_create_fn(size); + } + +@@ -93,7 +93,7 @@ static inline int32_t do_epoll_create(int32_t size) + + static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct epoll_event* event) + { +- if (unlikely(posix_api->is_chld)) { ++ if (unlikely(posix_api->ues_posix)) { + return posix_api->epoll_ctl_fn(epfd, op, fd, event); + } + +@@ -102,7 +102,7 @@ static inline int32_t do_epoll_ctl(int32_t epfd, int32_t op, int32_t fd, struct + + static inline int32_t do_epoll_wait(int32_t epfd, struct epoll_event* events, int32_t maxevents, int32_t timeout) + { +- if (unlikely(posix_api->is_chld)) { ++ if (unlikely(posix_api->ues_posix)) { + return posix_api->epoll_wait_fn(epfd, events, maxevents, timeout); + } + +@@ -203,7 +203,8 @@ static inline int32_t do_listen(int32_t s, int32_t backlog) + return posix_api->listen_fn(s, backlog); + } + +- int32_t ret = stack_broadcast_listen(s, backlog); ++ int32_t ret = get_global_cfg_params()->listen_shadow ? stack_broadcast_listen(s, backlog) : ++ stack_single_listen(s, backlog); + if (ret != 0) { + return ret; + } +@@ -264,7 +265,7 @@ static inline int32_t do_setsockopt(int32_t s, int32_t level, int32_t optname, c + static inline int32_t do_socket(int32_t domain, int32_t type, int32_t protocol) + { + if ((domain != AF_INET && domain != AF_UNSPEC) +- || posix_api->is_chld) { ++ || posix_api->ues_posix) { + return posix_api->socket_fn(domain, type, protocol); + } + +@@ -368,7 +369,7 @@ static int32_t do_poll(struct pollfd *fds, nfds_t nfds, int32_t timeout) + GAZELLE_RETURN(EINVAL); + } + +- if (unlikely(posix_api->is_chld) || nfds == 0) { ++ if (unlikely(posix_api->ues_posix) || nfds == 0) { + return posix_api->poll_fn(fds, nfds, timeout); + } + +diff --git a/src/lstack/core/lstack_cfg.c b/src/lstack/core/lstack_cfg.c +index 13086a3..ca2b979 100644 +--- a/src/lstack/core/lstack_cfg.c ++++ b/src/lstack/core/lstack_cfg.c +@@ -56,6 +56,7 @@ static int32_t parse_devices(void); + static int32_t parse_dpdk_args(void); + static int32_t parse_gateway_addr(void); + static int32_t parse_kni_switch(void); ++static int32_t parse_listen_shadow(void); + + struct config_vector_t { + const char *name; +@@ -73,6 +74,7 @@ static struct config_vector_t g_config_tbl[] = { + { "num_wakeup", parse_wakeup_cpu_number }, + { "low_power_mode", parse_low_power_mode }, + { "kni_switch", parse_kni_switch }, ++ { "listen_shadow", parse_listen_shadow }, + { NULL, NULL } + }; + +@@ -670,6 +672,22 @@ static int32_t parse_use_ltran(void) + return 0; + } + ++static int32_t parse_listen_shadow(void) ++{ ++ const config_setting_t *arg = NULL; ++ ++ arg = config_lookup(&g_config, "listen_shadow"); ++ if (arg == NULL) { ++ g_config_params.listen_shadow = false; ++ return 0; ++ } ++ ++ int32_t val = config_setting_get_int(arg); ++ g_config_params.listen_shadow = (val == 0) ? false : true; ++ ++ return 0; ++} ++ + static int32_t parse_kni_switch(void) + { + const config_setting_t *arg = NULL; +diff --git a/src/lstack/core/lstack_control_plane.c b/src/lstack/core/lstack_control_plane.c +index 26a1b1c..ef38fb5 100644 +--- a/src/lstack/core/lstack_control_plane.c ++++ b/src/lstack/core/lstack_control_plane.c +@@ -713,7 +713,7 @@ void control_server_thread(void *arg) + struct epoll_event evt_array; + while (1) { + /* wait init finish */ +- if (posix_api->is_chld) { ++ if (posix_api->ues_posix) { + usleep(GAZELLE_10MS); + continue; + } +@@ -759,7 +759,7 @@ void control_client_thread(void *arg) + + while (1) { + /* wait init finish */ +- if (posix_api->is_chld) { ++ if (posix_api->ues_posix) { + usleep(GAZELLE_10MS); + continue; + } +diff --git a/src/lstack/core/lstack_dpdk.c b/src/lstack/core/lstack_dpdk.c +index df0332b..6675d7b 100644 +--- a/src/lstack/core/lstack_dpdk.c ++++ b/src/lstack/core/lstack_dpdk.c +@@ -129,26 +129,6 @@ static struct rte_mempool *create_pktmbuf_mempool(const char *name, uint32_t nb_ + return pool; + } + +-struct rte_mempool *create_rpc_mempool(const char *name, uint16_t queue_id) +-{ +- char pool_name[PATH_MAX]; +- struct rte_mempool *pool; +- int32_t ret; +- +- ret = snprintf_s(pool_name, sizeof(pool_name), PATH_MAX - 1, "%s_%hu", name, queue_id); +- if (ret < 0) { +- return NULL; +- } +- +- pool = rte_mempool_create(pool_name, CALL_POOL_SZ, sizeof(struct rpc_msg), 0, 0, NULL, NULL, NULL, +- NULL, rte_socket_id(), 0); +- if (pool == NULL) { +- LSTACK_LOG(ERR, LSTACK, "cannot create %s pool rte_err=%d\n", pool_name, rte_errno); +- } +- +- return pool; +-} +- + static struct reg_ring_msg *create_reg_mempool(const char *name, uint16_t queue_id) + { + int ret; +@@ -175,13 +155,13 @@ int32_t pktmbuf_pool_init(struct protocol_stack *stack, uint16_t stack_num) + return -1; + } + +- stack->rx_pktmbuf_pool = create_pktmbuf_mempool("rx_mbuf", RX_NB_MBUF, RX_MBUF_CACHE_SZ, ++ stack->rx_pktmbuf_pool = create_pktmbuf_mempool("rx_mbuf", RX_NB_MBUF / stack_num, RX_MBUF_CACHE_SZ, + stack->queue_id); + if (stack->rx_pktmbuf_pool == NULL) { + return -1; + } + +- stack->tx_pktmbuf_pool = create_pktmbuf_mempool("tx_mbuf", TX_NB_MBUF, TX_MBUF_CACHE_SZ, ++ stack->tx_pktmbuf_pool = create_pktmbuf_mempool("tx_mbuf", TX_NB_MBUF / stack_num, TX_MBUF_CACHE_SZ, + stack->queue_id); + if (stack->tx_pktmbuf_pool == NULL) { + return -1; +@@ -220,12 +200,14 @@ int32_t create_shared_ring(struct protocol_stack *stack) + lockless_queue_init(&stack->rpc_queue); + + if (get_protocol_stack_group()->wakeup_enable) { +- stack->wakeup_ring = create_ring("WAKEUP_RING", VDEV_WAKEUP_QUEUE_SZ, 0, stack->queue_id); ++ stack->wakeup_ring = create_ring("WAKEUP_RING", VDEV_WAKEUP_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ, ++ stack->queue_id); + if (stack->wakeup_ring == NULL) { + return -1; + } + } + ++ + if (use_ltran()) { + stack->rx_ring = create_ring("RING_RX", VDEV_RX_QUEUE_SZ, RING_F_SP_ENQ | RING_F_SC_DEQ, stack->queue_id); + if (stack->rx_ring == NULL) { +@@ -255,7 +237,7 @@ int32_t fill_mbuf_to_ring(struct rte_mempool *mempool, struct rte_ring *ring, ui + struct rte_mbuf *free_buf[FREE_RX_QUEUE_SZ]; + + while (remain > 0) { +- batch = LWIP_MIN(remain, FREE_RX_QUEUE_SZ); ++ batch = LWIP_MIN(remain, RING_SIZE(FREE_RX_QUEUE_SZ)); + + ret = gazelle_alloc_pktmbuf(mempool, free_buf, batch); + if (ret != 0) { +@@ -263,7 +245,7 @@ int32_t fill_mbuf_to_ring(struct rte_mempool *mempool, struct rte_ring *ring, ui + return -1; + } + +- ret = rte_ring_en_enqueue_bulk(ring, (void **)free_buf, batch); ++ ret = gazelle_ring_sp_enqueue(ring, (void **)free_buf, batch); + if (ret == 0) { + LSTACK_LOG(ERR, LSTACK, "cannot enqueue to ring, count: %d\n", (int32_t)batch); + return -1; +diff --git a/src/lstack/core/lstack_init.c b/src/lstack/core/lstack_init.c +index f8e96bf..78040b0 100644 +--- a/src/lstack/core/lstack_init.c ++++ b/src/lstack/core/lstack_init.c +@@ -143,7 +143,7 @@ static int32_t check_preload_bind_proc(void) + + __attribute__((destructor)) void gazelle_network_exit(void) + { +- if (posix_api != NULL && !posix_api->is_chld) { ++ if (posix_api != NULL && !posix_api->ues_posix) { + lwip_exit(); + } + +@@ -275,7 +275,6 @@ __attribute__((constructor)) void gazelle_network_init(void) + LSTACK_EXIT(1, "stack thread or kernel_event thread failed\n"); + } + +- posix_api->is_chld = 0; ++ posix_api->ues_posix = 0; + LSTACK_LOG(INFO, LSTACK, "gazelle_network_init success\n"); +- rte_smp_mb(); + } +diff --git a/src/lstack/core/lstack_lwip.c b/src/lstack/core/lstack_lwip.c +index 8544ef7..156fc1f 100644 +--- a/src/lstack/core/lstack_lwip.c ++++ b/src/lstack/core/lstack_lwip.c +@@ -35,33 +35,24 @@ + #define HALF_DIVISOR (2) + #define USED_IDLE_WATERMARK (VDEV_IDLE_QUEUE_SZ >> 2) + +-void listen_list_add_node(int32_t head_fd, int32_t add_fd) +-{ +- struct lwip_sock *sock = NULL; +- int32_t fd = head_fd; +- +- while (fd > 0) { +- sock = get_socket(fd); +- if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %d get sock null\n", get_stack_tid(), fd); +- return; +- } +- fd = sock->nextfd; +- } +- sock->nextfd = add_fd; +-} ++static int32_t lwip_alloc_pbufs(pbuf_layer layer, uint16_t length, pbuf_type type, void **pbufs, uint32_t num); + + static void free_ring_pbuf(struct rte_ring *ring) + { +- while (1) { +- struct pbuf *pbuf = NULL; +- int32_t ret = rte_ring_sc_dequeue(ring, (void **)&pbuf); +- if (ret != 0) { +- break; +- } ++ void *pbufs[SOCK_RECV_RING_SIZE]; + +- pbuf_free(pbuf); +- } ++ do { ++ gazelle_ring_read(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); ++ gazelle_ring_read_over(ring); ++ } while (gazelle_ring_readable_count(ring)); ++ ++ do { ++ uint32_t num = gazelle_ring_sc_dequeue(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); ++ ++ for (uint32_t i = 0; i < num; i++) { ++ pbuf_free(pbufs[i]); ++ } ++ } while (gazelle_ring_readover_count(ring)); + } + + static void reset_sock_data(struct lwip_sock *sock) +@@ -73,11 +64,6 @@ static void reset_sock_data(struct lwip_sock *sock) + } + sock->recv_ring = NULL; + +- if (sock->recv_wait_free) { +- free_ring_pbuf(sock->recv_wait_free); +- rte_ring_free(sock->recv_wait_free); +- } +- sock->recv_wait_free = NULL; + + if (sock->send_ring) { + free_ring_pbuf(sock->send_ring); +@@ -85,19 +71,11 @@ static void reset_sock_data(struct lwip_sock *sock) + } + sock->send_ring = NULL; + +- if (sock->send_idle_ring) { +- free_ring_pbuf(sock->send_idle_ring); +- rte_ring_free(sock->send_idle_ring); +- } +- sock->send_idle_ring = NULL; + + sock->stack = NULL; + sock->wakeup = NULL; +- sock->events = 0; +- sock->nextfd = -1; +- sock->attach_fd = -1; ++ sock->listen_next = NULL; + sock->wait_close = false; +- sock->shadowed_sock = NULL; + sock->epoll_events = 0; + sock->events = 0; + +@@ -105,34 +83,29 @@ static void reset_sock_data(struct lwip_sock *sock) + pbuf_free(sock->recv_lastdata); + } + sock->recv_lastdata = NULL; +- +- if (sock->send_lastdata) { +- pbuf_free(sock->send_lastdata); +- } +- sock->send_lastdata = NULL; + } + + static void replenish_send_idlembuf(struct rte_ring *ring) + { +- uint32_t replenish_cnt = rte_ring_free_count(ring); ++ void *pbuf[SOCK_SEND_RING_SIZE]; + +- for (uint32_t i = 0; i < replenish_cnt; i++) { +- struct pbuf *pbuf = lwip_alloc_pbuf(PBUF_TRANSPORT, TCP_MSS, PBUF_RAM); +- if (pbuf == NULL) { +- break; +- } ++ uint32_t replenish_cnt = gazelle_ring_free_count(ring); ++ uint32_t alloc_num = LWIP_MIN(replenish_cnt, RING_SIZE(SOCK_SEND_RING_SIZE)); + +- int32_t ret = rte_ring_sp_enqueue(ring, (void *)pbuf); +- if (ret < 0) { +- pbuf_free(pbuf); +- break; +- } ++ if (lwip_alloc_pbufs(PBUF_TRANSPORT, TCP_MSS, PBUF_RAM, (void **)pbuf, alloc_num) != 0) { ++ return; ++ } ++ ++ uint32_t num = gazelle_ring_sp_enqueue(ring, pbuf, alloc_num); ++ for (uint32_t i = num; i < alloc_num; i++) { ++ pbuf_free(pbuf[i]); + } + } + + void gazelle_init_sock(int32_t fd) + { + static uint32_t name_tick = 0; ++ struct protocol_stack *stack = get_protocol_stack(); + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { + return; +@@ -140,38 +113,26 @@ void gazelle_init_sock(int32_t fd) + + reset_sock_data(sock); + +- sock->recv_ring = create_ring("sock_recv", SOCK_RECV_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); ++ sock->recv_ring = create_ring("sock_recv", SOCK_RECV_RING_SIZE, RING_F_SP_ENQ | RING_F_SC_DEQ, ++ atomic_fetch_add(&name_tick, 1)); + if (sock->recv_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "sock_recv create failed. errno: %d.\n", rte_errno); + return; + } + +- sock->recv_wait_free = create_ring("wait_free", SOCK_RECV_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); +- if (sock->recv_wait_free == NULL) { +- LSTACK_LOG(ERR, LSTACK, "wait_free create failed. errno: %d.\n", rte_errno); +- return; +- } +- +- sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); ++ sock->send_ring = create_ring("sock_send", SOCK_SEND_RING_SIZE, RING_F_SP_ENQ | RING_F_SC_DEQ, ++ atomic_fetch_add(&name_tick, 1)); + if (sock->send_ring == NULL) { + LSTACK_LOG(ERR, LSTACK, "sock_send create failed. errno: %d.\n", rte_errno); + return; + } ++ replenish_send_idlembuf(sock->send_ring); + +- sock->send_idle_ring = create_ring("idle_send", SOCK_SEND_RING_SIZE, 0, atomic_fetch_add(&name_tick, 1)); +- if (sock->send_idle_ring == NULL) { +- LSTACK_LOG(ERR, LSTACK, "idle_send create failed. errno: %d.\n", rte_errno); +- return; +- } +- replenish_send_idlembuf(sock->send_idle_ring); +- +- sock->stack = get_protocol_stack(); ++ sock->stack = stack; + sock->stack->conn_num++; +- init_list_node(&sock->recv_list); +- init_list_node(&sock->attach_list); +- init_list_node(&sock->listen_list); +- init_list_node(&sock->event_list); +- init_list_node(&sock->send_list); ++ init_list_node_null(&sock->recv_list); ++ init_list_node_null(&sock->event_list); ++ init_list_node_null(&sock->send_list); + } + + void gazelle_clean_sock(int32_t fd) +@@ -181,17 +142,18 @@ void gazelle_clean_sock(int32_t fd) + return; + } + ++ if (sock->wakeup && sock->wakeup->type == WAKEUP_EPOLL) { ++ pthread_spin_lock(&sock->wakeup->event_list_lock); ++ list_del_node_null(&sock->event_list); ++ pthread_spin_unlock(&sock->wakeup->event_list_lock); ++ } ++ + sock->stack->conn_num--; + + reset_sock_data(sock); + +- list_del_node_init(&sock->recv_list); +- list_del_node_init(&sock->attach_list); +- list_del_node_init(&sock->listen_list); +-#ifdef GAZELLE_USE_EPOLL_EVENT_STACK +- list_del_node_init(&sock->event_list); +-#endif +- list_del_node_init(&sock->send_list); ++ list_del_node_null(&sock->recv_list); ++ list_del_node_null(&sock->send_list); + } + + void gazelle_free_pbuf(struct pbuf *pbuf) +@@ -201,45 +163,14 @@ void gazelle_free_pbuf(struct pbuf *pbuf) + } + + struct rte_mbuf *mbuf = pbuf_to_mbuf(pbuf); +- if (mbuf->pool != NULL) { +- rte_pktmbuf_free(mbuf); +- } else { +- rte_free(mbuf); +- } +-} +- +-static int32_t alloc_mbufs(struct rte_mempool *pool, struct rte_mbuf **mbufs, uint32_t num) +-{ +- // alloc mbuf from pool +- if (rte_pktmbuf_alloc_bulk(pool, mbufs, num) == 0) { +- return 0; +- } +- +- // alloc mbuf from system +- for (uint32_t i = 0; i < num; i++) { +- struct rte_mbuf *mbuf = (struct rte_mbuf *)rte_malloc(NULL, pool->elt_size, sizeof(uint64_t)); +- if (mbuf == NULL) { +- for (uint32_t j = 0; j < i; j++) { +- rte_free(mbufs[j]); +- mbufs[j] = NULL; +- } +- return -1; +- } +- +- mbufs[i] = mbuf; +- rte_pktmbuf_init(pool, NULL, mbuf, 0); +- rte_pktmbuf_reset(mbuf); +- mbuf->pool = NULL; +- } +- +- return 0; ++ rte_pktmbuf_free(mbuf); + } + + int32_t gazelle_alloc_pktmbuf(struct rte_mempool *pool, struct rte_mbuf **mbufs, uint32_t num) + { + struct pbuf_custom *pbuf_custom = NULL; + +- int32_t ret = alloc_mbufs(pool, mbufs, num); ++ int32_t ret = rte_pktmbuf_alloc_bulk(pool, mbufs, num); + if (ret != 0) { + return ret; + } +@@ -252,86 +183,98 @@ int32_t gazelle_alloc_pktmbuf(struct rte_mempool *pool, struct rte_mbuf **mbufs, + return 0; + } + +-struct pbuf *lwip_alloc_pbuf(pbuf_layer layer, uint16_t length, pbuf_type type) ++static struct pbuf *init_mbuf_to_pbuf(struct rte_mbuf *mbuf, pbuf_layer layer, uint16_t length, pbuf_type type) + { +- struct rte_mbuf *mbuf; +- int32_t ret = alloc_mbufs(get_protocol_stack()->tx_pktmbuf_pool, &mbuf, 1); +- if (ret != 0) { +- get_protocol_stack()->stats.tx_allocmbuf_fail++; +- return NULL; +- } +- + struct pbuf_custom *pbuf_custom = mbuf_to_pbuf(mbuf); + pbuf_custom->custom_free_function = gazelle_free_pbuf; + + void *data = rte_pktmbuf_mtod(mbuf, void *); + struct pbuf *pbuf = pbuf_alloced_custom(layer, length, type, pbuf_custom, data, MAX_PACKET_SZ); +- +-#if CHECKSUM_CHECK_IP_HW || CHECKSUM_CHECK_TCP_HW + if (pbuf) { + pbuf->ol_flags = 0; + pbuf->l2_len = 0; + pbuf->l3_len = 0; + } +-#endif + + return pbuf; + } + +-struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8_t *apiflags) ++static int32_t lwip_alloc_pbufs(pbuf_layer layer, uint16_t length, pbuf_type type, void **bufs, uint32_t num) + { +- struct pbuf *pbuf = NULL; ++ int32_t ret = rte_pktmbuf_alloc_bulk(get_protocol_stack()->tx_pktmbuf_pool, (struct rte_mbuf **)bufs, num); ++ if (ret != 0) { ++ get_protocol_stack()->stats.tx_allocmbuf_fail++; ++ return -1; ++ } + +- if (sock->send_lastdata) { +- pbuf = sock->send_lastdata; +- sock->send_lastdata = NULL; +- } else { +- int32_t ret = rte_ring_sc_dequeue(sock->send_ring, (void **)&pbuf); +- if (ret != 0) { +- *apiflags &= ~TCP_WRITE_FLAG_MORE; +- return NULL; +- } ++ for (uint32_t i = 0; i < num; i++) { ++ bufs[i] = init_mbuf_to_pbuf(bufs[i], layer, length, type); + } + +- if (pbuf->tot_len >= remain_size) { +- sock->send_lastdata = pbuf; +- *apiflags |= TCP_WRITE_FLAG_MORE; /* set TCP_PSH flag */ ++ return 0; ++} ++ ++struct pbuf *lwip_alloc_pbuf(pbuf_layer layer, uint16_t length, pbuf_type type) ++{ ++ struct pbuf *pbuf; ++ ++ if (lwip_alloc_pbufs(layer, length, type, (void **)&pbuf, 1) != 0) { + return NULL; + } + +- replenish_send_idlembuf(sock->send_idle_ring); ++ return pbuf; ++} + +- if ((sock->epoll_events & EPOLLOUT) && rte_ring_free_count(sock->send_ring)) { +- add_epoll_event(sock->conn, EPOLLOUT); ++struct pbuf *write_lwip_data(struct lwip_sock *sock, uint16_t remain_size, uint8_t *apiflags) ++{ ++ struct pbuf *pbuf = NULL; ++ ++ if (gazelle_ring_sc_dequeue(sock->send_ring, (void **)&pbuf, 1) != 1) { ++ *apiflags &= ~TCP_WRITE_FLAG_MORE; ++ return NULL; + } + + sock->stack->stats.write_lwip_cnt++; + return pbuf; + } + ++static inline void del_data_out_event(struct lwip_sock *sock) ++{ ++ pthread_spin_lock(&sock->wakeup->event_list_lock); ++ ++ /* check again avoid cover event add in stack thread */ ++ if (!NETCONN_IS_OUTIDLE(sock)) { ++ sock->events &= ~EPOLLOUT; ++ ++ if (sock->events == 0) { ++ list_del_node_null(&sock->event_list); ++ } ++ } ++ ++ pthread_spin_unlock(&sock->wakeup->event_list_lock); ++} ++ + ssize_t write_stack_data(struct lwip_sock *sock, const void *buf, size_t len) + { +- if (sock->events & EPOLLERR) { ++ if (sock->errevent > 0) { + return 0; + } + +- uint32_t free_count = rte_ring_free_count(sock->send_ring); ++ uint32_t free_count = gazelle_ring_readable_count(sock->send_ring); + if (free_count == 0) { + return -1; + } + +- uint32_t avaible_cont = rte_ring_count(sock->send_idle_ring); +- avaible_cont = LWIP_MIN(free_count, avaible_cont); +- + struct pbuf *pbuf = NULL; + ssize_t send_len = 0; + size_t copy_len; + uint32_t send_pkt = 0; + +- while (send_len < len && send_pkt < avaible_cont) { +- int32_t ret = rte_ring_sc_dequeue(sock->send_idle_ring, (void **)&pbuf); +- if (ret < 0) { +- sock->stack->stats.app_write_idlefail++; ++ while (send_len < len && send_pkt < free_count) { ++ if (gazelle_ring_read(sock->send_ring, (void **)&pbuf, 1) != 1) { ++ if (sock->wakeup) { ++ sock->wakeup->stat.app_write_idlefail++; ++ } + break; + } + +@@ -339,21 +282,42 @@ ssize_t write_stack_data(struct lwip_sock *sock, const void *buf, size_t len) + pbuf_take(pbuf, (char *)buf + send_len, copy_len); + pbuf->tot_len = pbuf->len = copy_len; + +- ret = rte_ring_sp_enqueue(sock->send_ring, pbuf); +- if (ret != 0) { +- sock->stack->stats.app_write_drop++; +- pbuf_free(pbuf); +- break; +- } +- + send_len += copy_len; + send_pkt++; + } +- __sync_fetch_and_add(&sock->stack->stats.app_write_cnt, send_pkt); ++ gazelle_ring_read_over(sock->send_ring); ++ ++ if (sock->wakeup) { ++ sock->wakeup->stat.app_write_cnt += send_pkt; ++ if (sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLOUT)) { ++ del_data_out_event(sock); ++ } ++ } + + return (send_len <= 0) ? -1 : send_len; + } + ++static void do_lwip_send(int32_t fd, struct lwip_sock *sock, int32_t flags) ++{ ++ /* send all send_ring, so len set lwip send max. */ ++ ssize_t len = lwip_send(fd, sock, UINT16_MAX, flags); ++ if (len == 0) { ++ /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ ++ sock->errevent = 1; ++ add_epoll_event(sock->conn, EPOLLERR); ++ } ++ ++ if (gazelle_ring_readable_count(sock->send_ring) < SOCK_SEND_REPLENISH_THRES) { ++ replenish_send_idlembuf(sock->send_ring); ++ } ++ ++ if (len > 0) { ++ if ((sock->epoll_events & EPOLLOUT) && NETCONN_IS_OUTIDLE(sock)) { ++ add_epoll_event(sock->conn, EPOLLOUT); ++ } ++ } ++} ++ + void stack_send(struct rpc_msg *msg) + { + int32_t fd = msg->args[MSG_ARG_0].i; +@@ -369,17 +333,11 @@ void stack_send(struct rpc_msg *msg) + return; + } + +- /* send all send_ring, so len set lwip send max. */ +- ssize_t len = lwip_send(fd, sock, UINT16_MAX, flags); +- if (len == 0) { +- /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ +- add_epoll_event(sock->conn, EPOLLERR); +- } ++ do_lwip_send(fd, sock, flags); + + /* have remain data add sendlist */ + if (NETCONN_IS_DATAOUT(sock)) { +- if (list_is_empty(&sock->send_list)) { +- sock->send_flags = flags; ++ if (list_is_null(&sock->send_list)) { + list_add_node(&sock->stack->send_list, &sock->send_list); + } + sock->stack->stats.send_self_rpc++; +@@ -396,20 +354,14 @@ void send_stack_list(struct protocol_stack *stack, uint32_t send_max) + sock = container_of(node, struct lwip_sock, send_list); + + if (sock->conn == NULL || !NETCONN_IS_DATAOUT(sock)) { +- list_del_node_init(&sock->send_list); ++ list_del_node_null(&sock->send_list); + continue; + } + +- /* send all send_ring, so len set lwip send max. */ +- ssize_t len = lwip_send(sock->conn->socket, sock, UINT16_MAX, sock->send_flags); +- if (len == 0) { +- /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ +- add_epoll_event(sock->conn, EPOLLERR); +- list_del_node_init(&sock->send_list); +- } ++ do_lwip_send(sock->conn->socket, sock, 0); + + if (!NETCONN_IS_DATAOUT(sock)) { +- list_del_node_init(&sock->send_list); ++ list_del_node_null(&sock->send_list); + } + + if (++read_num >= send_max) { +@@ -418,26 +370,39 @@ void send_stack_list(struct protocol_stack *stack, uint32_t send_max) + } + } + ++static inline void free_recv_ring_readover(struct rte_ring *ring) ++{ ++ void *pbufs[SOCK_RECV_RING_SIZE]; ++ uint32_t num = gazelle_ring_sc_dequeue(ring, pbufs, RING_SIZE(SOCK_RECV_RING_SIZE)); ++ for (uint32_t i = 0; i < num; i++) { ++ pbuf_free(pbufs[i]); ++ } ++} ++ + ssize_t read_lwip_data(struct lwip_sock *sock, int32_t flags, u8_t apiflags) + { + if (sock->conn->recvmbox == NULL) { + return 0; + } + +- if (rte_ring_count(sock->recv_wait_free)) { +- free_ring_pbuf(sock->recv_wait_free); ++ if (gazelle_ring_readover_count(sock->recv_ring) >= SOCK_RECV_FREE_THRES) { ++ free_recv_ring_readover(sock->recv_ring); ++ } ++ ++ uint32_t free_count = gazelle_ring_free_count(sock->recv_ring); ++ if (free_count == 0) { ++ GAZELLE_RETURN(EAGAIN); + } + +- uint32_t free_count = rte_ring_free_count(sock->recv_ring); + uint32_t data_count = rte_ring_count(sock->conn->recvmbox->ring); +- uint32_t read_max = LWIP_MIN(free_count, data_count); +- struct pbuf *pbuf = NULL; ++ uint32_t read_num = LWIP_MIN(free_count, data_count); ++ read_num = LWIP_MIN(read_num, SOCK_RECV_RING_SIZE); ++ struct pbuf *pbufs[SOCK_RECV_RING_SIZE]; + uint32_t read_count = 0; + ssize_t recv_len = 0; +- int32_t ret; + +- for (uint32_t i = 0; i < read_max; i++) { +- err_t err = netconn_recv_tcp_pbuf_flags(sock->conn, &pbuf, apiflags); ++ for (uint32_t i = 0; i < read_num; i++) { ++ err_t err = netconn_recv_tcp_pbuf_flags(sock->conn, &pbufs[i], apiflags); + if (err != ERR_OK) { + if (recv_len > 0) { + /* already received data, return that (this trusts in getting the same error from +@@ -448,35 +413,28 @@ ssize_t read_lwip_data(struct lwip_sock *sock, int32_t flags, u8_t apiflags) + return (err == ERR_CLSD) ? 0 : -1; + } + +- if (!(flags & MSG_PEEK)) { +- ret = rte_ring_sp_enqueue(sock->recv_ring, pbuf); +- if (ret != 0) { +- pbuf_free(pbuf); +- sock->stack->stats.read_lwip_drop++; +- break; +- } +- read_count++; +- } +- +- if (get_protocol_stack_group()->latency_start) { +- calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_LWIP); +- } +- +- recv_len += pbuf->len; ++ recv_len += pbufs[i]->tot_len; ++ read_count++; + + /* once we have some data to return, only add more if we don't need to wait */ + apiflags |= NETCONN_DONTBLOCK | NETCONN_NOFIN; + } + +- if (data_count > read_count) { +- add_recv_list(sock->conn->socket); ++ if (!(flags & MSG_PEEK)) { ++ uint32_t enqueue_num = gazelle_ring_sp_enqueue(sock->recv_ring, (void **)pbufs, read_count); ++ for (uint32_t i = enqueue_num; i < read_count; i++) { ++ /* update receive window */ ++ tcp_recved(sock->conn->pcb.tcp, pbufs[i]->tot_len); ++ pbuf_free(pbufs[i]); ++ sock->stack->stats.read_lwip_drop++; ++ } + } + +- if (recv_len > 0 && (flags & MSG_PEEK) == 0) { +- add_epoll_event(sock->conn, EPOLLIN); ++ for (uint32_t i = 0; get_protocol_stack_group()->latency_start && i < read_count; i++) { ++ calculate_lstack_latency(&sock->stack->latency, pbufs[i], GAZELLE_LATENCY_LWIP); + } +- sock->stack->stats.read_lwip_cnt += read_count; + ++ sock->stack->stats.read_lwip_cnt += read_count; + if (recv_len == 0) { + GAZELLE_RETURN(EAGAIN); + } +@@ -532,14 +490,12 @@ ssize_t gazelle_send(int32_t fd, const void *buf, size_t len, int32_t flags) + GAZELLE_RETURN(EINVAL); + } + +- sock->send_flags = flags; + ssize_t send = write_stack_data(sock, buf, len); + if (send < 0) { + GAZELLE_RETURN(EAGAIN); + } else if (send == 0) { + return 0; + } +- rte_smp_mb(); + + rpc_call_send(fd, NULL, send, flags); + return send; +@@ -574,22 +530,52 @@ ssize_t sendmsg_to_stack(int32_t s, const struct msghdr *message, int32_t flags) + return buflen; + } + ++static inline void del_data_in_event(struct lwip_sock *sock) ++{ ++ pthread_spin_lock(&sock->wakeup->event_list_lock); ++ ++ /* check again avoid cover event add in stack thread */ ++ if (!NETCONN_IS_DATAIN(sock)) { ++ sock->events &= ~EPOLLIN; ++ ++ if (sock->events == 0) { ++ list_del_node_null(&sock->event_list); ++ } ++ } ++ ++ pthread_spin_unlock(&sock->wakeup->event_list_lock); ++} ++ ++static struct pbuf *pbuf_free_partial(struct pbuf *pbuf, uint16_t free_len) ++{ ++ while (free_len && pbuf) { ++ if (free_len >= pbuf->len) { ++ struct pbuf *p = pbuf; ++ pbuf = pbuf->next; ++ free_len = free_len - p->len; ++ } else { ++ pbuf_remove_header(pbuf, free_len); ++ break; ++ } ++ } ++ ++ return pbuf; ++} ++ + ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags) + { + size_t recv_left = len; + struct pbuf *pbuf = NULL; + ssize_t recvd = 0; +- int32_t ret; +- u16_t copy_len; ++ uint16_t copy_len; + + struct lwip_sock *sock = get_socket(fd); + if (sock == NULL) { + LSTACK_LOG(ERR, LSTACK, "get_socket null fd %d.\n", fd); + GAZELLE_RETURN(EINVAL); + } +- sock->recv_flags = flags; + +- if ((sock->events & EPOLLERR) && !NETCONN_IS_DATAIN(sock)) { ++ if (sock->errevent > 0) { + return 0; + } + +@@ -598,35 +584,39 @@ ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags) + pbuf = sock->recv_lastdata; + sock->recv_lastdata = NULL; + } else { +- ret = rte_ring_sc_dequeue(sock->recv_ring, (void **)&pbuf); +- if (ret != 0) { ++ if (gazelle_ring_read(sock->recv_ring, (void **)&pbuf, 1) != 1) { + break; + } + } + +- copy_len = (recv_left > pbuf->tot_len) ? pbuf->tot_len : (u16_t)recv_left; ++ copy_len = (recv_left > pbuf->len) ? pbuf->len : (uint16_t)recv_left; + pbuf_copy_partial(pbuf, (char *)buf + recvd, copy_len, 0); + + recvd += copy_len; + recv_left -= copy_len; + +- if (pbuf->tot_len > copy_len) { +- sock->recv_lastdata = pbuf_free_header(pbuf, copy_len); ++ if (pbuf->len > copy_len || pbuf->next) { ++ sock->recv_lastdata = pbuf_free_partial(pbuf, copy_len); + } else { ++ if (sock->wakeup) { ++ sock->wakeup->stat.app_read_cnt += 1; ++ } + if (get_protocol_stack_group()->latency_start) { + calculate_lstack_latency(&sock->stack->latency, pbuf, GAZELLE_LATENCY_READ); + } +- ret = rte_ring_sp_enqueue(sock->recv_wait_free, pbuf); +- if (ret != 0) { +- pbuf_free(pbuf); +- } +- sock->recv_lastdata = NULL; +- __sync_fetch_and_add(&sock->stack->stats.app_read_cnt, 1); ++ gazelle_ring_read_over(sock->recv_ring); + } + } + ++ /* rte_ring_count reduce lock */ ++ if (sock->wakeup && sock->wakeup->type == WAKEUP_EPOLL && (sock->events & EPOLLIN)) { ++ del_data_in_event(sock); ++ } ++ + if (recvd == 0) { +- sock->stack->stats.read_null++; ++ if (sock->wakeup) { ++ sock->wakeup->stat.read_null++; ++ } + GAZELLE_RETURN(EAGAIN); + } + return recvd; +@@ -634,9 +624,9 @@ ssize_t read_stack_data(int32_t fd, void *buf, size_t len, int32_t flags) + + void add_recv_list(int32_t fd) + { +- struct lwip_sock *sock = get_socket(fd); ++ struct lwip_sock *sock = get_socket_by_fd(fd); + +- if (sock->stack && list_is_empty(&sock->recv_list)) { ++ if (sock && sock->stack && list_is_null(&sock->recv_list)) { + list_add_node(&sock->stack->recv_list, &sock->recv_list); + } + } +@@ -648,24 +638,26 @@ void read_recv_list(struct protocol_stack *stack, uint32_t max_num) + struct lwip_sock *sock; + uint32_t read_num = 0; + ++ struct list_node *last_node = list->prev; + list_for_each_safe(node, temp, list) { + sock = container_of(node, struct lwip_sock, recv_list); + +- if (sock->conn == NULL || sock->recv_ring == NULL || sock->send_ring == NULL || sock->conn->pcb.tcp == NULL) { +- list_del_node_init(&sock->recv_list); ++ if (sock->conn == NULL || sock->conn->recvmbox == NULL || rte_ring_count(sock->conn->recvmbox->ring) == 0) { ++ list_del_node_null(&sock->recv_list); + continue; + } + +- if (rte_ring_free_count(sock->recv_ring)) { +- list_del_node_init(&sock->recv_list); +- ssize_t len = lwip_recv(sock->conn->socket, NULL, 0, sock->recv_flags); +- if (len == 0) { +- /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ +- add_epoll_event(sock->conn, EPOLLERR); +- } ++ ssize_t len = lwip_recv(sock->conn->socket, NULL, 0, 0); ++ if (len == 0) { ++ /* FIXME: should use POLLRDHUP, when connection be closed. lwip event-callback no POLLRDHUP */ ++ sock->errevent = 1; ++ add_epoll_event(sock->conn, EPOLLERR); ++ } else if (len > 0) { ++ add_epoll_event(sock->conn, EPOLLIN); + } + +- if (++read_num >= max_num) { ++ /* last_node:recv only once per sock. max_num avoid cost too much time this loop */ ++ if (++read_num >= max_num || last_node == node) { + break; + } + } +@@ -684,14 +676,14 @@ static void copy_pcb_to_conn(struct gazelle_stat_lstack_conn_info *conn, const s + + if (netconn != NULL && netconn->recvmbox != NULL) { + conn->recv_cnt = rte_ring_count(netconn->recvmbox->ring); ++ conn->fd = netconn->socket; + + struct lwip_sock *sock = get_socket(netconn->socket); + if (netconn->socket > 0 && sock != NULL && sock->recv_ring != NULL && sock->send_ring != NULL) { +- conn->recv_ring_cnt = rte_ring_count(sock->recv_ring); ++ conn->recv_ring_cnt = gazelle_ring_readable_count(sock->recv_ring); + conn->recv_ring_cnt += (sock->recv_lastdata) ? 1 : 0; + +- conn->send_ring_cnt = rte_ring_count(sock->send_ring); +- conn->send_ring_cnt += (sock->send_lastdata) ? 1 : 0; ++ conn->send_ring_cnt = gazelle_ring_readover_count(sock->send_ring); + + if (sock->wakeup) { + sem_getvalue(&sock->wakeup->event_sem, &conn->sem_cnt); +@@ -756,9 +748,11 @@ void create_shadow_fd(struct rpc_msg *msg) + } + + clone_lwip_socket_opt(clone_sock, sock); +- clone_sock->shadowed_sock = sock; + +- listen_list_add_node(fd, clone_fd); ++ while (sock->listen_next) { ++ sock = sock->listen_next; ++ } ++ sock->listen_next = clone_sock; + + int32_t ret = lwip_bind(clone_fd, addr, addr_len); + if (ret < 0) { +@@ -843,11 +837,6 @@ static uint32_t get_list_count(struct list_node *list) + return count; + } + +-void stack_eventlist_count(struct rpc_msg *msg) +-{ +- msg->result = get_list_count(&get_protocol_stack()->event_list); +-} +- + void stack_sendlist_count(struct rpc_msg *msg) + { + msg->result = get_list_count(&get_protocol_stack()->send_list); +diff --git a/src/lstack/core/lstack_protocol_stack.c b/src/lstack/core/lstack_protocol_stack.c +index 88513ba..a1f3790 100644 +--- a/src/lstack/core/lstack_protocol_stack.c ++++ b/src/lstack/core/lstack_protocol_stack.c +@@ -11,6 +11,7 @@ + */ + #define _GNU_SOURCE + #include ++#include + + #include + #include +@@ -30,6 +31,7 @@ + #include "lstack_protocol_stack.h" + #include "lstack_cfg.h" + #include "lstack_control_plane.h" ++#include "posix/lstack_epoll.h" + #include "lstack_stack_stat.h" + + #define READ_LIST_MAX 32 +@@ -39,7 +41,6 @@ + + static PER_THREAD uint16_t g_stack_idx = PROTOCOL_STACK_MAX; + static struct protocol_stack_group g_stack_group = {0}; +-static PER_THREAD long g_stack_tid = 0; + + void set_init_fail(void); + typedef void *(*stack_thread_func)(void *arg); +@@ -66,6 +67,8 @@ static inline void set_stack_idx(uint16_t idx) + + long get_stack_tid(void) + { ++ static PER_THREAD long g_stack_tid = 0; ++ + if (g_stack_tid == 0) { + g_stack_tid = syscall(__NR_gettid); + } +@@ -96,17 +99,37 @@ struct protocol_stack *get_protocol_stack_by_fd(int32_t fd) + return sock->stack; + } + +-struct protocol_stack *get_minconn_protocol_stack(void) ++struct protocol_stack *get_bind_protocol_stack(void) + { +- int32_t min_index = 0; ++ static PER_THREAD struct protocol_stack *bind_stack = NULL; ++ ++ /* same app communication thread bind same stack */ ++ if (bind_stack) { ++ return bind_stack; ++ } + +- for (int32_t i = 1; i < g_stack_group.stack_num; i++) { +- if (g_stack_group.stacks[i]->conn_num < g_stack_group.stacks[min_index]->conn_num) { +- min_index = i; ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ uint16_t index = 0; ++ ++ /* close listen shadow, per app communication thread select only one stack */ ++ if (get_global_cfg_params()->listen_shadow == false) { ++ static uint16_t stack_index = 0; ++ index = atomic_fetch_add(&stack_index, 1); ++ if (index >= stack_group->stack_num) { ++ LSTACK_LOG(ERR, LSTACK, "thread =%hu larger than stack num = %hu\n", index, stack_group->stack_num); ++ return NULL; ++ } ++ /* use listen shadow, app communication thread maybe more than stack num, select the least load stack */ ++ } else { ++ for (uint16_t i = 1; i < stack_group->stack_num; i++) { ++ if (stack_group->stacks[i]->conn_num < stack_group->stacks[index]->conn_num) { ++ index = i; ++ } + } + } + +- return g_stack_group.stacks[min_index]; ++ bind_stack = stack_group->stacks[index]; ++ return stack_group->stacks[index]; + } + + void lstack_low_power_idling(void) +@@ -193,7 +216,7 @@ static int32_t create_thread(uint16_t queue_id, char *thread_name, stack_thread_ + return 0; + } + +-static void* gazelle_weakup_thread(void *arg) ++static void* gazelle_wakeup_thread(void *arg) + { + uint16_t queue_id = *(uint16_t *)arg; + struct protocol_stack *stack = get_protocol_stack_group()->stacks[queue_id]; +@@ -203,17 +226,13 @@ static void* gazelle_weakup_thread(void *arg) + + LSTACK_LOG(INFO, LSTACK, "weakup_%02d start\n", stack->queue_id); + ++ sem_t *event_sem[WAKEUP_MAX_NUM]; ++ int num; + for (;;) { +- if (rte_ring_count(stack->wakeup_ring) == 0) { +- continue; +- } +- +- sem_t *event_sem; +- if (rte_ring_sc_dequeue(stack->wakeup_ring, (void **)&event_sem)) { +- continue; ++ num = gazelle_light_ring_dequeue_burst(stack->wakeup_ring, (void **)event_sem, WAKEUP_MAX_NUM); ++ for (int i = 0; i < num; i++) { ++ sem_post(event_sem[i]); + } +- +- sem_post(event_sem); + } + + return NULL; +@@ -233,12 +252,8 @@ static int32_t init_stack_value(struct protocol_stack *stack, uint16_t queue_id) + stack->lwip_stats = &lwip_stats; + + init_list_node(&stack->recv_list); +- init_list_node(&stack->listen_list); +- init_list_node(&stack->event_list); + init_list_node(&stack->send_list); + +- pthread_spin_init(&stack->event_lock, PTHREAD_PROCESS_SHARED); +- + sys_calibrate_tsc(); + stack_stat_init(); + +@@ -297,8 +312,10 @@ static void* gazelle_kernel_event(void *arg) + } + + for (int32_t i = 0; i < event_num; i++) { +- if (events[i].data.ptr) { +- sem_post((sem_t *)events[i].data.ptr); ++ struct wakeup_poll *wakeup = events[i].data.ptr; ++ if (wakeup) { ++ __atomic_store_n(&wakeup->have_kernel_event, true, __ATOMIC_RELEASE); ++ sem_post(&wakeup->event_sem); + } + } + } +@@ -311,7 +328,7 @@ static int32_t create_companion_thread(struct protocol_stack_group *stack_group, + int32_t ret; + + if (stack_group->wakeup_enable) { +- ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_weakup_thread); ++ ret = create_thread(stack->queue_id, "gazelleweakup", gazelle_wakeup_thread); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "gazelleweakup ret=%d errno=%d\n", ret, errno); + return ret; +@@ -339,13 +356,11 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + + struct protocol_stack *stack = malloc(sizeof(*stack)); + if (stack == NULL) { +- sem_post(&stack_group->thread_phase1); + LSTACK_LOG(ERR, LSTACK, "malloc stack failed\n"); + return NULL; + } + + if (init_stack_value(stack, queue_id) != 0) { +- sem_post(&stack_group->thread_phase1); + free(stack); + return NULL; + } +@@ -358,7 +373,6 @@ static struct protocol_stack * stack_thread_init(uint16_t queue_id) + + if (use_ltran()) { + if (client_reg_thrd_ring() != 0) { +- sem_post(&stack_group->thread_phase1); + free(stack); + return NULL; + } +@@ -419,6 +433,8 @@ static int32_t init_protocol_sem(void) + int32_t ret; + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + ++ pthread_spin_init(&stack_group->wakeup_list_lock, PTHREAD_PROCESS_PRIVATE); ++ + if (!use_ltran()) { + ret = sem_init(&stack_group->ethdev_init, 0, 0); + if (ret < 0) { +@@ -449,6 +465,7 @@ int32_t init_protocol_stack(void) + + stack_group->stack_num = get_global_cfg_params()->num_cpu; + stack_group->wakeup_enable = (get_global_cfg_params()->num_wakeup > 0) ? true : false; ++ stack_group->wakeup_list = NULL; + + if (init_protocol_sem() != 0) { + return -1; +@@ -486,58 +503,10 @@ void stack_socket(struct rpc_msg *msg) + } + } + +-static inline bool is_real_close(int32_t fd) +-{ +- struct lwip_sock *sock = get_socket_by_fd(fd); +- +- /* last sock */ +- if (list_is_empty(&sock->attach_list)) { +- return true; +- } +- +- /* listen sock, but have attach sock */ +- if (sock->attach_fd == fd) { +- sock->wait_close = true; +- return false; +- } else { /* attach sock */ +- /* listen sock is normal */ +- struct lwip_sock *listen_sock = get_socket_by_fd(sock->attach_fd); +- if (listen_sock == NULL || !listen_sock->wait_close) { +- list_del_node_init(&sock->attach_list); +- return true; +- } +- +- /* listen sock is wait clsoe. check this is last attach sock */ +- struct list_node *list = &(sock->attach_list); +- struct list_node *node, *temp; +- uint32_t list_count = 0; +- list_for_each_safe(node, temp, list) { +- list_count++; +- } +- /* 2:listen sock is wait close and closing attach sock. close listen sock here */ +- if (list_count == 2) { +- lwip_close(listen_sock->attach_fd); +- gazelle_clean_sock(listen_sock->attach_fd); +- posix_api->close_fn(listen_sock->attach_fd); +- list_del_node_init(&listen_sock->attach_list); +- } +- list_del_node_init(&sock->attach_list); +- return true; +- } +- +- list_del_node_init(&sock->attach_list); +- return true; +-} +- + void stack_close(struct rpc_msg *msg) + { + int32_t fd = msg->args[MSG_ARG_0].i; + +- if (!is_real_close(fd)) { +- msg->result = 0; +- return; +- } +- + msg->result = lwip_close(fd); + if (msg->result != 0) { + LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d failed %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); +@@ -556,26 +525,8 @@ void stack_bind(struct rpc_msg *msg) + } + } + +-static inline struct lwip_sock *reuse_listen(struct protocol_stack *stack, struct lwip_sock *listen_sock) +-{ +- struct list_node *node, *temp; +- struct list_node *list = &(stack->listen_list); +- struct lwip_sock *sock; +- +- list_for_each_safe(node, temp, list) { +- sock = container_of(node, struct lwip_sock, listen_list); +- if (sock->conn->pcb.tcp->local_port == listen_sock->conn->pcb.tcp->local_port && +- sock->conn->pcb.tcp->local_ip.addr == listen_sock->conn->pcb.tcp->local_ip.addr) { +- return sock; +- } +- } +- +- return NULL; +-} +- + void stack_listen(struct rpc_msg *msg) + { +- struct protocol_stack *stack = get_protocol_stack(); + int32_t fd = msg->args[MSG_ARG_0].i; + int32_t backlog = msg->args[MSG_ARG_1].i; + +@@ -585,25 +536,9 @@ void stack_listen(struct rpc_msg *msg) + return; + } + +- /* stack have listen same ip+port, then attach to old listen */ +- struct lwip_sock *listen_sock = reuse_listen(stack, sock); +- if (listen_sock) { +- if (list_is_empty(&sock->attach_list)) { +- list_add_node(&listen_sock->attach_list, &sock->attach_list); +- } +- sock->attach_fd = listen_sock->conn->socket; +- msg->result = 0; +- return; +- } +- + /* new listen add to stack listen list */ + msg->result = lwip_listen(fd, backlog); +- if (msg->result == 0) { +- if (list_is_empty(&sock->listen_list)) { +- list_add_node(&stack->listen_list, &sock->listen_list); +- } +- sock->attach_fd = fd; +- } else { ++ if (msg->result != 0) { + LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d failed %ld\n", get_stack_tid(), msg->args[MSG_ARG_0].i, msg->result); + } + } +@@ -611,28 +546,35 @@ void stack_listen(struct rpc_msg *msg) + void stack_accept(struct rpc_msg *msg) + { + int32_t fd = msg->args[MSG_ARG_0].i; ++ msg->result = -1; + + int32_t accept_fd = lwip_accept(fd, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].p); +- if (accept_fd > 0) { +- struct lwip_sock *sock = get_socket(accept_fd); +- if (sock && sock->stack) { +- msg->result = accept_fd; +- return; +- } ++ if (accept_fd < 0) { ++ LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); ++ return; ++ } + ++ struct lwip_sock *sock = get_socket(accept_fd); ++ if (sock == NULL || sock->stack == NULL) { + lwip_close(accept_fd); + gazelle_clean_sock(accept_fd); + posix_api->close_fn(accept_fd); ++ LSTACK_LOG(ERR, LSTACK, "fd %d ret %d\n", fd, accept_fd); ++ return; + } + +- LSTACK_LOG(ERR, LSTACK, "tid %ld, fd %d attach_fd %d failed %d\n", get_stack_tid(), msg->args[MSG_ARG_0].i, +- fd, accept_fd); +- msg->result = -1; ++ msg->result = accept_fd; ++ if (rte_ring_count(sock->conn->recvmbox->ring)) { ++ add_recv_list(accept_fd); ++ } + } + + void stack_connect(struct rpc_msg *msg) + { + msg->result = lwip_connect(msg->args[MSG_ARG_0].i, msg->args[MSG_ARG_1].p, msg->args[MSG_ARG_2].socklen); ++ if (msg->result < 0) { ++ msg->result = -errno; ++ } + } + + void stack_getpeername(struct rpc_msg *msg) +@@ -723,6 +665,7 @@ void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cur_stack + + ret = gazelle_alloc_pktmbuf(stack->rx_pktmbuf_pool, &mbuf_copy, 1); + if (ret != 0) { ++ stack->stats.rx_allocmbuf_fail++; + return; + } + copy_mbuf(mbuf_copy, mbuf); +@@ -737,22 +680,28 @@ void stack_broadcast_arp(struct rte_mbuf *mbuf, struct protocol_stack *cur_stack + /* when fd is listenfd, listenfd of all protocol stack thread will be closed */ + int32_t stack_broadcast_close(int32_t fd) + { +- struct lwip_sock *sock = NULL; +- int32_t next_fd; ++ struct lwip_sock *sock = get_socket(fd); ++ int32_t ret = 0; ++ ++ do { ++ sock = sock->listen_next; ++ if (rpc_call_close(fd)) { ++ ret = -1; ++ } + +- while (fd > 0) { +- sock = get_socket(fd); + if (sock == NULL) { +- LSTACK_LOG(ERR, LSTACK, "tid %ld, %d get sock null\n", get_stack_tid(), fd); +- GAZELLE_RETURN(EINVAL); ++ break; + } +- next_fd = sock->nextfd; ++ fd = sock->conn->socket; ++ } while (sock); + +- rpc_call_close(fd); +- fd = next_fd; +- } ++ return ret; ++} + +- return 0; ++/* choice one stack listen */ ++int32_t stack_single_listen(int32_t fd, int32_t backlog) ++{ ++ return rpc_call_listen(fd, backlog); + } + + /* listen sync to all protocol stack thread, so that any protocol stack thread can build connect */ +@@ -797,44 +746,59 @@ int32_t stack_broadcast_listen(int32_t fd, int32_t backlog) + return 0; + } + +-/* ergodic the protocol stack thread to find the connection, because all threads are listening */ +-int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen) ++static struct lwip_sock *get_min_accept_sock(int32_t fd) + { + struct lwip_sock *sock = get_socket(fd); +- if (sock == NULL || sock->attach_fd < 0) { +- errno = EINVAL; +- return -1; +- } +- fd = sock->attach_fd; +- + struct lwip_sock *min_sock = NULL; +- int32_t min_fd = fd; +- while (fd > 0) { +- sock = get_socket(fd); +- if (sock == NULL) { +- GAZELLE_RETURN(EINVAL); +- } +- struct lwip_sock *attach_sock = get_socket(sock->attach_fd); +- if (attach_sock == NULL) { +- GAZELLE_RETURN(EINVAL); +- } + +- if (!NETCONN_IS_ACCEPTIN(attach_sock)) { +- fd = sock->nextfd; ++ while (sock) { ++ if (!NETCONN_IS_ACCEPTIN(sock)) { ++ sock = sock->listen_next; + continue; + } + +- if (min_sock == NULL || min_sock->stack->conn_num > attach_sock->stack->conn_num) { +- min_sock = attach_sock; +- min_fd = sock->attach_fd; ++ if (min_sock == NULL || min_sock->stack->conn_num > sock->stack->conn_num) { ++ min_sock = sock; + } + +- fd = sock->nextfd; ++ sock = sock->listen_next; ++ } ++ ++ return min_sock; ++} ++ ++static void inline del_accept_in_event(struct lwip_sock *sock) ++{ ++ pthread_spin_lock(&sock->wakeup->event_list_lock); ++ ++ if (!NETCONN_IS_ACCEPTIN(sock)) { ++ sock->events &= ~EPOLLIN; ++ if (sock->events == 0) { ++ list_del_node_null(&sock->event_list); ++ } + } + ++ pthread_spin_unlock(&sock->wakeup->event_list_lock); ++} ++ ++/* ergodic the protocol stack thread to find the connection, because all threads are listening */ ++int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen) ++{ + int32_t ret = -1; +- if (min_sock) { +- ret = rpc_call_accept(min_fd, addr, addrlen); ++ ++ struct lwip_sock *sock = get_socket(fd); ++ if (sock == NULL) { ++ errno = EINVAL; ++ return -1; ++ } ++ ++ struct lwip_sock *min_sock = get_min_accept_sock(fd); ++ if (min_sock && min_sock->conn) { ++ ret = rpc_call_accept(min_sock->conn->socket, addr, addrlen); ++ } ++ ++ if (min_sock && min_sock->wakeup && min_sock->wakeup->type == WAKEUP_EPOLL) { ++ del_accept_in_event(min_sock); + } + + if (ret < 0) { +diff --git a/src/lstack/core/lstack_stack_stat.c b/src/lstack/core/lstack_stack_stat.c +index 743857f..06fac5c 100644 +--- a/src/lstack/core/lstack_stack_stat.c ++++ b/src/lstack/core/lstack_stack_stat.c +@@ -25,6 +25,7 @@ + #include "gazelle_dfx_msg.h" + #include "lstack_thread_rpc.h" + #include "lstack_stack_stat.h" ++#include "posix/lstack_epoll.h" + + #define US_PER_SEC 1000000 + +@@ -87,6 +88,68 @@ static void set_latency_start_flag(bool start) + } + } + ++void register_wakeup(struct wakeup_poll *wakeup) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ ++ pthread_spin_lock(&stack_group->wakeup_list_lock); ++ ++ wakeup->next = stack_group->wakeup_list; ++ stack_group->wakeup_list = wakeup; ++ ++ pthread_spin_unlock(&stack_group->wakeup_list_lock); ++} ++ ++void unregister_wakeup(struct wakeup_poll *wakeup) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ ++ pthread_spin_lock(&stack_group->wakeup_list_lock); ++ ++ struct wakeup_poll *node = stack_group->wakeup_list; ++ struct wakeup_poll *pre = NULL; ++ ++ while(node && node != wakeup) { ++ pre = node; ++ node = node->next; ++ } ++ ++ if (node == NULL) { ++ pthread_spin_unlock(&stack_group->wakeup_list_lock); ++ return; ++ } ++ ++ if (pre) { ++ pre->next = node->next; ++ } else { ++ stack_group->wakeup_list = node->next; ++ } ++ ++ pthread_spin_unlock(&stack_group->wakeup_list_lock); ++} ++ ++static void get_wakeup_stat(struct protocol_stack *stack, struct gazelle_wakeup_stat *stat) ++{ ++ struct protocol_stack_group *stack_group = get_protocol_stack_group(); ++ ++ pthread_spin_lock(&stack_group->wakeup_list_lock); ++ ++ struct wakeup_poll *node = stack_group->wakeup_list; ++ while (node) { ++ if (node->bind_stack == stack) { ++ stat->app_events += node->stat.app_events; ++ stat->read_null += node->stat.read_null; ++ stat->app_write_cnt += node->stat.app_write_cnt; ++ stat->app_write_idlefail += node->stat.app_write_idlefail; ++ stat->app_read_cnt += node->stat.app_read_cnt; ++ } ++ ++ node = node->next; ++ } ++ ++ pthread_spin_unlock(&stack_group->wakeup_list_lock); ++} ++ + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info) + { + struct cfg_params *cfg = get_global_cfg_params(); +@@ -102,21 +165,24 @@ static void get_stack_stats(struct gazelle_stack_dfx_data *dfx, struct protocol_ + struct protocol_stack_group *stack_group = get_protocol_stack_group(); + + dfx->loglevel = rte_log_get_level(RTE_LOGTYPE_LSTACK); ++ + lstack_get_low_power_info(&dfx->low_power_info); +- memcpy_s(&dfx->data.pkts, sizeof(dfx->data.pkts), &stack->stats, sizeof(dfx->data.pkts)); ++ ++ memcpy_s(&dfx->data.pkts.stack_stat, sizeof(struct gazelle_stack_stat), &stack->stats, ++ sizeof(struct gazelle_stack_stat)); ++ ++ get_wakeup_stat(stack, &dfx->data.pkts.wakeup_stat); ++ + dfx->data.pkts.call_alloc_fail = stack_group->call_alloc_fail; + + int32_t rpc_call_result = rpc_call_msgcnt(stack); + dfx->data.pkts.call_msg_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; + + rpc_call_result = rpc_call_recvlistcnt(stack); +- dfx->data.pkts.recv_list = (rpc_call_result < 0) ? 0 : rpc_call_result; +- +- rpc_call_result = rpc_call_eventlistcnt(stack); +- dfx->data.pkts.event_list = (rpc_call_result < 0) ? 0 : rpc_call_result; ++ dfx->data.pkts.recv_list_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; + + rpc_call_result = rpc_call_sendlistcnt(stack); +- dfx->data.pkts.send_list = (rpc_call_result < 0) ? 0 : rpc_call_result; ++ dfx->data.pkts.send_list_cnt = (rpc_call_result < 0) ? 0 : rpc_call_result; + + dfx->data.pkts.conn_num = stack->conn_num; + } +@@ -182,6 +248,8 @@ int32_t handle_stack_cmd(int32_t fd, enum GAZELLE_STAT_MODE stat_mode) + + for (uint32_t i = 0; i < stack_group->stack_num; i++) { + struct protocol_stack *stack = stack_group->stacks[i]; ++ ++ memset_s(&dfx, sizeof(dfx), 0, sizeof(dfx)); + get_stack_dfx_data(&dfx, stack, stat_mode); + + if (!use_ltran() && +diff --git a/src/lstack/core/lstack_thread_rpc.c b/src/lstack/core/lstack_thread_rpc.c +index 312e192..8937920 100644 +--- a/src/lstack/core/lstack_thread_rpc.c ++++ b/src/lstack/core/lstack_thread_rpc.c +@@ -23,34 +23,53 @@ + #include "lstack_dpdk.h" + #include "lstack_thread_rpc.h" + +-static PER_THREAD struct rte_mempool *rpc_pool = NULL; ++#define RPC_MSG_MAX 32 ++struct rpc_msg_pool { ++ struct rpc_msg msgs[RPC_MSG_MAX]; ++ uint32_t prod __rte_cache_aligned; ++ uint32_t cons __rte_cache_aligned; ++}; ++ ++static PER_THREAD struct rpc_msg_pool *g_rpc_pool = NULL; ++ ++static inline __attribute__((always_inline)) struct rpc_msg *get_rpc_msg(struct rpc_msg_pool *rpc_pool) ++{ ++ uint32_t cons = __atomic_load_n(&rpc_pool->cons, __ATOMIC_ACQUIRE); ++ uint32_t prod = rpc_pool->prod + 1; ++ ++ if (prod == cons) { ++ return NULL; ++ } ++ ++ rpc_pool->prod = prod; ++ return &rpc_pool->msgs[prod]; ++} + + static inline __attribute__((always_inline)) + struct rpc_msg *rpc_msg_alloc(struct protocol_stack *stack, rpc_msg_func func) + { +- int32_t ret; + struct rpc_msg *msg = NULL; + + if (stack == NULL) { + return NULL; + } + +- static uint16_t pool_index = 0; +- if (rpc_pool == NULL) { +- rpc_pool = create_rpc_mempool("rpc_msg", atomic_fetch_add(&pool_index, 1)); +- if (rpc_pool == NULL) { ++ if (g_rpc_pool == NULL) { ++ g_rpc_pool = calloc(1, sizeof(struct rpc_msg_pool)); ++ if (g_rpc_pool == NULL) { ++ get_protocol_stack_group()->call_alloc_fail++; + return NULL; + } + } + +- ret = rte_mempool_get(rpc_pool, (void **)&msg); +- if (ret < 0) { ++ msg = get_rpc_msg(g_rpc_pool); ++ if (msg == NULL) { + get_protocol_stack_group()->call_alloc_fail++; + return NULL; + } +- msg->pool = rpc_pool; ++ msg->pool = g_rpc_pool; + +- pthread_spin_init(&msg->lock, PTHREAD_PROCESS_SHARED); ++ pthread_spin_init(&msg->lock, PTHREAD_PROCESS_PRIVATE); + msg->func = func; + msg->self_release = 1; + +@@ -64,7 +83,8 @@ void rpc_msg_free(struct rpc_msg *msg) + + msg->self_release = 0; + msg->func = NULL; +- rte_mempool_put(msg->pool, (void *)msg); ++ ++ atomic_fetch_add(&msg->pool->cons, 1); + } + + static inline __attribute__((always_inline)) +@@ -109,8 +129,6 @@ void poll_rpc_msg(struct protocol_stack *stack, uint32_t max_num) + stack->stats.call_null++; + } + +- rte_mb(); +- + if (msg->self_release) { + pthread_spin_unlock(&msg->lock); + } else { +@@ -192,16 +210,6 @@ int32_t rpc_call_thread_regphase2(struct protocol_stack *stack, void *conn) + return rpc_sync_call(&stack->rpc_queue, msg); + } + +-int32_t rpc_call_eventlistcnt(struct protocol_stack *stack) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(stack, stack_eventlist_count); +- if (msg == NULL) { +- return -1; +- } +- +- return rpc_sync_call(&stack->rpc_queue, msg); +-} +- + int32_t rpc_call_sendlistcnt(struct protocol_stack *stack) + { + struct rpc_msg *msg = rpc_msg_alloc(stack, stack_sendlist_count); +@@ -222,28 +230,6 @@ int32_t rpc_call_recvlistcnt(struct protocol_stack *stack) + return rpc_sync_call(&stack->rpc_queue, msg); + } + +-void add_epoll_event(struct netconn *conn, uint32_t event); +-static void rpc_add_event(struct rpc_msg *msg) +-{ +- struct lwip_sock *sock = (struct lwip_sock *)msg->args[MSG_ARG_0].p; +- if (sock->conn) { +- add_epoll_event(sock->conn, sock->events); +- } +-} +- +-void rpc_call_addevent(struct protocol_stack *stack, void *sock) +-{ +- struct rpc_msg *msg = rpc_msg_alloc(stack, rpc_add_event); +- if (msg == NULL) { +- return; +- } +- +- msg->args[MSG_ARG_0].p = sock; +- +- msg->self_release = 0; +- rpc_call(&stack->rpc_queue, msg); +-} +- + int32_t rpc_call_arp(struct protocol_stack *stack, struct rte_mbuf *mbuf) + { + struct rpc_msg *msg = rpc_msg_alloc(stack, stack_arp); +@@ -260,7 +246,7 @@ int32_t rpc_call_arp(struct protocol_stack *stack, struct rte_mbuf *mbuf) + + int32_t rpc_call_socket(int32_t domain, int32_t type, int32_t protocol) + { +- struct protocol_stack *stack = get_minconn_protocol_stack(); ++ struct protocol_stack *stack = get_bind_protocol_stack(); + struct rpc_msg *msg = rpc_msg_alloc(stack, stack_socket); + if (msg == NULL) { + return -1; +@@ -342,7 +328,12 @@ int32_t rpc_call_connect(int fd, const struct sockaddr *addr, socklen_t addrlen) + msg->args[MSG_ARG_1].cp = addr; + msg->args[MSG_ARG_2].socklen = addrlen; + +- return rpc_sync_call(&stack->rpc_queue, msg); ++ int32_t ret = rpc_sync_call(&stack->rpc_queue, msg); ++ if (ret < 0) { ++ errno = -ret; ++ return -1; ++ } ++ return ret; + } + + int32_t rpc_call_getpeername(int fd, struct sockaddr *addr, socklen_t *addrlen) +diff --git a/src/lstack/include/lstack_cfg.h b/src/lstack/include/lstack_cfg.h +index 987828d..aeffbb3 100644 +--- a/src/lstack/include/lstack_cfg.h ++++ b/src/lstack/include/lstack_cfg.h +@@ -75,6 +75,7 @@ struct cfg_params { + uint32_t lpm_pkts_in_detect; + bool use_ltran; // ture:lstack read from nic false:read form ltran + bool kni_switch; ++ bool listen_shadow; // true:listen in all stack thread. false:listen in one stack thread. + int dpdk_argc; + char **dpdk_argv; + struct secondary_attach_arg sec_attach_arg; +diff --git a/src/lstack/include/lstack_dpdk.h b/src/lstack/include/lstack_dpdk.h +index bb9be21..6ffcc41 100644 +--- a/src/lstack/include/lstack_dpdk.h ++++ b/src/lstack/include/lstack_dpdk.h +@@ -23,7 +23,7 @@ + #include "dpdk_common.h" + struct protocol_stack; + +-#define RX_NB_MBUF ((5 * (MAX_CLIENTS / 4)) + (VDEV_RX_QUEUE_SZ * DEFAULT_BACKUP_RING_SIZE_FACTOR)) ++#define RX_NB_MBUF ((5 * MAX_CLIENTS) + (VDEV_RX_QUEUE_SZ * DEFAULT_BACKUP_RING_SIZE_FACTOR)) + #define RX_MBUF_CACHE_SZ (VDEV_RX_QUEUE_SZ) + #define TX_NB_MBUF (128 * DEFAULT_RING_SIZE) + #define TX_MBUF_CACHE_SZ (DEFAULT_RING_SIZE) +@@ -34,13 +34,13 @@ struct protocol_stack; + + #define MAX_PACKET_SZ 2048 + ++#define RING_SIZE(x) ((x) - 1) + + #define MBUF_SZ (MAX_PACKET_SZ + RTE_PKTMBUF_HEADROOM) + + #define MAX_CORE_NUM 256 + #define CALL_MSG_RING_SIZE (unsigned long long)32 + #define CALL_CACHE_SZ 0 +-#define CALL_POOL_SZ 128 + + /* Layout: + * | rte_mbuf | pbuf | custom_free_function | payload | +@@ -62,7 +62,6 @@ int32_t dpdk_eal_init(void); + int32_t pktmbuf_pool_init(struct protocol_stack *stack, uint16_t stack_num); + struct rte_ring *create_ring(const char *name, uint32_t count, uint32_t flags, int32_t queue_id); + int32_t create_shared_ring(struct protocol_stack *stack); +-struct rte_mempool *create_rpc_mempool(const char *name, uint16_t queue_id); + void lstack_log_level_init(void); + int dpdk_ethdev_init(void); + int dpdk_ethdev_start(void); +diff --git a/src/lstack/include/lstack_lockless_queue.h b/src/lstack/include/lstack_lockless_queue.h +index c00d3a2..c70b56a 100644 +--- a/src/lstack/include/lstack_lockless_queue.h ++++ b/src/lstack/include/lstack_lockless_queue.h +@@ -1,5 +1,13 @@ + /* +-* Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved. ++* Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. ++* gazelle is licensed under the Mulan PSL v2. ++* You can use this software according to the terms and conditions of the Mulan PSL v2. ++* You may obtain a copy of Mulan PSL v2 at: ++* http://license.coscl.org.cn/MulanPSL2 ++* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR ++* IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR ++* PURPOSE. ++* See the Mulan PSL v2 for more details. + */ + + #ifndef __GAZELLE_LOCKLESS_QUEUE_H__ +diff --git a/src/lstack/include/lstack_lwip.h b/src/lstack/include/lstack_lwip.h +index c73e3a7..ba57541 100644 +--- a/src/lstack/include/lstack_lwip.h ++++ b/src/lstack/include/lstack_lwip.h +@@ -14,18 +14,16 @@ + #define __GAZELLE_LWIP_H__ + + #include "lstack_thread_rpc.h" ++#include "dpdk_common.h" + #include "lwipsock.h" + +-#define SOCK_RECV_RING_SIZE (128) +-#define SOCK_SEND_RING_SIZE (32) + + #define NETCONN_IS_ACCEPTIN(sock) (((sock)->conn->acceptmbox != NULL) && !sys_mbox_empty((sock)->conn->acceptmbox)) +-#define NETCONN_IS_DATAIN(sock) ((rte_ring_count((sock)->recv_ring) || (sock)->recv_lastdata)) +-#define NETCONN_IS_DATAOUT(sock) ((rte_ring_count((sock)->send_ring) || (sock)->send_lastdata)) +-#define NETCONN_IS_OUTIDLE(sock) rte_ring_free_count((sock)->send_ring) ++#define NETCONN_IS_DATAIN(sock) ((gazelle_ring_readable_count((sock)->recv_ring) || (sock)->recv_lastdata)) ++#define NETCONN_IS_DATAOUT(sock) gazelle_ring_readover_count((sock)->send_ring) ++#define NETCONN_IS_OUTIDLE(sock) gazelle_ring_readable_count((sock)->send_ring) + + void create_shadow_fd(struct rpc_msg *msg); +-void listen_list_add_node(int32_t head_fd, int32_t add_fd); + void gazelle_init_sock(int32_t fd); + int32_t gazelle_socket(int domain, int type, int protocol); + void gazelle_clean_sock(int32_t fd); +@@ -37,7 +35,6 @@ void read_recv_list(struct protocol_stack *stack, uint32_t max_num); + void send_stack_list(struct protocol_stack *stack, uint32_t send_max); + void add_recv_list(int32_t fd); + void stack_sendlist_count(struct rpc_msg *msg); +-void stack_eventlist_count(struct rpc_msg *msg); + void get_lwip_conntable(struct rpc_msg *msg); + void get_lwip_connnum(struct rpc_msg *msg); + void stack_recvlist_count(struct rpc_msg *msg); +diff --git a/src/lstack/include/lstack_protocol_stack.h b/src/lstack/include/lstack_protocol_stack.h +index bc4e4bd..8a6aa9d 100644 +--- a/src/lstack/include/lstack_protocol_stack.h ++++ b/src/lstack/include/lstack_protocol_stack.h +@@ -14,48 +14,52 @@ + #define __GAZELLE_PROTOCOL_STACK_H__ + + #include +-#include + #include + #include ++#include + #include "dpdk_common.h" + #include "lstack_thread_rpc.h" + #include "gazelle_dfx_msg.h" + #include "lstack_lockless_queue.h" + ++#define SOCK_RECV_RING_SIZE (128) ++#define SOCK_RECV_FREE_THRES (32) ++#define SOCK_SEND_RING_SIZE (32) ++#define SOCK_SEND_REPLENISH_THRES (16) ++#define WAKEUP_MAX_NUM (32) ++ + struct protocol_stack { + uint32_t tid; + uint16_t queue_id; + uint16_t port_id; + uint16_t socket_id; + uint16_t cpu_id; +- volatile uint16_t conn_num; + cpu_set_t idle_cpuset; /* idle cpu in numa of stack, app thread bind to it */ ++ int32_t epollfd; /* kernel event thread epoll fd */ + +- lockless_queue rpc_queue; + struct rte_mempool *rx_pktmbuf_pool; + struct rte_mempool *tx_pktmbuf_pool; + struct rte_ring *rx_ring; + struct rte_ring *tx_ring; + struct rte_ring *reg_ring; + struct rte_ring *wakeup_ring; +- + struct reg_ring_msg *reg_buf; + ++ volatile uint16_t conn_num __rte_cache_aligned; ++ lockless_queue rpc_queue __rte_cache_aligned; ++ char pad __rte_cache_aligned; ++ + struct netif netif; ++ struct eth_dev_ops *dev_ops; + uint32_t rx_ring_used; + uint32_t tx_ring_used; +- struct eth_dev_ops *dev_ops; + + struct list_node recv_list; +- struct list_node listen_list; + struct list_node send_list; +- struct list_node event_list; +- pthread_spinlock_t event_lock; +- int32_t epollfd; /* kernel event thread epoll fd */ + +- struct gazelle_stat_pkts stats; +- struct gazelle_stack_latency latency; + struct stats_ *lwip_stats; ++ struct gazelle_stack_latency latency; ++ struct gazelle_stack_stat stats __rte_cache_aligned; + }; + + struct eth_params; +@@ -74,12 +78,14 @@ struct protocol_stack_group { + /* dfx stats */ + bool latency_start; + uint64_t call_alloc_fail; ++ pthread_spinlock_t wakeup_list_lock; ++ struct wakeup_poll *wakeup_list __rte_cache_aligned; + }; + + long get_stack_tid(void); + struct protocol_stack *get_protocol_stack(void); + struct protocol_stack *get_protocol_stack_by_fd(int32_t fd); +-struct protocol_stack *get_minconn_protocol_stack(void); ++struct protocol_stack *get_bind_protocol_stack(void); + struct protocol_stack_group *get_protocol_stack_group(void); + + int32_t init_protocol_stack(void); +@@ -96,6 +102,7 @@ int32_t stack_broadcast_close(int32_t fd); + + /* listen sync to all protocol stack thread, so that any protocol stack thread can build connect */ + int32_t stack_broadcast_listen(int32_t fd, int backlog); ++int32_t stack_single_listen(int32_t fd, int32_t backlog); + + /* ergodic the protocol stack thread to find the connection, because all threads are listening */ + int32_t stack_broadcast_accept(int32_t fd, struct sockaddr *addr, socklen_t *addrlen); +diff --git a/src/lstack/include/lstack_stack_stat.h b/src/lstack/include/lstack_stack_stat.h +index 2c3bf8f..e152fe6 100644 +--- a/src/lstack/include/lstack_stack_stat.h ++++ b/src/lstack/include/lstack_stack_stat.h +@@ -24,4 +24,7 @@ void stack_stat_init(void); + int32_t handle_stack_cmd(int fd, enum GAZELLE_STAT_MODE stat_mode); + uint64_t get_current_time(void); + void lstack_get_low_power_info(struct gazelle_stat_low_power_info *low_power_info); ++void register_wakeup(struct wakeup_poll *wakeup); ++void unregister_wakeup(struct wakeup_poll *wakeup); ++ + #endif /* GAZELLE_STACK_STAT_H */ +diff --git a/src/lstack/include/lstack_thread_rpc.h b/src/lstack/include/lstack_thread_rpc.h +index 61bcd38..35e6b1e 100644 +--- a/src/lstack/include/lstack_thread_rpc.h ++++ b/src/lstack/include/lstack_thread_rpc.h +@@ -37,12 +37,13 @@ union rpc_msg_arg { + socklen_t socklen; + size_t size; + }; ++struct rpc_msg_pool; + struct rpc_msg { + pthread_spinlock_t lock; /* msg handler unlock notice sender msg process done */ + int32_t self_release; /* 0:msg handler release msg 1:msg sender release msg */ + int64_t result; /* func return val */ + lockless_queue_node queue_node; +- struct rte_mempool *pool; ++ struct rpc_msg_pool *pool; + + rpc_msg_func func; /* msg handle func hook */ + union rpc_msg_arg args[RPM_MSG_ARG_SIZE]; /* resolve by type */ +@@ -50,11 +51,9 @@ struct rpc_msg { + + struct protocol_stack; + void poll_rpc_msg(struct protocol_stack *stack, uint32_t max_num); +-void rpc_call_addevent(struct protocol_stack *stack, void *sock); + int32_t rpc_call_msgcnt(struct protocol_stack *stack); + int32_t rpc_call_shadow_fd(struct protocol_stack *stack, int32_t fd, const struct sockaddr *addr, socklen_t addrlen); + int32_t rpc_call_recvlistcnt(struct protocol_stack *stack); +-int32_t rpc_call_eventlistcnt(struct protocol_stack *stack); + int32_t rpc_call_sendlistcnt(struct protocol_stack *stack); + int32_t rpc_call_thread_regphase1(struct protocol_stack *stack, void *conn); + int32_t rpc_call_thread_regphase2(struct protocol_stack *stack, void *conn); +diff --git a/src/lstack/include/posix/lstack_epoll.h b/src/lstack/include/posix/lstack_epoll.h +index a83f41f..e9f9b91 100644 +--- a/src/lstack/include/posix/lstack_epoll.h ++++ b/src/lstack/include/posix/lstack_epoll.h +@@ -20,16 +20,27 @@ extern "C" { + #include + #include + #include ++#include ++#include + + #include "lstack_protocol_stack.h" + ++enum wakeup_type { ++ WAKEUP_EPOLL = 0, ++ WAKEUP_POLL, ++}; + struct wakeup_poll { ++ /* stack thread read frequently */ ++ sem_t event_sem __rte_cache_aligned; ++ enum wakeup_type type __rte_cache_aligned; ++ volatile bool have_kernel_event __rte_cache_aligned; ++ struct gazelle_wakeup_stat stat __rte_cache_aligned; ++ char pad __rte_cache_aligned; ++ + bool init; + struct protocol_stack *bind_stack; +- sem_t event_sem; +- +- int32_t epollfd; +- bool have_kernel_fd; ++ int32_t epollfd; /* epoll kernel fd, ctl add into gazelle_kernel_event thread */ ++ struct wakeup_poll *next; + + /* poll */ + struct pollfd *last_fds; +@@ -40,7 +51,8 @@ struct wakeup_poll { + /* epoll */ + int32_t stack_fd_cnt[PROTOCOL_STACK_MAX]; + struct protocol_stack *max_stack; +- struct list_node event_list; /* epoll temp use */ ++ struct list_node event_list; ++ pthread_spinlock_t event_list_lock; + }; + + int32_t lstack_epoll_create(int32_t size); +diff --git a/src/lstack/lstack.conf b/src/lstack/lstack.conf +index 696dfb9..fdca602 100644 +--- a/src/lstack/lstack.conf ++++ b/src/lstack/lstack.conf +@@ -16,6 +16,7 @@ kni_switch=0 + low_power_mode=0 + + num_cpus="2" ++num_wakeup="3" + + host_addr="192.168.1.10" + mask_addr="255.255.255.0" +diff --git a/src/lstack/netif/lstack_ethdev.c b/src/lstack/netif/lstack_ethdev.c +index 382f3bc..7938520 100644 +--- a/src/lstack/netif/lstack_ethdev.c ++++ b/src/lstack/netif/lstack_ethdev.c +@@ -39,10 +39,11 @@ void eth_dev_recv(struct rte_mbuf *mbuf) + struct pbuf_custom *pc = NULL; + struct protocol_stack *stack = get_protocol_stack(); + struct rte_mbuf *m = mbuf; +- uint16_t len; ++ uint16_t len, pkt_len; + ++ pkt_len = (uint16_t)rte_pktmbuf_pkt_len(m); + while (m != NULL) { +- len = (uint16_t)rte_pktmbuf_pkt_len(m); ++ len = (uint16_t)rte_pktmbuf_data_len(m); + payload = rte_pktmbuf_mtod(m, void *); + pc = mbuf_to_pbuf(m); + pc->custom_free_function = gazelle_free_pbuf; +@@ -51,6 +52,7 @@ void eth_dev_recv(struct rte_mbuf *mbuf) + stack->stats.rx_allocmbuf_fail++; + break; + } ++ next->tot_len = pkt_len; + #if CHECKSUM_CHECK_IP_HW || CHECKSUM_CHECK_TCP_HW + next->ol_flags = m->ol_flags; + #endif +@@ -71,7 +73,6 @@ void eth_dev_recv(struct rte_mbuf *mbuf) + if (ret != ERR_OK) { + LSTACK_LOG(ERR, LSTACK, "eth_dev_recv: failed to handle rx pbuf ret=%d\n", ret); + stack->stats.rx_drop++; +- pbuf_free(head); + } + } + } +@@ -181,7 +182,7 @@ int32_t ethdev_init(struct protocol_stack *stack) + + if (use_ltran()) { + stack->rx_ring_used = 0; +- int32_t ret = fill_mbuf_to_ring(stack->rx_pktmbuf_pool, stack->rx_ring, VDEV_RX_QUEUE_SZ - 1); ++ int32_t ret = fill_mbuf_to_ring(stack->rx_pktmbuf_pool, stack->rx_ring, RING_SIZE(VDEV_RX_QUEUE_SZ)); + if (ret != 0) { + LSTACK_LOG(ERR, LSTACK, "fill mbuf to rx_ring failed ret=%d\n", ret); + return ret; +diff --git a/src/lstack/netif/lstack_vdev.c b/src/lstack/netif/lstack_vdev.c +index 5a4e86a..287ac8f 100644 +--- a/src/lstack/netif/lstack_vdev.c ++++ b/src/lstack/netif/lstack_vdev.c +@@ -42,14 +42,14 @@ static uint32_t ltran_rx_poll(struct protocol_stack *stack, struct rte_mbuf **pk + uint32_t nr_pkts; + struct rte_mbuf *free_buf[DPDK_PKT_BURST_SIZE]; + +- rcvd_pkts = rte_ring_en_dequeue_burst(stack->rx_ring, (void **)pkts, max_mbuf); ++ rcvd_pkts = gazelle_ring_sc_dequeue(stack->rx_ring, (void **)pkts, max_mbuf); + + stack->rx_ring_used += rcvd_pkts; + if (unlikely(stack->rx_ring_used >= USED_RX_PKTS_WATERMARK)) { +- uint32_t free_cnt = LWIP_MIN(stack->rx_ring_used, DPDK_PKT_BURST_SIZE); ++ uint32_t free_cnt = LWIP_MIN(stack->rx_ring_used, RING_SIZE(DPDK_PKT_BURST_SIZE)); + int32_t ret = gazelle_alloc_pktmbuf(stack->rx_pktmbuf_pool, (struct rte_mbuf **)free_buf, free_cnt); + if (likely(ret == 0)) { +- nr_pkts = rte_ring_en_enqueue_bulk(stack->rx_ring, (void **)free_buf, free_cnt); ++ nr_pkts = gazelle_ring_sp_enqueue(stack->rx_ring, (void **)free_buf, free_cnt); + stack->rx_ring_used -= nr_pkts; + } else { + stack->stats.rx_allocmbuf_fail++; +@@ -72,14 +72,14 @@ static uint32_t ltran_tx_xmit(struct protocol_stack *stack, struct rte_mbuf **pk + + do { + if (unlikely(stack->tx_ring_used >= INUSE_TX_PKTS_WATERMARK)) { +- uint32_t free_pkts = rte_ring_en_dequeue_burst(stack->tx_ring, (void **)free_buf, stack->tx_ring_used); ++ uint32_t free_pkts = gazelle_ring_sc_dequeue(stack->tx_ring, (void **)free_buf, stack->tx_ring_used); + for (uint32_t i = 0; i < free_pkts; i++) { + rte_pktmbuf_free(free_buf[i]); + } + stack->tx_ring_used -= free_pkts; + } + +- sent_pkts += rte_ring_en_enqueue_bulk(stack->tx_ring, (void **)(&pkts[sent_pkts]), nr_pkts - sent_pkts); ++ sent_pkts += gazelle_ring_sp_enqueue(stack->tx_ring, (void **)(&pkts[sent_pkts]), nr_pkts - sent_pkts); + } while ((sent_pkts < nr_pkts) && (ENQUEUE_RING_RETRY_TIMEOUT > sys_now() - tbegin) && get_register_state()); + + stack->tx_ring_used += sent_pkts; +@@ -128,7 +128,7 @@ int32_t vdev_reg_xmit(enum reg_ring_type type, struct gazelle_quintuple *qtuple) + } + + do { +- (void)rte_ring_en_dequeue_burst(stack->reg_ring, free_buf, VDEV_REG_QUEUE_SZ); ++ (void)gazelle_ring_sc_dequeue(stack->reg_ring, free_buf, VDEV_REG_QUEUE_SZ); + + if (get_reg_ring_free_count(stack->reg_ring) == 0) { + continue; +@@ -144,7 +144,7 @@ int32_t vdev_reg_xmit(enum reg_ring_type type, struct gazelle_quintuple *qtuple) + } + + free_buf[0] = tmp_buf; +- sent_pkts = rte_ring_en_enqueue_bulk(stack->reg_ring, free_buf, 1); ++ sent_pkts = gazelle_ring_sp_enqueue(stack->reg_ring, free_buf, 1); + } while ((sent_pkts < 1) && (ENQUEUE_RING_RETRY_TIMEOUT > sys_now() - tbegin) && get_register_state()); + + if (sent_pkts == 1) { +diff --git a/src/ltran/ltran_dfx.c b/src/ltran/ltran_dfx.c +index 8d71966..7db1adc 100644 +--- a/src/ltran/ltran_dfx.c ++++ b/src/ltran/ltran_dfx.c +@@ -546,32 +546,28 @@ static void gazelle_print_lstack_stat_brief(struct gazelle_stat_lstack_total *st + static void show_lstack_stats(struct gazelle_stack_dfx_data *lstack_stat) + { + printf("\n------ stack tid: %6u ------\n", lstack_stat->tid); +- printf("rx_pkts: %-20"PRIu64" ", lstack_stat->data.pkts.rx); +- printf("rx_drop: %-20"PRIu64" ", lstack_stat->data.pkts.rx_drop); +- printf("rx_allocmbuf_fail: %-10"PRIu64"\n", lstack_stat->data.pkts.rx_allocmbuf_fail); +- printf("tx_pkts: %-20"PRIu64" ", lstack_stat->data.pkts.tx); +- printf("tx_drop: %-20"PRIu64" ", lstack_stat->data.pkts.tx_drop); +- printf("tx_allocmbuf_fail: %-10"PRIu64"\n", lstack_stat->data.pkts.tx_allocmbuf_fail); +- printf("app_read: %-19"PRIu64" ", lstack_stat->data.pkts.app_read_cnt); +- printf("read_lwip: %-18"PRIu64" ", lstack_stat->data.pkts.read_lwip_cnt); +- printf("read_lwip_drop: %-13"PRIu64" \n", lstack_stat->data.pkts.read_lwip_drop); +- printf("app_write: %-18"PRIu64" ", lstack_stat->data.pkts.app_write_cnt); +- printf("write_lwip: %-17"PRIu64" ", lstack_stat->data.pkts.write_lwip_cnt); +- printf("app_get_idlefail: %-11"PRIu64" \n", lstack_stat->data.pkts.app_write_idlefail); +- printf("app_write_drop: %-13"PRIu64" ", lstack_stat->data.pkts.app_write_drop); +- printf("write_lwip_drop: %-12"PRIu64" ", lstack_stat->data.pkts.write_lwip_drop); +- printf("app_write_idlebuf: %-10"PRIu16" \n", lstack_stat->data.pkts.send_idle_ring_cnt); +- printf("event_list: %-17"PRIu64" ", lstack_stat->data.pkts.event_list); +- printf("recv_list: %-18"PRIu64" ", lstack_stat->data.pkts.recv_list); ++ printf("rx_pkts: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.rx); ++ printf("rx_drop: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.rx_drop); ++ printf("rx_allocmbuf_fail: %-10"PRIu64"\n", lstack_stat->data.pkts.stack_stat.rx_allocmbuf_fail); ++ printf("tx_pkts: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.tx); ++ printf("tx_drop: %-20"PRIu64" ", lstack_stat->data.pkts.stack_stat.tx_drop); ++ printf("tx_allocmbuf_fail: %-10"PRIu64"\n", lstack_stat->data.pkts.stack_stat.tx_allocmbuf_fail); ++ printf("app_read: %-19"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_read_cnt); ++ printf("read_lwip: %-18"PRIu64" ", lstack_stat->data.pkts.stack_stat.read_lwip_cnt); ++ printf("read_lwip_drop: %-13"PRIu64" \n", lstack_stat->data.pkts.stack_stat.read_lwip_drop); ++ printf("app_write: %-18"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_write_cnt); ++ printf("write_lwip: %-17"PRIu64" ", lstack_stat->data.pkts.stack_stat.write_lwip_cnt); ++ printf("app_get_idlefail: %-11"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.app_write_idlefail); ++ printf("recv_list: %-18"PRIu64" ", lstack_stat->data.pkts.recv_list_cnt); ++ printf("send_list: %-18"PRIu64" ", lstack_stat->data.pkts.send_list_cnt); + printf("conn_num: %-19"PRIu16" \n", lstack_stat->data.pkts.conn_num); +- printf("wakeup_events: %-14"PRIu64" ", lstack_stat->data.pkts.wakeup_events); +- printf("app_events: %-17"PRIu64" ", lstack_stat->data.pkts.app_events); +- printf("read_null: %-18"PRIu64" \n", lstack_stat->data.pkts.read_null); ++ printf("wakeup_events: %-14"PRIu64" ", lstack_stat->data.pkts.stack_stat.wakeup_events); ++ printf("app_events: %-17"PRIu64" ", lstack_stat->data.pkts.wakeup_stat.app_events); ++ printf("read_null: %-18"PRIu64" \n", lstack_stat->data.pkts.wakeup_stat.read_null); + printf("call_msg: %-19"PRIu64" ", lstack_stat->data.pkts.call_msg_cnt); + printf("call_alloc_fail: %-12"PRIu64" ", lstack_stat->data.pkts.call_alloc_fail); +- printf("call_null: %-18"PRIu64" \n", lstack_stat->data.pkts.call_null); +- printf("send_self_rpc: %-14"PRIu64" ", lstack_stat->data.pkts.send_self_rpc); +- printf("send_list: %-18"PRIu64" \n", lstack_stat->data.pkts.send_list); ++ printf("call_null: %-18"PRIu64" \n", lstack_stat->data.pkts.stack_stat.call_null); ++ printf("send_self_rpc: %-14"PRIu64" \n", lstack_stat->data.pkts.stack_stat.send_self_rpc); + } + + static void gazelle_print_lstack_stat_detail(struct gazelle_stack_dfx_data *lstack_stat, +@@ -873,8 +869,8 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + printf("Active Internet connections (servers and established)\n"); + do { + printf("\n------ stack tid: %6u ------\n", stat->tid); +- printf("No. Proto recv_cnt recv_ring in_send send_ring sem_cnt Local Address " +- " Foreign Address State\n"); ++ printf("No. Proto recv_cnt recv_ring in_send send_ring sem_cnt fd Local Address " ++ " Foreign Address State\n"); + uint32_t unread_pkts = 0; + uint32_t unsend_pkts = 0; + for (i = 0; i < conn->conn_num && i < GAZELLE_LSTACK_MAX_CONN; i++) { +@@ -883,13 +879,13 @@ static void gazelle_print_lstack_stat_conn(void *buf, const struct gazelle_stat_ + rip.s_addr = conn_info->rip; + lip.s_addr = conn_info->lip; + if ((conn_info->state == GAZELLE_ACTIVE_LIST) || (conn_info->state == GAZELLE_TIME_WAIT_LIST)) { +- printf("%-6utcp %-10u%-11u%-9u%-11u%-9d%s:%hu\t%s:%hu\t%s\n", i, conn_info->recv_cnt, ++ printf("%-6utcp %-10u%-11u%-9u%-11u%-9d%-7d%s:%hu\t %s:%hu\t %s\n", i, conn_info->recv_cnt, + conn_info->recv_ring_cnt, conn_info->in_send, conn_info->send_ring_cnt, conn_info->sem_cnt, +- inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port, ++ conn_info->fd, inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port, + inet_ntop(AF_INET, &rip, str_rip, sizeof(str_rip)), conn_info->r_port, + tcp_state_to_str(conn_info->tcp_sub_state)); + } else if (conn_info->state == GAZELLE_LISTEN_LIST) { +- printf("%-6utcp %-50u%s:%hu\t0.0.0.0:*\t\tLISTEN\n", i, conn_info->recv_cnt, ++ printf("%-6utcp %-57u%s:%hu\t 0.0.0.0:*\t\t LISTEN\n", i, conn_info->recv_cnt, + inet_ntop(AF_INET, &lip, str_ip, sizeof(str_ip)), conn_info->l_port); + } else { + printf("Got unknow tcp conn::%s:%5hu, state:%u\n", +diff --git a/src/ltran/ltran_forward.c b/src/ltran/ltran_forward.c +index b264ad3..776692d 100644 +--- a/src/ltran/ltran_forward.c ++++ b/src/ltran/ltran_forward.c +@@ -92,7 +92,7 @@ static __rte_always_inline void backup_bufs_enque_rx_ring(struct gazelle_stack * + struct rte_mbuf *free_buf[RING_MAX_SIZE]; + + flush_cnt = (stack->backup_pkt_cnt < RING_MAX_SIZE) ? stack->backup_pkt_cnt : RING_MAX_SIZE; +- free_cnt = rte_ring_cn_dequeue_burst(stack->rx_ring, (void **)free_buf, flush_cnt); ++ free_cnt = gazelle_ring_read(stack->rx_ring, (void **)free_buf, flush_cnt); + + for (uint32_t j = 0; j < free_cnt; j++) { + index = (stack->backup_start + j) % backup_size; +@@ -102,7 +102,7 @@ static __rte_always_inline void backup_bufs_enque_rx_ring(struct gazelle_stack * + stack->stack_stats.rx += free_cnt; + stack->backup_pkt_cnt -= free_cnt; + stack->backup_start = (stack->backup_start + free_cnt) % backup_size; +- rte_ring_cn_enqueue(stack->rx_ring); ++ gazelle_ring_read_over(stack->rx_ring); + } + + static __rte_always_inline void pktbufs_move_to_backup_bufs(struct gazelle_stack *stack, struct rte_mbuf **mbuf, +@@ -135,7 +135,7 @@ static __rte_always_inline uint32_t pkt_bufs_enque_rx_ring(struct gazelle_stack + struct rte_mbuf **cl_buffer = stack->pkt_buf; + struct rte_mbuf *free_buf[GAZELLE_PACKET_READ_SIZE]; + +- free_cnt = rte_ring_cn_dequeue_burst(stack->rx_ring, (void **)free_buf, stack->pkt_cnt); ++ free_cnt = gazelle_ring_read(stack->rx_ring, (void **)free_buf, stack->pkt_cnt); + stack->stack_stats.rx += free_cnt; + + /* this prefetch and copy code, only 50~60 instruction, but never spend less than 70 cycle. +@@ -187,7 +187,7 @@ static __rte_always_inline uint32_t pkt_bufs_enque_rx_ring(struct gazelle_stack + } + + if (likely(free_cnt != 0)) { +- rte_ring_cn_enqueue(stack->rx_ring); ++ gazelle_ring_read_over(stack->rx_ring); + } + + return free_cnt; +@@ -520,14 +520,14 @@ static __rte_always_inline void tcp_hash_table_handle(struct gazelle_stack *stac + return; + } + +- uint32_t num = rte_ring_cn_dequeue_burst(stack->reg_ring, pkts, PACKET_READ_SIZE); ++ uint32_t num = gazelle_ring_read(stack->reg_ring, pkts, PACKET_READ_SIZE); + + for (uint32_t i = 0; i < num; i++) { + tcp_hash_table_modify(stack, pkts[i]); + pkts[i] = NULL; + } + +- rte_ring_cn_enqueue(stack->reg_ring); ++ gazelle_ring_read_over(stack->reg_ring); + if (pthread_mutex_unlock(&sock_htable->mlock) != 0) { + LTRAN_WARN("write tcp_htable: unlock failed, errno %d\n", errno); + } +@@ -675,7 +675,7 @@ static __rte_always_inline void downstream_forward_one(struct gazelle_stack *sta + uint32_t used_cnt; + + struct rte_mbuf *used_pkts[GAZELLE_PACKET_READ_SIZE]; +- used_cnt = rte_ring_cn_dequeue_burst(stack->tx_ring, (void **)used_pkts, GAZELLE_PACKET_READ_SIZE); ++ used_cnt = gazelle_ring_read(stack->tx_ring, (void **)used_pkts, GAZELLE_PACKET_READ_SIZE); + if (used_cnt == 0) { + return; + } +@@ -686,7 +686,7 @@ static __rte_always_inline void downstream_forward_one(struct gazelle_stack *sta + if (ret != 0) { + /* free pkts that not have be sent. */ + LTRAN_ERR("down alloc error, rx_pkts:%u ret=%d.\n", used_cnt, ret); +- rte_ring_cn_enqueue(stack->tx_ring); ++ gazelle_ring_read_over(stack->tx_ring); + stack->stack_stats.tx_drop += used_cnt; + rte_exit(EXIT_FAILURE, "down alloc error\n"); + } +@@ -696,7 +696,7 @@ static __rte_always_inline void downstream_forward_one(struct gazelle_stack *sta + tx_bytes += used_pkts[tx_pkts]->data_len; + stack->stack_stats.tx_bytes += used_pkts[tx_pkts]->data_len; + } +- rte_ring_cn_enqueue(stack->tx_ring); ++ gazelle_ring_read_over(stack->tx_ring); + + /* send packets anyway. */ + tx_pkts = 0; +diff --git a/src/ltran/ltran_stat.c b/src/ltran/ltran_stat.c +index 7080424..c6805a6 100644 +--- a/src/ltran/ltran_stat.c ++++ b/src/ltran/ltran_stat.c +@@ -25,6 +25,7 @@ + #include "gazelle_dfx_msg.h" + #include "ltran_timer.h" + #include "ltran_ethdev.h" ++#include "dpdk_common.h" + #include "ltran_forward.h" + + static uint64_t g_start_time_stamp = 0; +@@ -32,25 +33,11 @@ static int32_t g_start_latency = GAZELLE_OFF; + volatile int32_t g_ltran_stop_flag = GAZELLE_FALSE; + static struct statistics g_statistics; + +-static uint32_t get_rx_ring_count(const struct rte_ring *r) +-{ +- return rte_ring_count(r); +-} +- + uint64_t get_start_time_stamp(void) + { + return g_start_time_stamp; + } + +-static uint32_t get_tx_ring_count(const struct rte_ring *r) +-{ +- uint32_t prod_tail = r->prod.tail; +- uint32_t cons_head = r->cons.head; +- +- uint32_t count = (cons_head - prod_tail) & r->mask; +- return (count > r->capacity) ? r->capacity : count; +-} +- + void set_start_latency_flag(int32_t flag) + { + struct gazelle_instance_mgr *instance_mgr = get_instance_mgr(); +@@ -203,8 +190,8 @@ static int32_t gazelle_filling_lstack_stat_total(struct gazelle_stat_lstack_tota + stat->latency_pkts = stack->stack_stats.latency_pkts; + stat->latency_total = stack->stack_stats.latency_total; + stat->reg_ring_cnt = rte_ring_cn_count(stack->reg_ring); +- stat->rx_ring_cnt = get_rx_ring_count(stack->rx_ring); +- stat->tx_ring_cnt = get_tx_ring_count(stack->tx_ring); ++ stat->rx_ring_cnt = gazelle_ring_readover_count(stack->rx_ring); ++ stat->tx_ring_cnt = gazelle_ring_readable_count(stack->tx_ring); + + return GAZELLE_OK; + } +-- +2.23.0 + diff --git a/gazelle.spec b/gazelle.spec index d9f35e1..f0cc795 100644 --- a/gazelle.spec +++ b/gazelle.spec @@ -2,7 +2,7 @@ Name: gazelle Version: 1.0.1 -Release: 9 +Release: 10 Summary: gazelle is a high performance user-mode stack License: Mulan PSL v2 URL: https://gitee.com/openeuler/gazelle @@ -54,6 +54,22 @@ Patch9036: 0036-the-sending-of-sock-last-data-is-triggered-by-lstack.patch Patch9037: 0037-add-gazellectl-lstack-constraint.patch Patch9038: 0038-refactor-event.patch Patch9039: 0039-update-license-lockless-queue.patch +Patch9040: 0040-fix-sock-invalid-address.patch +Patch9041: 0041-exit-lstack-process-after-ltran-instance-logout.patch +Patch9042: 0042-use-atomic-variales-to-count.patch +Patch9043: 0043-re-arrange-the-program-to-invoke-rte_eth_dev_start-b.patch +Patch9044: 0044-delete-redundant-file.patch +Patch9045: 0045-lstack-all-exit-move-to-init.patch +Patch9046: 0046-clean-code-fix-huge-func.patch +Patch9047: 0047-add-kernel-path-in-epoll-funcs.patch +Patch9048: 0048-refactor-kernel-event-poll-epoll.patch +Patch9049: 0049-post-thread_phase1-sem-to-avoid-block-main-thread-wh.patch +Patch9050: 0050-adjust-the-number-of-RX-TX-mbufs-of-each-stack-threa.patch +Patch9051: 0051-modify-README.patch +Patch9052: 0052-bugfix-https-gitee.com-src-openeuler-gazelle-issues-.patch +Patch9053: 0053-update-README.md.patch +Patch9054: 0054-ltran-fix-use-after-free-issue.patch +Patch9055: 0055-refactor-pkt-read-send-performance.patch %description %{name} is a high performance user-mode stack. @@ -94,6 +110,15 @@ install -Dpm 0640 %{_builddir}/%{name}-%{version}/src/ltran/ltran.conf %{b %config(noreplace) %{conf_path}/ltran.conf %changelog +* Thu Jul 7 2022 jiangheng - 1.0.1-10 +- Type:bugfix +- CVE: +- SUG:NA +- DESC:update readme + fix some bugs + refactor pkt read send to improve performance + refactoe kernle event to improve performanc + * Fri May 27 2022 xiusailong - 1.0.1-9 - update license lockless queue