dpdk/0235-dmadev-introduce-DMA-device-support.patch

From 8f6d527cd1ab3c7f5e6490425795b46cb5ebc42a Mon Sep 17 00:00:00 2001
From: "Min Hu (Connor)" <humin29@huawei.com>
Date: Fri, 12 Nov 2021 09:36:45 +0800
Subject: [PATCH] dmadev: introduce DMA device support

Signed-off-by: Min Hu (Connor) <humin29@huawei.com>
---
 app/test/meson.build                   |    4 +
 app/test/test_dmadev.c                 |  867 ++++++++++++++++++
 app/test/test_dmadev_api.c             |  574 ++++++++++++
 app/test/test_dmadev_api.h             |    5 +
 doc/api/doxy-api-index.md              |    1 +
 doc/api/doxy-api.conf.in               |    1 +
 doc/guides/dmadevs/index.rst           |   12 +
 doc/guides/index.rst                   |    1 +
 doc/guides/prog_guide/dmadev.rst       |   90 ++
 doc/guides/prog_guide/img/dmadev.svg   |  283 ++++++
 doc/guides/prog_guide/index.rst        |    1 +
 drivers/dma/hisilicon/hisi_dmadev.c    |  925 +++++++++++++++++++
 drivers/dma/hisilicon/hisi_dmadev.h    |  236 +++++
 drivers/dma/hisilicon/meson.build      |   19 +
 drivers/dma/hisilicon/version.map      |    3 +
 drivers/dma/meson.build                |   13 +
 drivers/dma/skeleton/meson.build       |   11 +
 drivers/dma/skeleton/skeleton_dmadev.c |  596 +++++++++++++
 drivers/dma/skeleton/skeleton_dmadev.h |   61 ++
 drivers/dma/skeleton/version.map       |    3 +
 drivers/meson.build                    |    1 +
 examples/dma/Makefile                  |   51 ++
 examples/dma/dmafwd.c                  | 1105 +++++++++++++++++++++++
 examples/dma/meson.build               |   15 +
 examples/meson.build                   |    2 +-
 lib/librte_dmadev/meson.build          |    5 +
 lib/librte_dmadev/rte_dmadev.c         |  866 ++++++++++++++++++
 lib/librte_dmadev/rte_dmadev.h         | 1138 ++++++++++++++++++++++++
 lib/librte_dmadev/rte_dmadev_core.h    |   80 ++
 lib/librte_dmadev/rte_dmadev_pmd.h     |  171 ++++
 lib/librte_dmadev/version.map          |   31 +
 lib/meson.build                        |    2 +-
 32 files changed, 7171 insertions(+), 2 deletions(-)
 create mode 100644 app/test/test_dmadev.c
 create mode 100644 app/test/test_dmadev_api.c
 create mode 100644 app/test/test_dmadev_api.h
 create mode 100644 doc/guides/dmadevs/index.rst
 create mode 100644 doc/guides/prog_guide/dmadev.rst
 create mode 100644 doc/guides/prog_guide/img/dmadev.svg
 create mode 100644 drivers/dma/hisilicon/hisi_dmadev.c
 create mode 100644 drivers/dma/hisilicon/hisi_dmadev.h
 create mode 100644 drivers/dma/hisilicon/meson.build
 create mode 100644 drivers/dma/hisilicon/version.map
 create mode 100644 drivers/dma/meson.build
 create mode 100644 drivers/dma/skeleton/meson.build
 create mode 100644 drivers/dma/skeleton/skeleton_dmadev.c
 create mode 100644 drivers/dma/skeleton/skeleton_dmadev.h
 create mode 100644 drivers/dma/skeleton/version.map
 create mode 100644 examples/dma/Makefile
 create mode 100644 examples/dma/dmafwd.c
 create mode 100644 examples/dma/meson.build
 create mode 100644 lib/librte_dmadev/meson.build
 create mode 100644 lib/librte_dmadev/rte_dmadev.c
 create mode 100644 lib/librte_dmadev/rte_dmadev.h
 create mode 100644 lib/librte_dmadev/rte_dmadev_core.h
 create mode 100644 lib/librte_dmadev/rte_dmadev_pmd.h
 create mode 100644 lib/librte_dmadev/version.map

diff --git a/app/test/meson.build b/app/test/meson.build
index 94fd39fec..88bed64ad 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -34,6 +34,8 @@ test_sources = files('commands.c',
 	'test_debug.c',
 	'test_distributor.c',
 	'test_distributor_perf.c',
+        'test_dmadev.c',
+        'test_dmadev_api.c',
 	'test_eal_flags.c',
 	'test_eal_fs.c',
 	'test_efd.c',
@@ -151,6 +153,7 @@ test_deps = ['acl',
 	'cmdline',
 	'cryptodev',
 	'distributor',
+        'dmadev',
 	'efd',
 	'ethdev',
 	'eventdev',
@@ -320,6 +323,7 @@ driver_test_names = [
         'cryptodev_sw_mvsam_autotest',
         'cryptodev_sw_snow3g_autotest',
         'cryptodev_sw_zuc_autotest',
+        'dmadev_autotest',
         'eventdev_selftest_octeontx',
         'eventdev_selftest_sw',
         'rawdev_autotest',
diff --git a/app/test/test_dmadev.c b/app/test/test_dmadev.c
new file mode 100644
index 000000000..b206db27a
--- /dev/null
+++ b/app/test/test_dmadev.c
@@ -0,0 +1,867 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_dmadev.h>
+#include <rte_mbuf.h>
+#include <rte_pause.h>
+#include <rte_cycles.h>
+#include <rte_random.h>
+#include <rte_bus_vdev.h>
+#include <rte_dmadev_pmd.h>
+
+#include "test.h"
+#include "test_dmadev_api.h"
+
+#define ERR_RETURN(...) do { print_err(__func__, __LINE__, __VA_ARGS__); return -1; } while (0)
+
+#define COPY_LEN 1024
+
+static struct rte_mempool *pool;
+static uint16_t id_count;
+
+static void
+__rte_format_printf(3, 4)
+print_err(const char *func, int lineno, const char *format, ...)
+{
+	va_list ap;
+
+	fprintf(stderr, "In %s:%d - ", func, lineno);
+	va_start(ap, format);
+	vfprintf(stderr, format, ap);
+	va_end(ap);
+}
+
+static int
+runtest(const char *printable, int (*test_fn)(int16_t dev_id, uint16_t vchan), int iterations,
+		int16_t dev_id, uint16_t vchan, bool check_err_stats)
+{
+	struct rte_dma_stats stats;
+	int i;
+
+	rte_dma_stats_reset(dev_id, vchan);
+	printf("DMA Dev %d: Running %s Tests %s\n", dev_id, printable,
+			check_err_stats ? " " : "(errors expected)");
+	for (i = 0; i < iterations; i++) {
+		if (test_fn(dev_id, vchan) < 0)
+			return -1;
+
+		rte_dma_stats_get(dev_id, 0, &stats);
+		printf("Ops submitted: %"PRIu64"\t", stats.submitted);
+		printf("Ops completed: %"PRIu64"\t", stats.completed);
+		printf("Errors: %"PRIu64"\r", stats.errors);
+
+		if (stats.completed != stats.submitted)
+			ERR_RETURN("\nError, not all submitted jobs are reported as completed\n");
+		if (check_err_stats && stats.errors != 0)
+			ERR_RETURN("\nErrors reported during op processing, aborting tests\n");
+	}
+	printf("\n");
+	return 0;
+}
+
+static void
+await_hw(int16_t dev_id, uint16_t vchan)
+{
+	enum rte_dma_vchan_status st;
+
+	if (rte_dma_vchan_status(dev_id, vchan, &st) < 0) {
+		/* for drivers that don't support this op, just sleep for 1 millisecond */
+		rte_delay_us_sleep(1000);
+		return;
+	}
+
+	/* for those that do, *max* end time is one second from now, but all should be faster */
+	const uint64_t end_cycles = rte_get_timer_cycles() + rte_get_timer_hz();
+	while (st == RTE_DMA_VCHAN_ACTIVE && rte_get_timer_cycles() < end_cycles) {
+		rte_pause();
+		rte_dma_vchan_status(dev_id, vchan, &st);
+	}
+}
+
+/* run a series of copy tests just using some different options for enqueues and completions */
+static int
+do_multi_copies(int16_t dev_id, uint16_t vchan,
+		int split_batches,     /* submit 2 x 16 or 1 x 32 burst */
+		int split_completions, /* gather 2 x 16 or 1 x 32 completions */
+		int use_completed_status) /* use completed or completed_status function */
+{
+	struct rte_mbuf *srcs[32], *dsts[32];
+	enum rte_dma_status_code sc[32];
+	unsigned int i, j;
+	bool dma_err = false;
+
+	/* Enqueue burst of copies and hit doorbell */
+	for (i = 0; i < RTE_DIM(srcs); i++) {
+		uint64_t *src_data;
+
+		if (split_batches && i == RTE_DIM(srcs) / 2)
+			rte_dma_submit(dev_id, vchan);
+
+		srcs[i] = rte_pktmbuf_alloc(pool);
+		dsts[i] = rte_pktmbuf_alloc(pool);
+		if (srcs[i] == NULL || dsts[i] == NULL)
+			ERR_RETURN("Error allocating buffers\n");
+
+		src_data = rte_pktmbuf_mtod(srcs[i], uint64_t *);
+		for (j = 0; j < COPY_LEN/sizeof(uint64_t); j++)
+			src_data[j] = rte_rand();
+
+		if (rte_dma_copy(dev_id, vchan, srcs[i]->buf_iova + srcs[i]->data_off,
+				dsts[i]->buf_iova + dsts[i]->data_off, COPY_LEN, 0) != id_count++)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
+	}
+	rte_dma_submit(dev_id, vchan);
+
+	await_hw(dev_id, vchan);
+
+	if (split_completions) {
+		/* gather completions in two halves */
+		uint16_t half_len = RTE_DIM(srcs) / 2;
+		int ret = rte_dma_completed(dev_id, vchan, half_len, NULL, &dma_err);
+		if (ret != half_len || dma_err)
+			ERR_RETURN("Error with rte_dma_completed - first half. ret = %d, expected ret = %u, dma_err = %d\n",
+					ret, half_len, dma_err);
+
+		ret = rte_dma_completed(dev_id, vchan, half_len, NULL, &dma_err);
+		if (ret != half_len || dma_err)
+			ERR_RETURN("Error with rte_dma_completed - second half. ret = %d, expected ret = %u, dma_err = %d\n",
+					ret, half_len, dma_err);
+	} else {
+		/* gather all completions in one go, using either
+		 * completed or completed_status fns
+		 */
+		if (!use_completed_status) {
+			int n = rte_dma_completed(dev_id, vchan, RTE_DIM(srcs), NULL, &dma_err);
+			if (n != RTE_DIM(srcs) || dma_err)
+				ERR_RETURN("Error with rte_dma_completed, %u [expected: %zu], dma_err = %d\n",
+						n, RTE_DIM(srcs), dma_err);
+		} else {
+			int n = rte_dma_completed_status(dev_id, vchan, RTE_DIM(srcs), NULL, sc);
+			if (n != RTE_DIM(srcs))
+				ERR_RETURN("Error with rte_dma_completed_status, %u [expected: %zu]\n",
+						n, RTE_DIM(srcs));
+
+			for (j = 0; j < (uint16_t)n; j++)
+				if (sc[j] != RTE_DMA_STATUS_SUCCESSFUL)
+					ERR_RETURN("Error with rte_dma_completed_status, job %u reports failure [code %u]\n",
+							j, sc[j]);
+		}
+	}
+
+	/* check for empty */
+	int ret = use_completed_status ?
+			rte_dma_completed_status(dev_id, vchan, RTE_DIM(srcs), NULL, sc) :
+			rte_dma_completed(dev_id, vchan, RTE_DIM(srcs), NULL, &dma_err);
+	if (ret != 0)
+		ERR_RETURN("Error with completion check - ops unexpectedly returned\n");
+
+	for (i = 0; i < RTE_DIM(srcs); i++) {
+		char *src_data, *dst_data;
+
+		src_data = rte_pktmbuf_mtod(srcs[i], char *);
+		dst_data = rte_pktmbuf_mtod(dsts[i], char *);
+		for (j = 0; j < COPY_LEN; j++)
+			if (src_data[j] != dst_data[j])
+				ERR_RETURN("Error with copy of packet %u, byte %u\n", i, j);
+
+		rte_pktmbuf_free(srcs[i]);
+		rte_pktmbuf_free(dsts[i]);
+	}
+	return 0;
+}
+
+static int
+test_enqueue_copies(int16_t dev_id, uint16_t vchan)
+{
+	unsigned int i;
+	uint16_t id;
+
+	/* test doing a single copy */
+	do {
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rte_rand() & 0xFF;
+
+		id = rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova(src), rte_pktmbuf_iova(dst),
+				COPY_LEN, RTE_DMA_OP_FLAG_SUBMIT);
+		if (id != id_count)
+			ERR_RETURN("Error with rte_dma_copy, got %u, expected %u\n",
+					id, id_count);
+
+		/* give time for copy to finish, then check it was done */
+		await_hw(dev_id, vchan);
+
+		for (i = 0; i < COPY_LEN; i++)
+			if (dst_data[i] != src_data[i])
+				ERR_RETURN("Data mismatch at char %u [Got %02x not %02x]\n", i,
+						dst_data[i], src_data[i]);
+
+		/* now check completion works */
+		if (rte_dma_completed(dev_id, vchan, 1, &id, NULL) != 1)
+			ERR_RETURN("Error with rte_dma_completed\n");
+
+		if (id != id_count)
+			ERR_RETURN("Error:incorrect job id received, %u [expected %u]\n",
+					id, id_count);
+
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+
+		/* now check completion returns nothing more */
+		if (rte_dma_completed(dev_id, 0, 1, NULL, NULL) != 0)
+			ERR_RETURN("Error with rte_dma_completed in empty check\n");
+
+		id_count++;
+
+	} while (0);
+
+	/* test doing a multiple single copies */
+	do {
+		const uint16_t max_ops = 4;
+		struct rte_mbuf *src, *dst;
+		char *src_data, *dst_data;
+		uint16_t count;
+
+		src = rte_pktmbuf_alloc(pool);
+		dst = rte_pktmbuf_alloc(pool);
+		src_data = rte_pktmbuf_mtod(src, char *);
+		dst_data = rte_pktmbuf_mtod(dst, char *);
+
+		for (i = 0; i < COPY_LEN; i++)
+			src_data[i] = rte_rand() & 0xFF;
+
+		/* perform the same copy <max_ops> times */
+		for (i = 0; i < max_ops; i++)
+			if (rte_dma_copy(dev_id, vchan,
+					rte_pktmbuf_iova(src),
+					rte_pktmbuf_iova(dst),
+					COPY_LEN, RTE_DMA_OP_FLAG_SUBMIT) != id_count++)
+				ERR_RETURN("Error with rte_dma_copy\n");
+
+		await_hw(dev_id, vchan);
+
+		count = rte_dma_completed(dev_id, vchan, max_ops * 2, &id, NULL);
+		if (count != max_ops)
+			ERR_RETURN("Error with rte_dma_completed, got %u not %u\n",
+					count, max_ops);
+
+		if (id != id_count - 1)
+			ERR_RETURN("Error, incorrect job id returned: got %u not %u\n",
+					id, id_count - 1);
+
+		for (i = 0; i < COPY_LEN; i++)
+			if (dst_data[i] != src_data[i])
+				ERR_RETURN("Data mismatch at char %u\n", i);
+
+		rte_pktmbuf_free(src);
+		rte_pktmbuf_free(dst);
+	} while (0);
+
+	/* test doing multiple copies */
+	return do_multi_copies(dev_id, vchan, 0, 0, 0) /* enqueue and complete 1 batch at a time */
+			/* enqueue 2 batches and then complete both */
+			|| do_multi_copies(dev_id, vchan, 1, 0, 0)
+			/* enqueue 1 batch, then complete in two halves */
+			|| do_multi_copies(dev_id, vchan, 0, 1, 0)
+			/* test using completed_status in place of regular completed API */
+			|| do_multi_copies(dev_id, vchan, 0, 0, 1);
+}
+
+/* Failure handling test cases - global macros and variables for those tests*/
+#define COMP_BURST_SZ	16
+#define OPT_FENCE(idx) ((fence && idx == 8) ? RTE_DMA_OP_FLAG_FENCE : 0)
+
+static int
+test_failure_in_full_burst(int16_t dev_id, uint16_t vchan, bool fence,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* Test single full batch statuses with failures */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	struct rte_dma_stats baseline, stats;
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count, status_count;
+	unsigned int i;
+	bool error = false;
+	int err_count = 0;
+
+	rte_dma_stats_get(dev_id, vchan, &baseline); /* get a baseline set of stats */
+	for (i = 0; i < COMP_BURST_SZ; i++) {
+		int id = rte_dma_copy(dev_id, vchan,
+				(i == fail_idx ? 0 : (srcs[i]->buf_iova + srcs[i]->data_off)),
+				dsts[i]->buf_iova + dsts[i]->data_off,
+				COPY_LEN, OPT_FENCE(i));
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", i);
+		if (i == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dma_submit(dev_id, vchan);
+	rte_dma_stats_get(dev_id, vchan, &stats);
+	if (stats.submitted != baseline.submitted + COMP_BURST_SZ)
+		ERR_RETURN("Submitted stats value not as expected, %"PRIu64" not %"PRIu64"\n",
+				stats.submitted, baseline.submitted + COMP_BURST_SZ);
+
+	await_hw(dev_id, vchan);
+
+	count = rte_dma_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (count != fail_idx)
+		ERR_RETURN("Error with rte_dma_completed for failure test. Got returned %u not %u.\n",
+				count, fail_idx);
+	if (!error)
+		ERR_RETURN("Error, missing expected failed copy, %u. has_error is not set\n",
+				fail_idx);
+	if (idx != invalid_addr_id - 1)
+		ERR_RETURN("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
+				fail_idx, idx, invalid_addr_id - 1);
+
+	/* all checks ok, now verify calling completed() again always returns 0 */
+	for (i = 0; i < 10; i++)
+		if (rte_dma_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error) != 0
+				|| error == false || idx != (invalid_addr_id - 1))
+			ERR_RETURN("Error with follow-up completed calls for fail idx %u\n",
+					fail_idx);
+
+	status_count = rte_dma_completed_status(dev_id, vchan, COMP_BURST_SZ,
+			&idx, status);
+	/* some HW may stop on error and be restarted after getting error status for single value
+	 * To handle this case, if we get just one error back, wait for more completions and get
+	 * status for rest of the burst
+	 */
+	if (status_count == 1) {
+		await_hw(dev_id, vchan);
+		status_count += rte_dma_completed_status(dev_id, vchan, COMP_BURST_SZ - 1,
+					&idx, &status[1]);
+	}
+	/* check that at this point we have all status values */
+	if (status_count != COMP_BURST_SZ - count)
+		ERR_RETURN("Error with completed_status calls for fail idx %u. Got %u not %u\n",
+				fail_idx, status_count, COMP_BURST_SZ - count);
+	/* now verify just one failure followed by multiple successful or skipped entries */
+	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL)
+		ERR_RETURN("Error with status returned for fail idx %u. First status was not failure\n",
+				fail_idx);
+	for (i = 1; i < status_count; i++)
+		/* after a failure in a burst, depending on ordering/fencing,
+		 * operations may be successful or skipped because of previous error.
+		 */
+		if (status[i] != RTE_DMA_STATUS_SUCCESSFUL
+				&& status[i] != RTE_DMA_STATUS_NOT_ATTEMPTED)
+			ERR_RETURN("Error with status calls for fail idx %u. Status for job %u (of %u) is not successful\n",
+					fail_idx, count + i, COMP_BURST_SZ);
+
+	/* check the completed + errors stats are as expected */
+	rte_dma_stats_get(dev_id, vchan, &stats);
+	if (stats.completed != baseline.completed + COMP_BURST_SZ)
+		ERR_RETURN("Completed stats value not as expected, %"PRIu64" not %"PRIu64"\n",
+				stats.completed, baseline.completed + COMP_BURST_SZ);
+	for (i = 0; i < status_count; i++)
+		err_count += (status[i] != RTE_DMA_STATUS_SUCCESSFUL);
+	if (stats.errors != baseline.errors + err_count)
+		ERR_RETURN("'Errors' stats value not as expected, %"PRIu64" not %"PRIu64"\n",
+				stats.errors, baseline.errors + err_count);
+
+	return 0;
+}
+
+static int
+test_individual_status_query_with_failure(int16_t dev_id, uint16_t vchan, bool fence,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* Test gathering batch statuses one at a time */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count = 0, status_count = 0;
+	unsigned int j;
+	bool error = false;
+
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		int id = rte_dma_copy(dev_id, vchan,
+				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
+				dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, OPT_FENCE(j));
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
+		if (j == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dma_submit(dev_id, vchan);
+	await_hw(dev_id, vchan);
+
+	/* use regular "completed" until we hit error */
+	while (!error) {
+		uint16_t n = rte_dma_completed(dev_id, vchan, 1, &idx, &error);
+		count += n;
+		if (n > 1 || count >= COMP_BURST_SZ)
+			ERR_RETURN("Error - too many completions got\n");
+		if (n == 0 && !error)
+			ERR_RETURN("Error, unexpectedly got zero completions after %u completed\n",
+					count);
+	}
+	if (idx != invalid_addr_id - 1)
+		ERR_RETURN("Error, last successful index not as expected, got %u, expected %u\n",
+				idx, invalid_addr_id - 1);
+
+	/* use completed_status until we hit end of burst */
+	while (count + status_count < COMP_BURST_SZ) {
+		uint16_t n = rte_dma_completed_status(dev_id, vchan, 1, &idx,
+				&status[status_count]);
+		await_hw(dev_id, vchan); /* allow delay to ensure jobs are completed */
+		status_count += n;
+		if (n != 1)
+			ERR_RETURN("Error: unexpected number of completions received, %u, not 1\n",
+					n);
+	}
+
+	/* check for single failure */
+	if (status[0] == RTE_DMA_STATUS_SUCCESSFUL)
+		ERR_RETURN("Error, unexpected successful DMA transaction\n");
+	for (j = 1; j < status_count; j++)
+		if (status[j] != RTE_DMA_STATUS_SUCCESSFUL
+				&& status[j] != RTE_DMA_STATUS_NOT_ATTEMPTED)
+			ERR_RETURN("Error, unexpected DMA error reported\n");
+
+	return 0;
+}
+
+static int
+test_single_item_status_query_with_failure(int16_t dev_id, uint16_t vchan,
+		struct rte_mbuf **srcs, struct rte_mbuf **dsts, unsigned int fail_idx)
+{
+	/* When error occurs just collect a single error using "completed_status()"
+	 * before going to back to completed() calls
+	 */
+	enum rte_dma_status_code status;
+	uint16_t invalid_addr_id = 0;
+	uint16_t idx;
+	uint16_t count, status_count, count2;
+	unsigned int j;
+	bool error = false;
+
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		int id = rte_dma_copy(dev_id, vchan,
+				(j == fail_idx ? 0 : (srcs[j]->buf_iova + srcs[j]->data_off)),
+				dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
+		if (j == fail_idx)
+			invalid_addr_id = id;
+	}
+	rte_dma_submit(dev_id, vchan);
+	await_hw(dev_id, vchan);
+
+	/* get up to the error point */
+	count = rte_dma_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (count != fail_idx)
+		ERR_RETURN("Error with rte_dma_completed for failure test. Got returned %u not %u.\n",
+				count, fail_idx);
+	if (!error)
+		ERR_RETURN("Error, missing expected failed copy, %u. has_error is not set\n",
+				fail_idx);
+	if (idx != invalid_addr_id - 1)
+		ERR_RETURN("Error, missing expected failed copy, %u. Got last idx %u, not %u\n",
+				fail_idx, idx, invalid_addr_id - 1);
+
+	/* get the error code */
+	status_count = rte_dma_completed_status(dev_id, vchan, 1, &idx, &status);
+	if (status_count != 1)
+		ERR_RETURN("Error with completed_status calls for fail idx %u. Got %u not %u\n",
+				fail_idx, status_count, COMP_BURST_SZ - count);
+	if (status == RTE_DMA_STATUS_SUCCESSFUL)
+		ERR_RETURN("Error with status returned for fail idx %u. First status was not failure\n",
+				fail_idx);
+
+	/* delay in case time needed after err handled to complete other jobs */
+	await_hw(dev_id, vchan);
+
+	/* get the rest of the completions without status */
+	count2 = rte_dma_completed(dev_id, vchan, COMP_BURST_SZ, &idx, &error);
+	if (error == true)
+		ERR_RETURN("Error, got further errors post completed_status() call, for failure case %u.\n",
+				fail_idx);
+	if (count + status_count + count2 != COMP_BURST_SZ)
+		ERR_RETURN("Error, incorrect number of completions received, got %u not %u\n",
+				count + status_count + count2, COMP_BURST_SZ);
+
+	return 0;
+}
+
+static int
+test_multi_failure(int16_t dev_id, uint16_t vchan, struct rte_mbuf **srcs, struct rte_mbuf **dsts,
+		const unsigned int *fail, size_t num_fail)
+{
+	/* test having multiple errors in one go */
+	enum rte_dma_status_code status[COMP_BURST_SZ];
+	unsigned int i, j;
+	uint16_t count, err_count = 0;
+	bool error = false;
+
+	/* enqueue and gather completions in one go */
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		/* set up for failure if the current index is anywhere is the fails array */
+		for (i = 0; i < num_fail; i++)
+			if (j == fail[i])
+				src = 0;
+
+		int id = rte_dma_copy(dev_id, vchan,
+				src, dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
+	}
+	rte_dma_submit(dev_id, vchan);
+	await_hw(dev_id, vchan);
+
+	count = rte_dma_completed_status(dev_id, vchan, COMP_BURST_SZ, NULL, status);
+	while (count < COMP_BURST_SZ) {
+		await_hw(dev_id, vchan);
+
+		uint16_t ret = rte_dma_completed_status(dev_id, vchan, COMP_BURST_SZ - count,
+				NULL, &status[count]);
+		if (ret == 0)
+			ERR_RETURN("Error getting all completions for jobs. Got %u of %u\n",
+					count, COMP_BURST_SZ);
+		count += ret;
+	}
+	for (i = 0; i < count; i++)
+		if (status[i] != RTE_DMA_STATUS_SUCCESSFUL)
+			err_count++;
+
+	if (err_count != num_fail)
+		ERR_RETURN("Error: Invalid number of failed completions returned, %u; expected %zu\n",
+			err_count, num_fail);
+
+	/* enqueue and gather completions in bursts, but getting errors one at a time */
+	for (j = 0; j < COMP_BURST_SZ; j++) {
+		uintptr_t src = srcs[j]->buf_iova + srcs[j]->data_off;
+		/* set up for failure if the current index is anywhere is the fails array */
+		for (i = 0; i < num_fail; i++)
+			if (j == fail[i])
+				src = 0;
+
+		int id = rte_dma_copy(dev_id, vchan,
+				src, dsts[j]->buf_iova + dsts[j]->data_off,
+				COPY_LEN, 0);
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_copy for buffer %u\n", j);
+	}
+	rte_dma_submit(dev_id, vchan);
+	await_hw(dev_id, vchan);
+
+	count = 0;
+	err_count = 0;
+	while (count + err_count < COMP_BURST_SZ) {
+		count += rte_dma_completed(dev_id, vchan, COMP_BURST_SZ, NULL, &error);
+		if (error) {
+			uint16_t ret = rte_dma_completed_status(dev_id, vchan, 1,
+					NULL, status);
+			if (ret != 1)
+				ERR_RETURN("Error getting error-status for completions\n");
+			err_count += ret;
+			await_hw(dev_id, vchan);
+		}
+	}
+	if (err_count != num_fail)
+		ERR_RETURN("Error: Incorrect number of failed completions received, got %u not %zu\n",
+				err_count, num_fail);
+
+	return 0;
+}
+
+static int
+test_completion_status(int16_t dev_id, uint16_t vchan, bool fence)
+{
+	const unsigned int fail[] = {0, 7, 14, 15};
+	struct rte_mbuf *srcs[COMP_BURST_SZ], *dsts[COMP_BURST_SZ];
+	unsigned int i;
+
+	for (i = 0; i < COMP_BURST_SZ; i++) {
+		srcs[i] = rte_pktmbuf_alloc(pool);
+		dsts[i] = rte_pktmbuf_alloc(pool);
+	}
+
+	for (i = 0; i < RTE_DIM(fail); i++) {
+		if (test_failure_in_full_burst(dev_id, vchan, fence, srcs, dsts, fail[i]) < 0)
+			return -1;
+
+		if (test_individual_status_query_with_failure(dev_id, vchan, fence,
+				srcs, dsts, fail[i]) < 0)
+			return -1;
+
+		/* test is run the same fenced, or unfenced, but no harm in running it twice */
+		if (test_single_item_status_query_with_failure(dev_id, vchan,
+				srcs, dsts, fail[i]) < 0)
+			return -1;
+	}
+
+	if (test_multi_failure(dev_id, vchan, srcs, dsts, fail, RTE_DIM(fail)) < 0)
+		return -1;
+
+	for (i = 0; i < COMP_BURST_SZ; i++) {
+		rte_pktmbuf_free(srcs[i]);
+		rte_pktmbuf_free(dsts[i]);
+	}
+	return 0;
+}
+
+static int
+test_completion_handling(int16_t dev_id, uint16_t vchan)
+{
+	return test_completion_status(dev_id, vchan, false)              /* without fences */
+			|| test_completion_status(dev_id, vchan, true);  /* with fences */
+}
+
+static int
+test_enqueue_fill(int16_t dev_id, uint16_t vchan)
+{
+	const unsigned int lengths[] = {8, 64, 1024, 50, 100, 89};
+	struct rte_mbuf *dst;
+	char *dst_data;
+	uint64_t pattern = 0xfedcba9876543210;
+	unsigned int i, j;
+
+	dst = rte_pktmbuf_alloc(pool);
+	if (dst == NULL)
+		ERR_RETURN("Failed to allocate mbuf\n");
+	dst_data = rte_pktmbuf_mtod(dst, char *);
+
+	for (i = 0; i < RTE_DIM(lengths); i++) {
+		/* reset dst_data */
+		memset(dst_data, 0, rte_pktmbuf_data_len(dst));
+
+		/* perform the fill operation */
+		int id = rte_dma_fill(dev_id, vchan, pattern,
+				rte_pktmbuf_iova(dst), lengths[i], RTE_DMA_OP_FLAG_SUBMIT);
+		if (id < 0)
+			ERR_RETURN("Error with rte_dma_fill\n");
+		await_hw(dev_id, vchan);
+
+		if (rte_dma_completed(dev_id, vchan, 1, NULL, NULL) != 1)
+			ERR_RETURN("Error: fill operation failed (length: %u)\n", lengths[i]);
+		/* check the data from the fill operation is correct */
+		for (j = 0; j < lengths[i]; j++) {
+			char pat_byte = ((char *)&pattern)[j % 8];
+			if (dst_data[j] != pat_byte)
+				ERR_RETURN("Error with fill operation (lengths = %u): got (%x), not (%x)\n",
+						lengths[i], dst_data[j], pat_byte);
+		}
+		/* check that the data after the fill operation was not written to */
+		for (; j < rte_pktmbuf_data_len(dst); j++)
+			if (dst_data[j] != 0)
+				ERR_RETURN("Error, fill operation wrote too far (lengths = %u): got (%x), not (%x)\n",
+						lengths[i], dst_data[j], 0);
+	}
+
+	rte_pktmbuf_free(dst);
+	return 0;
+}
+
+static int
+test_burst_capacity(int16_t dev_id, uint16_t vchan)
+{
+#define CAP_TEST_BURST_SIZE	64
+	const int ring_space = rte_dma_burst_capacity(dev_id, vchan);
+	struct rte_mbuf *src, *dst;
+	int i, j, iter;
+	int cap, ret;
+	bool dma_err;
+
+	src = rte_pktmbuf_alloc(pool);
+	dst = rte_pktmbuf_alloc(pool);
+
+	/* to test capacity, we enqueue elements and check capacity is reduced
+	 * by one each time - rebaselining the expected value after each burst
+	 * as the capacity is only for a burst. We enqueue multiple bursts to
+	 * fill up half the ring, before emptying it again. We do this twice to
+	 * ensure that we get to test scenarios where we get ring wrap-around
+	 */
+	for (iter = 0; iter < 2; iter++) {
+		for (i = 0; i < (ring_space / (2 * CAP_TEST_BURST_SIZE)) + 1; i++) {
+			cap = rte_dma_burst_capacity(dev_id, vchan);
+
+			for (j = 0; j < CAP_TEST_BURST_SIZE; j++) {
+				ret = rte_dma_copy(dev_id, vchan, rte_pktmbuf_iova(src),
+						rte_pktmbuf_iova(dst), COPY_LEN, 0);
+				if (ret < 0)
+					ERR_RETURN("Error with rte_dmadev_copy\n");
+
+				if (rte_dma_burst_capacity(dev_id, vchan) != cap - (j + 1))
+					ERR_RETURN("Error, ring capacity did not change as expected\n");
+			}
+			if (rte_dma_submit(dev_id, vchan) < 0)
+				ERR_RETURN("Error, failed to submit burst\n");
+
+			if (cap < rte_dma_burst_capacity(dev_id, vchan))
+				ERR_RETURN("Error, avail ring capacity has gone up, not down\n");
+		}
+		await_hw(dev_id, vchan);
+
+		for (i = 0; i < (ring_space / (2 * CAP_TEST_BURST_SIZE)) + 1; i++) {
+			ret = rte_dma_completed(dev_id, vchan,
+					CAP_TEST_BURST_SIZE, NULL, &dma_err);
+			if (ret != CAP_TEST_BURST_SIZE || dma_err) {
+				enum rte_dma_status_code status;
+
+				rte_dma_completed_status(dev_id, vchan, 1, NULL, &status);
+				ERR_RETURN("Error with rte_dmadev_completed, %u [expected: %u], dma_err = %d, i = %u, iter = %u, status = %u\n",
+						ret, CAP_TEST_BURST_SIZE, dma_err, i, iter, status);
+			}
+		}
+		cap = rte_dma_burst_capacity(dev_id, vchan);
+		if (cap != ring_space)
+			ERR_RETURN("Error, ring capacity has not reset to original value, got %u, expected %u\n",
+					cap, ring_space);
+	}
+
+	rte_pktmbuf_free(src);
+	rte_pktmbuf_free(dst);
+
+	return 0;
+}
+
+static int
+test_dmadev_instance(int16_t dev_id)
+{
+#define TEST_RINGSIZE 512
+#define CHECK_ERRS    true
+	struct rte_dma_stats stats;
+	struct rte_dma_info info;
+	const struct rte_dma_conf conf = { .nb_vchans = 1};
+	const struct rte_dma_vchan_conf qconf = {
+			.direction = RTE_DMA_DIR_MEM_TO_MEM,
+			.nb_desc = TEST_RINGSIZE,
+	};
+	const int vchan = 0;
+	int ret;
+
+	ret = rte_dma_info_get(dev_id, &info);
+	if (ret != 0)
+		ERR_RETURN("Error with rte_dma_info_get()\n");
+
+	printf("\n### Test dmadev instance %u [%s]\n",
+			dev_id, info.dev_name);
+
+	if (info.max_vchans < 1)
+		ERR_RETURN("Error, no channels available on device id %u\n", dev_id);
+
+	if (rte_dma_configure(dev_id, &conf) != 0)
+		ERR_RETURN("Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) < 0)
+		ERR_RETURN("Error with queue configuration\n");
+
+	ret = rte_dma_info_get(dev_id, &info);
+	if (ret != 0 || info.nb_vchans != 1)
+		ERR_RETURN("Error, no configured queues reported on device id %u\n", dev_id);
+
+	if (rte_dma_start(dev_id) != 0)
+		ERR_RETURN("Error with rte_dma_start()\n");
+
+	if (rte_dma_stats_get(dev_id, vchan, &stats) != 0)
+		ERR_RETURN("Error with rte_dma_stats_get()\n");
+
+	if (stats.completed != 0 || stats.submitted != 0 || stats.errors != 0)
+		ERR_RETURN("Error device stats are not all zero: completed = %"PRIu64", "
+				"submitted = %"PRIu64", errors = %"PRIu64"\n",
+				stats.completed, stats.submitted, stats.errors);
+	id_count = 0;
+
+	/* create a mempool for running tests */
+	pool = rte_pktmbuf_pool_create("TEST_DMADEV_POOL",
+			TEST_RINGSIZE * 2, /* n == num elements */
+			32,  /* cache size */
+			0,   /* priv size */
+			2048, /* data room size */
+			info.numa_node);
+	if (pool == NULL)
+		ERR_RETURN("Error with mempool creation\n");
+
+	/* run the test cases, use many iterations to ensure UINT16_MAX id wraparound */
+	if (runtest("copy", test_enqueue_copies, 640, dev_id, vchan, CHECK_ERRS) < 0)
+		goto err;
+
+	/* run some burst capacity tests */
+	if (runtest("burst capacity", test_burst_capacity, 1, dev_id, vchan, CHECK_ERRS) < 0)
+		goto err;
+
+	/* to test error handling we can provide null pointers for source or dest in copies. This
+	 * requires VA mode in DPDK, since NULL(0) is a valid physical address.
+	 * We also need hardware that can report errors back.
+	 */
+	if (rte_eal_iova_mode() != RTE_IOVA_VA)
+		printf("DMA Dev %u: DPDK not in VA mode, skipping error handling tests\n", dev_id);
+	else if ((info.dev_capa & RTE_DMA_CAPA_HANDLES_ERRORS) == 0)
+		printf("DMA Dev %u: device does not report errors, skipping error handling tests\n",
+				dev_id);
+	else if (runtest("error handling", test_completion_handling, 1,
+			dev_id, vchan, !CHECK_ERRS) < 0)
+		goto err;
+
+	if ((info.dev_capa & RTE_DMA_CAPA_OPS_FILL) == 0)
+		printf("DMA Dev %u: No device fill support, skipping fill tests\n", dev_id);
+	else if (runtest("fill", test_enqueue_fill, 1, dev_id, vchan, CHECK_ERRS) < 0)
+		goto err;
+
+	rte_mempool_free(pool);
+	rte_dma_stop(dev_id);
+	rte_dma_stats_reset(dev_id, vchan);
+	return 0;
+
+err:
+	rte_mempool_free(pool);
+	rte_dma_stop(dev_id);
+	return -1;
+}
+
+static int
+test_apis(void)
+{
+	const char *pmd = "dma_skeleton";
+	int id;
+	int ret;
+
+	/* attempt to create skeleton instance - ignore errors due to one being already present */
+	rte_vdev_init(pmd, NULL);
+	id = rte_dma_get_dev_id_by_name(pmd);
+	if (id < 0)
+		return TEST_SKIPPED;
+	printf("\n### Test dmadev infrastructure using skeleton driver\n");
+	ret = test_dma_api(id);
+
+	return ret;
+}
+
+static int
+test_dma(void)
+{
+	int i;
+
+	/* basic sanity on dmadev infrastructure */
+	if (test_apis() < 0)
+		ERR_RETURN("Error performing API tests\n");
+
+	if (rte_dma_count_avail() == 0)
+		return TEST_SKIPPED;
+
+	RTE_DMA_FOREACH_DEV(i)
+		if (test_dmadev_instance(i) < 0)
+			ERR_RETURN("Error, test failure for device %d\n", i);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(dmadev_autotest, test_dma);
diff --git a/app/test/test_dmadev_api.c b/app/test/test_dmadev_api.c
new file mode 100644
index 000000000..4a181af90
--- /dev/null
+++ b/app/test/test_dmadev_api.c
@@ -0,0 +1,574 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+#include <string.h>
+
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_test.h>
+#include <rte_dmadev.h>
+
+extern int test_dma_api(uint16_t dev_id);
+
+#define DMA_TEST_API_RUN(test) \
+	testsuite_run_test(test, #test)
+
+#define TEST_MEMCPY_SIZE	1024
+#define TEST_WAIT_US_VAL	50000
+
+#define TEST_SUCCESS 0
+#define TEST_FAILED  -1
+
+static int16_t test_dev_id;
+static int16_t invalid_dev_id;
+
+static char *src;
+static char *dst;
+
+static int total;
+static int passed;
+static int failed;
+
+static int
+testsuite_setup(int16_t dev_id)
+{
+	test_dev_id = dev_id;
+	invalid_dev_id = -1;
+
+	src = rte_malloc("dmadev_test_src", TEST_MEMCPY_SIZE, 0);
+	if (src == NULL)
+		return -ENOMEM;
+	dst = rte_malloc("dmadev_test_dst", TEST_MEMCPY_SIZE, 0);
+	if (dst == NULL) {
+		rte_free(src);
+		src = NULL;
+		return -ENOMEM;
+	}
+
+	total = 0;
+	passed = 0;
+	failed = 0;
+
+	/* Set dmadev log level to critical to suppress unnecessary output
+	 * during API tests.
+	 */
+	rte_log_set_level_pattern("lib.dmadev", RTE_LOG_CRIT);
+
+	return 0;
+}
+
+static void
+testsuite_teardown(void)
+{
+	rte_free(src);
+	src = NULL;
+	rte_free(dst);
+	dst = NULL;
+	/* Ensure the dmadev is stopped. */
+	rte_dma_stop(test_dev_id);
+
+	rte_log_set_level_pattern("lib.dmadev", RTE_LOG_INFO);
+}
+
+static void
+testsuite_run_test(int (*test)(void), const char *name)
+{
+	int ret = 0;
+
+	if (test) {
+		ret = test();
+		if (ret < 0) {
+			failed++;
+			printf("%s Failed\n", name);
+		} else {
+			passed++;
+			printf("%s Passed\n", name);
+		}
+	}
+
+	total++;
+}
+
+static int
+test_dma_get_dev_id_by_name(void)
+{
+	int ret = rte_dma_get_dev_id_by_name("invalid_dmadev_device");
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_is_valid_dev(void)
+{
+	int ret;
+	ret = rte_dma_is_valid(invalid_dev_id);
+	RTE_TEST_ASSERT(ret == false, "Expected false for invalid dev id");
+	ret = rte_dma_is_valid(test_dev_id);
+	RTE_TEST_ASSERT(ret == true, "Expected true for valid dev id");
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_count(void)
+{
+	uint16_t count = rte_dma_count_avail();
+	RTE_TEST_ASSERT(count > 0, "Invalid dmadev count %u", count);
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_info_get(void)
+{
+	struct rte_dma_info info =  { 0 };
+	int ret;
+
+	ret = rte_dma_info_get(invalid_dev_id, &info);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_info_get(test_dev_id, NULL);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_info_get(test_dev_id, &info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info");
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_configure(void)
+{
+	struct rte_dma_conf conf = { 0 };
+	struct rte_dma_info info = { 0 };
+	int ret;
+
+	/* Check for invalid parameters */
+	ret = rte_dma_configure(invalid_dev_id, &conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_configure(test_dev_id, NULL);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check for nb_vchans == 0 */
+	memset(&conf, 0, sizeof(conf));
+	ret = rte_dma_configure(test_dev_id, &conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check for conf.nb_vchans > info.max_vchans */
+	ret = rte_dma_info_get(test_dev_id, &info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info");
+	memset(&conf, 0, sizeof(conf));
+	conf.nb_vchans = info.max_vchans + 1;
+	ret = rte_dma_configure(test_dev_id, &conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check enable silent mode */
+	memset(&conf, 0, sizeof(conf));
+	conf.nb_vchans = info.max_vchans;
+	conf.enable_silent = true;
+	ret = rte_dma_configure(test_dev_id, &conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Configure success */
+	memset(&conf, 0, sizeof(conf));
+	conf.nb_vchans = info.max_vchans;
+	ret = rte_dma_configure(test_dev_id, &conf);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to configure dmadev, %d", ret);
+
+	/* Check configure success */
+	ret = rte_dma_info_get(test_dev_id, &info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info");
+	RTE_TEST_ASSERT_EQUAL(conf.nb_vchans, info.nb_vchans,
+			      "Configure nb_vchans not match");
+
+	return TEST_SUCCESS;
+}
+
+static int
+check_direction(void)
+{
+	struct rte_dma_vchan_conf vchan_conf;
+	int ret;
+
+	/* Check for direction */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_DEV_TO_DEV + 1;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM - 1;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check for direction and dev_capa combination */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_DEV;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	vchan_conf.direction = RTE_DMA_DIR_DEV_TO_MEM;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	vchan_conf.direction = RTE_DMA_DIR_DEV_TO_DEV;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	return 0;
+}
+
+static int
+check_port_type(struct rte_dma_info *dev_info)
+{
+	struct rte_dma_vchan_conf vchan_conf;
+	int ret;
+
+	/* Check src port type validation */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+	vchan_conf.nb_desc = dev_info->min_desc;
+	vchan_conf.src_port.port_type = RTE_DMA_PORT_PCIE;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check dst port type validation */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+	vchan_conf.nb_desc = dev_info->min_desc;
+	vchan_conf.dst_port.port_type = RTE_DMA_PORT_PCIE;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	return 0;
+}
+
+static int
+test_dma_vchan_setup(void)
+{
+	struct rte_dma_vchan_conf vchan_conf = { 0 };
+	struct rte_dma_conf dev_conf = { 0 };
+	struct rte_dma_info dev_info = { 0 };
+	int ret;
+
+	/* Check for invalid parameters */
+	ret = rte_dma_vchan_setup(invalid_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_vchan_setup(test_dev_id, 0, NULL);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Make sure configure success */
+	ret = rte_dma_info_get(test_dev_id, &dev_info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info");
+	dev_conf.nb_vchans = dev_info.max_vchans;
+	ret = rte_dma_configure(test_dev_id, &dev_conf);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to configure dmadev, %d", ret);
+
+	/* Check for invalid vchan */
+	ret = rte_dma_vchan_setup(test_dev_id, dev_conf.nb_vchans, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check for direction */
+	ret = check_direction();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to check direction");
+
+	/* Check for nb_desc validation */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+	vchan_conf.nb_desc = dev_info.min_desc - 1;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	vchan_conf.nb_desc = dev_info.max_desc + 1;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check port type */
+	ret = check_port_type(&dev_info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to check port type");
+
+	/* Check vchan setup success */
+	memset(&vchan_conf, 0, sizeof(vchan_conf));
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+	vchan_conf.nb_desc = dev_info.min_desc;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup vchan, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static int
+setup_one_vchan(void)
+{
+	struct rte_dma_vchan_conf vchan_conf = { 0 };
+	struct rte_dma_info dev_info = { 0 };
+	struct rte_dma_conf dev_conf = { 0 };
+	int ret;
+
+	ret = rte_dma_info_get(test_dev_id, &dev_info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info, %d", ret);
+	dev_conf.nb_vchans = dev_info.max_vchans;
+	ret = rte_dma_configure(test_dev_id, &dev_conf);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to configure, %d", ret);
+	vchan_conf.direction = RTE_DMA_DIR_MEM_TO_MEM;
+	vchan_conf.nb_desc = dev_info.min_desc;
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup vchan, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_start_stop(void)
+{
+	struct rte_dma_vchan_conf vchan_conf = { 0 };
+	struct rte_dma_conf dev_conf = { 0 };
+	int ret;
+
+	/* Check for invalid parameters */
+	ret = rte_dma_start(invalid_dev_id);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_stop(invalid_dev_id);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Setup one vchan for later test */
+	ret = setup_one_vchan();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup one vchan, %d", ret);
+
+	ret = rte_dma_start(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to start, %d", ret);
+
+	/* Check reconfigure and vchan setup when device started */
+	ret = rte_dma_configure(test_dev_id, &dev_conf);
+	RTE_TEST_ASSERT(ret == -EBUSY, "Failed to configure, %d", ret);
+	ret = rte_dma_vchan_setup(test_dev_id, 0, &vchan_conf);
+	RTE_TEST_ASSERT(ret == -EBUSY, "Failed to setup vchan, %d", ret);
+
+	ret = rte_dma_stop(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to stop, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_stats(void)
+{
+	struct rte_dma_info dev_info = { 0 };
+	struct rte_dma_stats stats = { 0 };
+	int ret;
+
+	/* Check for invalid parameters */
+	ret = rte_dma_stats_get(invalid_dev_id, 0, &stats);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_stats_get(invalid_dev_id, 0, NULL);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_stats_reset(invalid_dev_id, 0);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Setup one vchan for later test */
+	ret = setup_one_vchan();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup one vchan, %d", ret);
+
+	/* Check for invalid vchan */
+	ret = rte_dma_info_get(test_dev_id, &dev_info);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to obtain device info, %d", ret);
+	ret = rte_dma_stats_get(test_dev_id, dev_info.max_vchans, &stats);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+	ret = rte_dma_stats_reset(test_dev_id, dev_info.max_vchans);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Expected -EINVAL, %d", ret);
+
+	/* Check for valid vchan */
+	ret = rte_dma_stats_get(test_dev_id, 0, &stats);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to get stats, %d", ret);
+	ret = rte_dma_stats_get(test_dev_id, RTE_DMA_ALL_VCHAN, &stats);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to get all stats, %d", ret);
+	ret = rte_dma_stats_reset(test_dev_id, 0);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to reset stats, %d", ret);
+	ret = rte_dma_stats_reset(test_dev_id, RTE_DMA_ALL_VCHAN);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to reset all stats, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_dump(void)
+{
+	int ret;
+
+	/* Check for invalid parameters */
+	ret = rte_dma_dump(invalid_dev_id, stderr);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Excepted -EINVAL, %d", ret);
+	ret = rte_dma_dump(test_dev_id, NULL);
+	RTE_TEST_ASSERT(ret == -EINVAL, "Excepted -EINVAL, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static void
+setup_memory(void)
+{
+	int i;
+
+	for (i = 0; i < TEST_MEMCPY_SIZE; i++)
+		src[i] = (char)i;
+	memset(dst, 0, TEST_MEMCPY_SIZE);
+}
+
+static int
+verify_memory(void)
+{
+	int i;
+
+	for (i = 0; i < TEST_MEMCPY_SIZE; i++) {
+		if (src[i] == dst[i])
+			continue;
+		RTE_TEST_ASSERT_EQUAL(src[i], dst[i],
+			"Failed to copy memory, %d %d", src[i], dst[i]);
+	}
+
+	return 0;
+}
+
+static int
+test_dma_completed(void)
+{
+	uint16_t last_idx = 1;
+	bool has_error = true;
+	uint16_t cpl_ret;
+	int ret;
+
+	/* Setup one vchan for later test */
+	ret = setup_one_vchan();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup one vchan, %d", ret);
+
+	ret = rte_dma_start(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to start, %d", ret);
+
+	setup_memory();
+
+	/* Check enqueue without submit */
+	ret = rte_dma_copy(test_dev_id, 0, (rte_iova_t)src, (rte_iova_t)dst,
+			   TEST_MEMCPY_SIZE, 0);
+	RTE_TEST_ASSERT_EQUAL(ret, 0, "Failed to enqueue copy, %d", ret);
+	rte_delay_us_sleep(TEST_WAIT_US_VAL);
+	cpl_ret = rte_dma_completed(test_dev_id, 0, 1, &last_idx, &has_error);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 0, "Failed to get completed");
+
+	/* Check add submit */
+	ret = rte_dma_submit(test_dev_id, 0);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to submit, %d", ret);
+	rte_delay_us_sleep(TEST_WAIT_US_VAL);
+	cpl_ret = rte_dma_completed(test_dev_id, 0, 1, &last_idx, &has_error);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 1, "Failed to get completed");
+	RTE_TEST_ASSERT_EQUAL(last_idx, 0, "Last idx should be zero, %u",
+				last_idx);
+	RTE_TEST_ASSERT_EQUAL(has_error, false, "Should have no error");
+	ret = verify_memory();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to verify memory");
+
+	setup_memory();
+
+	/* Check for enqueue with submit */
+	ret = rte_dma_copy(test_dev_id, 0, (rte_iova_t)src, (rte_iova_t)dst,
+			   TEST_MEMCPY_SIZE, RTE_DMA_OP_FLAG_SUBMIT);
+	RTE_TEST_ASSERT_EQUAL(ret, 1, "Failed to enqueue copy, %d", ret);
+	rte_delay_us_sleep(TEST_WAIT_US_VAL);
+	cpl_ret = rte_dma_completed(test_dev_id, 0, 1, &last_idx, &has_error);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 1, "Failed to get completed");
+	RTE_TEST_ASSERT_EQUAL(last_idx, 1, "Last idx should be 1, %u",
+				last_idx);
+	RTE_TEST_ASSERT_EQUAL(has_error, false, "Should have no error");
+	ret = verify_memory();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to verify memory");
+
+	/* Stop dmadev to make sure dmadev to a known state */
+	ret = rte_dma_stop(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to stop, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+static int
+test_dma_completed_status(void)
+{
+	enum rte_dma_status_code status[1] = { 1 };
+	uint16_t last_idx = 1;
+	uint16_t cpl_ret, i;
+	int ret;
+
+	/* Setup one vchan for later test */
+	ret = setup_one_vchan();
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to setup one vchan, %d", ret);
+
+	ret = rte_dma_start(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to start, %d", ret);
+
+	/* Check for enqueue with submit */
+	ret = rte_dma_copy(test_dev_id, 0, (rte_iova_t)src, (rte_iova_t)dst,
+			   TEST_MEMCPY_SIZE, RTE_DMA_OP_FLAG_SUBMIT);
+	RTE_TEST_ASSERT_EQUAL(ret, 0, "Failed to enqueue copy, %d", ret);
+	rte_delay_us_sleep(TEST_WAIT_US_VAL);
+	cpl_ret = rte_dma_completed_status(test_dev_id, 0, 1, &last_idx,
+					   status);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 1, "Failed to completed status");
+	RTE_TEST_ASSERT_EQUAL(last_idx, 0, "Last idx should be zero, %u",
+				last_idx);
+	for (i = 0; i < RTE_DIM(status); i++)
+		RTE_TEST_ASSERT_EQUAL(status[i], 0,
+				"Failed to completed status, %d", status[i]);
+
+	/* Check do completed status again */
+	cpl_ret = rte_dma_completed_status(test_dev_id, 0, 1, &last_idx,
+					   status);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 0, "Failed to completed status");
+
+	/* Check for enqueue with submit again */
+	ret = rte_dma_copy(test_dev_id, 0, (rte_iova_t)src, (rte_iova_t)dst,
+			   TEST_MEMCPY_SIZE, RTE_DMA_OP_FLAG_SUBMIT);
+	RTE_TEST_ASSERT_EQUAL(ret, 1, "Failed to enqueue copy, %d", ret);
+	rte_delay_us_sleep(TEST_WAIT_US_VAL);
+	cpl_ret = rte_dma_completed_status(test_dev_id, 0, 1, &last_idx,
+					   status);
+	RTE_TEST_ASSERT_EQUAL(cpl_ret, 1, "Failed to completed status");
+	RTE_TEST_ASSERT_EQUAL(last_idx, 1, "Last idx should be 1, %u",
+				last_idx);
+	for (i = 0; i < RTE_DIM(status); i++)
+		RTE_TEST_ASSERT_EQUAL(status[i], 0,
+				"Failed to completed status, %d", status[i]);
+
+	/* Stop dmadev to make sure dmadev to a known state */
+	ret = rte_dma_stop(test_dev_id);
+	RTE_TEST_ASSERT_SUCCESS(ret, "Failed to stop, %d", ret);
+
+	return TEST_SUCCESS;
+}
+
+int
+test_dma_api(uint16_t dev_id)
+{
+	int ret = testsuite_setup(dev_id);
+	if (ret) {
+		printf("testsuite setup fail!\n");
+		return -1;
+	}
+
+	/* If the testcase exit successfully, ensure that the test dmadev exist
+	 * and the dmadev is in the stopped state.
+	 */
+	DMA_TEST_API_RUN(test_dma_get_dev_id_by_name);
+	DMA_TEST_API_RUN(test_dma_is_valid_dev);
+	DMA_TEST_API_RUN(test_dma_count);
+	DMA_TEST_API_RUN(test_dma_info_get);
+	DMA_TEST_API_RUN(test_dma_configure);
+	DMA_TEST_API_RUN(test_dma_vchan_setup);
+	DMA_TEST_API_RUN(test_dma_start_stop);
+	DMA_TEST_API_RUN(test_dma_stats);
+	DMA_TEST_API_RUN(test_dma_dump);
+	DMA_TEST_API_RUN(test_dma_completed);
+	DMA_TEST_API_RUN(test_dma_completed_status);
+
+	testsuite_teardown();
+
+	printf("Total tests   : %d\n", total);
+	printf("Passed        : %d\n", passed);
+	printf("Failed        : %d\n", failed);
+
+	if (failed)
+		return -1;
+
+	return 0;
+};
diff --git a/app/test/test_dmadev_api.h b/app/test/test_dmadev_api.h
new file mode 100644
index 000000000..33fbc5bd4
--- /dev/null
+++ b/app/test/test_dmadev_api.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+int test_dma_api(uint16_t dev_id);
diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
index 748514e24..8dc353300 100644
--- a/doc/api/doxy-api-index.md
+++ b/doc/api/doxy-api-index.md
@@ -21,6 +21,7 @@ The public API headers are grouped by topics:
   [compressdev]        (@ref rte_compressdev.h),
   [compress]           (@ref rte_comp.h),
   [regexdev]           (@ref rte_regexdev.h),
+  [dmadev]             (@ref rte_dmadev.h),
   [eventdev]           (@ref rte_eventdev.h),
   [event_eth_rx_adapter]   (@ref rte_event_eth_rx_adapter.h),
   [event_eth_tx_adapter]   (@ref rte_event_eth_tx_adapter.h),
diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
index 5c883b613..3b2c53426 100644
--- a/doc/api/doxy-api.conf.in
+++ b/doc/api/doxy-api.conf.in
@@ -35,6 +35,7 @@ INPUT                   = @TOPDIR@/doc/api/doxy-api-index.md \
                           @TOPDIR@/lib/librte_compressdev \
                           @TOPDIR@/lib/librte_cryptodev \
                           @TOPDIR@/lib/librte_distributor \
+                          @TOPDIR@/lib/librte_dmadev \
                           @TOPDIR@/lib/librte_efd \
                           @TOPDIR@/lib/librte_ethdev \
                           @TOPDIR@/lib/librte_eventdev \
diff --git a/doc/guides/dmadevs/index.rst b/doc/guides/dmadevs/index.rst
new file mode 100644
index 000000000..0bce29d76
--- /dev/null
+++ b/doc/guides/dmadevs/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright 2021 HiSilicon Limited
+
+DMA Device Drivers
+==================
+
+The following are a list of DMA device drivers, which can be used from
+an application through DMA API.
+
+.. toctree::
+   :maxdepth: 2
+   :numbered:
diff --git a/doc/guides/index.rst b/doc/guides/index.rst
index 857f0363d..919825992 100644
--- a/doc/guides/index.rst
+++ b/doc/guides/index.rst
@@ -21,6 +21,7 @@ DPDK documentation
    compressdevs/index
    vdpadevs/index
    regexdevs/index
+   dmadevs/index
    eventdevs/index
    rawdevs/index
    mempool/index
diff --git a/doc/guides/prog_guide/dmadev.rst b/doc/guides/prog_guide/dmadev.rst
new file mode 100644
index 000000000..32f714786
--- /dev/null
+++ b/doc/guides/prog_guide/dmadev.rst
@@ -0,0 +1,90 @@
+.. SPDX-License-Identifier: BSD-3-Clause
+   Copyright 2021 HiSilicon Limited
+
+DMA Device Library
+==================
+
+The DMA library provides a DMA device framework for management and provisioning
+of hardware and software DMA poll mode drivers, defining generic API which
+support a number of different DMA operations.
+
+
+Design Principles
+-----------------
+
+The DMA framework provides a generic DMA device framework which supports both
+physical (hardware) and virtual (software) DMA devices, as well as a generic DMA
+API which allows DMA devices to be managed and configured, and supports DMA
+operations to be provisioned on DMA poll mode driver.
+
+.. _figure_dmadev:
+
+.. figure:: img/dmadev.*
+
+The above figure shows the model on which the DMA framework is built on:
+
+ * The DMA controller could have multiple hardware DMA channels (aka. hardware
+   DMA queues), each hardware DMA channel should be represented by a dmadev.
+ * The dmadev could create multiple virtual DMA channels, each virtual DMA
+   channel represents a different transfer context.
+ * The DMA operation request must be submitted to the virtual DMA channel.
+
+
+Device Management
+-----------------
+
+Device Creation
+~~~~~~~~~~~~~~~
+
+Physical DMA controllers are discovered during the PCI probe/enumeration of the
+EAL function which is executed at DPDK initialization, this is based on their
+PCI BDF (bus/bridge, device, function). Specific physical DMA controllers, like
+other physical devices in DPDK can be listed using the EAL command line options.
+
+The dmadevs are dynamically allocated by using the function
+``rte_dma_pmd_allocate`` based on the number of hardware DMA channels.
+
+
+Device Identification
+~~~~~~~~~~~~~~~~~~~~~
+
+Each DMA device, whether physical or virtual is uniquely designated by two
+identifiers:
+
+- A unique device index used to designate the DMA device in all functions
+  exported by the DMA API.
+
+- A device name used to designate the DMA device in console messages, for
+  administration or debugging purposes.
+
+
+Device Features and Capabilities
+--------------------------------
+
+DMA devices may support different feature sets. The ``rte_dma_info_get`` API
+can be used to get the device info and supported features.
+
+Silent mode is a special device capability which does not require the
+application to invoke dequeue APIs.
+
+
+Enqueue / Dequeue APIs
+~~~~~~~~~~~~~~~~~~~~~~
+
+Enqueue APIs such as ``rte_dma_copy`` and ``rte_dma_fill`` can be used to
+enqueue operations to hardware. If an enqueue is successful, a ``ring_idx`` is
+returned. This ``ring_idx`` can be used by applications to track per operation
+metadata in an application-defined circular ring.
+
+The ``rte_dma_submit`` API is used to issue doorbell to hardware.
+Alternatively the ``RTE_DMA_OP_FLAG_SUBMIT`` flag can be passed to the enqueue
+APIs to also issue the doorbell to hardware.
+
+There are two dequeue APIs ``rte_dma_completed`` and
+``rte_dma_completed_status``, these are used to obtain the results of the
+enqueue requests. ``rte_dma_completed`` will return the number of successfully
+completed operations. ``rte_dma_completed_status`` will return the number of
+completed operations along with the status of each operation (filled into the
+``status`` array passed by user). These two APIs can also return the last
+completed operation's ``ring_idx`` which could help user track operations within
+their own application-defined rings.
diff --git a/doc/guides/prog_guide/img/dmadev.svg b/doc/guides/prog_guide/img/dmadev.svg
new file mode 100644
index 000000000..157d7eb7d
--- /dev/null
+++ b/doc/guides/prog_guide/img/dmadev.svg
@@ -0,0 +1,283 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<!-- SPDX-License-Identifier: BSD-3-Clause -->
+<!-- Copyright(c) 2021 HiSilicon Limited -->
+
+<svg
+   width="128.64288mm"
+   height="95.477707mm"
+   viewBox="0 0 192.96433 143.21656"
+   version="1.1"
+   id="svg934"
+   inkscape:version="1.1 (c68e22c387, 2021-05-23)"
+   sodipodi:docname="dmadev.svg"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <sodipodi:namedview
+     id="namedview936"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:document-units="mm"
+     showgrid="false"
+     fit-margin-top="0"
+     fit-margin-left="0"
+     fit-margin-right="0"
+     fit-margin-bottom="0"
+     inkscape:showpageshadow="false"
+     inkscape:zoom="1.332716"
+     inkscape:cx="335.03011"
+     inkscape:cy="143.69152"
+     inkscape:window-width="1920"
+     inkscape:window-height="976"
+     inkscape:window-x="-8"
+     inkscape:window-y="-8"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="layer1"
+     scale-x="1.5"
+     units="mm" />
+  <defs
+     id="defs931">
+    <rect
+       x="342.43954"
+       y="106.56832"
+       width="58.257381"
+       height="137.82834"
+       id="rect17873" />
+  </defs>
+  <g
+     inkscape:label="Layer 1"
+     inkscape:groupmode="layer"
+     id="layer1"
+     transform="translate(-0.13857517,-21.527306)">
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.296755"
+       id="rect31-9"
+       width="50"
+       height="28"
+       x="0.13857517"
+       y="21.527306"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1"
+       transform="translate(-49.110795,15.205683)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1045">virtual DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1047">channel</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.296755"
+       id="rect31-9-5"
+       width="50"
+       height="28"
+       x="60.138577"
+       y="21.527306"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4"
+       transform="translate(10.512565,15.373298)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1049">virtual DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1051">channel</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.296755"
+       id="rect31-9-5-3"
+       width="50"
+       height="28"
+       x="137.43863"
+       y="21.527306"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-8"
+       transform="translate(88.79231,15.373299)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1053">virtual DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1055">channel</tspan></text>
+    <text
+       xml:space="preserve"
+       transform="matrix(0.26458333,0,0,0.26458333,-0.04940429,21.408845)"
+       id="text17871"
+       style="font-style:normal;font-weight:normal;font-size:40px;line-height:1.25;font-family:sans-serif;white-space:pre;shape-inside:url(#rect17873);fill:#000000;fill-opacity:1;stroke:none" />
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.218145"
+       id="rect31-9-5-8"
+       width="38.34557"
+       height="19.729115"
+       x="36.138577"
+       y="64.827354"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-3"
+       transform="translate(-13.394978,59.135217)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1057">dmadev</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.307089"
+       id="rect31-9-5-8-0"
+       width="60.902534"
+       height="24.616455"
+       x="25.196909"
+       y="98.47744"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-3-76"
+       transform="translate(-24.485484,90.97883)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1059">hardware DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1061">channel</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.307089"
+       id="rect31-9-5-8-0-6"
+       width="60.902534"
+       height="24.616455"
+       x="132.20036"
+       y="98.47744"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-3-76-7"
+       transform="translate(82.950904,90.79085)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1063">hardware DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1065">channel</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.307089"
+       id="rect31-9-5-8-0-4"
+       width="60.902534"
+       height="24.616455"
+       x="76.810928"
+       y="140.12741"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-3-76-4"
+       transform="translate(27.032341,133.10574)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1067">hardware DMA </tspan><tspan
+         x="54.136707"
+         y="26.865018"
+         id="tspan1069">controller</tspan></text>
+    <rect
+       style="fill:#c9c9ff;fill-opacity:1;stroke-width:0.218145"
+       id="rect31-9-5-8-5"
+       width="38.34557"
+       height="19.729115"
+       x="143.43863"
+       y="64.827354"
+       ry="0" />
+    <text
+       xml:space="preserve"
+       style="font-style:normal;font-weight:normal;font-size:7.05556px;line-height:1.25;font-family:sans-serif;white-space:pre;inline-size:70.1114;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583"
+       x="54.136707"
+       y="18.045568"
+       id="text803-1-4-3-7"
+       transform="translate(94.92597,59.664385)"><tspan
+         x="54.136707"
+         y="18.045568"
+         id="tspan1071">dmadev</tspan></text>
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 74.476373,49.527306 62.82407,64.827354"
+       id="path45308"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5"
+       inkscape:connection-end="#rect31-9-5-8" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 35.924309,49.527306 47.711612,64.827354"
+       id="path45310"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9"
+       inkscape:connection-end="#rect31-9-5-8" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="M 55.403414,84.556469 55.53332,98.47744"
+       id="path45312"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5-8"
+       inkscape:connection-end="#rect31-9-5-8-0" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 162.62241,84.556469 0.0155,13.920971"
+       id="path45320"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5-8-5"
+       inkscape:connection-end="#rect31-9-5-8-0-6" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 146.28317,123.09389 -22.65252,17.03352"
+       id="path45586"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5-8-0-6"
+       inkscape:connection-end="#rect31-9-5-8-0-4" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 70.900938,123.09389 21.108496,17.03352"
+       id="path45588"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5-8-0"
+       inkscape:connection-end="#rect31-9-5-8-0-4" />
+    <path
+       style="fill:none;fill-rule:evenodd;stroke:#000000;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
+       d="m 162.50039,49.527306 0.0675,15.300048"
+       id="path45956"
+       inkscape:connector-type="polyline"
+       inkscape:connector-curvature="0"
+       inkscape:connection-start="#rect31-9-5-3"
+       inkscape:connection-end="#rect31-9-5-8-5" />
+  </g>
+</svg>
diff --git a/doc/guides/prog_guide/index.rst b/doc/guides/prog_guide/index.rst
index 45c7dec88..dc60db9be 100644
--- a/doc/guides/prog_guide/index.rst
+++ b/doc/guides/prog_guide/index.rst
@@ -27,6 +27,7 @@ Programmer's Guide
     cryptodev_lib
     compressdev
     regexdev
+    dmadev
     rte_security
     rawdev
     link_bonding_poll_mode_drv_lib
diff --git a/drivers/dma/hisilicon/hisi_dmadev.c b/drivers/dma/hisilicon/hisi_dmadev.c
new file mode 100644
index 000000000..cf5bc6dc9
--- /dev/null
+++ b/drivers/dma/hisilicon/hisi_dmadev.c
@@ -0,0 +1,925 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2021 HiSilicon Limited
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include <rte_bus_pci.h>
+#include <rte_cycles.h>
+#include <rte_eal.h>
+#include <rte_io.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_memzone.h>
+#include <rte_pci.h>
+#include <rte_dmadev_pmd.h>
+
+#include "hisi_dmadev.h"
+
+RTE_LOG_REGISTER(hisi_dma_logtype, pmd.dma.hisi, INFO);
+#define HISI_DMA_LOG(level, fmt, args...) \
+		rte_log(RTE_LOG_ ## level, hisi_dma_logtype, \
+		"%s(): " fmt "\n", __func__, ##args)
+#define HISI_DMA_LOG_RAW(hw, level, fmt, args...) \
+		rte_log(RTE_LOG_ ## level, hisi_dma_logtype, \
+		"%s %s(): " fmt "\n", (hw)->data->dev_name, \
+		__func__, ##args)
+#define HISI_DMA_DEBUG(hw, fmt, args...) \
+		HISI_DMA_LOG_RAW(hw, DEBUG, fmt, ## args)
+#define HISI_DMA_INFO(hw, fmt, args...) \
+		HISI_DMA_LOG_RAW(hw, INFO, fmt, ## args)
+#define HISI_DMA_WARN(hw, fmt, args...) \
+		HISI_DMA_LOG_RAW(hw, WARNING, fmt, ## args)
+#define HISI_DMA_ERR(hw, fmt, args...) \
+		HISI_DMA_LOG_RAW(hw, ERR, fmt, ## args)
+
+static uint32_t
+hisi_dma_queue_base(struct hisi_dma_dev *hw)
+{
+	if (hw->reg_layout == HISI_DMA_REG_LAYOUT_HIP08)
+		return HISI_DMA_HIP08_QUEUE_BASE;
+	else
+		return 0;
+}
+
+static volatile void *
+hisi_dma_queue_regaddr(struct hisi_dma_dev *hw, uint32_t qoff)
+{
+	uint32_t off = hisi_dma_queue_base(hw) +
+			hw->queue_id * HISI_DMA_QUEUE_REGION_SIZE + qoff;
+	return (volatile void *)((char *)hw->io_base + off);
+}
+
+static void
+hisi_dma_write_reg(void *base, uint32_t off, uint32_t val)
+{
+	rte_write32(rte_cpu_to_le_32(val),
+		    (volatile void *)((char *)base + off));
+}
+
+static void
+hisi_dma_write_dev(struct hisi_dma_dev *hw, uint32_t off, uint32_t val)
+{
+	hisi_dma_write_reg(hw->io_base, off, val);
+}
+
+static void
+hisi_dma_write_queue(struct hisi_dma_dev *hw, uint32_t qoff, uint32_t val)
+{
+	uint32_t off = hisi_dma_queue_base(hw) +
+			hw->queue_id * HISI_DMA_QUEUE_REGION_SIZE + qoff;
+	hisi_dma_write_dev(hw, off, val);
+}
+
+static uint32_t
+hisi_dma_read_reg(void *base, uint32_t off)
+{
+	uint32_t val = rte_read32((volatile void *)((char *)base + off));
+	return rte_le_to_cpu_32(val);
+}
+
+static uint32_t
+hisi_dma_read_dev(struct hisi_dma_dev *hw, uint32_t off)
+{
+	return hisi_dma_read_reg(hw->io_base, off);
+}
+
+static uint32_t
+hisi_dma_read_queue(struct hisi_dma_dev *hw, uint32_t qoff)
+{
+	uint32_t off = hisi_dma_queue_base(hw) +
+			hw->queue_id * HISI_DMA_QUEUE_REGION_SIZE + qoff;
+	return hisi_dma_read_dev(hw, off);
+}
+
+static void
+hisi_dma_update_bit(struct hisi_dma_dev *hw, uint32_t off, uint32_t pos,
+		    bool set)
+{
+	uint32_t tmp = hisi_dma_read_dev(hw, off);
+	uint32_t mask = 1u << pos;
+	tmp = set ? tmp | mask : tmp & ~mask;
+	hisi_dma_write_dev(hw, off, tmp);
+}
+
+static void
+hisi_dma_update_queue_bit(struct hisi_dma_dev *hw, uint32_t qoff, uint32_t pos,
+			  bool set)
+{
+	uint32_t tmp = hisi_dma_read_queue(hw, qoff);
+	uint32_t mask = 1u << pos;
+	tmp = set ? tmp | mask : tmp & ~mask;
+	hisi_dma_write_queue(hw, qoff, tmp);
+}
+
+static void
+hisi_dma_update_queue_mbit(struct hisi_dma_dev *hw, uint32_t qoff,
+			   uint32_t mask, bool set)
+{
+	uint32_t tmp = hisi_dma_read_queue(hw, qoff);
+	tmp = set ? tmp | mask : tmp & ~mask;
+	hisi_dma_write_queue(hw, qoff, tmp);
+}
+
+#define hisi_dma_poll_hw_state(hw, val, cond, sleep_us, timeout_us) ({ \
+	uint32_t timeout = 0; \
+	while (timeout++ <= (timeout_us)) { \
+		(val) = hisi_dma_read_queue(hw, HISI_DMA_QUEUE_FSM_REG); \
+		if (cond) \
+			break; \
+		rte_delay_us(sleep_us); \
+	} \
+	(cond) ? 0 : -ETIME; \
+})
+
+static int
+hisi_dma_reset_hw(struct hisi_dma_dev *hw)
+{
+#define POLL_SLEEP_US	100
+#define POLL_TIMEOUT_US	10000
+
+	uint32_t tmp;
+	int ret;
+
+	hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL0_REG,
+				  HISI_DMA_QUEUE_CTRL0_PAUSE_B, true);
+	hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL0_REG,
+				  HISI_DMA_QUEUE_CTRL0_EN_B, false);
+
+	ret = hisi_dma_poll_hw_state(hw, tmp,
+		FIELD_GET(HISI_DMA_QUEUE_FSM_STS_M, tmp) != HISI_DMA_STATE_RUN,
+		POLL_SLEEP_US, POLL_TIMEOUT_US);
+	if (ret) {
+		HISI_DMA_ERR(hw, "disable dma timeout!");
+		return ret;
+	}
+
+	hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL1_REG,
+				  HISI_DMA_QUEUE_CTRL1_RESET_B, true);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_SQ_TAIL_REG, 0);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_CQ_HEAD_REG, 0);
+	hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL0_REG,
+				  HISI_DMA_QUEUE_CTRL0_PAUSE_B, false);
+
+	ret = hisi_dma_poll_hw_state(hw, tmp,
+		FIELD_GET(HISI_DMA_QUEUE_FSM_STS_M, tmp) == HISI_DMA_STATE_IDLE,
+		POLL_SLEEP_US, POLL_TIMEOUT_US);
+	if (ret) {
+		HISI_DMA_ERR(hw, "reset dma timeout!");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void
+hisi_dma_init_hw(struct hisi_dma_dev *hw)
+{
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_SQ_BASE_L_REG,
+			     lower_32_bits(hw->sqe_iova));
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_SQ_BASE_H_REG,
+			     upper_32_bits(hw->sqe_iova));
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_CQ_BASE_L_REG,
+			     lower_32_bits(hw->cqe_iova));
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_CQ_BASE_H_REG,
+			     upper_32_bits(hw->cqe_iova));
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_SQ_DEPTH_REG,
+			     hw->sq_depth_mask);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_CQ_DEPTH_REG, hw->cq_depth - 1);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_SQ_TAIL_REG, 0);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_CQ_HEAD_REG, 0);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_ERR_INT_NUM0_REG, 0);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_ERR_INT_NUM1_REG, 0);
+	hisi_dma_write_queue(hw, HISI_DMA_QUEUE_ERR_INT_NUM2_REG, 0);
+
+	if (hw->reg_layout == HISI_DMA_REG_LAYOUT_HIP08) {
+		hisi_dma_write_queue(hw, HISI_DMA_HIP08_QUEUE_ERR_INT_NUM3_REG,
+				     0);
+		hisi_dma_write_queue(hw, HISI_DMA_HIP08_QUEUE_ERR_INT_NUM4_REG,
+				     0);
+		hisi_dma_write_queue(hw, HISI_DMA_HIP08_QUEUE_ERR_INT_NUM5_REG,
+				     0);
+		hisi_dma_write_queue(hw, HISI_DMA_HIP08_QUEUE_ERR_INT_NUM6_REG,
+				     0);
+		hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL0_REG,
+				HISI_DMA_HIP08_QUEUE_CTRL0_ERR_ABORT_B, false);
+		hisi_dma_update_queue_mbit(hw, HISI_DMA_QUEUE_INT_STATUS_REG,
+				HISI_DMA_HIP08_QUEUE_INT_MASK_M, true);
+		hisi_dma_update_queue_mbit(hw,
+				HISI_DMA_HIP08_QUEUE_INT_MASK_REG,
+				HISI_DMA_HIP08_QUEUE_INT_MASK_M, true);
+	}
+}
+
+static void
+hisi_dma_init_gbl(void *pci_bar, uint8_t revision)
+{
+	struct hisi_dma_dev hw;
+
+	memset(&hw, 0, sizeof(hw));
+	hw.io_base = pci_bar;
+
+	if (revision == HISI_DMA_REVISION_HIP08B)
+		hisi_dma_update_bit(&hw, HISI_DMA_HIP08_MODE_REG,
+				    HISI_DMA_HIP08_MODE_SEL_B, true);
+}
+
+static uint8_t
+hisi_dma_reg_layout(uint8_t revision)
+{
+	if (revision == HISI_DMA_REVISION_HIP08B)
+		return HISI_DMA_REG_LAYOUT_HIP08;
+	else
+		return HISI_DMA_REG_LAYOUT_INVALID;
+}
+
+static void
+hisi_dma_zero_iomem(struct hisi_dma_dev *hw)
+{
+	memset(hw->iomz->addr, 0, hw->iomz_sz);
+}
+
+static int
+hisi_dma_alloc_iomem(struct hisi_dma_dev *hw, uint16_t ring_size,
+		     const char *dev_name)
+{
+	uint32_t sq_size = sizeof(struct hisi_dma_sqe) * ring_size;
+	uint32_t cq_size = sizeof(struct hisi_dma_cqe) *
+			   (ring_size + HISI_DMA_CQ_RESERVED);
+	uint32_t status_size = sizeof(uint16_t) * ring_size;
+	char mz_name[RTE_MEMZONE_NAMESIZE];
+	const struct rte_memzone *iomz;
+	uint32_t total_size;
+
+	sq_size = RTE_CACHE_LINE_ROUNDUP(sq_size);
+	cq_size = RTE_CACHE_LINE_ROUNDUP(cq_size);
+	status_size = RTE_CACHE_LINE_ROUNDUP(status_size);
+	total_size = sq_size + cq_size + status_size;
+
+	(void)snprintf(mz_name, sizeof(mz_name), "hisi_dma:%s", dev_name);
+	iomz = rte_memzone_reserve(mz_name, total_size, hw->data->numa_node,
+				   RTE_MEMZONE_IOVA_CONTIG);
+	if (iomz == NULL) {
+		HISI_DMA_ERR(hw, "malloc %s iomem fail!", mz_name);
+		return -ENOMEM;
+	}
+
+	hw->iomz = iomz;
+	hw->iomz_sz = total_size;
+	hw->sqe = iomz->addr;
+	hw->cqe = (void *)((char *)iomz->addr + sq_size);
+	hw->status = (void *)((char *)iomz->addr + sq_size + cq_size);
+	hw->sqe_iova = iomz->iova;
+	hw->cqe_iova = iomz->iova + sq_size;
+	hw->sq_depth_mask = ring_size - 1;
+	hw->cq_depth = ring_size + HISI_DMA_CQ_RESERVED;
+	hisi_dma_zero_iomem(hw);
+
+	return 0;
+}
+
+static void
+hisi_dma_free_iomem(struct hisi_dma_dev *hw)
+{
+	if (hw->iomz != NULL)
+		rte_memzone_free(hw->iomz);
+
+	hw->iomz = NULL;
+	hw->sqe = NULL;
+	hw->cqe = NULL;
+	hw->status = NULL;
+	hw->sqe_iova = 0;
+	hw->cqe_iova = 0;
+	hw->sq_depth_mask = 0;
+	hw->cq_depth = 0;
+}
+
+static int
+hisi_dma_info_get(const struct rte_dma_dev *dev,
+		  struct rte_dma_info *dev_info,
+		  uint32_t info_sz)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(info_sz);
+
+	dev_info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM |
+			     RTE_DMA_CAPA_OPS_COPY;
+	dev_info->max_vchans = 1;
+	dev_info->max_desc = HISI_DMA_MAX_DESC_NUM;
+	dev_info->min_desc = HISI_DMA_MIN_DESC_NUM;
+
+	return 0;
+}
+
+static int
+hisi_dma_configure(struct rte_dma_dev *dev,
+		   const struct rte_dma_conf *conf,
+		   uint32_t conf_sz)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(conf);
+	RTE_SET_USED(conf_sz);
+	return 0;
+}
+
+static int
+hisi_dma_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+		     const struct rte_dma_vchan_conf *conf,
+		     uint32_t conf_sz)
+{
+	struct hisi_dma_dev *hw = dev->data->dev_private;
+	int ret;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(conf_sz);
+
+	if (!rte_is_power_of_2(conf->nb_desc)) {
+		HISI_DMA_ERR(hw, "Number of desc must be power of 2!");
+		return -EINVAL;
+	}
+
+	hisi_dma_free_iomem(hw);
+	ret = hisi_dma_alloc_iomem(hw, conf->nb_desc, dev->data->dev_name);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int
+hisi_dma_start(struct rte_dma_dev *dev)
+{
+	struct hisi_dma_dev *hw = dev->data->dev_private;
+
+	if (hw->iomz == NULL) {
+		HISI_DMA_ERR(hw, "Vchan was not setup, start fail!\n");
+		return -EINVAL;
+	}
+
+	/* Reset the dmadev to a known state, include:
+	 *   1) zero iomem, also include status fields.
+	 *   2) init hardware register.
+	 *   3) init index values to zero.
+	 *   4) init running statistics.
+	 */
+	hisi_dma_zero_iomem(hw);
+	hisi_dma_init_hw(hw);
+	hw->ridx = 0;
+	hw->cridx = 0;
+	hw->sq_head = 0;
+	hw->sq_tail = 0;
+	hw->cq_sq_head = 0;
+	hw->cq_head = 0;
+	hw->cqs_completed = 0;
+	hw->cqe_vld = 1;
+	hw->submitted = 0;
+	hw->completed = 0;
+	hw->errors = 0;
+
+	hisi_dma_update_queue_bit(hw, HISI_DMA_QUEUE_CTRL0_REG,
+				  HISI_DMA_QUEUE_CTRL0_EN_B, true);
+
+	return 0;
+}
+
+static int
+hisi_dma_stop(struct rte_dma_dev *dev)
+{
+	return hisi_dma_reset_hw(dev->data->dev_private);
+}
+
+static int
+hisi_dma_close(struct rte_dma_dev *dev)
+{
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		/* The dmadev already stopped */
+		hisi_dma_free_iomem(dev->data->dev_private);
+	}
+	return 0;
+}
+
+static int
+hisi_dma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan,
+		   struct rte_dma_stats *stats,
+		   uint32_t stats_sz)
+{
+	struct hisi_dma_dev *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(stats_sz);
+	stats->submitted = hw->submitted;
+	stats->completed = hw->completed;
+	stats->errors = hw->errors;
+
+	return 0;
+}
+
+static int
+hisi_dma_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
+{
+	struct hisi_dma_dev *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+	hw->submitted = 0;
+	hw->completed = 0;
+	hw->errors = 0;
+
+	return 0;
+}
+
+static void
+hisi_dma_get_dump_range(struct hisi_dma_dev *hw, uint32_t *start, uint32_t *end)
+{
+	if (hw->reg_layout == HISI_DMA_REG_LAYOUT_HIP08) {
+		*start = HISI_DMA_HIP08_DUMP_START_REG;
+		*end = HISI_DMA_HIP08_DUMP_END_REG;
+	} else {
+		*start = 0;
+		*end = 0;
+	}
+}
+
+static void
+hisi_dma_dump_common(struct hisi_dma_dev *hw, FILE *f)
+{
+#define DUMP_REGNUM_PER_LINE	4
+
+	uint32_t start, end;
+	uint32_t cnt, i;
+
+	hisi_dma_get_dump_range(hw, &start, &end);
+
+	(void)fprintf(f, "    common-register:\n");
+
+	cnt = 0;
+	for (i = start; i <= end; i += sizeof(uint32_t)) {
+		if (cnt % DUMP_REGNUM_PER_LINE == 0)
+			(void)fprintf(f, "      [%4x]:", i);
+		(void)fprintf(f, " 0x%08x", hisi_dma_read_dev(hw, i));
+		cnt++;
+		if (cnt % DUMP_REGNUM_PER_LINE == 0)
+			(void)fprintf(f, "\n");
+	}
+	if (cnt % DUMP_REGNUM_PER_LINE)
+		(void)fprintf(f, "\n");
+}
+
+static void
+hisi_dma_dump_read_queue(struct hisi_dma_dev *hw, uint32_t qoff,
+			 char *buffer, int max_sz)
+{
+	memset(buffer, 0, max_sz);
+
+	/* Address-related registers are not printed for security reasons. */
+	if (qoff == HISI_DMA_QUEUE_SQ_BASE_L_REG ||
+	    qoff == HISI_DMA_QUEUE_SQ_BASE_H_REG ||
+	    qoff == HISI_DMA_QUEUE_CQ_BASE_L_REG ||
+	    qoff == HISI_DMA_QUEUE_CQ_BASE_H_REG) {
+		(void)snprintf(buffer, max_sz, "**********");
+		return;
+	}
+
+	(void)snprintf(buffer, max_sz, "0x%08x", hisi_dma_read_queue(hw, qoff));
+}
+
+static void
+hisi_dma_dump_queue(struct hisi_dma_dev *hw, FILE *f)
+{
+#define REG_FMT_LEN	32
+	char buf[REG_FMT_LEN] = { 0 };
+	uint32_t i;
+
+	(void)fprintf(f, "    queue-register:\n");
+	for (i = 0; i < HISI_DMA_QUEUE_REGION_SIZE; ) {
+		hisi_dma_dump_read_queue(hw, i, buf, sizeof(buf));
+		(void)fprintf(f, "      [%2x]: %s", i, buf);
+		i += sizeof(uint32_t);
+		hisi_dma_dump_read_queue(hw, i, buf, sizeof(buf));
+		(void)fprintf(f, " %s", buf);
+		i += sizeof(uint32_t);
+		hisi_dma_dump_read_queue(hw, i, buf, sizeof(buf));
+		(void)fprintf(f, " %s", buf);
+		i += sizeof(uint32_t);
+		hisi_dma_dump_read_queue(hw, i, buf, sizeof(buf));
+		(void)fprintf(f, " %s\n", buf);
+		i += sizeof(uint32_t);
+	}
+}
+
+static int
+hisi_dma_dump(const struct rte_dma_dev *dev, FILE *f)
+{
+	struct hisi_dma_dev *hw = dev->data->dev_private;
+
+	(void)fprintf(f,
+		"    revision: 0x%x queue_id: %u ring_size: %u\n"
+		"    ridx: %u cridx: %u\n"
+		"    sq_head: %u sq_tail: %u cq_sq_head: %u\n"
+		"    cq_head: %u cqs_completed: %u cqe_vld: %u\n"
+		"    submitted: %" PRIu64 " completed: %" PRIu64 " errors %"
+		PRIu64"\n",
+		hw->revision, hw->queue_id,
+		hw->sq_depth_mask > 0 ? hw->sq_depth_mask + 1 : 0,
+		hw->ridx, hw->cridx,
+		hw->sq_head, hw->sq_tail, hw->cq_sq_head,
+		hw->cq_head, hw->cqs_completed, hw->cqe_vld,
+		hw->submitted, hw->completed, hw->errors);
+	hisi_dma_dump_queue(hw, f);
+	hisi_dma_dump_common(hw, f);
+
+	return 0;
+}
+
+static int
+hisi_dma_copy(void *dev_private, uint16_t vchan,
+		 rte_iova_t src, rte_iova_t dst,
+		 uint32_t length, uint64_t flags)
+{
+	struct hisi_dma_dev *hw = dev_private;
+	struct hisi_dma_sqe *sqe = &hw->sqe[hw->sq_tail];
+
+	RTE_SET_USED(vchan);
+
+	if (((hw->sq_tail + 1) & hw->sq_depth_mask) == hw->sq_head)
+		return -ENOSPC;
+
+	sqe->dw0 = rte_cpu_to_le_32(SQE_OPCODE_M2M);
+	sqe->dw1 = 0;
+	sqe->dw2 = 0;
+	sqe->length = rte_cpu_to_le_32(length);
+	sqe->src_addr = rte_cpu_to_le_64(src);
+	sqe->dst_addr = rte_cpu_to_le_64(dst);
+	hw->sq_tail = (hw->sq_tail + 1) & hw->sq_depth_mask;
+	hw->submitted++;
+
+	if (flags & RTE_DMA_OP_FLAG_FENCE)
+		sqe->dw0 |= rte_cpu_to_le_32(SQE_FENCE_FLAG);
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+		rte_write32(rte_cpu_to_le_32(hw->sq_tail), hw->sq_tail_reg);
+
+	return hw->ridx++;
+}
+
+static int
+hisi_dma_submit(void *dev_private, uint16_t vchan)
+{
+	struct hisi_dma_dev *hw = dev_private;
+
+	RTE_SET_USED(vchan);
+	rte_write32(rte_cpu_to_le_32(hw->sq_tail), hw->sq_tail_reg);
+
+	return 0;
+}
+
+static inline void
+hisi_dma_scan_cq(struct hisi_dma_dev *hw)
+{
+	volatile struct hisi_dma_cqe *cqe;
+	uint16_t csq_head = hw->cq_sq_head;
+	uint16_t cq_head = hw->cq_head;
+	uint16_t count = 0;
+	uint64_t misc;
+
+	while (true) {
+		cqe = &hw->cqe[cq_head];
+		misc = cqe->misc;
+		misc = rte_le_to_cpu_64(misc);
+		if (FIELD_GET(CQE_VALID_B, misc) != hw->cqe_vld)
+			break;
+
+		csq_head = FIELD_GET(CQE_SQ_HEAD_MASK, misc);
+		if (unlikely(misc & CQE_STATUS_MASK))
+			hw->status[csq_head] = FIELD_GET(CQE_STATUS_MASK,
+							 misc);
+
+		count++;
+		cq_head++;
+		if (cq_head == hw->cq_depth) {
+			hw->cqe_vld = !hw->cqe_vld;
+			cq_head = 0;
+		}
+	}
+
+	if (count == 0)
+		return;
+
+	hw->cq_head = cq_head;
+	hw->cq_sq_head = (csq_head + 1) & hw->sq_depth_mask;
+	hw->cqs_completed += count;
+	if (hw->cqs_completed >= HISI_DMA_CQ_RESERVED) {
+		rte_write32(rte_cpu_to_le_32(cq_head), hw->cq_head_reg);
+		hw->cqs_completed = 0;
+	}
+}
+
+static inline uint16_t
+hisi_dma_calc_cpls(struct hisi_dma_dev *hw, const uint16_t nb_cpls)
+{
+	uint16_t cpl_num;
+
+	if (hw->cq_sq_head >= hw->sq_head)
+		cpl_num = hw->cq_sq_head - hw->sq_head;
+	else
+		cpl_num = hw->sq_depth_mask + 1 - hw->sq_head + hw->cq_sq_head;
+
+	if (cpl_num > nb_cpls)
+		cpl_num = nb_cpls;
+
+	return cpl_num;
+}
+
+static uint16_t
+hisi_dma_completed(void *dev_private,
+		   uint16_t vchan, const uint16_t nb_cpls,
+		   uint16_t *last_idx, bool *has_error)
+{
+	struct hisi_dma_dev *hw = dev_private;
+	uint16_t sq_head = hw->sq_head;
+	uint16_t cpl_num, i;
+
+	RTE_SET_USED(vchan);
+	hisi_dma_scan_cq(hw);
+
+	cpl_num = hisi_dma_calc_cpls(hw, nb_cpls);
+	for (i = 0; i < cpl_num; i++) {
+		if (hw->status[sq_head]) {
+			*has_error = true;
+			break;
+		}
+		sq_head = (sq_head + 1) & hw->sq_depth_mask;
+	}
+	if (i > 0) {
+		hw->cridx += i;
+		*last_idx = hw->cridx - 1;
+		hw->sq_head = sq_head;
+	}
+	hw->completed += i;
+
+	return i;
+}
+
+static enum rte_dma_status_code
+hisi_dma_convert_status(uint16_t status)
+{
+	switch (status) {
+	case HISI_DMA_STATUS_SUCCESS:
+		return RTE_DMA_STATUS_SUCCESSFUL;
+	case HISI_DMA_STATUS_INVALID_OPCODE:
+		return RTE_DMA_STATUS_INVALID_OPCODE;
+	case HISI_DMA_STATUS_INVALID_LENGTH:
+		return RTE_DMA_STATUS_INVALID_LENGTH;
+	case HISI_DMA_STATUS_USER_ABORT:
+		return RTE_DMA_STATUS_USER_ABORT;
+	case HISI_DMA_STATUS_REMOTE_READ_ERROR:
+	case HISI_DMA_STATUS_AXI_READ_ERROR:
+		return RTE_DMA_STATUS_BUS_READ_ERROR;
+	case HISI_DMA_STATUS_AXI_WRITE_ERROR:
+		return RTE_DMA_STATUS_BUS_WRITE_ERROR;
+	case HISI_DMA_STATUS_DATA_POISON:
+	case HISI_DMA_STATUS_REMOTE_DATA_POISION:
+		return RTE_DMA_STATUS_DATA_POISION;
+	case HISI_DMA_STATUS_SQE_READ_ERROR:
+	case HISI_DMA_STATUS_SQE_READ_POISION:
+		return RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR;
+	case HISI_DMA_STATUS_LINK_DOWN_ERROR:
+		return RTE_DMA_STATUS_DEV_LINK_ERROR;
+	default:
+		return RTE_DMA_STATUS_ERROR_UNKNOWN;
+	}
+}
+
+static uint16_t
+hisi_dma_completed_status(void *dev_private,
+			  uint16_t vchan, const uint16_t nb_cpls,
+			  uint16_t *last_idx, enum rte_dma_status_code *status)
+{
+	struct hisi_dma_dev *hw = dev_private;
+	uint16_t sq_head = hw->sq_head;
+	uint16_t cpl_num, i;
+
+	RTE_SET_USED(vchan);
+	hisi_dma_scan_cq(hw);
+
+	cpl_num = hisi_dma_calc_cpls(hw, nb_cpls);
+	for (i = 0; i < cpl_num; i++) {
+		status[i] = hisi_dma_convert_status(hw->status[sq_head]);
+		hw->errors += !!status[i];
+		hw->status[sq_head] = HISI_DMA_STATUS_SUCCESS;
+		sq_head = (sq_head + 1) & hw->sq_depth_mask;
+	}
+	if (likely(cpl_num > 0)) {
+		hw->cridx += cpl_num;
+		*last_idx = hw->cridx - 1;
+		hw->sq_head = sq_head;
+	}
+	hw->completed += cpl_num;
+
+	return cpl_num;
+}
+
+static uint16_t
+hisi_dma_burst_capacity(const void *dev_private, uint16_t vchan)
+{
+	const struct hisi_dma_dev *hw = dev_private;
+	uint16_t sq_head = hw->sq_head;
+	uint16_t sq_tail = hw->sq_tail;
+
+	RTE_SET_USED(vchan);
+
+	return (sq_tail >= sq_head) ? hw->sq_depth_mask - sq_tail + sq_head :
+				      sq_head - 1 - sq_tail;
+}
+
+static void
+hisi_dma_gen_pci_device_name(const struct rte_pci_device *pci_dev,
+			     char *name, size_t size)
+{
+	memset(name, 0, size);
+	(void)snprintf(name, size, "%x:%x.%x",
+		 pci_dev->addr.bus, pci_dev->addr.devid,
+		 pci_dev->addr.function);
+}
+
+static void
+hisi_dma_gen_dev_name(const struct rte_pci_device *pci_dev,
+		      uint8_t queue_id, char *name, size_t size)
+{
+	memset(name, 0, size);
+	(void)snprintf(name, size, "%x:%x.%x-ch%u",
+		 pci_dev->addr.bus, pci_dev->addr.devid,
+		 pci_dev->addr.function, queue_id);
+}
+
+/**
+ * Hardware queue state machine:
+ *
+ *   -----------  dmadev_create	  ------------------
+ *   | Unknown | ---------------> |      IDLE      |
+ *   -----------                  ------------------
+ *                                   ^          |
+ *                                   |          |dev_start
+ *                           dev_stop|          |
+ *                                   |          v
+ *                                ------------------
+ *                                |      RUN       |
+ *                                ------------------
+ *
+ */
+static const struct rte_dma_dev_ops hisi_dmadev_ops = {
+	.dev_info_get     = hisi_dma_info_get,
+	.dev_configure    = hisi_dma_configure,
+	.dev_start        = hisi_dma_start,
+	.dev_stop         = hisi_dma_stop,
+	.dev_close        = hisi_dma_close,
+	.vchan_setup      = hisi_dma_vchan_setup,
+	.stats_get        = hisi_dma_stats_get,
+	.stats_reset      = hisi_dma_stats_reset,
+	.dev_dump         = hisi_dma_dump,
+};
+
+static int
+hisi_dma_create(struct rte_pci_device *pci_dev, uint8_t queue_id,
+		uint8_t revision)
+{
+#define REG_PCI_BAR_INDEX	2
+
+	char name[RTE_DEV_NAME_MAX_LEN];
+	struct rte_dma_dev *dev;
+	struct hisi_dma_dev *hw;
+	int ret;
+
+	hisi_dma_gen_dev_name(pci_dev, queue_id, name, sizeof(name));
+	dev = rte_dma_pmd_allocate(name, pci_dev->device.numa_node,
+				   sizeof(*hw));
+	if (dev == NULL) {
+		HISI_DMA_LOG(ERR, "%s allocate dmadev fail!", name);
+		return -EINVAL;
+	}
+
+	dev->device = &pci_dev->device;
+	dev->dev_ops = &hisi_dmadev_ops;
+	dev->fp_obj->dev_private = dev->data->dev_private;
+	dev->fp_obj->copy = hisi_dma_copy;
+	dev->fp_obj->submit = hisi_dma_submit;
+	dev->fp_obj->completed = hisi_dma_completed;
+	dev->fp_obj->completed_status = hisi_dma_completed_status;
+	dev->fp_obj->burst_capacity = hisi_dma_burst_capacity;
+
+	hw = dev->data->dev_private;
+	hw->data = dev->data;
+	hw->revision = revision;
+	hw->reg_layout = hisi_dma_reg_layout(revision);
+	hw->io_base = pci_dev->mem_resource[REG_PCI_BAR_INDEX].addr;
+	hw->queue_id = queue_id;
+	hw->sq_tail_reg = hisi_dma_queue_regaddr(hw,
+						 HISI_DMA_QUEUE_SQ_TAIL_REG);
+	hw->cq_head_reg = hisi_dma_queue_regaddr(hw,
+						 HISI_DMA_QUEUE_CQ_HEAD_REG);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		ret = hisi_dma_reset_hw(hw);
+		if (ret) {
+			HISI_DMA_LOG(ERR, "%s init device fail!", name);
+			(void)rte_dma_pmd_release(name);
+			return -EIO;
+		}
+	}
+
+	dev->state = RTE_DMA_DEV_READY;
+	HISI_DMA_LOG(DEBUG, "%s create dmadev success!", name);
+
+	return 0;
+}
+
+static int
+hisi_dma_check_revision(struct rte_pci_device *pci_dev, const char *name,
+			uint8_t *out_revision)
+{
+	uint8_t revision;
+	int ret;
+
+	ret = rte_pci_read_config(pci_dev, &revision, 1,
+				  HISI_DMA_PCI_REVISION_ID_REG);
+	if (ret != 1) {
+		HISI_DMA_LOG(ERR, "%s read PCI revision failed!", name);
+		return -EINVAL;
+	}
+	if (hisi_dma_reg_layout(revision) == HISI_DMA_REG_LAYOUT_INVALID) {
+		HISI_DMA_LOG(ERR, "%s revision: 0x%x not supported!",
+			     name, revision);
+		return -EINVAL;
+	}
+
+	*out_revision = revision;
+	return 0;
+}
+
+static int
+hisi_dma_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	char name[RTE_DEV_NAME_MAX_LEN] = { 0 };
+	uint8_t revision;
+	uint8_t i;
+	int ret;
+
+	hisi_dma_gen_pci_device_name(pci_dev, name, sizeof(name));
+
+	if (pci_dev->mem_resource[2].addr == NULL) {
+		HISI_DMA_LOG(ERR, "%s BAR2 is NULL!\n", name);
+		return -ENODEV;
+	}
+
+	ret = hisi_dma_check_revision(pci_dev, name, &revision);
+	if (ret)
+		return ret;
+	HISI_DMA_LOG(DEBUG, "%s read PCI revision: 0x%x", name, revision);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		hisi_dma_init_gbl(pci_dev->mem_resource[2].addr, revision);
+
+	for (i = 0; i < HISI_DMA_MAX_HW_QUEUES; i++) {
+		ret = hisi_dma_create(pci_dev, i, revision);
+		if (ret) {
+			HISI_DMA_LOG(ERR, "%s create dmadev %u failed!",
+				     name, i);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int
+hisi_dma_remove(struct rte_pci_device *pci_dev)
+{
+	char name[RTE_DEV_NAME_MAX_LEN];
+	uint8_t i;
+	int ret;
+
+	for (i = 0; i < HISI_DMA_MAX_HW_QUEUES; i++) {
+		hisi_dma_gen_dev_name(pci_dev, i, name, sizeof(name));
+		ret = rte_dma_pmd_release(name);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static const struct rte_pci_id pci_id_hisi_dma_map[] = {
+	{ RTE_PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, HISI_DMA_DEVICE_ID) },
+	{ .vendor_id = 0, }, /* sentinel */
+};
+
+static struct rte_pci_driver hisi_dma_pmd_drv = {
+	.id_table  = pci_id_hisi_dma_map,
+	.drv_flags = RTE_PCI_DRV_NEED_MAPPING,
+	.probe     = hisi_dma_probe,
+	.remove    = hisi_dma_remove,
+};
+
+RTE_PMD_REGISTER_PCI(dma_hisilicon, hisi_dma_pmd_drv);
+RTE_PMD_REGISTER_PCI_TABLE(dma_hisilicon, pci_id_hisi_dma_map);
+RTE_PMD_REGISTER_KMOD_DEP(dma_hisilicon, "vfio-pci");
diff --git a/drivers/dma/hisilicon/hisi_dmadev.h b/drivers/dma/hisilicon/hisi_dmadev.h
new file mode 100644
index 000000000..12e209c86
--- /dev/null
+++ b/drivers/dma/hisilicon/hisi_dmadev.h
@@ -0,0 +1,236 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+#ifndef HISI_DMADEV_H
+#define HISI_DMADEV_H
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+
+#define BIT(x)	(1ul << (x))
+#define BITS_PER_LONG	(__SIZEOF_LONG__ * 8)
+#define GENMASK(h, l) \
+		(((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+#define BF_SHF(x) (__builtin_ffsll(x) - 1)
+#define FIELD_GET(mask, reg) \
+		((typeof(mask))(((reg) & (mask)) >> BF_SHF(mask)))
+
+#define lower_32_bits(x) ((uint32_t)(x))
+#define upper_32_bits(x) ((uint32_t)(((x) >> 16) >> 16))
+
+#define PCI_VENDOR_ID_HUAWEI			0x19e5
+#define HISI_DMA_DEVICE_ID			0xA122
+#define HISI_DMA_PCI_REVISION_ID_REG		0x08
+#define HISI_DMA_REVISION_HIP08B		0x21
+
+#define HISI_DMA_MAX_HW_QUEUES			4
+#define HISI_DMA_MAX_DESC_NUM			8192
+#define HISI_DMA_MIN_DESC_NUM			32
+
+/**
+ * The HIP08B(HiSilicon IP08) and later Chip(e.g. HiSilicon IP09) are DMA iEPs,
+ * they have the same pci device id but with different pci revision.
+ * Unfortunately, they have different register layouts, so the layout
+ * enumerations are defined.
+ */
+enum {
+	HISI_DMA_REG_LAYOUT_INVALID = 0,
+	HISI_DMA_REG_LAYOUT_HIP08
+};
+
+/**
+ * Hardware PCI bar register MAP:
+ *
+ *     --------------
+ *     | Misc-reg-0 |
+ *     |            |
+ *     --------------   -> Queue base
+ *     |            |
+ *     | Queue-0    |
+ *     |            |
+ *     --------------   ---
+ *     |            |    ^
+ *     | Queue-1    |   Queue region
+ *     |            |    v
+ *     --------------   ---
+ *     | ...        |
+ *     | Queue-x    |
+ *     | ...        |
+ *     --------------
+ *     | Misc-reg-1 |
+ *     --------------
+ *
+ * As described above, a single queue register is continuous and occupies the
+ * length of queue-region. The global offset for a single queue register is
+ * calculated by:
+ *     offset = queue-base + (queue-id * queue-region) + reg-offset-in-region.
+ *
+ * The first part of queue region is basically the same for HIP08 and later chip
+ * register layouts, therefore, HISI_QUEUE_* registers are defined for it.
+ */
+#define HISI_DMA_QUEUE_SQ_BASE_L_REG		0x0
+#define HISI_DMA_QUEUE_SQ_BASE_H_REG		0x4
+#define HISI_DMA_QUEUE_SQ_DEPTH_REG		0x8
+#define HISI_DMA_QUEUE_SQ_TAIL_REG		0xC
+#define HISI_DMA_QUEUE_CQ_BASE_L_REG		0x10
+#define HISI_DMA_QUEUE_CQ_BASE_H_REG		0x14
+#define HISI_DMA_QUEUE_CQ_DEPTH_REG		0x18
+#define HISI_DMA_QUEUE_CQ_HEAD_REG		0x1C
+#define HISI_DMA_QUEUE_CTRL0_REG		0x20
+#define HISI_DMA_QUEUE_CTRL0_EN_B		0
+#define HISI_DMA_QUEUE_CTRL0_PAUSE_B		4
+#define HISI_DMA_QUEUE_CTRL1_REG		0x24
+#define HISI_DMA_QUEUE_CTRL1_RESET_B		0
+#define HISI_DMA_QUEUE_FSM_REG			0x30
+#define HISI_DMA_QUEUE_FSM_STS_M		GENMASK(3, 0)
+#define HISI_DMA_QUEUE_INT_STATUS_REG		0x40
+#define HISI_DMA_QUEUE_ERR_INT_NUM0_REG		0x84
+#define HISI_DMA_QUEUE_ERR_INT_NUM1_REG		0x88
+#define HISI_DMA_QUEUE_ERR_INT_NUM2_REG		0x8C
+#define HISI_DMA_QUEUE_REGION_SIZE		0x100
+
+/**
+ * HiSilicon IP08 DMA register and field define:
+ */
+#define HISI_DMA_HIP08_QUEUE_BASE			0x0
+#define HISI_DMA_HIP08_QUEUE_CTRL0_ERR_ABORT_B		2
+#define HISI_DMA_HIP08_QUEUE_INT_MASK_REG		0x44
+#define HISI_DMA_HIP08_QUEUE_INT_MASK_M			GENMASK(14, 0)
+#define HISI_DMA_HIP08_QUEUE_ERR_INT_NUM3_REG		0x90
+#define HISI_DMA_HIP08_QUEUE_ERR_INT_NUM4_REG		0x94
+#define HISI_DMA_HIP08_QUEUE_ERR_INT_NUM5_REG		0x98
+#define HISI_DMA_HIP08_QUEUE_ERR_INT_NUM6_REG		0x48
+#define HISI_DMA_HIP08_MODE_REG				0x217C
+#define HISI_DMA_HIP08_MODE_SEL_B			0
+#define HISI_DMA_HIP08_DUMP_START_REG			0x2000
+#define HISI_DMA_HIP08_DUMP_END_REG			0x2280
+
+/**
+ * In fact, there are multiple states, but it need to pay attention to
+ * the following two states for the driver:
+ */
+enum {
+	HISI_DMA_STATE_IDLE = 0,
+	HISI_DMA_STATE_RUN,
+};
+
+/**
+ * Hardware complete status define:
+ */
+#define HISI_DMA_STATUS_SUCCESS			0x0
+#define HISI_DMA_STATUS_INVALID_OPCODE		0x1
+#define HISI_DMA_STATUS_INVALID_LENGTH		0x2
+#define HISI_DMA_STATUS_USER_ABORT		0x4
+#define HISI_DMA_STATUS_REMOTE_READ_ERROR	0x10
+#define HISI_DMA_STATUS_AXI_READ_ERROR		0x20
+#define HISI_DMA_STATUS_AXI_WRITE_ERROR		0x40
+#define HISI_DMA_STATUS_DATA_POISON		0x80
+#define HISI_DMA_STATUS_SQE_READ_ERROR		0x100
+#define HISI_DMA_STATUS_SQE_READ_POISION	0x200
+#define HISI_DMA_STATUS_REMOTE_DATA_POISION	0x400
+#define HISI_DMA_STATUS_LINK_DOWN_ERROR		0x800
+
+/**
+ * After scanning the CQ array, the CQ head register needs to be updated.
+ * Updating the register involves write memory barrier operations.
+ * Here use the following method to reduce WMB operations:
+ *   a) malloc more CQEs, which correspond to the macro HISI_DMA_CQ_RESERVED.
+ *   b) update the CQ head register after accumulated number of completed CQs
+ *      is greater than or equal to HISI_DMA_CQ_RESERVED.
+ */
+#define HISI_DMA_CQ_RESERVED		64
+
+struct hisi_dma_sqe {
+	uint32_t dw0;
+#define SQE_FENCE_FLAG	BIT(10)
+#define SQE_OPCODE_M2M	0x4
+	uint32_t dw1;
+	uint32_t dw2;
+	uint32_t length;
+	uint64_t src_addr;
+	uint64_t dst_addr;
+};
+
+struct hisi_dma_cqe {
+	uint64_t rsv;
+	uint64_t misc;
+#define CQE_SQ_HEAD_MASK	GENMASK(15, 0)
+#define CQE_VALID_B		BIT(48)
+#define CQE_STATUS_MASK		GENMASK(63, 49)
+};
+
+struct hisi_dma_dev {
+	struct hisi_dma_sqe *sqe;
+	volatile struct hisi_dma_cqe *cqe;
+	uint16_t *status; /* the completion status array of SQEs. */
+
+	volatile void *sq_tail_reg; /**< register address for doorbell. */
+	volatile void *cq_head_reg; /**< register address for answer CQ. */
+
+	uint16_t sq_depth_mask; /**< SQ depth - 1, the SQ depth is power of 2 */
+	uint16_t cq_depth; /* CQ depth */
+
+	uint16_t ridx; /**< ring index which will assign to the next request. */
+	/** ring index which returned by hisi_dmadev_completed APIs. */
+	uint16_t cridx;
+
+	/**
+	 * SQE array management fields:
+	 *
+	 *  -----------------------------------------------------
+	 *  | SQE0 | SQE1 | SQE2 |   ...  | SQEx | ... | SQEn-1 |
+	 *  -----------------------------------------------------
+	 *     ^             ^               ^
+	 *     |             |               |
+	 *   sq_head     cq_sq_head       sq_tail
+	 *
+	 *  sq_head: index to the oldest completed request, this filed was
+	 *           updated by hisi_dmadev_completed* APIs.
+	 *  sq_tail: index of the next new request, this field was updated by
+	 *           hisi_dmadev_copy API.
+	 *  cq_sq_head: next index of index that has been completed by hardware,
+	 *              this filed was updated by hisi_dmadev_completed* APIs.
+	 *
+	 *  [sq_head, cq_sq_head): the SQEs that hardware already completed.
+	 *  [cq_sq_head, sq_tail): the SQEs that hardware processing.
+	 */
+	uint16_t sq_head;
+	uint16_t sq_tail;
+	uint16_t cq_sq_head;
+	/**
+	 * The driver scans the CQE array, if the valid bit changes, the CQE is
+	 * considered valid.
+	 * Note: One CQE is corresponding to one or several SQEs, e.g. app
+	 *       submits two copy requests, the hardware processes the two SQEs,
+	 *       but it may write back only one CQE and the CQE's sq_head field
+	 *       indicates the index of the second copy request in the SQE
+	 *       array.
+	 */
+	uint16_t cq_head; /**< CQ index for next scans. */
+	/** accumulated number of completed CQs
+	 * @see HISI_DMA_CQ_RESERVED
+	 */
+	uint16_t cqs_completed;
+	uint8_t cqe_vld; /**< valid bit for CQE, will change for every round. */
+
+	uint64_t submitted;
+	uint64_t completed;
+	uint64_t errors;
+
+	/**
+	 * The following fields are not accessed in the I/O path, so they are
+	 * placed at the end.
+	 */
+	struct rte_dma_dev_data *data;
+	uint8_t revision; /**< PCI revision. */
+	uint8_t reg_layout; /**< hardware register layout. */
+	void *io_base;
+	uint8_t queue_id; /**< hardware DMA queue index. */
+	const struct rte_memzone *iomz;
+	uint32_t iomz_sz;
+	rte_iova_t sqe_iova;
+	rte_iova_t cqe_iova;
+};
+
+#endif /* HISI_DMADEV_H */
diff --git a/drivers/dma/hisilicon/meson.build b/drivers/dma/hisilicon/meson.build
new file mode 100644
index 000000000..c11dc352d
--- /dev/null
+++ b/drivers/dma/hisilicon/meson.build
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited
+
+if not is_linux
+    build = false
+    reason = 'only supported on Linux'
+    subdir_done()
+endif
+
+if (arch_subdir != 'x86' and arch_subdir != 'arm') or (not dpdk_conf.get('RTE_ARCH_64'))
+    build = false
+    reason = 'only supported on x86_64 and aarch64'
+    subdir_done()
+endif
+
+deps += ['bus_pci', 'dmadev']
+sources = files(
+        'hisi_dmadev.c'
+)
diff --git a/drivers/dma/hisilicon/version.map b/drivers/dma/hisilicon/version.map
new file mode 100644
index 000000000..c2e0723b4
--- /dev/null
+++ b/drivers/dma/hisilicon/version.map
@@ -0,0 +1,3 @@
+DPDK_22 {
+	local: *;
+};
diff --git a/drivers/dma/meson.build b/drivers/dma/meson.build
new file mode 100644
index 000000000..d030069fd
--- /dev/null
+++ b/drivers/dma/meson.build
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2021 HiSilicon Limited
+
+if is_windows
+        subdir_done()
+endif
+
+drivers = [
+        'hisilicon',
+        'skeleton'
+]
+std_deps = ['dmadev']
+config_flag_fmt = 'RTE_LIBRTE_PMD_DMADEV_@0@'
diff --git a/drivers/dma/skeleton/meson.build b/drivers/dma/skeleton/meson.build
new file mode 100644
index 000000000..defe905e4
--- /dev/null
+++ b/drivers/dma/skeleton/meson.build
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited
+
+if is_windows
+        subdir_done()
+endif
+
+deps += ['dmadev', 'kvargs', 'ring', 'bus_vdev']
+sources = files(
+        'skeleton_dmadev.c',
+)
diff --git a/drivers/dma/skeleton/skeleton_dmadev.c b/drivers/dma/skeleton/skeleton_dmadev.c
new file mode 100644
index 000000000..8bed41f8b
--- /dev/null
+++ b/drivers/dma/skeleton/skeleton_dmadev.c
@@ -0,0 +1,596 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+#include <inttypes.h>
+
+#include <rte_bus_vdev.h>
+#include <rte_cycles.h>
+#include <rte_eal.h>
+#include <rte_kvargs.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+
+#include <rte_dmadev_pmd.h>
+
+#include "skeleton_dmadev.h"
+
+RTE_LOG_REGISTER(skeldma_logtype, pmd.dma.skeleton, INFO);
+#define SKELDMA_LOG(level, fmt, args...) \
+	rte_log(RTE_LOG_ ## level, skeldma_logtype, "%s(): " fmt "\n", \
+		__func__, ##args)
+
+/* Count of instances, currently only 1 is supported. */
+static uint16_t skeldma_count;
+
+static int
+skeldma_info_get(const struct rte_dma_dev *dev, struct rte_dma_info *dev_info,
+		 uint32_t info_sz)
+{
+#define SKELDMA_MAX_DESC	8192
+#define SKELDMA_MIN_DESC	32
+
+	RTE_SET_USED(dev);
+	RTE_SET_USED(info_sz);
+
+	dev_info->dev_capa = RTE_DMA_CAPA_MEM_TO_MEM |
+			     RTE_DMA_CAPA_SVA |
+			     RTE_DMA_CAPA_OPS_COPY;
+	dev_info->max_vchans = 1;
+	dev_info->max_desc = SKELDMA_MAX_DESC;
+	dev_info->min_desc = SKELDMA_MIN_DESC;
+
+	return 0;
+}
+
+static int
+skeldma_configure(struct rte_dma_dev *dev, const struct rte_dma_conf *conf,
+		  uint32_t conf_sz)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(conf);
+	RTE_SET_USED(conf_sz);
+	return 0;
+}
+
+static void *
+cpucopy_thread(void *param)
+{
+#define SLEEP_THRESHOLD		10000
+#define SLEEP_US_VAL		10
+
+	struct rte_dma_dev *dev = param;
+	struct skeldma_hw *hw = dev->data->dev_private;
+	struct skeldma_desc *desc = NULL;
+	int ret;
+
+	while (!hw->exit_flag) {
+		ret = rte_ring_dequeue(hw->desc_running, (void **)&desc);
+		if (ret) {
+			hw->zero_req_count++;
+			if (hw->zero_req_count == 0)
+				hw->zero_req_count = SLEEP_THRESHOLD;
+			if (hw->zero_req_count >= SLEEP_THRESHOLD)
+				rte_delay_us_sleep(SLEEP_US_VAL);
+			continue;
+		}
+
+		hw->zero_req_count = 0;
+		rte_memcpy(desc->dst, desc->src, desc->len);
+		__atomic_add_fetch(&hw->completed_count, 1, __ATOMIC_RELEASE);
+		(void)rte_ring_enqueue(hw->desc_completed, (void *)desc);
+	}
+
+	return NULL;
+}
+
+static void
+fflush_ring(struct skeldma_hw *hw, struct rte_ring *ring)
+{
+	struct skeldma_desc *desc = NULL;
+	while (rte_ring_count(ring) > 0) {
+		(void)rte_ring_dequeue(ring, (void **)&desc);
+		(void)rte_ring_enqueue(hw->desc_empty, (void *)desc);
+	}
+}
+
+static int
+skeldma_start(struct rte_dma_dev *dev)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+	rte_cpuset_t cpuset;
+	int ret;
+
+	if (hw->desc_mem == NULL) {
+		SKELDMA_LOG(ERR, "Vchan was not setup, start fail!");
+		return -EINVAL;
+	}
+
+	/* Reset the dmadev to a known state, include:
+	 * 1) fflush pending/running/completed ring to empty ring.
+	 * 2) init ring idx to zero.
+	 * 3) init running statistics.
+	 * 4) mark cpucopy task exit_flag to false.
+	 */
+	fflush_ring(hw, hw->desc_pending);
+	fflush_ring(hw, hw->desc_running);
+	fflush_ring(hw, hw->desc_completed);
+	hw->ridx = 0;
+	hw->submitted_count = 0;
+	hw->zero_req_count = 0;
+	hw->completed_count = 0;
+	hw->exit_flag = false;
+
+	rte_mb();
+
+	ret = rte_ctrl_thread_create(&hw->thread, "dma_skeleton", NULL,
+				     cpucopy_thread, dev);
+	if (ret) {
+		SKELDMA_LOG(ERR, "Start cpucopy thread fail!");
+		return -EINVAL;
+	}
+
+	if (hw->lcore_id != -1) {
+		cpuset = rte_lcore_cpuset(hw->lcore_id);
+		ret = pthread_setaffinity_np(hw->thread, sizeof(cpuset),
+					     &cpuset);
+		if (ret)
+			SKELDMA_LOG(WARNING,
+				"Set thread affinity lcore = %d fail!",
+				hw->lcore_id);
+	}
+
+	return 0;
+}
+
+static int
+skeldma_stop(struct rte_dma_dev *dev)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	hw->exit_flag = true;
+	rte_delay_ms(1);
+
+	(void)pthread_cancel(hw->thread);
+	pthread_join(hw->thread, NULL);
+
+	return 0;
+}
+
+static int
+vchan_setup(struct skeldma_hw *hw, uint16_t nb_desc)
+{
+	struct skeldma_desc *desc;
+	struct rte_ring *empty;
+	struct rte_ring *pending;
+	struct rte_ring *running;
+	struct rte_ring *completed;
+	uint16_t i;
+
+	desc = rte_zmalloc_socket("dma_skelteon_desc",
+				  nb_desc * sizeof(struct skeldma_desc),
+				  RTE_CACHE_LINE_SIZE, hw->socket_id);
+	if (desc == NULL) {
+		SKELDMA_LOG(ERR, "Malloc dma skeleton desc fail!");
+		return -ENOMEM;
+	}
+
+	empty = rte_ring_create("dma_skeleton_desc_empty", nb_desc,
+				hw->socket_id, RING_F_SP_ENQ | RING_F_SC_DEQ);
+	pending = rte_ring_create("dma_skeleton_desc_pending", nb_desc,
+				  hw->socket_id, RING_F_SP_ENQ | RING_F_SC_DEQ);
+	running = rte_ring_create("dma_skeleton_desc_running", nb_desc,
+				  hw->socket_id, RING_F_SP_ENQ | RING_F_SC_DEQ);
+	completed = rte_ring_create("dma_skeleton_desc_completed", nb_desc,
+				  hw->socket_id, RING_F_SP_ENQ | RING_F_SC_DEQ);
+	if (empty == NULL || pending == NULL || running == NULL ||
+	    completed == NULL) {
+		SKELDMA_LOG(ERR, "Create dma skeleton desc ring fail!");
+		rte_ring_free(empty);
+		rte_ring_free(pending);
+		rte_ring_free(running);
+		rte_ring_free(completed);
+		rte_free(desc);
+		return -ENOMEM;
+	}
+
+	/* The real usable ring size is *count-1* instead of *count* to
+	 * differentiate a free ring from an empty ring.
+	 * @see rte_ring_create
+	 */
+	for (i = 0; i < nb_desc - 1; i++)
+		(void)rte_ring_enqueue(empty, (void *)(desc + i));
+
+	hw->desc_mem = desc;
+	hw->desc_empty = empty;
+	hw->desc_pending = pending;
+	hw->desc_running = running;
+	hw->desc_completed = completed;
+
+	return 0;
+}
+
+static void
+vchan_release(struct skeldma_hw *hw)
+{
+	if (hw->desc_mem == NULL)
+		return;
+
+	rte_free(hw->desc_mem);
+	hw->desc_mem = NULL;
+	rte_ring_free(hw->desc_empty);
+	hw->desc_empty = NULL;
+	rte_ring_free(hw->desc_pending);
+	hw->desc_pending = NULL;
+	rte_ring_free(hw->desc_running);
+	hw->desc_running = NULL;
+	rte_ring_free(hw->desc_completed);
+	hw->desc_completed = NULL;
+}
+
+static int
+skeldma_close(struct rte_dma_dev *dev)
+{
+	/* The device already stopped */
+	vchan_release(dev->data->dev_private);
+	return 0;
+}
+
+static int
+skeldma_vchan_setup(struct rte_dma_dev *dev, uint16_t vchan,
+		    const struct rte_dma_vchan_conf *conf,
+		    uint32_t conf_sz)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(conf_sz);
+
+	if (!rte_is_power_of_2(conf->nb_desc)) {
+		SKELDMA_LOG(ERR, "Number of desc must be power of 2!");
+		return -EINVAL;
+	}
+
+	vchan_release(hw);
+	return vchan_setup(hw, conf->nb_desc);
+}
+
+static int
+skeldma_vchan_status(const struct rte_dma_dev *dev,
+		uint16_t vchan, enum rte_dma_vchan_status *status)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+
+	*status = RTE_DMA_VCHAN_IDLE;
+	if (hw->submitted_count != __atomic_load_n(&hw->completed_count, __ATOMIC_ACQUIRE)
+			|| hw->zero_req_count == 0)
+		*status = RTE_DMA_VCHAN_ACTIVE;
+	return 0;
+}
+
+static int
+skeldma_stats_get(const struct rte_dma_dev *dev, uint16_t vchan,
+		  struct rte_dma_stats *stats, uint32_t stats_sz)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(stats_sz);
+
+	stats->submitted = hw->submitted_count;
+	stats->completed = hw->completed_count;
+	stats->errors = 0;
+
+	return 0;
+}
+
+static int
+skeldma_stats_reset(struct rte_dma_dev *dev, uint16_t vchan)
+{
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	RTE_SET_USED(vchan);
+
+	hw->submitted_count = 0;
+	hw->completed_count = 0;
+
+	return 0;
+}
+
+static int
+skeldma_dump(const struct rte_dma_dev *dev, FILE *f)
+{
+#define GET_RING_COUNT(ring)	((ring) ? (rte_ring_count(ring)) : 0)
+
+	struct skeldma_hw *hw = dev->data->dev_private;
+
+	(void)fprintf(f,
+		"    lcore_id: %d\n"
+		"    socket_id: %d\n"
+		"    desc_empty_ring_count: %u\n"
+		"    desc_pending_ring_count: %u\n"
+		"    desc_running_ring_count: %u\n"
+		"    desc_completed_ring_count: %u\n",
+		hw->lcore_id, hw->socket_id,
+		GET_RING_COUNT(hw->desc_empty),
+		GET_RING_COUNT(hw->desc_pending),
+		GET_RING_COUNT(hw->desc_running),
+		GET_RING_COUNT(hw->desc_completed));
+	(void)fprintf(f,
+		"    next_ring_idx: %u\n"
+		"    submitted_count: %" PRIu64 "\n"
+		"    completed_count: %" PRIu64 "\n",
+		hw->ridx, hw->submitted_count, hw->completed_count);
+
+	return 0;
+}
+
+static inline void
+submit(struct skeldma_hw *hw, struct skeldma_desc *desc)
+{
+	uint16_t count = rte_ring_count(hw->desc_pending);
+	struct skeldma_desc *pend_desc = NULL;
+
+	while (count > 0) {
+		(void)rte_ring_dequeue(hw->desc_pending, (void **)&pend_desc);
+		(void)rte_ring_enqueue(hw->desc_running, (void *)pend_desc);
+		count--;
+	}
+
+	if (desc)
+		(void)rte_ring_enqueue(hw->desc_running, (void *)desc);
+}
+
+static int
+skeldma_copy(void *dev_private, uint16_t vchan,
+	     rte_iova_t src, rte_iova_t dst,
+	     uint32_t length, uint64_t flags)
+{
+	struct skeldma_hw *hw = dev_private;
+	struct skeldma_desc *desc;
+	int ret;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(flags);
+
+	ret = rte_ring_dequeue(hw->desc_empty, (void **)&desc);
+	if (ret)
+		return -ENOSPC;
+	desc->src = (void *)(uintptr_t)src;
+	desc->dst = (void *)(uintptr_t)dst;
+	desc->len = length;
+	desc->ridx = hw->ridx;
+	if (flags & RTE_DMA_OP_FLAG_SUBMIT)
+		submit(hw, desc);
+	else
+		(void)rte_ring_enqueue(hw->desc_pending, (void *)desc);
+	hw->submitted_count++;
+
+	return hw->ridx++;
+}
+
+static int
+skeldma_submit(void *dev_private, uint16_t vchan)
+{
+	struct skeldma_hw *hw = dev_private;
+	RTE_SET_USED(vchan);
+	submit(hw, NULL);
+	return 0;
+}
+
+static uint16_t
+skeldma_completed(void *dev_private,
+		  uint16_t vchan, const uint16_t nb_cpls,
+		  uint16_t *last_idx, bool *has_error)
+{
+	struct skeldma_hw *hw = dev_private;
+	struct skeldma_desc *desc = NULL;
+	uint16_t index = 0;
+	uint16_t count;
+
+	RTE_SET_USED(vchan);
+	RTE_SET_USED(has_error);
+
+	count = RTE_MIN(nb_cpls, rte_ring_count(hw->desc_completed));
+	while (index < count) {
+		(void)rte_ring_dequeue(hw->desc_completed, (void **)&desc);
+		if (index == count - 1)
+			*last_idx = desc->ridx;
+		index++;
+		(void)rte_ring_enqueue(hw->desc_empty, (void *)desc);
+	}
+
+	return count;
+}
+
+static uint16_t
+skeldma_completed_status(void *dev_private,
+			 uint16_t vchan, const uint16_t nb_cpls,
+			 uint16_t *last_idx, enum rte_dma_status_code *status)
+{
+	struct skeldma_hw *hw = dev_private;
+	struct skeldma_desc *desc = NULL;
+	uint16_t index = 0;
+	uint16_t count;
+
+	RTE_SET_USED(vchan);
+
+	count = RTE_MIN(nb_cpls, rte_ring_count(hw->desc_completed));
+	while (index < count) {
+		(void)rte_ring_dequeue(hw->desc_completed, (void **)&desc);
+		if (index == count - 1)
+			*last_idx = desc->ridx;
+		status[index++] = RTE_DMA_STATUS_SUCCESSFUL;
+		(void)rte_ring_enqueue(hw->desc_empty, (void *)desc);
+	}
+
+	return count;
+}
+
+static uint16_t
+skeldma_burst_capacity(const void *dev_private, uint16_t vchan)
+{
+	const struct skeldma_hw *hw = dev_private;
+
+	RTE_SET_USED(vchan);
+	return rte_ring_count(hw->desc_empty);
+}
+
+static const struct rte_dma_dev_ops skeldma_ops = {
+	.dev_info_get     = skeldma_info_get,
+	.dev_configure    = skeldma_configure,
+	.dev_start        = skeldma_start,
+	.dev_stop         = skeldma_stop,
+	.dev_close        = skeldma_close,
+
+	.vchan_setup      = skeldma_vchan_setup,
+	.vchan_status     = skeldma_vchan_status,
+
+	.stats_get        = skeldma_stats_get,
+	.stats_reset      = skeldma_stats_reset,
+
+	.dev_dump         = skeldma_dump,
+};
+
+static int
+skeldma_create(const char *name, struct rte_vdev_device *vdev, int lcore_id)
+{
+	struct rte_dma_dev *dev;
+	struct skeldma_hw *hw;
+	int socket_id;
+
+	socket_id = (lcore_id < 0) ? rte_socket_id() :
+				     rte_lcore_to_socket_id(lcore_id);
+	dev = rte_dma_pmd_allocate(name, socket_id, sizeof(struct skeldma_hw));
+	if (dev == NULL) {
+		SKELDMA_LOG(ERR, "Unable to allocate dmadev: %s", name);
+		return -EINVAL;
+	}
+
+	dev->device = &vdev->device;
+	dev->dev_ops = &skeldma_ops;
+	dev->fp_obj->dev_private = dev->data->dev_private;
+	dev->fp_obj->copy = skeldma_copy;
+	dev->fp_obj->submit = skeldma_submit;
+	dev->fp_obj->completed = skeldma_completed;
+	dev->fp_obj->completed_status = skeldma_completed_status;
+	dev->fp_obj->burst_capacity = skeldma_burst_capacity;
+
+	hw = dev->data->dev_private;
+	hw->lcore_id = lcore_id;
+	hw->socket_id = socket_id;
+
+	dev->state = RTE_DMA_DEV_READY;
+
+	return dev->data->dev_id;
+}
+
+static int
+skeldma_destroy(const char *name)
+{
+	return rte_dma_pmd_release(name);
+}
+
+static int
+skeldma_parse_lcore(const char *key __rte_unused,
+		    const char *value,
+		    void *opaque)
+{
+	int lcore_id = atoi(value);
+	if (lcore_id >= 0 && lcore_id < RTE_MAX_LCORE)
+		*(int *)opaque = lcore_id;
+	return 0;
+}
+
+static void
+skeldma_parse_vdev_args(struct rte_vdev_device *vdev, int *lcore_id)
+{
+	static const char *const args[] = {
+		SKELDMA_ARG_LCORE,
+		NULL
+	};
+
+	struct rte_kvargs *kvlist;
+	const char *params;
+
+	params = rte_vdev_device_args(vdev);
+	if (params == NULL || params[0] == '\0')
+		return;
+
+	kvlist = rte_kvargs_parse(params, args);
+	if (!kvlist)
+		return;
+
+	(void)rte_kvargs_process(kvlist, SKELDMA_ARG_LCORE,
+				 skeldma_parse_lcore, lcore_id);
+	SKELDMA_LOG(INFO, "Parse lcore_id = %d", *lcore_id);
+
+	rte_kvargs_free(kvlist);
+}
+
+static int
+skeldma_probe(struct rte_vdev_device *vdev)
+{
+	const char *name;
+	int lcore_id = -1;
+	int ret;
+
+	name = rte_vdev_device_name(vdev);
+	if (name == NULL)
+		return -EINVAL;
+
+	if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+		SKELDMA_LOG(ERR, "Multiple process not supported for %s", name);
+		return -EINVAL;
+	}
+
+	/* More than one instance is not supported */
+	if (skeldma_count > 0) {
+		SKELDMA_LOG(ERR, "Multiple instance not supported for %s",
+			name);
+		return -EINVAL;
+	}
+
+	skeldma_parse_vdev_args(vdev, &lcore_id);
+
+	ret = skeldma_create(name, vdev, lcore_id);
+	if (ret >= 0) {
+		SKELDMA_LOG(INFO, "Create %s dmadev with lcore-id %d",
+			name, lcore_id);
+		skeldma_count = 1;
+	}
+
+	return ret < 0 ? ret : 0;
+}
+
+static int
+skeldma_remove(struct rte_vdev_device *vdev)
+{
+	const char *name;
+	int ret;
+
+	name = rte_vdev_device_name(vdev);
+	if (name == NULL)
+		return -1;
+
+	ret = skeldma_destroy(name);
+	if (!ret) {
+		skeldma_count = 0;
+		SKELDMA_LOG(INFO, "Remove %s dmadev", name);
+	}
+
+	return ret;
+}
+
+static struct rte_vdev_driver skeldma_pmd_drv = {
+	.probe = skeldma_probe,
+	.remove = skeldma_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(dma_skeleton, skeldma_pmd_drv);
+RTE_PMD_REGISTER_PARAM_STRING(dma_skeleton,
+		SKELDMA_ARG_LCORE "=<uint16> ");
diff --git a/drivers/dma/skeleton/skeleton_dmadev.h b/drivers/dma/skeleton/skeleton_dmadev.h
new file mode 100644
index 000000000..91eb5460f
--- /dev/null
+++ b/drivers/dma/skeleton/skeleton_dmadev.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+#ifndef SKELETON_DMADEV_H
+#define SKELETON_DMADEV_H
+
+#include <pthread.h>
+
+#include <rte_ring.h>
+
+#define SKELDMA_ARG_LCORE	"lcore"
+
+struct skeldma_desc {
+	void *src;
+	void *dst;
+	uint32_t len;
+	uint16_t ridx; /* ring idx */
+};
+
+struct skeldma_hw {
+	int lcore_id; /* cpucopy task affinity core */
+	int socket_id;
+	pthread_t thread; /* cpucopy task thread */
+	volatile int exit_flag; /* cpucopy task exit flag */
+
+	struct skeldma_desc *desc_mem;
+
+	/* Descriptor ring state machine:
+	 *
+	 *  -----------     enqueue without submit     -----------
+	 *  |  empty  |------------------------------->| pending |
+	 *  -----------\                               -----------
+	 *       ^      \------------                       |
+	 *       |                  |                       |submit doorbell
+	 *       |                  |                       |
+	 *       |                  |enqueue with submit    |
+	 *       |get completed     |------------------|    |
+	 *       |                                     |    |
+	 *       |                                     v    v
+	 *  -----------     cpucopy thread working     -----------
+	 *  |completed|<-------------------------------| running |
+	 *  -----------                                -----------
+	 */
+	struct rte_ring *desc_empty;
+	struct rte_ring *desc_pending;
+	struct rte_ring *desc_running;
+	struct rte_ring *desc_completed;
+
+	/* Cache delimiter for dataplane API's operation data */
+	char cache1 __rte_cache_aligned;
+	uint16_t ridx;  /* ring idx */
+	uint64_t submitted_count;
+
+	/* Cache delimiter for cpucopy thread's operation data */
+	char cache2 __rte_cache_aligned;
+	volatile uint32_t zero_req_count;
+	uint64_t completed_count;
+};
+
+#endif /* SKELETON_DMADEV_H */
diff --git a/drivers/dma/skeleton/version.map b/drivers/dma/skeleton/version.map
new file mode 100644
index 000000000..c2e0723b4
--- /dev/null
+++ b/drivers/dma/skeleton/version.map
@@ -0,0 +1,3 @@
+DPDK_22 {
+	local: *;
+};
diff --git a/drivers/meson.build b/drivers/meson.build
index f9febc579..996df2210 100644
--- a/drivers/meson.build
+++ b/drivers/meson.build
@@ -16,6 +16,7 @@ subdirs = [
 	'vdpa',    # depends on common, bus and mempool.
 	'event',   # depends on common, bus, mempool and net.
 	'baseband', # depends on common and bus.
+	'dma'
 ]

 disabled_drivers = run_command(list_dir_globs, get_option('disable_drivers'),
diff --git a/examples/dma/Makefile b/examples/dma/Makefile
new file mode 100644
index 000000000..59af6478b
--- /dev/null
+++ b/examples/dma/Makefile
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Intel Corporation
+
+# binary name
+APP = dmafwd
+
+# all source are stored in SRCS-y
+SRCS-y := dmafwd.c
+
+PKGCONF ?= pkg-config
+
+# Build using pkg-config variables if possible
+ifneq ($(shell $(PKGCONF) --exists libdpdk && echo 0),0)
+$(error "no installation of DPDK found")
+endif
+
+all: shared
+.PHONY: shared static
+shared: build/$(APP)-shared
+	ln -sf $(APP)-shared build/$(APP)
+static: build/$(APP)-static
+	ln -sf $(APP)-static build/$(APP)
+
+PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
+CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
+LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
+LDFLAGS_STATIC = $(shell $(PKGCONF) --static --libs libdpdk)
+
+ifeq ($(MAKECMDGOALS),static)
+# check for broken pkg-config
+ifeq ($(shell echo $(LDFLAGS_STATIC) | grep 'whole-archive.*l:lib.*no-whole-archive'),)
+$(warning "pkg-config output list does not contain drivers between 'whole-archive'/'no-whole-archive' flags.")
+$(error "Cannot generate statically-linked binaries with this version of pkg-config")
+endif
+endif
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
+	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+
+build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
+	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+
+build:
+	@mkdir -p $@
+
+.PHONY: clean
+clean:
+	rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
+	test -d build && rmdir -p build || true
diff --git a/examples/dma/dmafwd.c b/examples/dma/dmafwd.c
new file mode 100644
index 000000000..9ff2593bb
--- /dev/null
+++ b/examples/dma/dmafwd.c
@@ -0,0 +1,1105 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019-2021 Intel Corporation
+ */
+
+#include <stdint.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_dmadev.h>
+
+/* size of ring used for software copying between rx and tx. */
+#define RTE_LOGTYPE_DMA RTE_LOGTYPE_USER1
+#define MAX_PKT_BURST 32
+#define MEMPOOL_CACHE_SIZE 512
+#define MIN_POOL_SIZE 65536U
+#define CMD_LINE_OPT_MAC_UPDATING "mac-updating"
+#define CMD_LINE_OPT_NO_MAC_UPDATING "no-mac-updating"
+#define CMD_LINE_OPT_PORTMASK "portmask"
+#define CMD_LINE_OPT_NB_QUEUE "nb-queue"
+#define CMD_LINE_OPT_COPY_TYPE "copy-type"
+#define CMD_LINE_OPT_RING_SIZE "ring-size"
+#define CMD_LINE_OPT_BATCH_SIZE "dma-batch-size"
+#define CMD_LINE_OPT_FRAME_SIZE "max-frame-size"
+#define CMD_LINE_OPT_STATS_INTERVAL "stats-interval"
+
+/* configurable number of RX/TX ring descriptors */
+#define RX_DEFAULT_RINGSIZE 1024
+#define TX_DEFAULT_RINGSIZE 1024
+
+/* max number of RX queues per port */
+#define MAX_RX_QUEUES_COUNT 8
+
+struct rxtx_port_config {
+	/* common config */
+	uint16_t rxtx_port;
+	uint16_t nb_queues;
+	/* for software copy mode */
+	struct rte_ring *rx_to_tx_ring;
+	/* for dmadev HW copy mode */
+	uint16_t dmadev_ids[MAX_RX_QUEUES_COUNT];
+};
+
+/* Configuring ports and number of assigned lcores in struct. 8< */
+struct rxtx_transmission_config {
+	struct rxtx_port_config ports[RTE_MAX_ETHPORTS];
+	uint16_t nb_ports;
+	uint16_t nb_lcores;
+};
+/* >8 End of configuration of ports and number of assigned lcores. */
+
+/* per-port statistics struct */
+struct dma_port_statistics {
+	uint64_t rx[RTE_MAX_ETHPORTS];
+	uint64_t tx[RTE_MAX_ETHPORTS];
+	uint64_t tx_dropped[RTE_MAX_ETHPORTS];
+	uint64_t copy_dropped[RTE_MAX_ETHPORTS];
+};
+struct dma_port_statistics port_statistics;
+struct total_statistics {
+	uint64_t total_packets_dropped;
+	uint64_t total_packets_tx;
+	uint64_t total_packets_rx;
+	uint64_t total_submitted;
+	uint64_t total_completed;
+	uint64_t total_failed;
+};
+
+typedef enum copy_mode_t {
+#define COPY_MODE_SW "sw"
+	COPY_MODE_SW_NUM,
+#define COPY_MODE_DMA "hw"
+	COPY_MODE_DMA_NUM,
+	COPY_MODE_INVALID_NUM,
+	COPY_MODE_SIZE_NUM = COPY_MODE_INVALID_NUM
+} copy_mode_t;
+
+/* mask of enabled ports */
+static uint32_t dma_enabled_port_mask;
+
+/* number of RX queues per port */
+static uint16_t nb_queues = 1;
+
+/* MAC updating enabled by default. */
+static int mac_updating = 1;
+
+/* hardare copy mode enabled by default. */
+static copy_mode_t copy_mode = COPY_MODE_DMA_NUM;
+
+/* size of descriptor ring for hardware copy mode or
+ * rte_ring for software copy mode
+ */
+static unsigned short ring_size = 2048;
+
+/* interval, in seconds, between stats prints */
+static unsigned short stats_interval = 1;
+/* global mbuf arrays for tracking DMA bufs */
+#define MBUF_RING_SIZE	2048
+#define MBUF_RING_MASK	(MBUF_RING_SIZE - 1)
+struct dma_bufs {
+	struct rte_mbuf *bufs[MBUF_RING_SIZE];
+	struct rte_mbuf *copies[MBUF_RING_SIZE];
+	uint16_t sent;
+};
+static struct dma_bufs dma_bufs[RTE_DMADEV_DEFAULT_MAX];
+
+/* global transmission config */
+struct rxtx_transmission_config cfg;
+
+/* configurable number of RX/TX ring descriptors */
+static uint16_t nb_rxd = RX_DEFAULT_RINGSIZE;
+static uint16_t nb_txd = TX_DEFAULT_RINGSIZE;
+
+static volatile bool force_quit;
+
+static uint32_t dma_batch_sz = MAX_PKT_BURST;
+static uint32_t max_frame_size = RTE_ETHER_MAX_LEN;
+
+/* ethernet addresses of ports */
+static struct rte_ether_addr dma_ports_eth_addr[RTE_MAX_ETHPORTS];
+
+static struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_ETHPORTS];
+struct rte_mempool *dma_pktmbuf_pool;
+
+/* Print out statistics for one port. */
+static void
+print_port_stats(uint16_t port_id)
+{
+	printf("\nStatistics for port %u ------------------------------"
+		"\nPackets sent: %34"PRIu64
+		"\nPackets received: %30"PRIu64
+		"\nPackets dropped on tx: %25"PRIu64
+		"\nPackets dropped on copy: %23"PRIu64,
+		port_id,
+		port_statistics.tx[port_id],
+		port_statistics.rx[port_id],
+		port_statistics.tx_dropped[port_id],
+		port_statistics.copy_dropped[port_id]);
+}
+
+/* Print out statistics for one dmadev device. */
+static void
+print_dmadev_stats(uint32_t dev_id, struct rte_dma_stats stats)
+{
+	printf("\nDMA channel %u", dev_id);
+	printf("\n\t Total submitted ops: %"PRIu64"", stats.submitted);
+	printf("\n\t Total completed ops: %"PRIu64"", stats.completed);
+	printf("\n\t Total failed ops: %"PRIu64"", stats.errors);
+}
+
+static void
+print_total_stats(struct total_statistics *ts)
+{
+	printf("\nAggregate statistics ==============================="
+		"\nTotal packets Tx: %22"PRIu64" [pkt/s]"
+		"\nTotal packets Rx: %22"PRIu64" [pkt/s]"
+		"\nTotal packets dropped: %17"PRIu64" [pkt/s]",
+		ts->total_packets_tx / stats_interval,
+		ts->total_packets_rx / stats_interval,
+		ts->total_packets_dropped / stats_interval);
+
+	if (copy_mode == COPY_MODE_DMA_NUM) {
+		printf("\nTotal submitted ops: %19"PRIu64" [ops/s]"
+			"\nTotal completed ops: %19"PRIu64" [ops/s]"
+			"\nTotal failed ops: %22"PRIu64" [ops/s]",
+			ts->total_submitted / stats_interval,
+			ts->total_completed / stats_interval,
+			ts->total_failed / stats_interval);
+	}
+
+	printf("\n====================================================\n");
+}
+
+/* Print out statistics on packets dropped. */
+static void
+print_stats(char *prgname)
+{
+	struct total_statistics ts, delta_ts;
+	struct rte_dma_stats stats = {0};
+	uint32_t i, port_id, dev_id;
+	char status_string[255]; /* to print at the top of the output */
+	int status_strlen;
+
+	const char clr[] = { 27, '[', '2', 'J', '\0' };
+	const char topLeft[] = { 27, '[', '1', ';', '1', 'H', '\0' };
+
+	status_strlen = snprintf(status_string, sizeof(status_string),
+		"%s, ", prgname);
+	status_strlen += snprintf(status_string + status_strlen,
+		sizeof(status_string) - status_strlen,
+		"Worker Threads = %d, ",
+		rte_lcore_count() > 2 ? 2 : 1);
+	status_strlen += snprintf(status_string + status_strlen,
+		sizeof(status_string) - status_strlen,
+		"Copy Mode = %s,\n", copy_mode == COPY_MODE_SW_NUM ?
+		COPY_MODE_SW : COPY_MODE_DMA);
+	status_strlen += snprintf(status_string + status_strlen,
+		sizeof(status_string) - status_strlen,
+		"Updating MAC = %s, ", mac_updating ?
+		"enabled" : "disabled");
+	status_strlen += snprintf(status_string + status_strlen,
+		sizeof(status_string) - status_strlen,
+		"Rx Queues = %d, ", nb_queues);
+	status_strlen += snprintf(status_string + status_strlen,
+		sizeof(status_string) - status_strlen,
+		"Ring Size = %d", ring_size);
+
+	memset(&ts, 0, sizeof(struct total_statistics));
+
+	while (!force_quit) {
+		/* Sleep for "stats_interval" seconds each round - init sleep allows reading
+		 * messages from app startup.
+		 */
+		sleep(stats_interval);
+
+		/* Clear screen and move to top left */
+		printf("%s%s", clr, topLeft);
+
+		memset(&delta_ts, 0, sizeof(struct total_statistics));
+
+		printf("%s\n", status_string);
+
+		for (i = 0; i < cfg.nb_ports; i++) {
+			port_id = cfg.ports[i].rxtx_port;
+			print_port_stats(port_id);
+
+			delta_ts.total_packets_dropped +=
+				port_statistics.tx_dropped[port_id]
+				+ port_statistics.copy_dropped[port_id];
+			delta_ts.total_packets_tx +=
+				port_statistics.tx[port_id];
+			delta_ts.total_packets_rx +=
+				port_statistics.rx[port_id];
+
+			if (copy_mode == COPY_MODE_DMA_NUM) {
+				uint32_t j;
+
+				for (j = 0; j < cfg.ports[i].nb_queues; j++) {
+					dev_id = cfg.ports[i].dmadev_ids[j];
+					rte_dma_stats_get(dev_id, 0, &stats);
+					print_dmadev_stats(dev_id, stats);
+
+					delta_ts.total_submitted += stats.submitted;
+					delta_ts.total_completed += stats.completed;
+					delta_ts.total_failed += stats.errors;
+				}
+			}
+		}
+
+		delta_ts.total_packets_tx -= ts.total_packets_tx;
+		delta_ts.total_packets_rx -= ts.total_packets_rx;
+		delta_ts.total_packets_dropped -= ts.total_packets_dropped;
+		delta_ts.total_submitted -= ts.total_submitted;
+		delta_ts.total_completed -= ts.total_completed;
+		delta_ts.total_failed -= ts.total_failed;
+
+		printf("\n");
+		print_total_stats(&delta_ts);
+
+		fflush(stdout);
+
+		ts.total_packets_tx += delta_ts.total_packets_tx;
+		ts.total_packets_rx += delta_ts.total_packets_rx;
+		ts.total_packets_dropped += delta_ts.total_packets_dropped;
+		ts.total_submitted += delta_ts.total_submitted;
+		ts.total_completed += delta_ts.total_completed;
+		ts.total_failed += delta_ts.total_failed;
+	}
+}
+
+static void
+update_mac_addrs(struct rte_mbuf *m, uint32_t dest_portid)
+{
+	struct rte_ether_hdr *eth;
+	void *tmp;
+
+	eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
+
+	/* 02:00:00:00:00:xx - overwriting 2 bytes of source address but
+	 * it's acceptable cause it gets overwritten by rte_ether_addr_copy
+	 */
+	tmp = &eth->d_addr.addr_bytes[0];
+	*((uint64_t *)tmp) = 0x000000000002 + ((uint64_t)dest_portid << 40);
+
+	/* src addr */
+	rte_ether_addr_copy(&dma_ports_eth_addr[dest_portid], &eth->s_addr);
+}
+
+/* Perform packet copy there is a user-defined function. 8< */
+static inline void
+pktmbuf_metadata_copy(const struct rte_mbuf *src, struct rte_mbuf *dst)
+{
+	dst->data_off = src->data_off;
+	memcpy(&dst->rx_descriptor_fields1, &src->rx_descriptor_fields1,
+		offsetof(struct rte_mbuf, buf_len) -
+		offsetof(struct rte_mbuf, rx_descriptor_fields1));
+}
+
+/* Copy packet data */
+static inline void
+pktmbuf_sw_copy(struct rte_mbuf *src, struct rte_mbuf *dst)
+{
+	rte_memcpy(rte_pktmbuf_mtod(dst, char *),
+		rte_pktmbuf_mtod(src, char *), src->data_len);
+}
+/* >8 End of perform packet copy there is a user-defined function. */
+
+static uint32_t
+dma_enqueue_packets(struct rte_mbuf *pkts[], struct rte_mbuf *pkts_copy[],
+	uint32_t nb_rx, uint16_t dev_id)
+{
+	struct dma_bufs *dma = &dma_bufs[dev_id];
+	int ret;
+	uint32_t i;
+
+	for (i = 0; i < nb_rx; i++) {
+		/* Perform data copy */
+		ret = rte_dma_copy(dev_id, 0,
+			rte_pktmbuf_iova(pkts[i]),
+			rte_pktmbuf_iova(pkts_copy[i]),
+			rte_pktmbuf_data_len(pkts[i]), 0);
+
+		if (ret < 0)
+			break;
+
+		dma->bufs[ret & MBUF_RING_MASK] = pkts[i];
+		dma->copies[ret & MBUF_RING_MASK] = pkts_copy[i];
+	}
+
+	ret = i;
+	return ret;
+}
+
+static inline uint32_t
+dma_enqueue(struct rte_mbuf *pkts[], struct rte_mbuf *pkts_copy[],
+		uint32_t num, uint32_t step, uint16_t dev_id)
+{
+	uint32_t i, k, m, n;
+
+	k = 0;
+	for (i = 0; i < num; i += m) {
+
+		m = RTE_MIN(step, num - i);
+		n = dma_enqueue_packets(pkts + i, pkts_copy + i, m, dev_id);
+		k += n;
+		if (n > 0)
+			rte_dma_submit(dev_id, 0);
+
+		/* don't try to enqueue more if HW queue is full */
+		if (n != m)
+			break;
+	}
+
+	return k;
+}
+
+static inline uint32_t
+dma_dequeue(struct rte_mbuf *src[], struct rte_mbuf *dst[], uint32_t num,
+	uint16_t dev_id)
+{
+	struct dma_bufs *dma = &dma_bufs[dev_id];
+	uint16_t nb_dq, filled;
+	/* Dequeue the mbufs from DMA device. Since all memory
+	 * is DPDK pinned memory and therefore all addresses should
+	 * be valid, we don't check for copy errors
+	 */
+	nb_dq = rte_dma_completed(dev_id, 0, num, NULL, NULL);
+
+	/* Return early if no work to do */
+	if (unlikely(nb_dq == 0))
+		return nb_dq;
+
+	/* Populate pkts_copy with the copies bufs from dma->copies for tx */
+	for (filled = 0; filled < nb_dq; filled++) {
+		src[filled] = dma->bufs[(dma->sent + filled) & MBUF_RING_MASK];
+		dst[filled] = dma->copies[(dma->sent + filled) & MBUF_RING_MASK];
+	}
+	dma->sent += nb_dq;
+
+	return filled;
+
+}
+
+/* Receive packets on one port and enqueue to dmadev or rte_ring. 8< */
+static void
+dma_rx_port(struct rxtx_port_config *rx_config)
+{
+	int32_t ret;
+	uint32_t nb_rx, nb_enq, i, j;
+	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+	struct rte_mbuf *pkts_burst_copy[MAX_PKT_BURST];
+
+	for (i = 0; i < rx_config->nb_queues; i++) {
+
+		nb_rx = rte_eth_rx_burst(rx_config->rxtx_port, i,
+			pkts_burst, MAX_PKT_BURST);
+
+		if (nb_rx == 0)
+			continue;
+
+		port_statistics.rx[rx_config->rxtx_port] += nb_rx;
+
+		ret = rte_mempool_get_bulk(dma_pktmbuf_pool,
+			(void *)pkts_burst_copy, nb_rx);
+
+		if (unlikely(ret < 0))
+			rte_exit(EXIT_FAILURE,
+				"Unable to allocate memory.\n");
+
+		for (j = 0; j < nb_rx; j++)
+			pktmbuf_metadata_copy(pkts_burst[j],
+				pkts_burst_copy[j]);
+
+		if (copy_mode == COPY_MODE_DMA_NUM) {
+			/* enqueue packets for  hardware copy */
+			nb_enq = dma_enqueue(pkts_burst, pkts_burst_copy,
+				nb_rx, dma_batch_sz, rx_config->dmadev_ids[i]);
+
+			/* free any not enqueued packets. */
+			rte_mempool_put_bulk(dma_pktmbuf_pool,
+				(void *)&pkts_burst[nb_enq],
+				nb_rx - nb_enq);
+			rte_mempool_put_bulk(dma_pktmbuf_pool,
+				(void *)&pkts_burst_copy[nb_enq],
+				nb_rx - nb_enq);
+
+			port_statistics.copy_dropped[rx_config->rxtx_port] +=
+				(nb_rx - nb_enq);
+
+			/* get completed copies */
+			nb_rx = dma_dequeue(pkts_burst, pkts_burst_copy,
+				MAX_PKT_BURST, rx_config->dmadev_ids[i]);
+		} else {
+			/* Perform packet software copy, free source packets */
+			for (j = 0; j < nb_rx; j++)
+				pktmbuf_sw_copy(pkts_burst[j],
+					pkts_burst_copy[j]);
+		}
+
+		rte_mempool_put_bulk(dma_pktmbuf_pool,
+			(void *)pkts_burst, nb_rx);
+
+		nb_enq = rte_ring_enqueue_burst(rx_config->rx_to_tx_ring,
+			(void *)pkts_burst_copy, nb_rx, NULL);
+
+		/* Free any not enqueued packets. */
+		rte_mempool_put_bulk(dma_pktmbuf_pool,
+			(void *)&pkts_burst_copy[nb_enq],
+			nb_rx - nb_enq);
+
+		port_statistics.copy_dropped[rx_config->rxtx_port] +=
+			(nb_rx - nb_enq);
+	}
+}
+/* >8 End of receive packets on one port and enqueue to dmadev or rte_ring. */
+
+/* Transmit packets from dmadev/rte_ring for one port. 8< */
+static void
+dma_tx_port(struct rxtx_port_config *tx_config)
+{
+	uint32_t i, j, nb_dq, nb_tx;
+	struct rte_mbuf *mbufs[MAX_PKT_BURST];
+
+	for (i = 0; i < tx_config->nb_queues; i++) {
+
+		/* Dequeue the mbufs from rx_to_tx_ring. */
+		nb_dq = rte_ring_dequeue_burst(tx_config->rx_to_tx_ring,
+				(void *)mbufs, MAX_PKT_BURST, NULL);
+		if (nb_dq == 0)
+			continue;
+
+		/* Update macs if enabled */
+		if (mac_updating) {
+			for (j = 0; j < nb_dq; j++)
+				update_mac_addrs(mbufs[j],
+					tx_config->rxtx_port);
+		}
+
+		nb_tx = rte_eth_tx_burst(tx_config->rxtx_port, 0,
+				(void *)mbufs, nb_dq);
+
+		port_statistics.tx[tx_config->rxtx_port] += nb_tx;
+
+		/* Free any unsent packets. */
+		if (unlikely(nb_tx < nb_dq))
+			rte_mempool_put_bulk(dma_pktmbuf_pool,
+			(void *)&mbufs[nb_tx], nb_dq - nb_tx);
+	}
+}
+/* >8 End of transmitting packets from dmadev. */
+
+/* Main rx processing loop for dmadev. */
+static void
+rx_main_loop(void)
+{
+	uint16_t i;
+	uint16_t nb_ports = cfg.nb_ports;
+
+	RTE_LOG(INFO, DMA, "Entering main rx loop for copy on lcore %u\n",
+		rte_lcore_id());
+
+	while (!force_quit)
+		for (i = 0; i < nb_ports; i++)
+			dma_rx_port(&cfg.ports[i]);
+}
+
+/* Main tx processing loop for hardware copy. */
+static void
+tx_main_loop(void)
+{
+	uint16_t i;
+	uint16_t nb_ports = cfg.nb_ports;
+
+	RTE_LOG(INFO, DMA, "Entering main tx loop for copy on lcore %u\n",
+		rte_lcore_id());
+
+	while (!force_quit)
+		for (i = 0; i < nb_ports; i++)
+			dma_tx_port(&cfg.ports[i]);
+}
+
+/* Main rx and tx loop if only one worker lcore available */
+static void
+rxtx_main_loop(void)
+{
+	uint16_t i;
+	uint16_t nb_ports = cfg.nb_ports;
+
+	RTE_LOG(INFO, DMA, "Entering main rx and tx loop for copy on"
+		" lcore %u\n", rte_lcore_id());
+
+	while (!force_quit)
+		for (i = 0; i < nb_ports; i++) {
+			dma_rx_port(&cfg.ports[i]);
+			dma_tx_port(&cfg.ports[i]);
+		}
+}
+
+/* Start processing for each lcore. 8< */
+static void start_forwarding_cores(void)
+{
+	uint32_t lcore_id = rte_lcore_id();
+
+	RTE_LOG(INFO, DMA, "Entering %s on lcore %u\n",
+		__func__, rte_lcore_id());
+
+	if (cfg.nb_lcores == 1) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		rte_eal_remote_launch((lcore_function_t *)rxtx_main_loop,
+			NULL, lcore_id);
+	} else if (cfg.nb_lcores > 1) {
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		rte_eal_remote_launch((lcore_function_t *)rx_main_loop,
+			NULL, lcore_id);
+
+		lcore_id = rte_get_next_lcore(lcore_id, true, true);
+		rte_eal_remote_launch((lcore_function_t *)tx_main_loop, NULL,
+			lcore_id);
+	}
+}
+/* >8 End of starting to processfor each lcore. */
+
+/* Display usage */
+static void
+dma_usage(const char *prgname)
+{
+	printf("%s [EAL options] -- -p PORTMASK [-q NQ]\n"
+		"  -b --dma-batch-size: number of requests per DMA batch\n"
+		"  -f --max-frame-size: max frame size\n"
+		"  -p --portmask: hexadecimal bitmask of ports to configure\n"
+		"  -q NQ: number of RX queues per port (default is 1)\n"
+		"  --[no-]mac-updating: Enable or disable MAC addresses updating (enabled by default)\n"
+		"      When enabled:\n"
+		"       - The source MAC address is replaced by the TX port MAC address\n"
+		"       - The destination MAC address is replaced by 02:00:00:00:00:TX_PORT_ID\n"
+		"  -c --copy-type CT: type of copy: sw|hw\n"
+		"  -s --ring-size RS: size of dmadev descriptor ring for hardware copy mode or rte_ring for software copy mode\n"
+		"  -i --stats-interval SI: interval, in seconds, between stats prints (default is 1)\n",
+			prgname);
+}
+
+static int
+dma_parse_portmask(const char *portmask)
+{
+	char *end = NULL;
+	unsigned long pm;
+
+	/* Parse hexadecimal string */
+	pm = strtoul(portmask, &end, 16);
+	if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0'))
+		return 0;
+
+	return pm;
+}
+
+static copy_mode_t
+dma_parse_copy_mode(const char *copy_mode)
+{
+	if (strcmp(copy_mode, COPY_MODE_SW) == 0)
+		return COPY_MODE_SW_NUM;
+	else if (strcmp(copy_mode, COPY_MODE_DMA) == 0)
+		return COPY_MODE_DMA_NUM;
+
+	return COPY_MODE_INVALID_NUM;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+dma_parse_args(int argc, char **argv, unsigned int nb_ports)
+{
+	static const char short_options[] =
+		"b:"  /* dma batch size */
+		"c:"  /* copy type (sw|hw) */
+		"f:"  /* max frame size */
+		"p:"  /* portmask */
+		"q:"  /* number of RX queues per port */
+		"s:"  /* ring size */
+		"i:"  /* interval, in seconds, between stats prints */
+		;
+
+	static const struct option lgopts[] = {
+		{CMD_LINE_OPT_MAC_UPDATING, no_argument, &mac_updating, 1},
+		{CMD_LINE_OPT_NO_MAC_UPDATING, no_argument, &mac_updating, 0},
+		{CMD_LINE_OPT_PORTMASK, required_argument, NULL, 'p'},
+		{CMD_LINE_OPT_NB_QUEUE, required_argument, NULL, 'q'},
+		{CMD_LINE_OPT_COPY_TYPE, required_argument, NULL, 'c'},
+		{CMD_LINE_OPT_RING_SIZE, required_argument, NULL, 's'},
+		{CMD_LINE_OPT_BATCH_SIZE, required_argument, NULL, 'b'},
+		{CMD_LINE_OPT_FRAME_SIZE, required_argument, NULL, 'f'},
+		{CMD_LINE_OPT_STATS_INTERVAL, required_argument, NULL, 'i'},
+		{NULL, 0, 0, 0}
+	};
+
+	const unsigned int default_port_mask = (1 << nb_ports) - 1;
+	int opt, ret;
+	char **argvopt;
+	int option_index;
+	char *prgname = argv[0];
+
+	dma_enabled_port_mask = default_port_mask;
+	argvopt = argv;
+
+	while ((opt = getopt_long(argc, argvopt, short_options,
+			lgopts, &option_index)) != EOF) {
+
+		switch (opt) {
+		case 'b':
+			dma_batch_sz = atoi(optarg);
+			if (dma_batch_sz > MAX_PKT_BURST) {
+				printf("Invalid dma batch size, %s.\n", optarg);
+				dma_usage(prgname);
+				return -1;
+			}
+			break;
+		case 'f':
+			max_frame_size = atoi(optarg);
+			if (max_frame_size > RTE_ETHER_MAX_JUMBO_FRAME_LEN) {
+				printf("Invalid max frame size, %s.\n", optarg);
+				dma_usage(prgname);
+				return -1;
+			}
+			break;
+
+		/* portmask */
+		case 'p':
+			dma_enabled_port_mask = dma_parse_portmask(optarg);
+			if (dma_enabled_port_mask & ~default_port_mask ||
+					dma_enabled_port_mask <= 0) {
+				printf("Invalid portmask, %s, suggest 0x%x\n",
+						optarg, default_port_mask);
+				dma_usage(prgname);
+				return -1;
+			}
+			break;
+
+		case 'q':
+			nb_queues = atoi(optarg);
+			if (nb_queues == 0 || nb_queues > MAX_RX_QUEUES_COUNT) {
+				printf("Invalid RX queues number %s. Max %u\n",
+					optarg, MAX_RX_QUEUES_COUNT);
+				dma_usage(prgname);
+				return -1;
+			}
+			break;
+
+		case 'c':
+			copy_mode = dma_parse_copy_mode(optarg);
+			if (copy_mode == COPY_MODE_INVALID_NUM) {
+				printf("Invalid copy type. Use: sw, hw\n");
+				dma_usage(prgname);
+				return -1;
+			}
+			break;
+
+		case 's':
+			ring_size = atoi(optarg);
+			if (ring_size == 0) {
+				printf("Invalid ring size, %s.\n", optarg);
+				dma_usage(prgname);
+				return -1;
+			}
+			/* ring_size must be less-than or equal to MBUF_RING_SIZE
+			 * to avoid overwriting bufs
+			 */
+			if (ring_size > MBUF_RING_SIZE) {
+				printf("Max ring_size is %d, setting ring_size to max",
+						MBUF_RING_SIZE);
+				ring_size = MBUF_RING_SIZE;
+			}
+			break;
+
+		case 'i':
+			stats_interval = atoi(optarg);
+			if (stats_interval == 0) {
+				printf("Invalid stats interval, setting to 1\n");
+				stats_interval = 1;	/* set to default */
+			}
+			break;
+
+		/* long options */
+		case 0:
+			break;
+
+		default:
+			dma_usage(prgname);
+			return -1;
+		}
+	}
+
+	printf("MAC updating %s\n", mac_updating ? "enabled" : "disabled");
+	if (optind >= 0)
+		argv[optind - 1] = prgname;
+
+	ret = optind - 1;
+	optind = 1; /* reset getopt lib */
+	return ret;
+}
+
+/* check link status, return true if at least one port is up */
+static int
+check_link_status(uint32_t port_mask)
+{
+	uint16_t portid;
+	struct rte_eth_link link;
+	int ret, link_status = 0;
+	char link_status_text[RTE_ETH_LINK_MAX_STR_LEN];
+
+	printf("\nChecking link status\n");
+	RTE_ETH_FOREACH_DEV(portid) {
+		if ((port_mask & (1 << portid)) == 0)
+			continue;
+
+		memset(&link, 0, sizeof(link));
+		ret = rte_eth_link_get(portid, &link);
+		if (ret < 0) {
+			printf("Port %u link get failed: err=%d\n",
+					portid, ret);
+			continue;
+		}
+
+		/* Print link status */
+		rte_eth_link_to_str(link_status_text,
+			sizeof(link_status_text), &link);
+		printf("Port %d %s\n", portid, link_status_text);
+
+		if (link.link_status)
+			link_status = 1;
+	}
+	return link_status;
+}
+
+/* Configuration of device. 8< */
+static void
+configure_dmadev_queue(uint32_t dev_id)
+{
+	struct rte_dma_info info;
+	struct rte_dma_conf dev_config = { .nb_vchans = 1 };
+	struct rte_dma_vchan_conf qconf = {
+		.direction = RTE_DMA_DIR_MEM_TO_MEM,
+		.nb_desc = ring_size
+	};
+	uint16_t vchan = 0;
+
+	if (rte_dma_configure(dev_id, &dev_config) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_configure()\n");
+
+	if (rte_dma_vchan_setup(dev_id, vchan, &qconf) != 0) {
+		printf("Error with queue configuration\n");
+		rte_panic();
+	}
+	rte_dma_info_get(dev_id, &info);
+	if (info.nb_vchans != 1) {
+		printf("Error, no configured queues reported on device id %u\n", dev_id);
+		rte_panic();
+	}
+	if (rte_dma_start(dev_id) != 0)
+		rte_exit(EXIT_FAILURE, "Error with rte_dma_start()\n");
+}
+/* >8 End of configuration of device. */
+
+/* Using dmadev API functions. 8< */
+static void
+assign_dmadevs(void)
+{
+	uint16_t nb_dmadev = 0;
+	int16_t dev_id = rte_dma_next_dev(0);
+	uint32_t i, j;
+
+	for (i = 0; i < cfg.nb_ports; i++) {
+		for (j = 0; j < cfg.ports[i].nb_queues; j++) {
+			if (dev_id == -1)
+				goto end;
+
+			cfg.ports[i].dmadev_ids[j] = dev_id;
+			configure_dmadev_queue(cfg.ports[i].dmadev_ids[j]);
+			dev_id = rte_dma_next_dev(dev_id + 1);
+			++nb_dmadev;
+		}
+	}
+end:
+	if (nb_dmadev < cfg.nb_ports * cfg.ports[0].nb_queues)
+		rte_exit(EXIT_FAILURE,
+			"Not enough dmadevs (%u) for all queues (%u).\n",
+			nb_dmadev, cfg.nb_ports * cfg.ports[0].nb_queues);
+	RTE_LOG(INFO, DMA, "Number of used dmadevs: %u.\n", nb_dmadev);
+}
+/* >8 End of using dmadev API functions. */
+
+/* Assign ring structures for packet exchanging. 8< */
+static void
+assign_rings(void)
+{
+	uint32_t i;
+
+	for (i = 0; i < cfg.nb_ports; i++) {
+		char ring_name[RTE_RING_NAMESIZE];
+
+		snprintf(ring_name, sizeof(ring_name), "rx_to_tx_ring_%u", i);
+		/* Create ring for inter core communication */
+		cfg.ports[i].rx_to_tx_ring = rte_ring_create(
+			ring_name, ring_size,
+			rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ);
+
+		if (cfg.ports[i].rx_to_tx_ring == NULL)
+			rte_exit(EXIT_FAILURE, "Ring create failed: %s\n",
+				rte_strerror(rte_errno));
+	}
+}
+/* >8 End of assigning ring structures for packet exchanging. */
+
+/*
+ * Initializes a given port using global settings and with the RX buffers
+ * coming from the mbuf_pool passed as a parameter.
+ */
+static inline void
+port_init(uint16_t portid, struct rte_mempool *mbuf_pool, uint16_t nb_queues)
+{
+	/* Configuring port to use RSS for multiple RX queues. 8< */
+	static const struct rte_eth_conf port_conf = {
+		.rxmode = {
+			.mq_mode = RTE_ETH_MQ_RX_RSS,
+		},
+		.rx_adv_conf = {
+			.rss_conf = {
+				.rss_key = NULL,
+				.rss_hf = RTE_ETH_RSS_PROTO_MASK,
+			}
+		}
+	};
+	/* >8 End of configuring port to use RSS for multiple RX queues. */
+
+	struct rte_eth_rxconf rxq_conf;
+	struct rte_eth_txconf txq_conf;
+	struct rte_eth_conf local_port_conf = port_conf;
+	struct rte_eth_dev_info dev_info;
+	int ret, i;
+
+	if (max_frame_size > local_port_conf.rxmode.mtu)
+		local_port_conf.rxmode.mtu = max_frame_size;
+
+	/* Skip ports that are not enabled */
+	if ((dma_enabled_port_mask & (1 << portid)) == 0) {
+		printf("Skipping disabled port %u\n", portid);
+		return;
+	}
+
+	/* Init port */
+	printf("Initializing port %u... ", portid);
+	fflush(stdout);
+	ret = rte_eth_dev_info_get(portid, &dev_info);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "Cannot get device info: %s, port=%u\n",
+			rte_strerror(-ret), portid);
+
+	local_port_conf.rx_adv_conf.rss_conf.rss_hf &=
+		dev_info.flow_type_rss_offloads;
+	ret = rte_eth_dev_configure(portid, nb_queues, 1, &local_port_conf);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "Cannot configure device:"
+			" err=%d, port=%u\n", ret, portid);
+
+	ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd,
+			&nb_txd);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE,
+			"Cannot adjust number of descriptors: err=%d, port=%u\n",
+			ret, portid);
+
+	rte_eth_macaddr_get(portid, &dma_ports_eth_addr[portid]);
+
+	/* Init RX queues */
+	rxq_conf = dev_info.default_rxconf;
+	rxq_conf.offloads = local_port_conf.rxmode.offloads;
+	for (i = 0; i < nb_queues; i++) {
+		ret = rte_eth_rx_queue_setup(portid, i, nb_rxd,
+			rte_eth_dev_socket_id(portid), &rxq_conf,
+			mbuf_pool);
+		if (ret < 0)
+			rte_exit(EXIT_FAILURE,
+				"rte_eth_rx_queue_setup:err=%d,port=%u, queue_id=%u\n",
+				ret, portid, i);
+	}
+
+	/* Init one TX queue on each port */
+	txq_conf = dev_info.default_txconf;
+	txq_conf.offloads = local_port_conf.txmode.offloads;
+	ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
+			rte_eth_dev_socket_id(portid),
+			&txq_conf);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE,
+			"rte_eth_tx_queue_setup:err=%d,port=%u\n",
+			ret, portid);
+
+	/* Initialize TX buffers */
+	tx_buffer[portid] = rte_zmalloc_socket("tx_buffer",
+			RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
+			rte_eth_dev_socket_id(portid));
+	if (tx_buffer[portid] == NULL)
+		rte_exit(EXIT_FAILURE,
+			"Cannot allocate buffer for tx on port %u\n",
+			portid);
+
+	rte_eth_tx_buffer_init(tx_buffer[portid], MAX_PKT_BURST);
+
+	ret = rte_eth_tx_buffer_set_err_callback(tx_buffer[portid],
+		rte_eth_tx_buffer_count_callback,
+		&port_statistics.tx_dropped[portid]);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE,
+			"Cannot set error callback for tx buffer on port %u\n",
+			portid);
+
+	/* Start device. 8< */
+	ret = rte_eth_dev_start(portid);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE,
+			"rte_eth_dev_start:err=%d, port=%u\n",
+			ret, portid);
+	/* >8 End of starting device. */
+
+	/* RX port is set in promiscuous mode. 8< */
+	rte_eth_promiscuous_enable(portid);
+	/* >8 End of RX port is set in promiscuous mode. */
+
+	printf("Port %u, MAC address: " RTE_ETHER_ADDR_PRT_FMT "\n\n",
+			portid,
+			RTE_ETHER_ADDR_BYTES(&dma_ports_eth_addr[portid]));
+
+	cfg.ports[cfg.nb_ports].rxtx_port = portid;
+	cfg.ports[cfg.nb_ports++].nb_queues = nb_queues;
+}
+
+/* Get a device dump for each device being used by the application */
+static void
+dmadev_dump(void)
+{
+	uint32_t i, j;
+
+	if (copy_mode != COPY_MODE_DMA_NUM)
+		return;
+
+	for (i = 0; i < cfg.nb_ports; i++)
+		for (j = 0; j < cfg.ports[i].nb_queues; j++)
+			rte_dma_dump(cfg.ports[i].dmadev_ids[j], stdout);
+}
+
+static void
+signal_handler(int signum)
+{
+	if (signum == SIGINT || signum == SIGTERM) {
+		printf("\n\nSignal %d received, preparing to exit...\n",
+			signum);
+		force_quit = true;
+	} else if (signum == SIGUSR1) {
+		dmadev_dump();
+	}
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret;
+	uint16_t nb_ports, portid;
+	uint32_t i;
+	unsigned int nb_mbufs;
+	size_t sz;
+
+	/* Init EAL. 8< */
+	ret = rte_eal_init(argc, argv);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
+	/* >8 End of init EAL. */
+	argc -= ret;
+	argv += ret;
+
+	force_quit = false;
+	signal(SIGINT, signal_handler);
+	signal(SIGTERM, signal_handler);
+	signal(SIGUSR1, signal_handler);
+
+	nb_ports = rte_eth_dev_count_avail();
+	if (nb_ports == 0)
+		rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+
+	/* Parse application arguments (after the EAL ones) */
+	ret = dma_parse_args(argc, argv, nb_ports);
+	if (ret < 0)
+		rte_exit(EXIT_FAILURE, "Invalid DMA arguments\n");
+
+	/* Allocates mempool to hold the mbufs. 8< */
+	nb_mbufs = RTE_MAX(nb_ports * (nb_queues * (nb_rxd + nb_txd +
+		4 * MAX_PKT_BURST + ring_size) + ring_size +
+		rte_lcore_count() * MEMPOOL_CACHE_SIZE),
+		MIN_POOL_SIZE);
+
+	/* Create the mbuf pool */
+	sz = max_frame_size + RTE_PKTMBUF_HEADROOM;
+	sz = RTE_MAX(sz, (size_t)RTE_MBUF_DEFAULT_BUF_SIZE);
+	dma_pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", nb_mbufs,
+		MEMPOOL_CACHE_SIZE, 0, sz, rte_socket_id());
+	if (dma_pktmbuf_pool == NULL)
+		rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n");
+	/* >8 End of allocates mempool to hold the mbufs. */
+
+	/* Initialize each port. 8< */
+	cfg.nb_ports = 0;
+	RTE_ETH_FOREACH_DEV(portid)
+		port_init(portid, dma_pktmbuf_pool, nb_queues);
+	/* >8 End of initializing each port. */
+
+	/* Initialize port xstats */
+	memset(&port_statistics, 0, sizeof(port_statistics));
+
+	/* Assigning each port resources. 8< */
+	while (!check_link_status(dma_enabled_port_mask) && !force_quit)
+		sleep(1);
+
+	/* Check if there is enough lcores for all ports. */
+	cfg.nb_lcores = rte_lcore_count() - 1;
+	if (cfg.nb_lcores < 1)
+		rte_exit(EXIT_FAILURE,
+			"There should be at least one worker lcore.\n");
+
+	if (copy_mode == COPY_MODE_DMA_NUM)
+		assign_dmadevs();
+
+	assign_rings();
+	/* >8 End of assigning each port resources. */
+
+	start_forwarding_cores();
+	/* main core prints stats while other cores forward */
+	print_stats(argv[0]);
+
+	/* force_quit is true when we get here */
+	rte_eal_mp_wait_lcore();
+
+	uint32_t j;
+	for (i = 0; i < cfg.nb_ports; i++) {
+		printf("Closing port %d\n", cfg.ports[i].rxtx_port);
+		ret = rte_eth_dev_stop(cfg.ports[i].rxtx_port);
+		if (ret != 0)
+			RTE_LOG(ERR, DMA, "rte_eth_dev_stop: err=%s, port=%u\n",
+				rte_strerror(-ret), cfg.ports[i].rxtx_port);
+
+		rte_eth_dev_close(cfg.ports[i].rxtx_port);
+		if (copy_mode == COPY_MODE_DMA_NUM) {
+			for (j = 0; j < cfg.ports[i].nb_queues; j++) {
+				printf("Stopping dmadev %d\n",
+					cfg.ports[i].dmadev_ids[j]);
+				rte_dma_stop(cfg.ports[i].dmadev_ids[j]);
+			}
+		} else /* copy_mode == COPY_MODE_SW_NUM */
+			rte_ring_free(cfg.ports[i].rx_to_tx_ring);
+	}
+
+	/* clean up the EAL */
+	rte_eal_cleanup();
+
+	printf("Bye...\n");
+	return 0;
+}
diff --git a/examples/dma/meson.build b/examples/dma/meson.build
new file mode 100644
index 000000000..f70b5d349
--- /dev/null
+++ b/examples/dma/meson.build
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019-2021 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+allow_experimental_apis = true
+
+deps += 'dmadev'
+
+sources = files(
+        'dmafwd.c'
+)
diff --git a/examples/meson.build b/examples/meson.build
index 46ec80919..6c57db163 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -12,7 +12,7 @@ execinfo = cc.find_library('execinfo', required: false)
 all_examples = [
 	'bbdev_app', 'bond',
 	'cmdline',
-	'distributor', 'ethtool',
+	'distributor', 'dma', 'ethtool',
 	'eventdev_pipeline',
 	'fips_validation', 'flow_classify',
 	'flow_filtering', 'helloworld',
diff --git a/lib/librte_dmadev/meson.build b/lib/librte_dmadev/meson.build
new file mode 100644
index 000000000..8d2ed5261
--- /dev/null
+++ b/lib/librte_dmadev/meson.build
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2021 HiSilicon Limited.
+
+sources = files('rte_dmadev.c')
+headers = files('rte_dmadev.h', 'rte_dmadev_core.h', 'rte_dmadev_pmd.h')
diff --git a/lib/librte_dmadev/rte_dmadev.c b/lib/librte_dmadev/rte_dmadev.c
new file mode 100644
index 000000000..7097fe41a
--- /dev/null
+++ b/lib/librte_dmadev/rte_dmadev.c
@@ -0,0 +1,866 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_malloc.h>
+#include <rte_memzone.h>
+#include <rte_string_fns.h>
+
+#include "rte_dmadev.h"
+#include "rte_dmadev_pmd.h"
+
+static int16_t dma_devices_max;
+
+struct rte_dma_fp_object *rte_dma_fp_objs;
+static struct rte_dma_dev *rte_dma_devices;
+static struct {
+	/* Hold the dev_max information of the primary process. This field is
+	 * set by the primary process and is read by the secondary process.
+	 */
+	int16_t dev_max;
+	struct rte_dma_dev_data data[0];
+} *dma_devices_shared_data;
+
+RTE_LOG_REGISTER(rte_dma_logtype, lib.dma, INFO);
+#define RTE_DMA_LOG(level, ...) \
+	rte_log(RTE_LOG_ ## level, rte_dma_logtype, RTE_FMT("dma: " \
+		RTE_FMT_HEAD(__VA_ARGS__,) "\n", RTE_FMT_TAIL(__VA_ARGS__,)))
+
+int
+rte_dma_dev_max(size_t dev_max)
+{
+	/* This function may be called before rte_eal_init(), so no rte library
+	 * function can be called in this function.
+	 */
+	if (dev_max == 0 || dev_max > INT16_MAX)
+		return -EINVAL;
+
+	if (dma_devices_max > 0)
+		return -EINVAL;
+
+	dma_devices_max = dev_max;
+
+	return 0;
+}
+
+int16_t
+rte_dma_next_dev(int16_t start_dev_id)
+{
+	int16_t dev_id = start_dev_id;
+	while (dev_id < dma_devices_max && rte_dma_devices[dev_id].state == RTE_DMA_DEV_UNUSED)
+		dev_id++;
+
+	if (dev_id < dma_devices_max)
+		return dev_id;
+
+	return -1;
+}
+
+static int
+dma_check_name(const char *name)
+{
+	size_t name_len;
+
+	if (name == NULL) {
+		RTE_DMA_LOG(ERR, "Name can't be NULL");
+		return -EINVAL;
+	}
+
+	name_len = strnlen(name, RTE_DEV_NAME_MAX_LEN);
+	if (name_len == 0) {
+		RTE_DMA_LOG(ERR, "Zero length DMA device name");
+		return -EINVAL;
+	}
+	if (name_len >= RTE_DEV_NAME_MAX_LEN) {
+		RTE_DMA_LOG(ERR, "DMA device name is too long");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int16_t
+dma_find_free_id(void)
+{
+	int16_t i;
+
+	if (rte_dma_devices == NULL || dma_devices_shared_data == NULL)
+		return -1;
+
+	for (i = 0; i < dma_devices_max; i++) {
+		if (dma_devices_shared_data->data[i].dev_name[0] == '\0')
+			return i;
+	}
+
+	return -1;
+}
+
+static struct rte_dma_dev*
+dma_find_by_name(const char *name)
+{
+	int16_t i;
+
+	if (rte_dma_devices == NULL)
+		return NULL;
+
+	for (i = 0; i < dma_devices_max; i++) {
+		if ((rte_dma_devices[i].state != RTE_DMA_DEV_UNUSED) &&
+		    (!strcmp(name, rte_dma_devices[i].data->dev_name)))
+			return &rte_dma_devices[i];
+	}
+
+	return NULL;
+}
+
+static void dma_fp_object_dummy(struct rte_dma_fp_object *obj);
+
+static int
+dma_fp_data_prepare(void)
+{
+	size_t size;
+	void *ptr;
+	int i;
+
+	if (rte_dma_fp_objs != NULL)
+		return 0;
+
+	/* Fast-path object must align cacheline, but the return value of malloc
+	 * may not be aligned to the cache line. Therefore, extra memory is
+	 * applied for realignment.
+	 * note: We do not call posix_memalign/aligned_alloc because it is
+	 * version dependent on libc.
+	 */
+	size = dma_devices_max * sizeof(struct rte_dma_fp_object) +
+		RTE_CACHE_LINE_SIZE;
+	ptr = malloc(size);
+	if (ptr == NULL)
+		return -ENOMEM;
+	memset(ptr, 0, size);
+
+	rte_dma_fp_objs = RTE_PTR_ALIGN(ptr, RTE_CACHE_LINE_SIZE);
+	for (i = 0; i < dma_devices_max; i++)
+		dma_fp_object_dummy(&rte_dma_fp_objs[i]);
+
+	return 0;
+}
+
+static int
+dma_dev_data_prepare(void)
+{
+	size_t size;
+
+	if (rte_dma_devices != NULL)
+		return 0;
+
+	size = dma_devices_max * sizeof(struct rte_dma_dev);
+	rte_dma_devices = malloc(size);
+	if (rte_dma_devices == NULL)
+		return -ENOMEM;
+	memset(rte_dma_devices, 0, size);
+
+	return 0;
+}
+
+static int
+dma_shared_data_prepare(void)
+{
+	const char *mz_name = "rte_dma_dev_data";
+	const struct rte_memzone *mz;
+	size_t size;
+
+	if (dma_devices_shared_data != NULL)
+		return 0;
+
+	size = sizeof(*dma_devices_shared_data) +
+		sizeof(struct rte_dma_dev_data) * dma_devices_max;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		mz = rte_memzone_reserve(mz_name, size, rte_socket_id(), 0);
+	else
+		mz = rte_memzone_lookup(mz_name);
+	if (mz == NULL)
+		return -ENOMEM;
+
+	dma_devices_shared_data = mz->addr;
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		memset(dma_devices_shared_data, 0, size);
+		dma_devices_shared_data->dev_max = dma_devices_max;
+	} else {
+		dma_devices_max = dma_devices_shared_data->dev_max;
+	}
+
+	return 0;
+}
+
+static int
+dma_data_prepare(void)
+{
+	int ret;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		if (dma_devices_max == 0)
+			dma_devices_max = RTE_DMADEV_DEFAULT_MAX;
+		ret = dma_fp_data_prepare();
+		if (ret)
+			return ret;
+		ret = dma_dev_data_prepare();
+		if (ret)
+			return ret;
+		ret = dma_shared_data_prepare();
+		if (ret)
+			return ret;
+	} else {
+		ret = dma_shared_data_prepare();
+		if (ret)
+			return ret;
+		ret = dma_fp_data_prepare();
+		if (ret)
+			return ret;
+		ret = dma_dev_data_prepare();
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct rte_dma_dev *
+dma_allocate_primary(const char *name, int numa_node, size_t private_data_size)
+{
+	struct rte_dma_dev *dev;
+	void *dev_private;
+	int16_t dev_id;
+	int ret;
+
+	ret = dma_data_prepare();
+	if (ret < 0) {
+		RTE_DMA_LOG(ERR, "Cannot initialize dmadevs data");
+		return NULL;
+	}
+
+	dev = dma_find_by_name(name);
+	if (dev != NULL) {
+		RTE_DMA_LOG(ERR, "DMA device already allocated");
+		return NULL;
+	}
+
+	dev_private = rte_zmalloc_socket(name, private_data_size,
+					 RTE_CACHE_LINE_SIZE, numa_node);
+	if (dev_private == NULL) {
+		RTE_DMA_LOG(ERR, "Cannot allocate private data");
+		return NULL;
+	}
+
+	dev_id = dma_find_free_id();
+	if (dev_id < 0) {
+		RTE_DMA_LOG(ERR, "Reached maximum number of DMA devices");
+		rte_free(dev_private);
+		return NULL;
+	}
+
+	dev = &rte_dma_devices[dev_id];
+	dev->data = &dma_devices_shared_data->data[dev_id];
+	rte_strscpy(dev->data->dev_name, name, sizeof(dev->data->dev_name));
+	dev->data->dev_id = dev_id;
+	dev->data->numa_node = numa_node;
+	dev->data->dev_private = dev_private;
+
+	return dev;
+}
+
+static struct rte_dma_dev *
+dma_attach_secondary(const char *name)
+{
+	struct rte_dma_dev *dev;
+	int16_t i;
+	int ret;
+
+	ret = dma_data_prepare();
+	if (ret < 0) {
+		RTE_DMA_LOG(ERR, "Cannot initialize dmadevs data");
+		return NULL;
+	}
+
+	for (i = 0; i < dma_devices_max; i++) {
+		if (!strcmp(dma_devices_shared_data->data[i].dev_name, name))
+			break;
+	}
+	if (i == dma_devices_max) {
+		RTE_DMA_LOG(ERR,
+			"Device %s is not driven by the primary process",
+			name);
+		return NULL;
+	}
+
+	dev = &rte_dma_devices[i];
+	dev->data = &dma_devices_shared_data->data[i];
+
+	return dev;
+}
+
+static struct rte_dma_dev *
+dma_allocate(const char *name, int numa_node, size_t private_data_size)
+{
+	struct rte_dma_dev *dev;
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+		dev = dma_allocate_primary(name, numa_node, private_data_size);
+	else
+		dev = dma_attach_secondary(name);
+
+	if (dev) {
+		dev->fp_obj = &rte_dma_fp_objs[dev->data->dev_id];
+		dma_fp_object_dummy(dev->fp_obj);
+	}
+
+	return dev;
+}
+
+static void
+dma_release(struct rte_dma_dev *dev)
+{
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		rte_free(dev->data->dev_private);
+		memset(dev->data, 0, sizeof(struct rte_dma_dev_data));
+	}
+
+	dma_fp_object_dummy(dev->fp_obj);
+	memset(dev, 0, sizeof(struct rte_dma_dev));
+}
+
+struct rte_dma_dev *
+rte_dma_pmd_allocate(const char *name, int numa_node, size_t private_data_size)
+{
+	struct rte_dma_dev *dev;
+
+	if (dma_check_name(name) != 0 || private_data_size == 0)
+		return NULL;
+
+	dev = dma_allocate(name, numa_node, private_data_size);
+	if (dev == NULL)
+		return NULL;
+
+	dev->state = RTE_DMA_DEV_REGISTERED;
+
+	return dev;
+}
+
+int
+rte_dma_pmd_release(const char *name)
+{
+	struct rte_dma_dev *dev;
+
+	if (dma_check_name(name) != 0)
+		return -EINVAL;
+
+	dev = dma_find_by_name(name);
+	if (dev == NULL)
+		return -EINVAL;
+
+	if (dev->state == RTE_DMA_DEV_READY)
+		return rte_dma_close(dev->data->dev_id);
+
+	dma_release(dev);
+	return 0;
+}
+
+int
+rte_dma_get_dev_id_by_name(const char *name)
+{
+	struct rte_dma_dev *dev;
+
+	if (dma_check_name(name) != 0)
+		return -EINVAL;
+
+	dev = dma_find_by_name(name);
+	if (dev == NULL)
+		return -EINVAL;
+
+	return dev->data->dev_id;
+}
+
+bool
+rte_dma_is_valid(int16_t dev_id)
+{
+	return (dev_id >= 0) && (dev_id < dma_devices_max) &&
+		rte_dma_devices != NULL &&
+		rte_dma_devices[dev_id].state != RTE_DMA_DEV_UNUSED;
+}
+
+uint16_t
+rte_dma_count_avail(void)
+{
+	uint16_t count = 0;
+	uint16_t i;
+
+	if (rte_dma_devices == NULL)
+		return count;
+
+	for (i = 0; i < dma_devices_max; i++) {
+		if (rte_dma_devices[i].state != RTE_DMA_DEV_UNUSED)
+			count++;
+	}
+
+	return count;
+}
+
+int
+rte_dma_info_get(int16_t dev_id, struct rte_dma_info *dev_info)
+{
+	const struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id) || dev_info == NULL)
+		return -EINVAL;
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_info_get, -ENOTSUP);
+	memset(dev_info, 0, sizeof(struct rte_dma_info));
+	ret = (*dev->dev_ops->dev_info_get)(dev, dev_info,
+					    sizeof(struct rte_dma_info));
+	if (ret != 0)
+		return ret;
+
+	dev_info->dev_name = dev->data->dev_name;
+	dev_info->numa_node = dev->device->numa_node;
+	dev_info->nb_vchans = dev->data->dev_conf.nb_vchans;
+
+	return 0;
+}
+
+int
+rte_dma_configure(int16_t dev_id, const struct rte_dma_conf *dev_conf)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	struct rte_dma_info dev_info;
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id) || dev_conf == NULL)
+		return -EINVAL;
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMA_LOG(ERR,
+			"Device %d must be stopped to allow configuration",
+			dev_id);
+		return -EBUSY;
+	}
+
+	ret = rte_dma_info_get(dev_id, &dev_info);
+	if (ret != 0) {
+		RTE_DMA_LOG(ERR, "Device %d get device info fail", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->nb_vchans == 0) {
+		RTE_DMA_LOG(ERR,
+			"Device %d configure zero vchans", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->nb_vchans > dev_info.max_vchans) {
+		RTE_DMA_LOG(ERR,
+			"Device %d configure too many vchans", dev_id);
+		return -EINVAL;
+	}
+	if (dev_conf->enable_silent &&
+	    !(dev_info.dev_capa & RTE_DMA_CAPA_SILENT)) {
+		RTE_DMA_LOG(ERR, "Device %d don't support silent", dev_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_configure, -ENOTSUP);
+	ret = (*dev->dev_ops->dev_configure)(dev, dev_conf,
+					     sizeof(struct rte_dma_conf));
+	if (ret == 0)
+		memcpy(&dev->data->dev_conf, dev_conf,
+		       sizeof(struct rte_dma_conf));
+
+	return ret;
+}
+
+int
+rte_dma_start(int16_t dev_id)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+
+	if (dev->data->dev_conf.nb_vchans == 0) {
+		RTE_DMA_LOG(ERR, "Device %d must be configured first", dev_id);
+		return -EINVAL;
+	}
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMA_LOG(WARNING, "Device %d already started", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_start == NULL)
+		goto mark_started;
+
+	ret = (*dev->dev_ops->dev_start)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_started:
+	dev->data->dev_started = 1;
+	return 0;
+}
+
+int
+rte_dma_stop(int16_t dev_id)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+
+	if (dev->data->dev_started == 0) {
+		RTE_DMA_LOG(WARNING, "Device %d already stopped", dev_id);
+		return 0;
+	}
+
+	if (dev->dev_ops->dev_stop == NULL)
+		goto mark_stopped;
+
+	ret = (*dev->dev_ops->dev_stop)(dev);
+	if (ret != 0)
+		return ret;
+
+mark_stopped:
+	dev->data->dev_started = 0;
+	return 0;
+}
+
+int
+rte_dma_close(int16_t dev_id)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+
+	/* Device must be stopped before it can be closed */
+	if (dev->data->dev_started == 1) {
+		RTE_DMA_LOG(ERR,
+			"Device %d must be stopped before closing", dev_id);
+		return -EBUSY;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->dev_close, -ENOTSUP);
+	ret = (*dev->dev_ops->dev_close)(dev);
+	if (ret == 0)
+		dma_release(dev);
+
+	return ret;
+}
+
+int
+rte_dma_vchan_setup(int16_t dev_id, uint16_t vchan,
+		    const struct rte_dma_vchan_conf *conf)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	struct rte_dma_info dev_info;
+	bool src_is_dev, dst_is_dev;
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id) || conf == NULL)
+		return -EINVAL;
+
+	if (dev->data->dev_started != 0) {
+		RTE_DMA_LOG(ERR,
+			"Device %d must be stopped to allow configuration",
+			dev_id);
+		return -EBUSY;
+	}
+
+	ret = rte_dma_info_get(dev_id, &dev_info);
+	if (ret != 0) {
+		RTE_DMA_LOG(ERR, "Device %d get device info fail", dev_id);
+		return -EINVAL;
+	}
+	if (dev->data->dev_conf.nb_vchans == 0) {
+		RTE_DMA_LOG(ERR, "Device %d must be configured first", dev_id);
+		return -EINVAL;
+	}
+	if (vchan >= dev_info.nb_vchans) {
+		RTE_DMA_LOG(ERR, "Device %d vchan out range!", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction != RTE_DMA_DIR_MEM_TO_MEM &&
+	    conf->direction != RTE_DMA_DIR_MEM_TO_DEV &&
+	    conf->direction != RTE_DMA_DIR_DEV_TO_MEM &&
+	    conf->direction != RTE_DMA_DIR_DEV_TO_DEV) {
+		RTE_DMA_LOG(ERR, "Device %d direction invalid!", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == RTE_DMA_DIR_MEM_TO_MEM &&
+	    !(dev_info.dev_capa & RTE_DMA_CAPA_MEM_TO_MEM)) {
+		RTE_DMA_LOG(ERR,
+			"Device %d don't support mem2mem transfer", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == RTE_DMA_DIR_MEM_TO_DEV &&
+	    !(dev_info.dev_capa & RTE_DMA_CAPA_MEM_TO_DEV)) {
+		RTE_DMA_LOG(ERR,
+			"Device %d don't support mem2dev transfer", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == RTE_DMA_DIR_DEV_TO_MEM &&
+	    !(dev_info.dev_capa & RTE_DMA_CAPA_DEV_TO_MEM)) {
+		RTE_DMA_LOG(ERR,
+			"Device %d don't support dev2mem transfer", dev_id);
+		return -EINVAL;
+	}
+	if (conf->direction == RTE_DMA_DIR_DEV_TO_DEV &&
+	    !(dev_info.dev_capa & RTE_DMA_CAPA_DEV_TO_DEV)) {
+		RTE_DMA_LOG(ERR,
+			"Device %d don't support dev2dev transfer", dev_id);
+		return -EINVAL;
+	}
+	if (conf->nb_desc < dev_info.min_desc ||
+	    conf->nb_desc > dev_info.max_desc) {
+		RTE_DMA_LOG(ERR,
+			"Device %d number of descriptors invalid", dev_id);
+		return -EINVAL;
+	}
+	src_is_dev = conf->direction == RTE_DMA_DIR_DEV_TO_MEM ||
+		     conf->direction == RTE_DMA_DIR_DEV_TO_DEV;
+	if ((conf->src_port.port_type == RTE_DMA_PORT_NONE && src_is_dev) ||
+	    (conf->src_port.port_type != RTE_DMA_PORT_NONE && !src_is_dev)) {
+		RTE_DMA_LOG(ERR, "Device %d source port type invalid", dev_id);
+		return -EINVAL;
+	}
+	dst_is_dev = conf->direction == RTE_DMA_DIR_MEM_TO_DEV ||
+		     conf->direction == RTE_DMA_DIR_DEV_TO_DEV;
+	if ((conf->dst_port.port_type == RTE_DMA_PORT_NONE && dst_is_dev) ||
+	    (conf->dst_port.port_type != RTE_DMA_PORT_NONE && !dst_is_dev)) {
+		RTE_DMA_LOG(ERR,
+			"Device %d destination port type invalid", dev_id);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_setup, -ENOTSUP);
+	return (*dev->dev_ops->vchan_setup)(dev, vchan, conf,
+					sizeof(struct rte_dma_vchan_conf));
+}
+
+int
+rte_dma_stats_get(int16_t dev_id, uint16_t vchan, struct rte_dma_stats *stats)
+{
+	const struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+
+	if (!rte_dma_is_valid(dev_id) || stats == NULL)
+		return -EINVAL;
+
+	if (vchan >= dev->data->dev_conf.nb_vchans &&
+	    vchan != RTE_DMA_ALL_VCHAN) {
+		RTE_DMA_LOG(ERR,
+			"Device %d vchan %u out of range", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_get, -ENOTSUP);
+	memset(stats, 0, sizeof(struct rte_dma_stats));
+	return (*dev->dev_ops->stats_get)(dev, vchan, stats,
+					  sizeof(struct rte_dma_stats));
+}
+
+int
+rte_dma_stats_reset(int16_t dev_id, uint16_t vchan)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+
+	if (vchan >= dev->data->dev_conf.nb_vchans &&
+	    vchan != RTE_DMA_ALL_VCHAN) {
+		RTE_DMA_LOG(ERR,
+			"Device %d vchan %u out of range", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->stats_reset, -ENOTSUP);
+	return (*dev->dev_ops->stats_reset)(dev, vchan);
+}
+
+int
+rte_dma_vchan_status(int16_t dev_id, uint16_t vchan, enum rte_dma_vchan_status *status)
+{
+	struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+
+	if (vchan >= dev->data->dev_conf.nb_vchans) {
+		RTE_DMA_LOG(ERR, "Device %u vchan %u out of range\n", dev_id, vchan);
+		return -EINVAL;
+	}
+
+	RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vchan_status, -ENOTSUP);
+	return (*dev->dev_ops->vchan_status)(dev, vchan, status);
+}
+
+static const char *
+dma_capability_name(uint64_t capability)
+{
+	static const struct {
+		uint64_t capability;
+		const char *name;
+	} capa_names[] = {
+		{ RTE_DMA_CAPA_MEM_TO_MEM,  "mem2mem" },
+		{ RTE_DMA_CAPA_MEM_TO_DEV,  "mem2dev" },
+		{ RTE_DMA_CAPA_DEV_TO_MEM,  "dev2mem" },
+		{ RTE_DMA_CAPA_DEV_TO_DEV,  "dev2dev" },
+		{ RTE_DMA_CAPA_SVA,         "sva"     },
+		{ RTE_DMA_CAPA_SILENT,      "silent"  },
+		{ RTE_DMA_CAPA_HANDLES_ERRORS, "handles_errors" },
+		{ RTE_DMA_CAPA_OPS_COPY,    "copy"    },
+		{ RTE_DMA_CAPA_OPS_COPY_SG, "copy_sg" },
+		{ RTE_DMA_CAPA_OPS_FILL,    "fill"    },
+	};
+
+	const char *name = "unknown";
+	uint32_t i;
+
+	for (i = 0; i < RTE_DIM(capa_names); i++) {
+		if (capability == capa_names[i].capability) {
+			name = capa_names[i].name;
+			break;
+		}
+	}
+
+	return name;
+}
+
+static void
+dma_dump_capability(FILE *f, uint64_t dev_capa)
+{
+	uint64_t capa;
+
+	(void)fprintf(f, "  dev_capa: 0x%" PRIx64 " -", dev_capa);
+	while (dev_capa > 0) {
+		capa = 1ull << __builtin_ctzll(dev_capa);
+		(void)fprintf(f, " %s", dma_capability_name(capa));
+		dev_capa &= ~capa;
+	}
+	(void)fprintf(f, "\n");
+}
+
+int
+rte_dma_dump(int16_t dev_id, FILE *f)
+{
+	const struct rte_dma_dev *dev = &rte_dma_devices[dev_id];
+	struct rte_dma_info dev_info;
+	int ret;
+
+	if (!rte_dma_is_valid(dev_id) || f == NULL)
+		return -EINVAL;
+
+	ret = rte_dma_info_get(dev_id, &dev_info);
+	if (ret != 0) {
+		RTE_DMA_LOG(ERR, "Device %d get device info fail", dev_id);
+		return -EINVAL;
+	}
+
+	(void)fprintf(f, "DMA Dev %d, '%s' [%s]\n",
+		dev->data->dev_id,
+		dev->data->dev_name,
+		dev->data->dev_started ? "started" : "stopped");
+	dma_dump_capability(f, dev_info.dev_capa);
+	(void)fprintf(f, "  max_vchans_supported: %u\n", dev_info.max_vchans);
+	(void)fprintf(f, "  nb_vchans_configured: %u\n", dev_info.nb_vchans);
+	(void)fprintf(f, "  silent_mode: %s\n",
+		dev->data->dev_conf.enable_silent ? "on" : "off");
+
+	if (dev->dev_ops->dev_dump != NULL)
+		return (*dev->dev_ops->dev_dump)(dev, f);
+
+	return 0;
+}
+
+static int
+dummy_copy(__rte_unused void *dev_private, __rte_unused uint16_t vchan,
+	   __rte_unused rte_iova_t src, __rte_unused rte_iova_t dst,
+	   __rte_unused uint32_t length, __rte_unused uint64_t flags)
+{
+	RTE_DMA_LOG(ERR, "copy is not configured or not supported.");
+	return -EINVAL;
+}
+
+static int
+dummy_copy_sg(__rte_unused void *dev_private, __rte_unused uint16_t vchan,
+	      __rte_unused const struct rte_dma_sge *src,
+	      __rte_unused const struct rte_dma_sge *dst,
+	      __rte_unused uint16_t nb_src, __rte_unused uint16_t nb_dst,
+	      __rte_unused uint64_t flags)
+{
+	RTE_DMA_LOG(ERR, "copy_sg is not configured or not supported.");
+	return -EINVAL;
+}
+
+static int
+dummy_fill(__rte_unused void *dev_private, __rte_unused uint16_t vchan,
+	   __rte_unused uint64_t pattern, __rte_unused rte_iova_t dst,
+	   __rte_unused uint32_t length, __rte_unused uint64_t flags)
+{
+	RTE_DMA_LOG(ERR, "fill is not configured or not supported.");
+	return -EINVAL;
+}
+
+static int
+dummy_submit(__rte_unused void *dev_private, __rte_unused uint16_t vchan)
+{
+	RTE_DMA_LOG(ERR, "submit is not configured or not supported.");
+	return -EINVAL;
+}
+
+static uint16_t
+dummy_completed(__rte_unused void *dev_private,	__rte_unused uint16_t vchan,
+		__rte_unused const uint16_t nb_cpls,
+		__rte_unused uint16_t *last_idx, __rte_unused bool *has_error)
+{
+	RTE_DMA_LOG(ERR, "completed is not configured or not supported.");
+	return 0;
+}
+
+static uint16_t
+dummy_completed_status(__rte_unused void *dev_private,
+		       __rte_unused uint16_t vchan,
+		       __rte_unused const uint16_t nb_cpls,
+		       __rte_unused uint16_t *last_idx,
+		       __rte_unused enum rte_dma_status_code *status)
+{
+	RTE_DMA_LOG(ERR,
+		    "completed_status is not configured or not supported.");
+	return 0;
+}
+
+static uint16_t
+dummy_burst_capacity(__rte_unused const void *dev_private,
+		     __rte_unused uint16_t vchan)
+{
+	RTE_DMA_LOG(ERR, "burst_capacity is not configured or not supported.");
+	return 0;
+}
+
+static void
+dma_fp_object_dummy(struct rte_dma_fp_object *obj)
+{
+	obj->dev_private      = NULL;
+	obj->copy             = dummy_copy;
+	obj->copy_sg          = dummy_copy_sg;
+	obj->fill             = dummy_fill;
+	obj->submit           = dummy_submit;
+	obj->completed        = dummy_completed;
+	obj->completed_status = dummy_completed_status;
+	obj->burst_capacity   = dummy_burst_capacity;
+}
diff --git a/lib/librte_dmadev/rte_dmadev.h b/lib/librte_dmadev/rte_dmadev.h
new file mode 100644
index 000000000..9942c6ec2
--- /dev/null
+++ b/lib/librte_dmadev/rte_dmadev.h
@@ -0,0 +1,1138 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ * Copyright(c) 2021 Intel Corporation
+ * Copyright(c) 2021 Marvell International Ltd
+ * Copyright(c) 2021 SmartShare Systems
+ */
+
+#ifndef RTE_DMADEV_H
+#define RTE_DMADEV_H
+
+/**
+ * @file rte_dmadev.h
+ *
+ * DMA (Direct Memory Access) device API.
+ *
+ * The DMA framework is built on the following model:
+ *
+ *     ---------------   ---------------       ---------------
+ *     | virtual DMA |   | virtual DMA |       | virtual DMA |
+ *     | channel     |   | channel     |       | channel     |
+ *     ---------------   ---------------       ---------------
+ *            |                |                      |
+ *            ------------------                      |
+ *                     |                              |
+ *               ------------                    ------------
+ *               |  dmadev  |                    |  dmadev  |
+ *               ------------                    ------------
+ *                     |                              |
+ *            ------------------               ------------------
+ *            | HW DMA channel |               | HW DMA channel |
+ *            ------------------               ------------------
+ *                     |                              |
+ *                     --------------------------------
+ *                                     |
+ *                           ---------------------
+ *                           | HW DMA Controller |
+ *                           ---------------------
+ *
+ * The DMA controller could have multiple HW-DMA-channels (aka. HW-DMA-queues),
+ * each HW-DMA-channel should be represented by a dmadev.
+ *
+ * The dmadev could create multiple virtual DMA channels, each virtual DMA
+ * channel represents a different transfer context. The DMA operation request
+ * must be submitted to the virtual DMA channel. e.g. Application could create
+ * virtual DMA channel 0 for memory-to-memory transfer scenario, and create
+ * virtual DMA channel 1 for memory-to-device transfer scenario.
+ *
+ * This framework uses 'int16_t dev_id' as the device identifier of a dmadev,
+ * and 'uint16_t vchan' as the virtual DMA channel identifier in one dmadev.
+ *
+ * The functions exported by the dmadev API to setup a device designated by its
+ * device identifier must be invoked in the following order:
+ *     - rte_dma_configure()
+ *     - rte_dma_vchan_setup()
+ *     - rte_dma_start()
+ *
+ * Then, the application can invoke dataplane functions to process jobs.
+ *
+ * If the application wants to change the configuration (i.e. invoke
+ * rte_dma_configure() or rte_dma_vchan_setup()), it must invoke
+ * rte_dma_stop() first to stop the device and then do the reconfiguration
+ * before invoking rte_dma_start() again. The dataplane functions should not
+ * be invoked when the device is stopped.
+ *
+ * Finally, an application can close a dmadev by invoking the rte_dma_close()
+ * function.
+ *
+ * The dataplane APIs include two parts:
+ * The first part is the submission of operation requests:
+ *     - rte_dma_copy()
+ *     - rte_dma_copy_sg()
+ *     - rte_dma_fill()
+ *     - rte_dma_submit()
+ *
+ * These APIs could work with different virtual DMA channels which have
+ * different contexts.
+ *
+ * The first three APIs are used to submit the operation request to the virtual
+ * DMA channel, if the submission is successful, a positive
+ * ring_idx <= UINT16_MAX is returned, otherwise a negative number is returned.
+ *
+ * The last API is used to issue doorbell to hardware, and also there are flags
+ * (@see RTE_DMA_OP_FLAG_SUBMIT) parameter of the first three APIs could do the
+ * same work.
+ * @note When enqueuing a set of jobs to the device, having a separate submit
+ * outside a loop makes for clearer code than having a check for the last
+ * iteration inside the loop to set a special submit flag.  However, for cases
+ * where one item alone is to be submitted or there is a small set of jobs to
+ * be submitted sequentially, having a submit flag provides a lower-overhead
+ * way of doing the submission while still keeping the code clean.
+ *
+ * The second part is to obtain the result of requests:
+ *     - rte_dma_completed()
+ *         - return the number of operation requests completed successfully.
+ *     - rte_dma_completed_status()
+ *         - return the number of operation requests completed.
+ *
+ * @note If the dmadev works in silent mode (@see RTE_DMA_CAPA_SILENT),
+ * application does not invoke the above two completed APIs.
+ *
+ * About the ring_idx which enqueue APIs (e.g. rte_dma_copy(), rte_dma_fill())
+ * return, the rules are as follows:
+ *     - ring_idx for each virtual DMA channel are independent.
+ *     - For a virtual DMA channel, the ring_idx is monotonically incremented,
+ *       when it reach UINT16_MAX, it wraps back to zero.
+ *     - This ring_idx can be used by applications to track per-operation
+ *       metadata in an application-defined circular ring.
+ *     - The initial ring_idx of a virtual DMA channel is zero, after the
+ *       device is stopped, the ring_idx needs to be reset to zero.
+ *
+ * One example:
+ *     - step-1: start one dmadev
+ *     - step-2: enqueue a copy operation, the ring_idx return is 0
+ *     - step-3: enqueue a copy operation again, the ring_idx return is 1
+ *     - ...
+ *     - step-101: stop the dmadev
+ *     - step-102: start the dmadev
+ *     - step-103: enqueue a copy operation, the ring_idx return is 0
+ *     - ...
+ *     - step-x+0: enqueue a fill operation, the ring_idx return is 65535
+ *     - step-x+1: enqueue a copy operation, the ring_idx return is 0
+ *     - ...
+ *
+ * The DMA operation address used in enqueue APIs (i.e. rte_dma_copy(),
+ * rte_dma_copy_sg(), rte_dma_fill()) is defined as rte_iova_t type.
+ *
+ * The dmadev supports two types of address: memory address and device address.
+ *
+ * - memory address: the source and destination address of the memory-to-memory
+ * transfer type, or the source address of the memory-to-device transfer type,
+ * or the destination address of the device-to-memory transfer type.
+ * @note If the device support SVA (@see RTE_DMA_CAPA_SVA), the memory address
+ * can be any VA address, otherwise it must be an IOVA address.
+ *
+ * - device address: the source and destination address of the device-to-device
+ * transfer type, or the source address of the device-to-memory transfer type,
+ * or the destination address of the memory-to-device transfer type.
+ *
+ * About MT-safe, all the functions of the dmadev API implemented by a PMD are
+ * lock-free functions which assume to not be invoked in parallel on different
+ * logical cores to work on the same target dmadev object.
+ * @note Different virtual DMA channels on the same dmadev *DO NOT* support
+ * parallel invocation because these virtual DMA channels share the same
+ * HW-DMA-channel.
+ */
+
+#include <stdint.h>
+
+#include <rte_bitops.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_dev.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Maximum number of devices if rte_dma_dev_max() is not called. */
+#define RTE_DMADEV_DEFAULT_MAX 64
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure the maximum number of dmadevs.
+ * @note This function can be invoked before the primary process rte_eal_init()
+ * to change the maximum number of dmadevs. If not invoked, the maximum number
+ * of dmadevs is @see RTE_DMADEV_DEFAULT_MAX
+ *
+ * @param dev_max
+ *   maximum number of dmadevs.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_dev_max(size_t dev_max);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the device identifier for the named DMA device.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   Returns DMA device identifier on success.
+ *   - <0: Failure to find named DMA device.
+ */
+__rte_experimental
+int rte_dma_get_dev_id_by_name(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Check whether the dev_id is valid.
+ *
+ * @param dev_id
+ *   DMA device index.
+ *
+ * @return
+ *   - If the device index is valid (true) or not (false).
+ */
+__rte_experimental
+bool rte_dma_is_valid(int16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get the total number of DMA devices that have been successfully
+ * initialised.
+ *
+ * @return
+ *   The total number of usable DMA devices.
+ */
+__rte_experimental
+uint16_t rte_dma_count_avail(void);
+
+/**
+ * Iterates over valid dmadev instances.
+ *
+ * @param start_dev_id
+ *   The id of the next possible dmadev.
+ * @return
+ *   Next valid dmadev, UINT16_MAX if there is none.
+ */
+__rte_experimental
+int16_t rte_dma_next_dev(int16_t start_dev_id);
+
+/** Utility macro to iterate over all available dmadevs */
+#define RTE_DMA_FOREACH_DEV(p) \
+	for (p = rte_dma_next_dev(0); \
+	     p != -1; \
+	     p = rte_dma_next_dev(p + 1))
+
+
+/**@{@name DMA capability
+ * @see struct rte_dma_info::dev_capa
+ */
+/** Support memory-to-memory transfer */
+#define RTE_DMA_CAPA_MEM_TO_MEM		RTE_BIT64(0)
+/** Support memory-to-device transfer. */
+#define RTE_DMA_CAPA_MEM_TO_DEV		RTE_BIT64(1)
+/** Support device-to-memory transfer. */
+#define RTE_DMA_CAPA_DEV_TO_MEM		RTE_BIT64(2)
+/** Support device-to-device transfer. */
+#define RTE_DMA_CAPA_DEV_TO_DEV		RTE_BIT64(3)
+/** Support SVA which could use VA as DMA address.
+ * If device support SVA then application could pass any VA address like memory
+ * from rte_malloc(), rte_memzone(), malloc, stack memory.
+ * If device don't support SVA, then application should pass IOVA address which
+ * from rte_malloc(), rte_memzone().
+ */
+#define RTE_DMA_CAPA_SVA                RTE_BIT64(4)
+/** Support work in silent mode.
+ * In this mode, application don't required to invoke rte_dma_completed*()
+ * API.
+ * @see struct rte_dma_conf::silent_mode
+ */
+#define RTE_DMA_CAPA_SILENT             RTE_BIT64(5)
+/** Supports error handling
+ *
+ * With this bit set, invalid input addresses will be reported as operation failures
+ * to the user but other operations can continue.
+ * Without this bit set, invalid data is not handled by either HW or driver, so user
+ * must ensure that all memory addresses are valid and accessible by HW.
+ */
+#define RTE_DMA_CAPA_HANDLES_ERRORS	RTE_BIT64(6)
+/** Support copy operation.
+ * This capability start with index of 32, so that it could leave gap between
+ * normal capability and ops capability.
+ */
+#define RTE_DMA_CAPA_OPS_COPY           RTE_BIT64(32)
+/** Support scatter-gather list copy operation. */
+#define RTE_DMA_CAPA_OPS_COPY_SG	RTE_BIT64(33)
+/** Support fill operation. */
+#define RTE_DMA_CAPA_OPS_FILL		RTE_BIT64(34)
+/**@}*/
+
+/**
+ * A structure used to retrieve the information of a DMA device.
+ *
+ * @see rte_dma_info_get
+ */
+struct rte_dma_info {
+	const char *dev_name; /**< Unique device name. */
+	/** Device capabilities (RTE_DMA_CAPA_*). */
+	uint64_t dev_capa;
+	/** Maximum number of virtual DMA channels supported. */
+	uint16_t max_vchans;
+	/** Maximum allowed number of virtual DMA channel descriptors. */
+	uint16_t max_desc;
+	/** Minimum allowed number of virtual DMA channel descriptors. */
+	uint16_t min_desc;
+	/** Maximum number of source or destination scatter-gather entry
+	 * supported.
+	 * If the device does not support COPY_SG capability, this value can be
+	 * zero.
+	 * If the device supports COPY_SG capability, then rte_dma_copy_sg()
+	 * parameter nb_src/nb_dst should not exceed this value.
+	 */
+	uint16_t max_sges;
+	/** NUMA node connection, -1 if unknown. */
+	int16_t numa_node;
+	/** Number of virtual DMA channel configured. */
+	uint16_t nb_vchans;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve information of a DMA device.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param[out] dev_info
+ *   A pointer to a structure of type *rte_dma_info* to be filled with the
+ *   information of the device.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_info_get(int16_t dev_id, struct rte_dma_info *dev_info);
+
+/**
+ * A structure used to configure a DMA device.
+ *
+ * @see rte_dma_configure
+ */
+struct rte_dma_conf {
+	/** The number of virtual DMA channels to set up for the DMA device.
+	 * This value cannot be greater than the field 'max_vchans' of struct
+	 * rte_dma_info which get from rte_dma_info_get().
+	 */
+	uint16_t nb_vchans;
+	/** Indicates whether to enable silent mode.
+	 * false-default mode, true-silent mode.
+	 * This value can be set to true only when the SILENT capability is
+	 * supported.
+	 *
+	 * @see RTE_DMA_CAPA_SILENT
+	 */
+	bool enable_silent;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Configure a DMA device.
+ *
+ * This function must be invoked first before any other function in the
+ * API. This function can also be re-invoked when a device is in the
+ * stopped state.
+ *
+ * @param dev_id
+ *   The identifier of the device to configure.
+ * @param dev_conf
+ *   The DMA device configuration structure encapsulated into rte_dma_conf
+ *   object.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_configure(int16_t dev_id, const struct rte_dma_conf *dev_conf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Start a DMA device.
+ *
+ * The device start step is the last one and consists of setting the DMA
+ * to start accepting jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_start(int16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Stop a DMA device.
+ *
+ * The device can be restarted with a call to rte_dma_start().
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_stop(int16_t dev_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Close a DMA device.
+ *
+ * The device cannot be restarted after this call.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_close(int16_t dev_id);
+
+/**
+ * DMA transfer direction defines.
+ *
+ * @see struct rte_dma_vchan_conf::direction
+ */
+enum rte_dma_direction {
+	/** DMA transfer direction - from memory to memory.
+	 *
+	 * @see struct rte_dma_vchan_conf::direction
+	 */
+	RTE_DMA_DIR_MEM_TO_MEM,
+	/** DMA transfer direction - from memory to device.
+	 * In a typical scenario, the SoCs are installed on host servers as
+	 * iNICs through the PCIe interface. In this case, the SoCs works in
+	 * EP(endpoint) mode, it could initiate a DMA move request from memory
+	 * (which is SoCs memory) to device (which is host memory).
+	 *
+	 * @see struct rte_dma_vchan_conf::direction
+	 */
+	RTE_DMA_DIR_MEM_TO_DEV,
+	/** DMA transfer direction - from device to memory.
+	 * In a typical scenario, the SoCs are installed on host servers as
+	 * iNICs through the PCIe interface. In this case, the SoCs works in
+	 * EP(endpoint) mode, it could initiate a DMA move request from device
+	 * (which is host memory) to memory (which is SoCs memory).
+	 *
+	 * @see struct rte_dma_vchan_conf::direction
+	 */
+	RTE_DMA_DIR_DEV_TO_MEM,
+	/** DMA transfer direction - from device to device.
+	 * In a typical scenario, the SoCs are installed on host servers as
+	 * iNICs through the PCIe interface. In this case, the SoCs works in
+	 * EP(endpoint) mode, it could initiate a DMA move request from device
+	 * (which is host memory) to the device (which is another host memory).
+	 *
+	 * @see struct rte_dma_vchan_conf::direction
+	 */
+	RTE_DMA_DIR_DEV_TO_DEV,
+};
+
+/**
+ * DMA access port type defines.
+ *
+ * @see struct rte_dma_port_param::port_type
+ */
+enum rte_dma_port_type {
+	RTE_DMA_PORT_NONE,
+	RTE_DMA_PORT_PCIE, /**< The DMA access port is PCIe. */
+};
+
+/**
+ * A structure used to descript DMA access port parameters.
+ *
+ * @see struct rte_dma_vchan_conf::src_port
+ * @see struct rte_dma_vchan_conf::dst_port
+ */
+struct rte_dma_port_param {
+	/** The device access port type.
+	 *
+	 * @see enum rte_dma_port_type
+	 */
+	enum rte_dma_port_type port_type;
+	RTE_STD_C11
+	union {
+		/** PCIe access port parameters.
+		 *
+		 * The following model shows SoC's PCIe module connects to
+		 * multiple PCIe hosts and multiple endpoints. The PCIe module
+		 * has an integrated DMA controller.
+		 *
+		 * If the DMA wants to access the memory of host A, it can be
+		 * initiated by PF1 in core0, or by VF0 of PF0 in core0.
+		 *
+		 * \code{.unparsed}
+		 * System Bus
+		 *    |     ----------PCIe module----------
+		 *    |     Bus
+		 *    |     Interface
+		 *    |     -----        ------------------
+		 *    |     |   |        | PCIe Core0     |
+		 *    |     |   |        |                |        -----------
+		 *    |     |   |        |   PF-0 -- VF-0 |        | Host A  |
+		 *    |     |   |--------|        |- VF-1 |--------| Root    |
+		 *    |     |   |        |   PF-1         |        | Complex |
+		 *    |     |   |        |   PF-2         |        -----------
+		 *    |     |   |        ------------------
+		 *    |     |   |
+		 *    |     |   |        ------------------
+		 *    |     |   |        | PCIe Core1     |
+		 *    |     |   |        |                |        -----------
+		 *    |     |   |        |   PF-0 -- VF-0 |        | Host B  |
+		 *    |-----|   |--------|   PF-1 -- VF-0 |--------| Root    |
+		 *    |     |   |        |        |- VF-1 |        | Complex |
+		 *    |     |   |        |   PF-2         |        -----------
+		 *    |     |   |        ------------------
+		 *    |     |   |
+		 *    |     |   |        ------------------
+		 *    |     |DMA|        |                |        ------
+		 *    |     |   |        |                |--------| EP |
+		 *    |     |   |--------| PCIe Core2     |        ------
+		 *    |     |   |        |                |        ------
+		 *    |     |   |        |                |--------| EP |
+		 *    |     |   |        |                |        ------
+		 *    |     -----        ------------------
+		 *
+		 * \endcode
+		 *
+		 * @note If some fields can not be supported by the
+		 * hardware/driver, then the driver ignores those fields.
+		 * Please check driver-specific documentation for limitations
+		 * and capablites.
+		 */
+		__extension__
+		struct {
+			uint64_t coreid : 4; /**< PCIe core id used. */
+			uint64_t pfid : 8; /**< PF id used. */
+			uint64_t vfen : 1; /**< VF enable bit. */
+			uint64_t vfid : 16; /**< VF id used. */
+			/** The pasid filed in TLP packet. */
+			uint64_t pasid : 20;
+			/** The attributes filed in TLP packet. */
+			uint64_t attr : 3;
+			/** The processing hint filed in TLP packet. */
+			uint64_t ph : 2;
+			/** The steering tag filed in TLP packet. */
+			uint64_t st : 16;
+		} pcie;
+	};
+	uint64_t reserved[2]; /**< Reserved for future fields. */
+};
+
+/**
+ * A structure used to configure a virtual DMA channel.
+ *
+ * @see rte_dma_vchan_setup
+ */
+struct rte_dma_vchan_conf {
+	/** Transfer direction
+	 *
+	 * @see enum rte_dma_direction
+	 */
+	enum rte_dma_direction direction;
+	/** Number of descriptor for the virtual DMA channel */
+	uint16_t nb_desc;
+	/** 1) Used to describes the device access port parameter in the
+	 * device-to-memory transfer scenario.
+	 * 2) Used to describes the source device access port parameter in the
+	 * device-to-device transfer scenario.
+	 *
+	 * @see struct rte_dma_port_param
+	 */
+	struct rte_dma_port_param src_port;
+	/** 1) Used to describes the device access port parameter in the
+	 * memory-to-device transfer scenario.
+	 * 2) Used to describes the destination device access port parameter in
+	 * the device-to-device transfer scenario.
+	 *
+	 * @see struct rte_dma_port_param
+	 */
+	struct rte_dma_port_param dst_port;
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Allocate and set up a virtual DMA channel.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel. The value must be in the range
+ *   [0, nb_vchans - 1] previously supplied to rte_dma_configure().
+ * @param conf
+ *   The virtual DMA channel configuration structure encapsulated into
+ *   rte_dma_vchan_conf object.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_vchan_setup(int16_t dev_id, uint16_t vchan,
+			const struct rte_dma_vchan_conf *conf);
+
+/**
+ * A structure used to retrieve statistics.
+ *
+ * @see rte_dma_stats_get
+ */
+struct rte_dma_stats {
+	/** Count of operations which were submitted to hardware. */
+	uint64_t submitted;
+	/** Count of operations which were completed, including successful and
+	 * failed completions.
+	 */
+	uint64_t completed;
+	/** Count of operations which failed to complete. */
+	uint64_t errors;
+};
+
+/**
+ * Special ID, which is used to represent all virtual DMA channels.
+ *
+ * @see rte_dma_stats_get
+ * @see rte_dma_stats_reset
+ */
+#define RTE_DMA_ALL_VCHAN	0xFFFFu
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Retrieve basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *   If equal RTE_DMA_ALL_VCHAN means all channels.
+ * @param[out] stats
+ *   The basic statistics structure encapsulated into rte_dma_stats
+ *   object.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_stats_get(int16_t dev_id, uint16_t vchan,
+		      struct rte_dma_stats *stats);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Reset basic statistics of a or all virtual DMA channel(s).
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *   If equal RTE_DMA_ALL_VCHAN means all channels.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_stats_reset(int16_t dev_id, uint16_t vchan);
+
+/**
+ * device vchannel status
+ *
+ * Enum with the options for the channel status, either idle, active or halted due to error
+ * @see rte_dma_vchan_status
+ */
+enum rte_dma_vchan_status {
+	RTE_DMA_VCHAN_IDLE,          /**< not processing, awaiting ops */
+	RTE_DMA_VCHAN_ACTIVE,        /**< currently processing jobs */
+	RTE_DMA_VCHAN_HALTED_ERROR,  /**< not processing due to error, cannot accept new ops */
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Determine if all jobs have completed on a device channel.
+ * This function is primarily designed for testing use, as it allows a process to check if
+ * all jobs are completed, without actually gathering completions from those jobs.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param[out] status
+ *   The vchan status
+ * @return
+ *   0 - call completed successfully
+ *   < 0 - error code indicating there was a problem calling the API
+ */
+__rte_experimental
+int
+rte_dma_vchan_status(int16_t dev_id, uint16_t vchan, enum rte_dma_vchan_status *status);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Dump DMA device info.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param f
+ *   The file to write the output to.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+int rte_dma_dump(int16_t dev_id, FILE *f);
+
+/**
+ * DMA transfer result status code defines.
+ *
+ * @see rte_dma_completed_status
+ */
+enum rte_dma_status_code {
+	/** The operation completed successfully. */
+	RTE_DMA_STATUS_SUCCESSFUL,
+	/** The operation failed to complete due abort by user.
+	 * This is mainly used when processing dev_stop, user could modidy the
+	 * descriptors (e.g. change one bit to tell hardware abort this job),
+	 * it allows outstanding requests to be complete as much as possible,
+	 * so reduce the time to stop the device.
+	 */
+	RTE_DMA_STATUS_USER_ABORT,
+	/** The operation failed to complete due to following scenarios:
+	 * The jobs in a particular batch are not attempted because they
+	 * appeared after a fence where a previous job failed. In some HW
+	 * implementation it's possible for jobs from later batches would be
+	 * completed, though, so report the status from the not attempted jobs
+	 * before reporting those newer completed jobs.
+	 */
+	RTE_DMA_STATUS_NOT_ATTEMPTED,
+	/** The operation failed to complete due invalid source address. */
+	RTE_DMA_STATUS_INVALID_SRC_ADDR,
+	/** The operation failed to complete due invalid destination address. */
+	RTE_DMA_STATUS_INVALID_DST_ADDR,
+	/** The operation failed to complete due invalid source or destination
+	 * address, cover the case that only knows the address error, but not
+	 * sure which address error.
+	 */
+	RTE_DMA_STATUS_INVALID_ADDR,
+	/** The operation failed to complete due invalid length. */
+	RTE_DMA_STATUS_INVALID_LENGTH,
+	/** The operation failed to complete due invalid opcode.
+	 * The DMA descriptor could have multiple format, which are
+	 * distinguished by the opcode field.
+	 */
+	RTE_DMA_STATUS_INVALID_OPCODE,
+	/** The operation failed to complete due bus read error. */
+	RTE_DMA_STATUS_BUS_READ_ERROR,
+	/** The operation failed to complete due bus write error. */
+	RTE_DMA_STATUS_BUS_WRITE_ERROR,
+	/** The operation failed to complete due bus error, cover the case that
+	 * only knows the bus error, but not sure which direction error.
+	 */
+	RTE_DMA_STATUS_BUS_ERROR,
+	/** The operation failed to complete due data poison. */
+	RTE_DMA_STATUS_DATA_POISION,
+	/** The operation failed to complete due descriptor read error. */
+	RTE_DMA_STATUS_DESCRIPTOR_READ_ERROR,
+	/** The operation failed to complete due device link error.
+	 * Used to indicates that the link error in the memory-to-device/
+	 * device-to-memory/device-to-device transfer scenario.
+	 */
+	RTE_DMA_STATUS_DEV_LINK_ERROR,
+	/** The operation failed to complete due lookup page fault. */
+	RTE_DMA_STATUS_PAGE_FAULT,
+	/** The operation failed to complete due unknown reason.
+	 * The initial value is 256, which reserves space for future errors.
+	 */
+	RTE_DMA_STATUS_ERROR_UNKNOWN = 0x100,
+};
+
+/**
+ * A structure used to hold scatter-gather DMA operation request entry.
+ *
+ * @see rte_dma_copy_sg
+ */
+struct rte_dma_sge {
+	rte_iova_t addr; /**< The DMA operation address. */
+	uint32_t length; /**< The DMA operation length. */
+};
+
+#include "rte_dmadev_core.h"
+
+/**@{@name DMA operation flag
+ * @see rte_dma_copy()
+ * @see rte_dma_copy_sg()
+ * @see rte_dma_fill()
+ */
+/** Fence flag.
+ * It means the operation with this flag must be processed only after all
+ * previous operations are completed.
+ * If the specify DMA HW works in-order (it means it has default fence between
+ * operations), this flag could be NOP.
+ */
+#define RTE_DMA_OP_FLAG_FENCE   RTE_BIT64(0)
+/** Submit flag.
+ * It means the operation with this flag must issue doorbell to hardware after
+ * enqueued jobs.
+ */
+#define RTE_DMA_OP_FLAG_SUBMIT  RTE_BIT64(1)
+/** Write data to low level cache hint.
+ * Used for performance optimization, this is just a hint, and there is no
+ * capability bit for this, driver should not return error if this flag was set.
+ */
+#define RTE_DMA_OP_FLAG_LLC     RTE_BIT64(2)
+/**@}*/
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a copy operation onto the virtual DMA channel.
+ *
+ * This queues up a copy operation to be performed by hardware, if the 'flags'
+ * parameter contains RTE_DMA_OP_FLAG_SUBMIT then trigger doorbell to begin
+ * this operation, otherwise do not trigger doorbell.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param src
+ *   The address of the source buffer.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the data to be copied.
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued job.
+ *   - -ENOSPC: if no space left to enqueue.
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+static inline int
+rte_dma_copy(int16_t dev_id, uint16_t vchan, rte_iova_t src, rte_iova_t dst,
+	     uint32_t length, uint64_t flags)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id) || length == 0)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->copy, -ENOTSUP);
+#endif
+
+	return (*obj->copy)(obj->dev_private, vchan, src, dst, length, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a scatter-gather list copy operation onto the virtual DMA channel.
+ *
+ * This queues up a scatter-gather list copy operation to be performed by
+ * hardware, if the 'flags' parameter contains RTE_DMA_OP_FLAG_SUBMIT then
+ * trigger doorbell to begin this operation, otherwise do not trigger doorbell.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param src
+ *   The pointer of source scatter-gather entry array.
+ * @param dst
+ *   The pointer of destination scatter-gather entry array.
+ * @param nb_src
+ *   The number of source scatter-gather entry.
+ *   @see struct rte_dma_info::max_sges
+ * @param nb_dst
+ *   The number of destination scatter-gather entry.
+ *   @see struct rte_dma_info::max_sges
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued job.
+ *   - -ENOSPC: if no space left to enqueue.
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+static inline int
+rte_dma_copy_sg(int16_t dev_id, uint16_t vchan, struct rte_dma_sge *src,
+		struct rte_dma_sge *dst, uint16_t nb_src, uint16_t nb_dst,
+		uint64_t flags)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id) || src == NULL || dst == NULL ||
+	    nb_src == 0 || nb_dst == 0)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->copy_sg, -ENOTSUP);
+#endif
+
+	return (*obj->copy_sg)(obj->dev_private, vchan, src, dst, nb_src,
+			       nb_dst, flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Enqueue a fill operation onto the virtual DMA channel.
+ *
+ * This queues up a fill operation to be performed by hardware, if the 'flags'
+ * parameter contains RTE_DMA_OP_FLAG_SUBMIT then trigger doorbell to begin
+ * this operation, otherwise do not trigger doorbell.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param pattern
+ *   The pattern to populate the destination buffer with.
+ * @param dst
+ *   The address of the destination buffer.
+ * @param length
+ *   The length of the destination buffer.
+ * @param flags
+ *   An flags for this operation.
+ *   @see RTE_DMA_OP_FLAG_*
+ *
+ * @return
+ *   - 0..UINT16_MAX: index of enqueued job.
+ *   - -ENOSPC: if no space left to enqueue.
+ *   - other values < 0 on failure.
+ */
+__rte_experimental
+static inline int
+rte_dma_fill(int16_t dev_id, uint16_t vchan, uint64_t pattern,
+	     rte_iova_t dst, uint32_t length, uint64_t flags)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id) || length == 0)
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->fill, -ENOTSUP);
+#endif
+
+	return (*obj->fill)(obj->dev_private, vchan, pattern, dst, length,
+			    flags);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Trigger hardware to begin performing enqueued operations.
+ *
+ * This API is used to write the "doorbell" to the hardware to trigger it
+ * to begin the operations previously enqueued by rte_dma_copy/fill().
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *
+ * @return
+ *   0 on success. Otherwise negative value is returned.
+ */
+__rte_experimental
+static inline int
+rte_dma_submit(int16_t dev_id, uint16_t vchan)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id))
+		return -EINVAL;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->submit, -ENOTSUP);
+#endif
+
+	return (*obj->submit)(obj->dev_private, vchan);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Return the number of operations that have been successfully completed.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_cpls
+ *   The maximum number of completed operations that can be processed.
+ * @param[out] last_idx
+ *   The last completed operation's ring_idx.
+ *   If not required, NULL can be passed in.
+ * @param[out] has_error
+ *   Indicates if there are transfer error.
+ *   If not required, NULL can be passed in.
+ *
+ * @return
+ *   The number of operations that successfully completed. This return value
+ *   must be less than or equal to the value of nb_cpls.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dma_completed(int16_t dev_id, uint16_t vchan, const uint16_t nb_cpls,
+		  uint16_t *last_idx, bool *has_error)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+	uint16_t idx;
+	bool err;
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id) || nb_cpls == 0)
+		return 0;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->completed, 0);
+#endif
+
+	/* Ensure the pointer values are non-null to simplify drivers.
+	 * In most cases these should be compile time evaluated, since this is
+	 * an inline function.
+	 * - If NULL is explicitly passed as parameter, then compiler knows the
+	 *   value is NULL
+	 * - If address of local variable is passed as parameter, then compiler
+	 *   can know it's non-NULL.
+	 */
+	if (last_idx == NULL)
+		last_idx = &idx;
+	if (has_error == NULL)
+		has_error = &err;
+
+	*has_error = false;
+	return (*obj->completed)(obj->dev_private, vchan, nb_cpls, last_idx,
+				 has_error);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Return the number of operations that have been completed, and the operations
+ * result may succeed or fail.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ * @param nb_cpls
+ *   Indicates the size of status array.
+ * @param[out] last_idx
+ *   The last completed operation's ring_idx.
+ *   If not required, NULL can be passed in.
+ * @param[out] status
+ *   This is a pointer to an array of length 'nb_cpls' that holds the completion
+ *   status code of each operation.
+ *   @see enum rte_dma_status_code
+ *
+ * @return
+ *   The number of operations that completed. This return value must be less
+ *   than or equal to the value of nb_cpls.
+ *   If this number is greater than zero (assuming n), then n values in the
+ *   status array are also set.
+ */
+__rte_experimental
+static inline uint16_t
+rte_dma_completed_status(int16_t dev_id, uint16_t vchan,
+			 const uint16_t nb_cpls, uint16_t *last_idx,
+			 enum rte_dma_status_code *status)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+	uint16_t idx;
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id) || nb_cpls == 0 || status == NULL)
+		return 0;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->completed_status, 0);
+#endif
+
+	if (last_idx == NULL)
+		last_idx = &idx;
+
+	return (*obj->completed_status)(obj->dev_private, vchan, nb_cpls,
+					last_idx, status);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Check remaining capacity in descriptor ring for the current burst.
+ *
+ * @param dev_id
+ *   The identifier of the device.
+ * @param vchan
+ *   The identifier of virtual DMA channel.
+ *
+ * @return
+ *   - Remaining space in the descriptor ring for the current burst.
+ *   - 0 on error
+ */
+__rte_experimental
+static inline uint16_t
+rte_dma_burst_capacity(int16_t dev_id, uint16_t vchan)
+{
+	struct rte_dma_fp_object *obj = &rte_dma_fp_objs[dev_id];
+
+#ifdef RTE_DMADEV_DEBUG
+	if (!rte_dma_is_valid(dev_id))
+		return 0;
+	RTE_FUNC_PTR_OR_ERR_RET(*obj->burst_capacity, 0);
+#endif
+	return (*obj->burst_capacity)(obj->dev_private, vchan);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_DMADEV_H */
diff --git a/lib/librte_dmadev/rte_dmadev_core.h b/lib/librte_dmadev/rte_dmadev_core.h
new file mode 100644
index 000000000..e42d8739a
--- /dev/null
+++ b/lib/librte_dmadev/rte_dmadev_core.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ * Copyright(c) 2021 Intel Corporation
+ */
+
+#ifndef RTE_DMADEV_CORE_H
+#define RTE_DMADEV_CORE_H
+
+/**
+ * @file
+ *
+ * DMA Device internal header.
+ *
+ * This header contains internal data types which are used by dataplane inline
+ * function.
+ *
+ * Applications should not use these functions directly.
+ */
+
+/** @internal Used to enqueue a copy operation. */
+typedef int (*rte_dma_copy_t)(void *dev_private, uint16_t vchan,
+			      rte_iova_t src, rte_iova_t dst,
+			      uint32_t length, uint64_t flags);
+
+/** @internal Used to enqueue a scatter-gather list copy operation. */
+typedef int (*rte_dma_copy_sg_t)(void *dev_private, uint16_t vchan,
+				 const struct rte_dma_sge *src,
+				 const struct rte_dma_sge *dst,
+				 uint16_t nb_src, uint16_t nb_dst,
+				 uint64_t flags);
+
+/** @internal Used to enqueue a fill operation. */
+typedef int (*rte_dma_fill_t)(void *dev_private, uint16_t vchan,
+			      uint64_t pattern, rte_iova_t dst,
+			      uint32_t length, uint64_t flags);
+
+/** @internal Used to trigger hardware to begin working. */
+typedef int (*rte_dma_submit_t)(void *dev_private, uint16_t vchan);
+
+/** @internal Used to return number of successful completed operations. */
+typedef uint16_t (*rte_dma_completed_t)(void *dev_private,
+				uint16_t vchan, const uint16_t nb_cpls,
+				uint16_t *last_idx, bool *has_error);
+
+/** @internal Used to return number of completed operations. */
+typedef uint16_t (*rte_dma_completed_status_t)(void *dev_private,
+			uint16_t vchan, const uint16_t nb_cpls,
+			uint16_t *last_idx, enum rte_dma_status_code *status);
+
+/** @internal Used to check the remaining space in descriptor ring. */
+typedef uint16_t (*rte_dma_burst_capacity_t)(const void *dev_private, uint16_t vchan);
+
+/**
+ * @internal
+ * Fast-path dmadev functions and related data are hold in a flat array.
+ * One entry per dmadev.
+ *
+ * This structure occupy exactly 128B which reserve space for future IO
+ * functions.
+ *
+ * The 'dev_private' field was placed in the first cache line to optimize
+ * performance because the PMD driver mainly depends on this field.
+ */
+struct rte_dma_fp_object {
+	/** PMD-specific private data. The driver should copy
+	 * rte_dma_dev.data->dev_private to this field during initialization.
+	 */
+	void *dev_private;
+	rte_dma_copy_t             copy;
+	rte_dma_copy_sg_t          copy_sg;
+	rte_dma_fill_t             fill;
+	rte_dma_submit_t           submit;
+	rte_dma_completed_t        completed;
+	rte_dma_completed_status_t completed_status;
+	rte_dma_burst_capacity_t   burst_capacity;
+} __rte_aligned(128);
+
+extern struct rte_dma_fp_object *rte_dma_fp_objs;
+
+#endif /* RTE_DMADEV_CORE_H */
diff --git a/lib/librte_dmadev/rte_dmadev_pmd.h b/lib/librte_dmadev/rte_dmadev_pmd.h
new file mode 100644
index 000000000..5316ad5b5
--- /dev/null
+++ b/lib/librte_dmadev/rte_dmadev_pmd.h
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2021 HiSilicon Limited
+ */
+
+#ifndef RTE_DMADEV_PMD_H
+#define RTE_DMADEV_PMD_H
+
+/**
+ * @file
+ *
+ * DMA Device PMD interface
+ *
+ * Driver facing interface for a DMA device. These are not to be called directly
+ * by any application.
+ */
+
+#include "rte_dmadev.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rte_dma_dev;
+
+/** @internal Used to get device information of a device. */
+typedef int (*rte_dma_info_get_t)(const struct rte_dma_dev *dev,
+				  struct rte_dma_info *dev_info,
+				  uint32_t info_sz);
+
+/** @internal Used to configure a device. */
+typedef int (*rte_dma_configure_t)(struct rte_dma_dev *dev,
+				   const struct rte_dma_conf *dev_conf,
+				   uint32_t conf_sz);
+
+/** @internal Used to start a configured device. */
+typedef int (*rte_dma_start_t)(struct rte_dma_dev *dev);
+
+/** @internal Used to stop a configured device. */
+typedef int (*rte_dma_stop_t)(struct rte_dma_dev *dev);
+
+/** @internal Used to close a configured device. */
+typedef int (*rte_dma_close_t)(struct rte_dma_dev *dev);
+
+/** @internal Used to allocate and set up a virtual DMA channel. */
+typedef int (*rte_dma_vchan_setup_t)(struct rte_dma_dev *dev, uint16_t vchan,
+				const struct rte_dma_vchan_conf *conf,
+				uint32_t conf_sz);
+
+/** @internal Used to retrieve basic statistics. */
+typedef int (*rte_dma_stats_get_t)(const struct rte_dma_dev *dev,
+			uint16_t vchan, struct rte_dma_stats *stats,
+			uint32_t stats_sz);
+
+/** @internal Used to reset basic statistics. */
+typedef int (*rte_dma_stats_reset_t)(struct rte_dma_dev *dev, uint16_t vchan);
+
+/** @internal Used to check if a virtual channel has finished all jobs. */
+typedef int (*rte_dma_vchan_status_t)(const struct rte_dma_dev *dev, uint16_t vchan,
+		enum rte_dma_vchan_status *status);
+
+/** @internal Used to dump internal information. */
+typedef int (*rte_dma_dump_t)(const struct rte_dma_dev *dev, FILE *f);
+
+/**
+ * DMA device operations function pointer table.
+ *
+ * @see struct rte_dma_dev:dev_ops
+ */
+struct rte_dma_dev_ops {
+	rte_dma_info_get_t         dev_info_get;
+	rte_dma_configure_t        dev_configure;
+	rte_dma_start_t            dev_start;
+	rte_dma_stop_t             dev_stop;
+	rte_dma_close_t            dev_close;
+
+	rte_dma_vchan_setup_t      vchan_setup;
+
+	rte_dma_stats_get_t        stats_get;
+	rte_dma_stats_reset_t      stats_reset;
+
+	rte_dma_vchan_status_t     vchan_status;
+	rte_dma_dump_t             dev_dump;
+};
+
+/**
+ * @internal
+ * The data part, with no function pointers, associated with each DMA device.
+ *
+ * This structure is safe to place in shared memory to be common among different
+ * processes in a multi-process configuration.
+ *
+ * @see struct rte_dma_dev::data
+ */
+struct rte_dma_dev_data {
+	char dev_name[RTE_DEV_NAME_MAX_LEN]; /**< Unique identifier name */
+	int16_t dev_id; /**< Device [external] identifier. */
+	int16_t numa_node; /**< Local NUMA memory ID. -1 if unknown. */
+	void *dev_private; /**< PMD-specific private data. */
+	struct rte_dma_conf dev_conf; /**< DMA device configuration. */
+	__extension__
+	uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
+	uint64_t reserved[2]; /**< Reserved for future fields */
+} __rte_cache_aligned;
+
+/**
+ * Possible states of a DMA device.
+ *
+ * @see struct rte_dma_dev::state
+ */
+enum rte_dma_dev_state {
+	RTE_DMA_DEV_UNUSED = 0, /**< Device is unused. */
+	/** Device is registered, but not ready to be used. */
+	RTE_DMA_DEV_REGISTERED,
+	/** Device is ready for use. This is set by the PMD. */
+	RTE_DMA_DEV_READY,
+};
+
+/**
+ * @internal
+ * The generic data structure associated with each DMA device.
+ */
+struct rte_dma_dev {
+	/** Device info which supplied during device initialization. */
+	struct rte_device *device;
+	struct rte_dma_dev_data *data; /**< Pointer to shared device data. */
+	/**< Fast-path functions and related data. */
+	struct rte_dma_fp_object *fp_obj;
+	/** Functions implemented by PMD. */
+	const struct rte_dma_dev_ops *dev_ops;
+	enum rte_dma_dev_state state; /**< Flag indicating the device state. */
+	uint64_t reserved[2]; /**< Reserved for future fields. */
+} __rte_cache_aligned;
+
+/**
+ * @internal
+ * Allocate a new dmadev slot for an DMA device and return the pointer to that
+ * slot for the driver to use.
+ *
+ * @param name
+ *   DMA device name.
+ * @param numa_node
+ *   Driver's private data's NUMA node.
+ * @param private_data_size
+ *   Driver's private data size.
+ *
+ * @return
+ *   A pointer to the DMA device slot case of success,
+ *   NULL otherwise.
+ */
+__rte_internal
+struct rte_dma_dev *rte_dma_pmd_allocate(const char *name, int numa_node,
+					 size_t private_data_size);
+
+/**
+ * @internal
+ * Release the specified dmadev.
+ *
+ * @param name
+ *   DMA device name.
+ *
+ * @return
+ *   - 0 on success, negative on error.
+ */
+__rte_internal
+int rte_dma_pmd_release(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RTE_DMADEV_PMD_H */
diff --git a/lib/librte_dmadev/version.map b/lib/librte_dmadev/version.map
new file mode 100644
index 000000000..7031d6b33
--- /dev/null
+++ b/lib/librte_dmadev/version.map
@@ -0,0 +1,31 @@
+EXPERIMENTAL {
+	global:
+
+	rte_dma_close;
+	rte_dma_configure;
+	rte_dma_count_avail;
+	rte_dma_dev_max;
+	rte_dma_dump;
+	rte_dma_get_dev_id_by_name;
+	rte_dma_info_get;
+	rte_dma_is_valid;
+	rte_dma_next_dev;
+	rte_dma_start;
+	rte_dma_stats_get;
+	rte_dma_stats_reset;
+	rte_dma_stop;
+	rte_dma_vchan_setup;
+	rte_dma_vchan_status;
+
+	local: *;
+};
+
+INTERNAL {
+	global:
+
+	rte_dma_fp_objs;
+	rte_dma_pmd_allocate;
+	rte_dma_pmd_release;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index ed00f8914..86106240d 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -25,7 +25,7 @@ libraries = [
 	'gro', 'gso', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
 	'power', 'pdump', 'rawdev', 'regexdev',
-	'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
+	'rib', 'reorder', 'sched', 'security', 'stack', 'vhost', 'dmadev',
 	# ipsec lib depends on net, crypto and security
 	'ipsec',
 	#fib lib depends on rib
--
2.23.0