summaryrefslogtreecommitdiff
path: root/arch/arm64
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-05-18 13:04:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-05-18 13:04:15 -0700
commit25f4874662fb0d43fc1d934dd7802b740ed2ab5f (patch)
tree03922245c2f078c36789ecbc15fe9e4497ec96b2 /arch/arm64
parent56172ac1024d2cb9194ea42fe76d05c0748863f4 (diff)
parent9c0731832d3b7420cbadba6a7f334363bc8dfb15 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe: "Aside from the usual things this has an arch update for __iowrite64_copy() used by the RDMA drivers. This API was intended to generate large 64 byte MemWr TLPs on PCI. These days most processors had done this by just repeating writel() in a loop. S390 and some new ARM64 designs require a special helper to get this to generate. - Small improvements and fixes for erdma, efa, hfi1, bnxt_re - Fix a UAF crash after module unload on leaking restrack entry - Continue adding full RDMA support in mana with support for EQs, GID's and CQs - Improvements to the mkey cache in mlx5 - DSCP traffic class support in hns and several bug fixes - Cap the maximum number of MADs in the receive queue to avoid OOM - Another batch of rxe bug fixes from large scale testing - __iowrite64_copy() optimizations for write combining MMIO memory - Remove NULL checks before dev_put/hold() - EFA support for receive with immediate - Fix a recent memleaking regression in a cma error path" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (70 commits) RDMA/cma: Fix kmemleak in rdma_core observed during blktests nvme/rdma use siw RDMA/IPoIB: Fix format truncation compilation errors bnxt_re: avoid shift undefined behavior in bnxt_qplib_alloc_init_hwq RDMA/efa: Support QP with unsolicited write w/ imm. receive IB/hfi1: Remove generic .ndo_get_stats64 IB/hfi1: Do not use custom stat allocator RDMA/hfi1: Use RMW accessors for changing LNKCTL2 RDMA/mana_ib: implement uapi for creation of rnic cq RDMA/mana_ib: boundary check before installing cq callbacks RDMA/mana_ib: introduce a helper to remove cq callbacks RDMA/mana_ib: create and destroy RNIC cqs RDMA/mana_ib: create EQs for RNIC CQs RDMA/core: Remove NULL check before dev_{put, hold} RDMA/ipoib: Remove NULL check before dev_{put, hold} RDMA/mlx5: Remove NULL check before dev_{put, hold} RDMA/mlx5: Track DCT, DCI and REG_UMR QPs as diver_detail resources. RDMA/core: Add an option to display driver-specific QPs in the rdmatool RDMA/efa: Add shutdown notifier RDMA/mana_ib: Fix missing ret value IB/mlx5: Use __iowrite64_copy() for write combining stores ...
Diffstat (limited to 'arch/arm64')
-rw-r--r--arch/arm64/include/asm/io.h132
-rw-r--r--arch/arm64/kernel/io.c42
2 files changed, 174 insertions, 0 deletions
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 8d825522c55c..4ff0ae3f6d66 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -140,6 +140,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
#define memcpy_toio(c,a,l) __memcpy_toio((c),(a),(l))
/*
+ * The ARM64 iowrite implementation is intended to support drivers that want to
+ * use write combining. For instance PCI drivers using write combining with a 64
+ * byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
+ *
+ * Newer ARM core have sensitive write combining buffers, it is important that
+ * the stores be contiguous blocks of store instructions. Normal memcpy
+ * approaches have a very low chance to generate write combining.
+ *
+ * Since this is the only API on ARM64 that should be used with write combining
+ * it also integrates the DGH hint which is supposed to lower the latency to
+ * emit the large TLP from the CPU.
+ */
+
+static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
+ const u32 *from, size_t count)
+{
+ switch (count) {
+ case 8:
+ asm volatile("str %w0, [%8, #4 * 0]\n"
+ "str %w1, [%8, #4 * 1]\n"
+ "str %w2, [%8, #4 * 2]\n"
+ "str %w3, [%8, #4 * 3]\n"
+ "str %w4, [%8, #4 * 4]\n"
+ "str %w5, [%8, #4 * 5]\n"
+ "str %w6, [%8, #4 * 6]\n"
+ "str %w7, [%8, #4 * 7]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+ "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+ "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+ break;
+ case 4:
+ asm volatile("str %w0, [%4, #4 * 0]\n"
+ "str %w1, [%4, #4 * 1]\n"
+ "str %w2, [%4, #4 * 2]\n"
+ "str %w3, [%4, #4 * 3]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+ "rZ"(from[3]), "r"(to));
+ break;
+ case 2:
+ asm volatile("str %w0, [%2, #4 * 0]\n"
+ "str %w1, [%2, #4 * 1]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+ break;
+ case 1:
+ __raw_writel(*from, to);
+ break;
+ default:
+ BUILD_BUG();
+ }
+}
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite32_copy(void __iomem *to, const void *from,
+ size_t count)
+{
+ if (count == 8 || count == 4 || count == 2 || count == 1) {
+ __const_memcpy_toio_aligned32(to, from, count);
+ dgh();
+ } else {
+ __iowrite32_copy_full(to, from, count);
+ }
+}
+
+#define __iowrite32_copy(to, from, count) \
+ (__builtin_constant_p(count) ? \
+ __const_iowrite32_copy(to, from, count) : \
+ __iowrite32_copy_full(to, from, count))
+
+static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
+ const u64 *from, size_t count)
+{
+ switch (count) {
+ case 8:
+ asm volatile("str %x0, [%8, #8 * 0]\n"
+ "str %x1, [%8, #8 * 1]\n"
+ "str %x2, [%8, #8 * 2]\n"
+ "str %x3, [%8, #8 * 3]\n"
+ "str %x4, [%8, #8 * 4]\n"
+ "str %x5, [%8, #8 * 5]\n"
+ "str %x6, [%8, #8 * 6]\n"
+ "str %x7, [%8, #8 * 7]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+ "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+ "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+ break;
+ case 4:
+ asm volatile("str %x0, [%4, #8 * 0]\n"
+ "str %x1, [%4, #8 * 1]\n"
+ "str %x2, [%4, #8 * 2]\n"
+ "str %x3, [%4, #8 * 3]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+ "rZ"(from[3]), "r"(to));
+ break;
+ case 2:
+ asm volatile("str %x0, [%2, #8 * 0]\n"
+ "str %x1, [%2, #8 * 1]\n"
+ :
+ : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+ break;
+ case 1:
+ __raw_writeq(*from, to);
+ break;
+ default:
+ BUILD_BUG();
+ }
+}
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
+ size_t count)
+{
+ if (count == 8 || count == 4 || count == 2 || count == 1) {
+ __const_memcpy_toio_aligned64(to, from, count);
+ dgh();
+ } else {
+ __iowrite64_copy_full(to, from, count);
+ }
+}
+
+#define __iowrite64_copy(to, from, count) \
+ (__builtin_constant_p(count) ? \
+ __const_iowrite64_copy(to, from, count) : \
+ __iowrite64_copy_full(to, from, count))
+
+/*
* I/O memory mapping functions.
*/
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index aa7a4ec6a3ae..ef48089fbfe1 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -38,6 +38,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
EXPORT_SYMBOL(__memcpy_fromio);
/*
+ * This generates a memcpy that works on a from/to address which is aligned to
+ * bits. Count is in terms of the number of bits sized quantities to copy. It
+ * optimizes to use the STR groupings when possible so that it is WC friendly.
+ */
+#define memcpy_toio_aligned(to, from, count, bits) \
+ ({ \
+ volatile u##bits __iomem *_to = to; \
+ const u##bits *_from = from; \
+ size_t _count = count; \
+ const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
+ \
+ for (; _from < _end_from; _from += 8, _to += 8) \
+ __const_memcpy_toio_aligned##bits(_to, _from, 8); \
+ if ((_count % 8) >= 4) { \
+ __const_memcpy_toio_aligned##bits(_to, _from, 4); \
+ _from += 4; \
+ _to += 4; \
+ } \
+ if ((_count % 4) >= 2) { \
+ __const_memcpy_toio_aligned##bits(_to, _from, 2); \
+ _from += 2; \
+ _to += 2; \
+ } \
+ if (_count % 2) \
+ __const_memcpy_toio_aligned##bits(_to, _from, 1); \
+ })
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
+{
+ memcpy_toio_aligned(to, from, count, 64);
+ dgh();
+}
+EXPORT_SYMBOL(__iowrite64_copy_full);
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
+{
+ memcpy_toio_aligned(to, from, count, 32);
+ dgh();
+}
+EXPORT_SYMBOL(__iowrite32_copy_full);
+
+/*
* Copy data from "real" memory space to IO memory space.
*/
void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)