From 335ebe3035b6fcb83c3f225bc5135300fc24c827 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 26 Feb 2019 12:36:18 +0100
Subject: lib/raid6: arm: optimize away a mask operation in NEON recovery
 routine

The NEON recovery code was modeled after the x86 SIMD code, and for
some reason, that code uses a 16 bit wide signed shift and a mask to
perform what amounts to a 8 bit unsigned shift. So fold the ops
together.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 lib/raid6/recov_neon_inner.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c
index 7d00c31a6547..f13c07f82297 100644
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@@ -56,14 +56,14 @@ void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
 		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vy = vshrq_n_u8(vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(qm1, vy);
 		qx = veorq_u8(vx, vy);
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
+		vy = vshrq_n_u8(px, 4);
 		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
-		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(pm1, vy);
 		vx = veorq_u8(vx, vy);
 		db = veorq_u8(vx, qx);
 
@@ -97,9 +97,9 @@ void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
 
 		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
 
-		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
+		vy = vshrq_n_u8(vx, 4);
 		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
-		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
+		vy = vqtbl1q_u8(qm1, vy);
 		vx = veorq_u8(vx, vy);
 		vy = veorq_u8(vx, vld1q_u8(p));
 
-- 
cgit v1.2.3-58-ga151