mirror of
git://git.openembedded.org/meta-openembedded
synced 2026-04-02 02:49:12 +00:00
106 lines
3.6 KiB
Diff
106 lines
3.6 KiB
Diff
From 1fba7790367d7b726d05a33bbbcebe10b9280a31 Mon Sep 17 00:00:00 2001
|
|
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
|
Date: Mon, 29 Nov 2010 02:10:22 +0200
|
|
Subject: [PATCH 12/24] ARM: better NEON instructions scheduling for add_8888_8888_8888
|
|
|
|
Provides a minor performance improvement by using pipelining and hiding
|
|
instructions latencies. Also do not clobber d0-d3 registers (source
|
|
image pixels) while doing calculations in order to allow the use of
|
|
the same macro for add_n_8_8888 fast path later.
|
|
|
|
Benchmark from ARM Cortex-A8 @500MHz:
|
|
|
|
== before ==
|
|
|
|
add_8888_8888_8888 = L1: 95.94 L2: 42.27 M: 25.60 (121.09%)
|
|
HT: 14.54 VT: 13.13 R: 12.77 RT: 4.49 (48Kops/s)
|
|
add_8888_8_8888 = L1: 104.51 L2: 57.81 M: 36.06 (106.62%)
|
|
HT: 19.24 VT: 16.45 R: 14.71 RT: 4.80 (51Kops/s)
|
|
|
|
== after ==
|
|
|
|
add_8888_8888_8888 = L1: 106.66 L2: 47.82 M: 27.32 (129.30%)
|
|
HT: 15.44 VT: 13.96 R: 12.86 RT: 4.48 (48Kops/s)
|
|
add_8888_8_8888 = L1: 107.72 L2: 61.02 M: 38.26 (113.16%)
|
|
HT: 19.48 VT: 16.72 R: 14.82 RT: 4.80 (51Kops/s)
|
|
---
|
|
pixman/pixman-arm-neon-asm.S | 52 +++++++++++++++++++++++++++--------------
|
|
1 files changed, 34 insertions(+), 18 deletions(-)
|
|
|
|
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
|
|
index 11ef166..829ef84 100644
|
|
--- a/pixman/pixman-arm-neon-asm.S
|
|
+++ b/pixman/pixman-arm-neon-asm.S
|
|
@@ -1542,34 +1542,50 @@ generate_composite_function \
|
|
/* expecting source data in {d0, d1, d2, d3} */
|
|
/* destination data in {d4, d5, d6, d7} */
|
|
/* mask in {d24, d25, d26, d27} */
|
|
- vmull.u8 q8, d27, d0
|
|
- vmull.u8 q9, d27, d1
|
|
+ vmull.u8 q8, d27, d0
|
|
+ vmull.u8 q9, d27, d1
|
|
vmull.u8 q10, d27, d2
|
|
vmull.u8 q11, d27, d3
|
|
- vrshr.u16 q0, q8, #8
|
|
- vrshr.u16 q1, q9, #8
|
|
- vrshr.u16 q12, q10, #8
|
|
- vrshr.u16 q13, q11, #8
|
|
- vraddhn.u16 d0, q0, q8
|
|
- vraddhn.u16 d1, q1, q9
|
|
- vraddhn.u16 d2, q12, q10
|
|
- vraddhn.u16 d3, q13, q11
|
|
- vqadd.u8 q14, q0, q2
|
|
- vqadd.u8 q15, q1, q3
|
|
+ /* 1 cycle bubble */
|
|
+ vrsra.u16 q8, q8, #8
|
|
+ vrsra.u16 q9, q9, #8
|
|
+ vrsra.u16 q10, q10, #8
|
|
+ vrsra.u16 q11, q11, #8
|
|
.endm
|
|
|
|
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
|
|
+ /* 2 cycle bubble */
|
|
+ vrshrn.u16 d28, q8, #8
|
|
+ vrshrn.u16 d29, q9, #8
|
|
+ vrshrn.u16 d30, q10, #8
|
|
+ vrshrn.u16 d31, q11, #8
|
|
+ vqadd.u8 q14, q2, q14
|
|
+ /* 1 cycle bubble */
|
|
+ vqadd.u8 q15, q3, q15
|
|
.endm
|
|
|
|
-/* TODO: expand macros and do better instructions scheduling */
|
|
.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
|
|
- pixman_composite_add_8888_8888_8888_process_pixblock_tail
|
|
- vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
|
- vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
|
- fetch_mask_pixblock
|
|
fetch_src_pixblock
|
|
+ vrshrn.u16 d28, q8, #8
|
|
+ fetch_mask_pixblock
|
|
+ vrshrn.u16 d29, q9, #8
|
|
+ vmull.u8 q8, d27, d0
|
|
+ vrshrn.u16 d30, q10, #8
|
|
+ vmull.u8 q9, d27, d1
|
|
+ vrshrn.u16 d31, q11, #8
|
|
+ vmull.u8 q10, d27, d2
|
|
+ vqadd.u8 q14, q2, q14
|
|
+ vmull.u8 q11, d27, d3
|
|
+ vqadd.u8 q15, q3, q15
|
|
+ vrsra.u16 q8, q8, #8
|
|
+ vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
|
|
+ vrsra.u16 q9, q9, #8
|
|
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
|
|
+ vrsra.u16 q10, q10, #8
|
|
+
|
|
cache_preload 8, 8
|
|
- pixman_composite_add_8888_8888_8888_process_pixblock_head
|
|
+
|
|
+ vrsra.u16 q11, q11, #8
|
|
.endm
|
|
|
|
generate_composite_function \
|
|
--
|
|
1.6.6.1
|
|
|