mirror of
git://git.openembedded.org/meta-openembedded
synced 2026-04-02 02:49:12 +00:00
140 lines
5.5 KiB
Diff
140 lines
5.5 KiB
Diff
From 3990931bf6197eff1cec06cf24bce53ddf9a539a Mon Sep 17 00:00:00 2001
|
|
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
|
|
Date: Sat, 27 Nov 2010 04:47:39 +0200
|
|
Subject: [PATCH 09/24] ARM: reuse common NEON code for over_{n_8|8888_n|8888_8}_0565
|
|
|
|
Renamed suppementary macros from 'over_n_8_0565' to 'over_8888_8_0565',
|
|
because they can actually support all variants of this operation:
|
|
over_8888_8_0565/over_n_8_0565/over_8888_n_0565.
|
|
|
|
Also 'over_8888_8_0565' now uses more optimized common code instead of its
|
|
own variant, improving performance a bit. Even though this operation is
|
|
still memory bandwidth limited, scaled variants of these fast paths may
|
|
put more stress on CPU later.
|
|
|
|
Benchmarked on ARM Cortex-A8 @500MHz:
|
|
|
|
== before ==
|
|
|
|
over_8888_8_0565 = L1: 67.10 L2: 53.82 M: 44.70 (105.17%)
|
|
HT: 18.73 VT: 16.91 R: 14.25 RT: 4.80 (52Kops/s)
|
|
|
|
== after ==
|
|
|
|
over_8888_8_0565 = L1: 77.83 L2: 58.14 M: 44.82 (105.52%)
|
|
HT: 20.58 VT: 17.44 R: 15.05 RT: 4.88 (52Kops/s)
|
|
---
|
|
pixman/pixman-arm-neon-asm.S | 61 +++++++++++++++++------------------------
|
|
1 files changed, 25 insertions(+), 36 deletions(-)
|
|
|
|
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
|
|
index 3e52a49..4175144 100644
|
|
--- a/pixman/pixman-arm-neon-asm.S
|
|
+++ b/pixman/pixman-arm-neon-asm.S
|
|
@@ -791,7 +791,7 @@ generate_composite_function \
|
|
|
|
/******************************************************************************/
|
|
|
|
-.macro pixman_composite_over_n_8_0565_process_pixblock_head
|
|
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
|
|
vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
|
|
vmull.u8 q1, d24, d9
|
|
vmull.u8 q6, d24, d10
|
|
@@ -816,7 +816,7 @@ generate_composite_function \
|
|
vmull.u8 q10, d3, d30
|
|
.endm
|
|
|
|
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail
|
|
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
|
|
/* 3 cycle bubble (after vmull.u8) */
|
|
vrshr.u16 q13, q8, #8
|
|
vrshr.u16 q11, q9, #8
|
|
@@ -835,7 +835,7 @@ generate_composite_function \
|
|
vsri.u16 q14, q9, #11
|
|
.endm
|
|
|
|
-.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
|
|
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
|
|
vld1.16 {d4, d5}, [DST_R, :128]!
|
|
vshrn.u16 d6, q2, #8
|
|
fetch_mask_pixblock
|
|
@@ -880,6 +880,23 @@ generate_composite_function \
|
|
vmull.u8 q10, d3, d30
|
|
.endm
|
|
|
|
+generate_composite_function \
|
|
+ pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
|
|
+ FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
|
|
+ 8, /* number of pixels, processed in a single block */ \
|
|
+ 5, /* prefetch distance */ \
|
|
+ default_init_need_all_regs, \
|
|
+ default_cleanup_need_all_regs, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
|
|
+ 28, /* dst_w_basereg */ \
|
|
+ 4, /* dst_r_basereg */ \
|
|
+ 8, /* src_basereg */ \
|
|
+ 24 /* mask_basereg */
|
|
+
|
|
+/******************************************************************************/
|
|
+
|
|
/*
|
|
* This function needs a special initialization of solid mask.
|
|
* Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
|
|
@@ -911,9 +928,9 @@ generate_composite_function \
|
|
5, /* prefetch distance */ \
|
|
pixman_composite_over_n_8_0565_init, \
|
|
pixman_composite_over_n_8_0565_cleanup, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_head, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail_head
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_tail_head
|
|
|
|
/******************************************************************************/
|
|
|
|
@@ -935,36 +952,8 @@ generate_composite_function \
|
|
5, /* prefetch distance */ \
|
|
pixman_composite_over_8888_n_0565_init, \
|
|
pixman_composite_over_8888_n_0565_cleanup, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_head, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail_head, \
|
|
- 28, /* dst_w_basereg */ \
|
|
- 4, /* dst_r_basereg */ \
|
|
- 8, /* src_basereg */ \
|
|
- 24 /* mask_basereg */
|
|
-
|
|
-/******************************************************************************/
|
|
-
|
|
-/* TODO: expand macros and do better instructions scheduling */
|
|
-.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
|
|
- vld1.16 {d4, d5}, [DST_R, :128]!
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail
|
|
- fetch_src_pixblock
|
|
- cache_preload 8, 8
|
|
- fetch_mask_pixblock
|
|
- pixman_composite_over_n_8_0565_process_pixblock_head
|
|
- vst1.16 {d28, d29}, [DST_W, :128]!
|
|
-.endm
|
|
-
|
|
-generate_composite_function \
|
|
- pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
|
|
- FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
|
|
- 8, /* number of pixels, processed in a single block */ \
|
|
- 5, /* prefetch distance */ \
|
|
- default_init_need_all_regs, \
|
|
- default_cleanup_need_all_regs, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_head, \
|
|
- pixman_composite_over_n_8_0565_process_pixblock_tail, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_head, \
|
|
+ pixman_composite_over_8888_8_0565_process_pixblock_tail, \
|
|
pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
|
|
28, /* dst_w_basereg */ \
|
|
4, /* dst_r_basereg */ \
|
|
--
|
|
1.6.6.1
|
|
|