Cemu/dependencies/ih264d/common/armv8/ih264_ihadamard_scaling_av8.s

251 lines
8.6 KiB
ArmAsm

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// * ih264_ihadamard_scaling_av8.s
// *
// * @brief
// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
// * of 16x16 intra-prediction
// *
// * @author
// * Mohit
// *
// * @par List of Functions:
// * - ih264_ihadamard_scaling_4x4_av8()
// *
// * @remarks
// * None
// *
.include "ih264_neon_macros.s"
// *******************************************************************************
// */
// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
// * of a 16x16 intra prediction macroblock, and then performs scaling.
// * prediction buffer
// *
// * @par Description:
// * The DC coefficients pass through a 2-stage inverse hadamard transform.
// * This inverse transformed content is scaled to based on Qp value.
// *
// * @param[in] pi2_src
// * input 4x4 block of DC coefficients
// *
// * @param[out] pi2_out
// * output 4x4 block
// *
// * @param[in] pu2_iscal_mat
// * pointer to scaling list
// *
// * @param[in] pu2_weigh_mat
// * pointer to weight matrix
// *
// * @param[in] u4_qp_div_6
// * Floor (qp/6)
// *
// * @param[in] pi4_tmp
// * temporary buffer of size 1*16
// *
// * @returns none
// *
// * @remarks none
// *
// *******************************************************************************
// */
// *
// *******************************************************************************
// */
// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
// word16* pi2_out,
// const uword16 *pu2_iscal_mat,
// const uword16 *pu2_weigh_mat,
// uword32 u4_qp_div_6,
// word32* pi4_tmp)
//**************variables vs registers*****************************************
//x0 => *pi2_src
//x1 => *pi2_out
//x2 => *pu2_iscal_mat
//x3 => *pu2_weigh_mat
//x4=> u4_qp_div_6
.text
.p2align 2
.global ih264_ihadamard_scaling_4x4_av8
ih264_ihadamard_scaling_4x4_av8:
//only one shift is done in horizontal inverse because,
//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
push_v_regs
//=======================inverse hadamard transform================================
ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7
dup v14.4s, w4 // populate the u4_qp_div_6
ld1 {v15.h}[0], [x3] // pu2_weigh_mat
ld1 {v16.h}[0], [x2] //pu2_iscal_mat
saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7
saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6
ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6
ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7
add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
umull v15.4s, v15.4h, v16.4h
dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0]
//transpose
trn1 v4.4s, v0.4s, v1.4s
trn2 v5.4s, v0.4s, v1.4s
trn1 v6.4s, v2.4s, v3.4s
trn2 v7.4s, v2.4s, v3.4s
trn1 v0.2d, v4.2d, v6.2d
trn2 v2.2d, v4.2d, v6.2d
trn1 v1.2d, v5.2d, v7.2d
trn2 v3.2d, v5.2d, v7.2d
//end transpose
add v4.4s, v0.4s, v3.4s //x0 = x4+x7
add v5.4s, v1.4s, v2.4s //x1 = x5+x6
sub v6.4s, v1.4s, v2.4s //x2 = x5-x6
sub v7.4s, v0.4s, v3.4s //x3 = x4-x7
add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
st1 {v0.4h-v3.4h}, [x1] //store the result
pop_v_regs
ret
// *******************************************************************************
// */
// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
// *
// * @par Description:
// * The DC coefficients pass through a 2-stage inverse hadamard transform.
// * This inverse transformed content is scaled to based on Qp value.
// * Both DC blocks of U and v blocks are processesd
// *
// * @param[in] pi2_src
// * input 1x8 block of ceffs. First 4 are from U and next from V
// *
// * @param[out] pi2_out
// * output 1x8 block
// *
// * @param[in] pu2_iscal_mat
// * pointer to scaling list
// *
// * @param[in] pu2_weigh_mat
// * pointer to weight matrix
// *
// * @param[in] u4_qp_div_6
// * Floor (qp/6)
// *
// * @returns none
// *
// * @remarks none
// *
// *******************************************************************************
// */
// *
// *******************************************************************************
// */
// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
// WORD16* pi2_out,
// const UWORD16 *pu2_iscal_mat,
// const UWORD16 *pu2_weigh_mat,
// UWORD32 u4_qp_div_6,
.global ih264_ihadamard_scaling_2x2_uv_av8
ih264_ihadamard_scaling_2x2_uv_av8:
//Registers used
// x0 : *pi2_src
// x1 : *pi2_out
// x2 : *pu2_iscal_mat
// x3 : *pu2_weigh_mat
// x4 : u4_qp_div_6
push_v_regs
ld1 {v26.h}[0], [x2]
ld1 {v27.h}[0], [x3]
sub w4, w4, #5 //qp/6 - 4
dup v28.4s, w4 //load qp/6
ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs
//i2_x4,i2_x6,i2_y4,i1_y6 -> d0
//i2_x5,i2_x7,i2_y5,i1_y6 -> d1
saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2
ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3
umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0]
dup v30.4s, v30.s[0]
trn1 v0.4s, v2.4s, v4.4s
trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1
add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5
sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7
mul v2.4s, v2.4s, v30.4s
mul v3.4s, v3.4s, v30.4s
sshl v2.4s, v2.4s, v28.4s
sshl v3.4s, v3.4s, v28.4s
xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5
xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7
st2 {v0.4s-v1.4s}, [x1]
pop_v_regs
ret