libyuv 代碼結構分析,借用其NEON/ARM64優化代碼

I 入口android

格式轉換入口的函數都在convert_xx之類的文件中。在個人android程序中主要用的是xx格式轉成NV12。其入口在convert_from.cc中。ide

函數爲:函數

int I420ToNV12(const uint8* src_y,
               int src_stride_y,
               const uint8* src_u,
               int src_stride_u,
               const uint8* src_v,
               int src_stride_v,
               uint8* dst_y,
               int dst_stride_y,
               uint8* dst_uv,
               int dst_stride_uv,
               int width,
               int height);

這裏涉及到兩個函數:oop

CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
               halfwidth, halfheight);

其入口都在planar_functions.cc中:ui

void CopyPlane(const uint8* src_y,
               int src_stride_y,
               uint8* dst_y,
               int dst_stride_y,
               int width,
               int height) {
  int y;
  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_y = dst_y + (height - 1) * dst_stride_y;
    dst_stride_y = -dst_stride_y;
  }
  // Coalesce rows.
  if (src_stride_y == width && dst_stride_y == width) {
    width *= height;
    height = 1;
    src_stride_y = dst_stride_y = 0;
  }
  // Nothing to do.
  if (src_y == dst_y && src_stride_y == dst_stride_y) {
    return;
  }
#if defined(HAS_COPYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
  }
#endif
#if defined(HAS_COPYROW_AVX)
  if (TestCpuFlag(kCpuHasAVX)) {
    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
  }
#endif
#if defined(HAS_COPYROW_ERMS)
  if (TestCpuFlag(kCpuHasERMS)) {
    CopyRow = CopyRow_ERMS;
  }
#endif
#if defined(HAS_COPYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
  }
#endif
#if defined(HAS_COPYROW_MIPS)
  if (TestCpuFlag(kCpuHasMIPS)) {
    CopyRow = CopyRow_MIPS;
  }
#endif

  // Copy plane
  for (y = 0; y < height; ++y) {
    CopyRow(src_y, dst_y, width);
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
}

void MergeUVPlane(const uint8* src_u,
                  int src_stride_u,
                  const uint8* src_v,
                  int src_stride_v,
                  uint8* dst_uv,
                  int dst_stride_uv,
                  int width,
                  int height) {
  int y;
  void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                     int width) = MergeUVRow_C;
  // Coalesce rows.
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_uv = dst_uv + (height - 1) * dst_stride_uv;
    dst_stride_uv = -dst_stride_uv;
  }
  // Coalesce rows.
  if (src_stride_u == width && src_stride_v == width &&
      dst_stride_uv == width * 2) {
    width *= height;
    height = 1;
    src_stride_u = src_stride_v = dst_stride_uv = 0;
  }
#if defined(HAS_MERGEUVROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    MergeUVRow = MergeUVRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      MergeUVRow = MergeUVRow_SSE2;
    }
  }
#endif
#if defined(HAS_MERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MergeUVRow = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      MergeUVRow = MergeUVRow_AVX2;
    }
  }
#endif
#if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow = MergeUVRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      MergeUVRow = MergeUVRow_NEON;
    }
  }
#endif
#if defined(HAS_MERGEUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    MergeUVRow = MergeUVRow_Any_MSA;
    if (IS_ALIGNED(width, 16)) {
      MergeUVRow = MergeUVRow_MSA;
    }
  }
#endif

  for (y = 0; y < height; ++y) {
    // Merge a row of U and V into a row of UV.
    MergeUVRow(src_u, src_v, dst_uv, width);
    src_u += src_stride_u;
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
  }
}
View Code

到目前爲止的代碼都是平臺無關的。也很好看懂。spa

 

II 平臺相關代碼code

平臺相關代碼都在xx_neon.cc xx_neon64.cc中。具體的CopyRow_NEON和MergeUVRow_NEON相關的代碼,都在row_neon64.cc/row_neon.cc中。前者是arm64的代碼,後者是armeabi-v7a的neon代碼。blog

// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
    "subs       %2, %2, #32                    \n"  // 32 processed per loop
    MEMACCESS(1)
    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
    "bgt        1b                             \n"
  : "+r"(src),   // %0
    "+r"(dst),   // %1
    "+r"(count)  // %2  // Output registers
  :                     // Input registers
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
}

CopyRow_Any_NEON和MergeUVRow_Any_NEON的代碼都在row_any.cc中,any的代碼不分64和32,ip

// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
  void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {         \
    SIMD_ALIGNED(uint8 temp[128 * 2]);                                    \
    memset(temp, 0, 128); /* for YUY2 and msan */                         \
    int r = width & MASK;                                                 \
    int n = width & ~MASK;                                                \
    if (n > 0) {                                                          \
      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
    }                                                                     \
    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
  }

#ifdef HAS_COPYROW_NEON
ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
#endif

代碼中用到的申明都在libyuv/row.h中。it

#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))

#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
#if defined(VISUALC_HAS_AVX2)
#define SIMD_ALIGNED(var) __declspec(align(32)) var
#else
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#endif
typedef __declspec(align(16)) int16 vec16[8];
typedef __declspec(align(16)) int32 vec32[4];
typedef __declspec(align(16)) int8 vec8[16];
typedef __declspec(align(16)) uint16 uvec16[8];
typedef __declspec(align(16)) uint32 uvec32[4];
typedef __declspec(align(16)) uint8 uvec8[16];
typedef __declspec(align(32)) int16 lvec16[16];
typedef __declspec(align(32)) int32 lvec32[8];
typedef __declspec(align(32)) int8 lvec8[32];
typedef __declspec(align(32)) uint16 ulvec16[16];
typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32];
#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
#define SIMD_ALIGNED(var) var __attribute__((aligned(32)))
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#endif
typedef int16 __attribute__((vector_size(16))) vec16;
typedef int32 __attribute__((vector_size(16))) vec32;
typedef int8 __attribute__((vector_size(16))) vec8;
typedef uint16 __attribute__((vector_size(16))) uvec16;
typedef uint32 __attribute__((vector_size(16))) uvec32;
typedef uint8 __attribute__((vector_size(16))) uvec8;
typedef int16 __attribute__((vector_size(32))) lvec16;
typedef int32 __attribute__((vector_size(32))) lvec32;
typedef int8 __attribute__((vector_size(32))) lvec8;
typedef uint16 __attribute__((vector_size(32))) ulvec16;
typedef uint32 __attribute__((vector_size(32))) ulvec32;
typedef uint8 __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
typedef int16 vec16[8];
typedef int32 vec32[4];
typedef int8 vec8[16];
typedef uint16 uvec16[8];
typedef uint32 uvec32[4];
typedef uint8 uvec8[16];
typedef int16 lvec16[16];
typedef int32 lvec32[8];
typedef int8 lvec8[32];
typedef uint16 ulvec16[16];
typedef uint32 ulvec32[8];
typedef uint8 ulvec8[32];
#endif

....

以後只要把這些代碼摳出來即可以借用了。比起交叉編譯兩個庫來講,方便許多。

相關文章
相關標籤/搜索