如何優化此代碼？

探查說，總時間的50％，這個函數內部花費。你會如何優化它？它BMP配色方案轉換爲YUV。謝謝！如何優化此代碼？

更新：平臺是ARMv6的（寫IPhone）

#define Y_FROM_RGB(_r_,_g_,_b_) (( 66 * _b_ + 129 * _g_ + 25 * _r_ + 128) >> 8) + 16 
#define V_FROM_RGB(_r_,_g_,_b_) ((112 * _b_ - 94 * _g_ - 18 * _r_ + 128) >> 10) + 128 
#define U_FROM_RGB(_r_,_g_,_b_) ((-38 * _b_ - 74 * _g_ + 112 * _r_ + 128) >> 10) + 128 

    /*! 
* \brief 
* Converts 24 bit image to YCrCb image channels 
* 
* \param source 
* Source 24bit image pointer 
* 
* \param source_width 
* Source image width 
* 
* \param dest_Y 
* destination image Y component pointer 
* 
* \param dest_scan_size_Y 
* destination image Y component line size 
* 
* \param dest_U 
* destination image U component pointer 
* 
* \param dest_scan_size_U 
* destination image U component line size 
* 
* \param dest_V 
* destination image V component pointer 
* 
* \param dest_scan_size_V 
* destination image V component line size 
* 
* \param dest_width 
* Destination image width = source_width 
* 
* \param dest_height 
* Destination image height = source image height 
* 
* Convert 24 bit image (source) with width (source_width) 
* to YCrCb image channels (dest_Y, dest_U, dest_V) with size (dest_width)x(dest_height), and line size 
* (dest_scan_size_Y, dest_scan_size_U, dest_scan_size_V) (in bytes) 
* 
*/ 
void ImageConvert_24_YUV420P(unsigned char * source, int source_width, 
          unsigned char * dest_Y, int dest_scan_size_Y, 
          unsigned char * dest_U, int dest_scan_size_U, 
          unsigned char * dest_V, int dest_scan_size_V, 
          int dest_width, int dest_height) 
{ 
    int source_scan_size = source_width*3; 

    int half_width = dest_width/2; 

    //Y loop 
    for (int y = 0; y < dest_height/2; y ++) 
    { 
    //Start of line 
    unsigned char * source_scan = source; 
    unsigned char * source_scan_next = source+source_scan_size; 
    unsigned char * dest_scan_Y = dest_Y; 
    unsigned char * dest_scan_U = dest_U; 
    unsigned char * dest_scan_V = dest_V; 

    //Do all pixels 
    for (int x = 0; x < half_width; x++) 
    { 
     int R = source_scan[0]; 
     int G = source_scan[1]; 
     int B = source_scan[2]; 

     //Y 
     int Y = Y_FROM_RGB(B, G, R); 

     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 

     int R1 = source_scan[0]; 
     int G1 = source_scan[1]; 
     int B1 = source_scan[2]; 

     //Y 
     Y = Y_FROM_RGB(B1, G1, R1); 

     R += (R1 + source_scan_next[0] + source_scan_next[3]); 
     G += (G1 + source_scan_next[1] + source_scan_next[4]); 
     B += (B1 + source_scan_next[2] + source_scan_next[5]); 


     //YCrCb 
     *dest_scan_Y = Y; 
     *dest_scan_V = V_FROM_RGB(B, G, R); 
     *dest_scan_U = U_FROM_RGB(B, G, R); 

     source_scan += 3; 
     dest_scan_Y += 1; 
     dest_scan_U += 1; 
     dest_scan_V += 1; 
     source_scan_next += 6; 
    }; 

    //scroll to next line 
    source += source_scan_size; 
    dest_Y += dest_scan_size_Y; 
    dest_U += dest_scan_size_U; 
    dest_V += dest_scan_size_V; 

    //Start of line 
    source_scan = source; 
    dest_scan_Y = dest_Y; 

    //Do all pixels 
    for (int x = 0; x < half_width; x ++) 
    { 
     int R = source_scan[0]; 
     int G = source_scan[1]; 
     int B = source_scan[2]; 

     //Y 
     int Y = Y_FROM_RGB(B, G, R); 

     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 

     R = source_scan[0]; 
     G = source_scan[1]; 
     B = source_scan[2]; 

     //Y 
     Y = Y_FROM_RGB(B, G, R); 
     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 
    }; 

    source += source_scan_size; 
    dest_Y += dest_scan_size_Y; 
    }; 
};

來源

2010-08-09 artur_i_am

是內存的問題嗎？如果沒有，可以在數據字大小（整數）來表示，而不是字節大小？ – Simon 2010-08-09 11:48:06

內存在這裏不是問題。數據可以用整數表示。 – 2010-08-09 11:51:01

除非我失去了一些東西后續的代碼似乎在兩個循環被重複，那麼，爲什麼不通過這個循環去一次？這可能需要對算法進行一些更改，但會提高性能。

for (int x = 0; x < half_width; x ++) 
{ 
    int R = source_scan[0]; 
    int G = source_scan[1]; 
    int B = source_scan[2]; 

    //Y 
    int Y = Y_FROM_RGB(B, G, R); 

    *dest_scan_Y = Y; 
    source_scan += 3; 
    dest_scan_Y += 1; 

    R = source_scan[0]; 
    G = source_scan[1]; 
    B = source_scan[2];

但是，做任何事情之前，移動兩個環路內成單獨的功能，然後運行分析器，看看你是否在其中一個比另一個函數花費更多的時間。

你有這個功能三個環路，而你不知道哪個部分實際上是你在哪裏花你的時間。因此，請確定在進行任何優化之前，否則您可能會發現您正在修復錯誤的部分。

來源

2010-08-09 11:49:13

我不知道你使用的是什麼平臺，但你可能想看看SIMD

臂Cotext-A8具有Neon技術，不支持SIMD。您應該能夠在ARM網站上找到更多信息。

來源

2010-08-09 11:54:31 doron

平臺是ARMv6的 – 2010-08-09 11:57:46

。假定它們指向不重疊的內存，你應該用restrict預選賽聲明你source，dest_Y，dest_U和dest_V指針，來告訴編譯器這一點，並允許它優化更好。

來源

2010-08-09 13:17:40 caf

後，我與__restrict預選賽聲明變量 - 代碼變得更慢:) – 2010-08-09 13:56:44

@artur_i_am：聽起來像是你有一個bug報告給您的編譯器供應商！ – caf 2010-08-09 22:14:35

如何優化此代碼？

回答

相關問題