2010-08-09 128 views
4

探查說,總時間的50%,這個函數內部花費。你會如何優化它? 它BMP配色方案轉換爲YUV。謝謝!如何優化此代碼?

更新:平臺是ARMv6的(寫IPhone)

#define Y_FROM_RGB(_r_,_g_,_b_) (( 66 * _b_ + 129 * _g_ + 25 * _r_ + 128) >> 8) + 16 
#define V_FROM_RGB(_r_,_g_,_b_) ((112 * _b_ - 94 * _g_ - 18 * _r_ + 128) >> 10) + 128 
#define U_FROM_RGB(_r_,_g_,_b_) ((-38 * _b_ - 74 * _g_ + 112 * _r_ + 128) >> 10) + 128 

    /*! 
* \brief 
* Converts 24 bit image to YCrCb image channels 
* 
* \param source 
* Source 24bit image pointer 
* 
* \param source_width 
* Source image width 
* 
* \param dest_Y 
* destination image Y component pointer 
* 
* \param dest_scan_size_Y 
* destination image Y component line size 
* 
* \param dest_U 
* destination image U component pointer 
* 
* \param dest_scan_size_U 
* destination image U component line size 
* 
* \param dest_V 
* destination image V component pointer 
* 
* \param dest_scan_size_V 
* destination image V component line size 
* 
* \param dest_width 
* Destination image width = source_width 
* 
* \param dest_height 
* Destination image height = source image height 
* 
* Convert 24 bit image (source) with width (source_width) 
* to YCrCb image channels (dest_Y, dest_U, dest_V) with size (dest_width)x(dest_height), and line size 
* (dest_scan_size_Y, dest_scan_size_U, dest_scan_size_V) (in bytes) 
* 
*/ 
void ImageConvert_24_YUV420P(unsigned char * source, int source_width, 
          unsigned char * dest_Y, int dest_scan_size_Y, 
          unsigned char * dest_U, int dest_scan_size_U, 
          unsigned char * dest_V, int dest_scan_size_V, 
          int dest_width, int dest_height) 
{ 
    int source_scan_size = source_width*3; 

    int half_width = dest_width/2; 

    //Y loop 
    for (int y = 0; y < dest_height/2; y ++) 
    { 
    //Start of line 
    unsigned char * source_scan = source; 
    unsigned char * source_scan_next = source+source_scan_size; 
    unsigned char * dest_scan_Y = dest_Y; 
    unsigned char * dest_scan_U = dest_U; 
    unsigned char * dest_scan_V = dest_V; 

    //Do all pixels 
    for (int x = 0; x < half_width; x++) 
    { 
     int R = source_scan[0]; 
     int G = source_scan[1]; 
     int B = source_scan[2]; 

     //Y 
     int Y = Y_FROM_RGB(B, G, R); 

     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 

     int R1 = source_scan[0]; 
     int G1 = source_scan[1]; 
     int B1 = source_scan[2]; 

     //Y 
     Y = Y_FROM_RGB(B1, G1, R1); 

     R += (R1 + source_scan_next[0] + source_scan_next[3]); 
     G += (G1 + source_scan_next[1] + source_scan_next[4]); 
     B += (B1 + source_scan_next[2] + source_scan_next[5]); 


     //YCrCb 
     *dest_scan_Y = Y; 
     *dest_scan_V = V_FROM_RGB(B, G, R); 
     *dest_scan_U = U_FROM_RGB(B, G, R); 

     source_scan += 3; 
     dest_scan_Y += 1; 
     dest_scan_U += 1; 
     dest_scan_V += 1; 
     source_scan_next += 6; 
    }; 

    //scroll to next line 
    source += source_scan_size; 
    dest_Y += dest_scan_size_Y; 
    dest_U += dest_scan_size_U; 
    dest_V += dest_scan_size_V; 

    //Start of line 
    source_scan = source; 
    dest_scan_Y = dest_Y; 

    //Do all pixels 
    for (int x = 0; x < half_width; x ++) 
    { 
     int R = source_scan[0]; 
     int G = source_scan[1]; 
     int B = source_scan[2]; 

     //Y 
     int Y = Y_FROM_RGB(B, G, R); 

     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 

     R = source_scan[0]; 
     G = source_scan[1]; 
     B = source_scan[2]; 

     //Y 
     Y = Y_FROM_RGB(B, G, R); 
     *dest_scan_Y = Y; 
     source_scan += 3; 
     dest_scan_Y += 1; 
    }; 

    source += source_scan_size; 
    dest_Y += dest_scan_size_Y; 
    }; 
}; 
+0

是內存的問題嗎?如果沒有,可以在數據字大小(整數)來表示,而不是字節大小? – Simon 2010-08-09 11:48:06

+0

內存在這裏不是問題。數據可以用整數表示。 – 2010-08-09 11:51:01

回答

5

除非我失去了一些東西后續的代碼似乎在兩個循環被重複,那麼,爲什麼不通過這個循環去一次?這可能需要對算法進行一些更改,但會提高性能。

for (int x = 0; x < half_width; x ++) 
{ 
    int R = source_scan[0]; 
    int G = source_scan[1]; 
    int B = source_scan[2]; 

    //Y 
    int Y = Y_FROM_RGB(B, G, R); 

    *dest_scan_Y = Y; 
    source_scan += 3; 
    dest_scan_Y += 1; 

    R = source_scan[0]; 
    G = source_scan[1]; 
    B = source_scan[2]; 

但是,做任何事情之前,移動兩個環路內成單獨的功能,然後運行分析器,看看你是否在其中一個比另一個函數花費更多的時間。

你有這個功能三個環路,而你不知道哪個部分實際上是你在哪裏花你的時間。因此,請確定在進行任何優化之前,否則您可能會發現您正在修復錯誤的部分。

0

我不知道你使用的是什麼平臺,但你可能想看看SIMD

臂Cotext-A8具有Neon技術,不支持SIMD。您應該能夠在ARM網站上找到更多信息。

+0

平臺是ARMv6的 – 2010-08-09 11:57:46

0

。假定它們指向不重疊的內存,你應該用restrict預選賽聲明你sourcedest_Ydest_Udest_V指針,來告訴編譯器這一點,並允許它優化更好。

+0

後,我與__restrict預選賽聲明變量 - 代碼變得更慢:) – 2010-08-09 13:56:44

+0

@artur_i_am:聽起來像是你有一個bug報告給您的編譯器供應商! – caf 2010-08-09 22:14:35