2015-02-10 330 views
1

我試圖做一個快速MD5 brutoforce PROGRAMM,我寫了一個simpliest實現MD5的,只能用繩子< 56的長度正常工作。MD5蠻力加速

void md5(u8* message, u32 length, u8* result) 
{ 
    u8 buffer[64]; 
    u32 *x = (u32*)buffer; 
    memset(buffer, 0, 64); 

    memcpy(buffer, message, length); 
    buffer[length] = 0x80; 

    length *= 8; 
    memcpy(buffer + 56, &length, 4); 

    u32 h[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476}; 

    /* Round 1 */ 
    h[0] += ((h[1] & h[2]) | (~h[1] & h[3])) + x[ 0] + 0xd76aa478; h[0] = h[1] + ((h[0] << 7) | (h[0] >> 25)); 
    h[3] += ((h[0] & h[1]) | (~h[0] & h[2])) + x[ 1] + 0xe8c7b756; h[3] = h[0] + ((h[3] << 12) | (h[3] >> 20)); 
    h[2] += ((h[3] & h[0]) | (~h[3] & h[1])) + x[ 2] + 0x242070db; h[2] = h[3] + ((h[2] << 17) | (h[2] >> 15)); 
    h[1] += ((h[2] & h[3]) | (~h[2] & h[0])) + x[ 3] + 0xc1bdceee; h[1] = h[2] + ((h[1] << 22) | (h[1] >> 10)); 
    h[0] += ((h[1] & h[2]) | (~h[1] & h[3])) + x[ 4] + 0xf57c0faf; h[0] = h[1] + ((h[0] << 7) | (h[0] >> 25)); 
    h[3] += ((h[0] & h[1]) | (~h[0] & h[2])) + x[ 5] + 0x4787c62a; h[3] = h[0] + ((h[3] << 12) | (h[3] >> 20)); 
    h[2] += ((h[3] & h[0]) | (~h[3] & h[1])) + x[ 6] + 0xa8304613; h[2] = h[3] + ((h[2] << 17) | (h[2] >> 15)); 
    h[1] += ((h[2] & h[3]) | (~h[2] & h[0])) + x[ 7] + 0xfd469501; h[1] = h[2] + ((h[1] << 22) | (h[1] >> 10)); 
    h[0] += ((h[1] & h[2]) | (~h[1] & h[3])) + x[ 8] + 0x698098d8; h[0] = h[1] + ((h[0] << 7) | (h[0] >> 25)); 
    h[3] += ((h[0] & h[1]) | (~h[0] & h[2])) + x[ 9] + 0x8b44f7af; h[3] = h[0] + ((h[3] << 12) | (h[3] >> 20)); 
    h[2] += ((h[3] & h[0]) | (~h[3] & h[1])) + x[10] + 0xffff5bb1; h[2] = h[3] + ((h[2] << 17) | (h[2] >> 15)); 
    h[1] += ((h[2] & h[3]) | (~h[2] & h[0])) + x[11] + 0x895cd7be; h[1] = h[2] + ((h[1] << 22) | (h[1] >> 10)); 
    h[0] += ((h[1] & h[2]) | (~h[1] & h[3])) + x[12] + 0x6b901122; h[0] = h[1] + ((h[0] << 7) | (h[0] >> 25)); 
    h[3] += ((h[0] & h[1]) | (~h[0] & h[2])) + x[13] + 0xfd987193; h[3] = h[0] + ((h[3] << 12) | (h[3] >> 20)); 
    h[2] += ((h[3] & h[0]) | (~h[3] & h[1])) + x[14] + 0xa679438e; h[2] = h[3] + ((h[2] << 17) | (h[2] >> 15)); 
    h[1] += ((h[2] & h[3]) | (~h[2] & h[0])) + x[15] + 0x49b40821; h[1] = h[2] + ((h[1] << 22) | (h[1] >> 10)); 

    /* Round 2 */ 
    h[0] += ((h[1] & h[3]) | (h[2] & ~h[3])) + x[ 1] + 0xf61e2562; h[0] = h[1] + ((h[0] << 5) | (h[0] >> 27)); 
    h[3] += ((h[0] & h[2]) | (h[1] & ~h[2])) + x[ 6] + 0xc040b340; h[3] = h[0] + ((h[3] << 9) | (h[3] >> 23)); 
    h[2] += ((h[3] & h[1]) | (h[0] & ~h[1])) + x[11] + 0x265e5a51; h[2] = h[3] + ((h[2] << 14) | (h[2] >> 18)); 
    h[1] += ((h[2] & h[0]) | (h[3] & ~h[0])) + x[ 0] + 0xe9b6c7aa; h[1] = h[2] + ((h[1] << 20) | (h[1] >> 12)); 
    h[0] += ((h[1] & h[3]) | (h[2] & ~h[3])) + x[ 5] + 0xd62f105d; h[0] = h[1] + ((h[0] << 5) | (h[0] >> 27)); 
    h[3] += ((h[0] & h[2]) | (h[1] & ~h[2])) + x[10] + 0x2441453; h[3] = h[0] + ((h[3] << 9) | (h[3] >> 23)); 
    h[2] += ((h[3] & h[1]) | (h[0] & ~h[1])) + x[15] + 0xd8a1e681; h[2] = h[3] + ((h[2] << 14) | (h[2] >> 18)); 
    h[1] += ((h[2] & h[0]) | (h[3] & ~h[0])) + x[ 4] + 0xe7d3fbc8; h[1] = h[2] + ((h[1] << 20) | (h[1] >> 12)); 
    h[0] += ((h[1] & h[3]) | (h[2] & ~h[3])) + x[ 9] + 0x21e1cde6; h[0] = h[1] + ((h[0] << 5) | (h[0] >> 27)); 
    h[3] += ((h[0] & h[2]) | (h[1] & ~h[2])) + x[14] + 0xc33707d6; h[3] = h[0] + ((h[3] << 9) | (h[3] >> 23)); 
    h[2] += ((h[3] & h[1]) | (h[0] & ~h[1])) + x[ 3] + 0xf4d50d87; h[2] = h[3] + ((h[2] << 14) | (h[2] >> 18)); 
    h[1] += ((h[2] & h[0]) | (h[3] & ~h[0])) + x[ 8] + 0x455a14ed; h[1] = h[2] + ((h[1] << 20) | (h[1] >> 12)); 
    h[0] += ((h[1] & h[3]) | (h[2] & ~h[3])) + x[13] + 0xa9e3e905; h[0] = h[1] + ((h[0] << 5) | (h[0] >> 27)); 
    h[3] += ((h[0] & h[2]) | (h[1] & ~h[2])) + x[ 2] + 0xfcefa3f8; h[3] = h[0] + ((h[3] << 9) | (h[3] >> 23)); 
    h[2] += ((h[3] & h[1]) | (h[0] & ~h[1])) + x[ 7] + 0x676f02d9; h[2] = h[3] + ((h[2] << 14) | (h[2] >> 18)); 
    h[1] += ((h[2] & h[0]) | (h[3] & ~h[0])) + x[12] + 0x8d2a4c8a; h[1] = h[2] + ((h[1] << 20) | (h[1] >> 12)); 

    /* Round 3 */ 
    h[0] += (h[1]^h[2]^h[3]) + x[ 5] + 0xfffa3942; h[0] = h[1] + ((h[0] << 4) | (h[0] >> 28)); 
    h[3] += (h[0]^h[1]^h[2]) + x[ 8] + 0x8771f681; h[3] = h[0] + ((h[3] << 11) | (h[3] >> 21)); 
    h[2] += (h[3]^h[0]^h[1]) + x[11] + 0x6d9d6122; h[2] = h[3] + ((h[2] << 16) | (h[2] >> 16)); 
    h[1] += (h[2]^h[3]^h[0]) + x[14] + 0xfde5380c; h[1] = h[2] + ((h[1] << 23) | (h[1] >> 9)); 
    h[0] += (h[1]^h[2]^h[3]) + x[ 1] + 0xa4beea44; h[0] = h[1] + ((h[0] << 4) | (h[0] >> 28)); 
    h[3] += (h[0]^h[1]^h[2]) + x[ 4] + 0x4bdecfa9; h[3] = h[0] + ((h[3] << 11) | (h[3] >> 21)); 
    h[2] += (h[3]^h[0]^h[1]) + x[ 7] + 0xf6bb4b60; h[2] = h[3] + ((h[2] << 16) | (h[2] >> 16)); 
    h[1] += (h[2]^h[3]^h[0]) + x[10] + 0xbebfbc70; h[1] = h[2] + ((h[1] << 23) | (h[1] >> 9)); 
    h[0] += (h[1]^h[2]^h[3]) + x[13] + 0x289b7ec6; h[0] = h[1] + ((h[0] << 4) | (h[0] >> 28)); 
    h[3] += (h[0]^h[1]^h[2]) + x[ 0] + 0xeaa127fa; h[3] = h[0] + ((h[3] << 11) | (h[3] >> 21)); 
    h[2] += (h[3]^h[0]^h[1]) + x[ 3] + 0xd4ef3085; h[2] = h[3] + ((h[2] << 16) | (h[2] >> 16)); 
    h[1] += (h[2]^h[3]^h[0]) + x[ 6] + 0x4881d05; h[1] = h[2] + ((h[1] << 23) | (h[1] >> 9)); 
    h[0] += (h[1]^h[2]^h[3]) + x[ 9] + 0xd9d4d039; h[0] = h[1] + ((h[0] << 4) | (h[0] >> 28)); 
    h[3] += (h[0]^h[1]^h[2]) + x[12] + 0xe6db99e5; h[3] = h[0] + ((h[3] << 11) | (h[3] >> 21)); 
    h[2] += (h[3]^h[0]^h[1]) + x[15] + 0x1fa27cf8; h[2] = h[3] + ((h[2] << 16) | (h[2] >> 16)); 
    h[1] += (h[2]^h[3]^h[0]) + x[ 2] + 0xc4ac5665; h[1] = h[2] + ((h[1] << 23) | (h[1] >> 9)); 

    /* Round 4 */ 
    h[0] += (h[2]^(h[1] | ~h[3])) + x[ 0] + 0xf4292244; h[0] = h[1] + ((h[0] << 6) | (h[0] >> 26)); 
    h[3] += (h[1]^(h[0] | ~h[2])) + x[ 7] + 0x432aff97; h[3] = h[0] + ((h[3] << 10) | (h[3] >> 22)); 
    h[2] += (h[0]^(h[3] | ~h[1])) + x[14] + 0xab9423a7; h[2] = h[3] + ((h[2] << 15) | (h[2] >> 17)); 
    h[1] += (h[3]^(h[2] | ~h[0])) + x[ 5] + 0xfc93a039; h[1] = h[2] + ((h[1] << 21) | (h[1] >> 11)); 
    h[0] += (h[2]^(h[1] | ~h[3])) + x[12] + 0x655b59c3; h[0] = h[1] + ((h[0] << 6) | (h[0] >> 26)); 
    h[3] += (h[1]^(h[0] | ~h[2])) + x[ 3] + 0x8f0ccc92; h[3] = h[0] + ((h[3] << 10) | (h[3] >> 22)); 
    h[2] += (h[0]^(h[3] | ~h[1])) + x[10] + 0xffeff47d; h[2] = h[3] + ((h[2] << 15) | (h[2] >> 17)); 
    h[1] += (h[3]^(h[2] | ~h[0])) + x[ 1] + 0x85845dd1; h[1] = h[2] + ((h[1] << 21) | (h[1] >> 11)); 
    h[0] += (h[2]^(h[1] | ~h[3])) + x[ 8] + 0x6fa87e4f; h[0] = h[1] + ((h[0] << 6) | (h[0] >> 26)); 
    h[3] += (h[1]^(h[0] | ~h[2])) + x[15] + 0xfe2ce6e0; h[3] = h[0] + ((h[3] << 10) | (h[3] >> 22)); 
    h[2] += (h[0]^(h[3] | ~h[1])) + x[ 6] + 0xa3014314; h[2] = h[3] + ((h[2] << 15) | (h[2] >> 17)); 
    h[1] += (h[3]^(h[2] | ~h[0])) + x[13] + 0x4e0811a1; h[1] = h[2] + ((h[1] << 21) | (h[1] >> 11)); 
    h[0] += (h[2]^(h[1] | ~h[3])) + x[ 4] + 0xf7537e82; h[0] = h[1] + ((h[0] << 6) | (h[0] >> 26)); 
    h[3] += (h[1]^(h[0] | ~h[2])) + x[11] + 0xbd3af235; h[3] = h[0] + ((h[3] << 10) | (h[3] >> 22)); 
    h[2] += (h[0]^(h[3] | ~h[1])) + x[ 2] + 0x2ad7d2bb; h[2] = h[3] + ((h[2] << 15) | (h[2] >> 17)); 
    h[1] += (h[3]^(h[2] | ~h[0])) + x[ 9] + 0xeb86d391; h[1] = h[2] + ((h[1] << 21) | (h[1] >> 11)); 

    h[0] += 0x67452301; 
    h[1] += 0xefcdab89; 
    h[2] += 0x98badcfe; 
    h[3] += 0x10325476; 

    memcpy(result,h,16); 
} 

現在我的結果是每秒5個百萬散列(酷睿i7的1個螺紋)和50百萬在GeForce540米。但是像BarsWF或Ignashgpu這樣的程序在相同的硬件上可以達到大約4千萬和2億的速度。我試圖找到允許這樣做的技巧或技巧,但我不能。我查看了BarsWF的源代碼,但我只找到非常相似的代碼,我確信它具有相同的複雜性。

給我一些建議,我應該找資料,這將有助於我。

+2

如果代碼幾乎是相同BarsWF然後我的猜測是有你的項目是如何正在興建的差異,例如編譯器優化級別。可能值得添加一些關於這方面的細節。 – 2015-02-10 14:49:33

+0

我不擅長這個。我使用VS2010,並且只在釋放模式下設置標誌/ O2。 – 2015-02-10 14:59:30

+0

哇哇,這就是我需要的! – ForceBru 2015-02-10 16:34:05

回答

1

您可以創建function md5x4(u8* message[4], u32 length[4], u8* result[4])將同時使用SSE計算哈希值4,甚至md5x8 8個哈希值,如果你有合適的AVX CPU。

您也可以在這裏找到http://www.zorinaq.com/papers/md5-amd64.html一些想法。但也許它們已經由編譯器完成了。