2016-08-22 189 views
2

實施例的字節最大序列:查找在兩個字節數組

{54,87,23,87,45,67,7,85,65,65,3,4,55,76, 65,64,5,6,4,54,45,6,4};

{76,57,65,3,4,55,76,65,64,5,6,4,54,45,8,65,66,57,6,7,7,56 ,6,7,44,57,8,76,54,67};

基本上,我有兩個字節[],並且需要在兩個字節中找到最大的相同字節序列。

我已經嘗試了明顯的事情,寫一些代碼,bruteforces結果:

var bestIndex = 0; 
var bestCount = 0; 
for (var i1 = 0; i1 + bestCount < data1.Length; i1++) 
{ 
    var currentCount = 0; 
    for (var i2 = 0; i2 < data2.Length; i2++) 
    { 
     if (data1[i1 + currentCount] == data2[i2]) 
     { 
      currentCount++; 
      if (i1 + currentCount == data1.Length) 
      { 
       bestCount = currentCount; 
       bestIndex = i1; 
       break; 
      } 
     } 
     else 
     { 
      if (currentCount > bestCount) 
      { 
       bestCount = currentCount; 
       bestIndex = i1; 
      } 
      currentCount = 0; 
     } 
    } 
    if (currentCount > bestCount) 
    { 
     bestCount = currentCount; 
     bestIndex = i1; 
    } 
} 

然而,在我的應用程序中的字節數組會大很多,高達GB甚至。所以基本上我需要一個關於如何更有效的提示/代碼。

+1

看起來像np完成我 – Steve

+0

啊,所以你有什麼問題? –

+0

@ rory.ap too slow ofc – Steve

回答

0

我對此有一些想法。我不確定這是否有幫助或傷害,但是您是否考慮過首先通過最大的可能性向後進行工作,以便在您找到匹配時立即終止。

 byte[] b1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 }; 
     byte[] b2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 }; 

     //figure out which one is smaller, since that one will limit the range options 
     byte[] smaller; 
     byte[] bigger; 

     if (b1.Count() > b2.Count()) 
     { 
      bigger = b1; 
      smaller = b2; 
     } 
     else 
     { 
      bigger = b2; 
      smaller = b1; 
     } 


     // doesn't matter what order we put these in, since they will be ordered later by length 
     List<Tuple<int, int>> ranges = new List<Tuple<int, int>>(); 
     Parallel.For(0, smaller.Count(), (i1) => { 
      Parallel.For(i1 + 1, smaller.Count(), (i2) => 
      { 
       ranges.Add(new Tuple<int, int>(i1, i2)); 
      }); 
     }); 

     // order by length of slice produced by range in descending order 
     // this way, once we get an answer, we know nothing else can be longer 
     ranges = ranges.OrderByDescending(x => x.Item2 - x.Item1).ToList(); 

     Tuple<int, int> largestMatchingRange = new Tuple<int, int>(0, 0); 

     foreach (Tuple<int, int> range in ranges) 
     { 
      bool match = true; // set in outer loop to allow for break 

      for (int i1 = 0; i1 < bigger.Count(); i1++) 
      { 
       if (bigger.Count() <= i1 + (range.Item2 - range.Item1)) 
       { 
        //short cut if the available slice from the bigger array is shorter than the range length 
        match = false; 
        continue; 
       } 

       match = true; // reset to true to allow for new attempt for each larger array slice 

       for (int i2 = range.Item1, i1Temp = i1; i2 < range.Item2; i2++, i1Temp++) 
       { 
        if (bigger[i1Temp] != smaller[i2]) 
        { 
         match = false; 
         break; 
        } 
       } 
       if (match) 
       { 
        largestMatchingRange = range; 
        break; 
       } 
      } 
      if (match) 
      { 
       break; 
      } 
     } 

     byte[] largestMatchingBytes = smaller.Skip(largestMatchingRange.Item1).Take(largestMatchingRange.Item2 - largestMatchingRange.Item1).ToArray(); 
+0

PS。如果沒有匹配,這可能會更慢,並且需要更多的前期處理,所以在所有情況下您都不會看到積極的結果。 PPS。如果您正在處理更大的陣列,您可能需要長時間交換整數。 – Broom

+0

絕妙的主意!我認爲這將有助於我的情況,因爲我會一直這樣做,只有一個相當小,一個非常大的數組 – Whosdatdev

+0

另外,有沒有人知道如何獲得長度順序的範圍沒有得到它們,然後排序?我無法弄清楚這個模式,找出一個嵌套的循環來切斷前面的範圍選擇和排序 – Broom

0

我想出了循環,這應該是更快。

byte[] data1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 }; 
byte[] data2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 }; 


//figure out which one is smaller, since that one will limit the range options 
byte[] smaller; 
byte[] bigger; 

if (data1.Count() > data2.Count()) 
{ 
    bigger = data1; 
    smaller = data2; 
} 
else 
{ 
    bigger = data2; 
    smaller = data1; 
} 

Tuple<int, int> largestMatchingRange = new Tuple<int, int>(0, 0); 

//iterate over slices in reverse length order 
for (int length = smaller.Count() - 1; length > 0; length--) 
{ 
    int numberOfSlicesForLength = smaller.Count() - length; 

    bool match = true; // set in outer loop to allow for break 

    for (int start = 0; start < numberOfSlicesForLength; start++) 
    { 
     //within a collection of similarly sized slices, we start with the slice found first within the array 
     Tuple<int, int> range = new Tuple<int, int>(start, start + length); 

     for (int i1 = 0; i1 < bigger.Count(); i1++) 
     { 
      if (bigger.Count() <= i1 + (range.Item2 - range.Item1)) 
      { 
       //short cut if the available slice from the bigger array is shorter than the range length 
       match = false; 
       continue; 
      } 

      match = true; // reset to true to allow for new attempt for each larger array slice 

      for (int i2 = range.Item1, i1Temp = i1; i2 < range.Item2; i2++, i1Temp++) 
      { 
       if (bigger[i1Temp] != smaller[i2]) 
       { 
        match = false; 
        break; 
       } 
      } 
      if (match) 
      { 
       largestMatchingRange = range; 
       break; 
      } 
     } 
     if (match) 
     { 
      break; 
     } 
    } 

    if (match) 
    { 
     break; 
    } 
} 

byte[] largestMatchingBytes = smaller.Skip(largestMatchingRange.Item1).Take(largestMatchingRange.Item2 - largestMatchingRange.Item1).ToArray(); 
0

不是逐個檢查字節,而是可以將每個字節值的索引位置保存在列表字典中。在你的情況下,256列表的數組可能會更好。

List<int>[] index(byte[] a) {  // List<long> if the array can be more than 2GB 
    var lists = new List<int>[256]; 
    for(int i = 0; i < a.Length; i++) { 
     var b = a[i]; 
     if (lists[b] == null) lists[b] = new List<int>(); 
     lists[b].Add(i); 
    } 
    return lists; 
} 

那麼你可以遍歷所有的256種可能的字節值

byte[] data1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 }; 
byte[] data2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 }; 

var indexes1 = index(data1); 
var indexes2 = index(data2); 

var bestIndex = 0; 
var bestCount = 0; 

for (var b = 0; b < 256; b++) 
{ 
    var list1 = indexes1[b]; if (list1 == null) continue; 
    var list2 = indexes1[b]; if (list2 == null) continue; 

    foreach(var index1 in list1) 
    { 
     foreach (var index2 in list2) 
     { 
      // your code here 
      for (var i1 = index1; i1 < data1.Length - bestCount; i1++) 
      { 
       var currentCount = 0; 
       for (var i2 = index2; i2 < data2.Length; i2++) 
       { 
        if (data1[i1 + currentCount] == data2[i2]) 
        { 
         currentCount++; 
         if (i1 + currentCount == data1.Length) 
         { 
          bestCount = currentCount; 
          bestIndex = i1; 
          break; 
         } 
        } 
        else 
        { 
         if (currentCount > bestCount) 
         { 
          bestCount = currentCount; 
          bestIndex = i1; 
         } 
         currentCount = 0; 
        } 
       } 
       if (currentCount > bestCount) 
       { 
        bestCount = currentCount; 
        bestIndex = i1; 
       } 
      } 
     } 
    } 
} 

var best = data1.Skip(bestIndex).Take(bestCount); 
Debug.Print(bestIndex + ", " + bestCount + ": " + string.Join(", ", best)); 

理論上,這感覺就像它會採取更大的陣列更少的對比,但在實踐中,將有更多的內存高速緩存未命中,所以我不確定它是否會比其他答案中更線性的並行版本更快。我沒有想太多,但希望它能給你一些想法,以防萬一我弄錯了。

更新

我才意識到這種想法是多麼糟糕的正規機少於32 GB的內存索引列表將字節數組的4倍以上的內存。