2011-05-13 46 views
2

我們將boost路徑映射到字符串對,如name:location(絕對位置路徑a la usr/myfolder/)。我們給了一些位置la usr/myfolder/mysubfolder/myfile。如何找到最適合給定網址的地圖位置?有路徑的地圖如何將tham與給定路徑進行比較?

例子,我們有一個地圖就像它,如果我們需要,我們可以訴諸:

service1:myfolder/ 
service2:myfolder/mysubfolder/ 
service3:myfolder/myothersubfolder/ 
service4:myfolder/mysubfolder/myfile 

我們給定值myfolder/mysubfolder/myfile/blablabla/(路徑)。 我們想知道它與我們地圖中哪個項目最相關。 搜索結果爲service4爲最相關內容的地圖項。

那麼如何通過給定的字符串值來查找它最相關的地圖元素呢?

所以original question是關於一般字符串的情況下,但我有一些重新配置,所以沒有,我只是在升壓路徑上工作。

+0

當你說這是在地圖中,你是指「std :: map」還是一個向量?他們排序? – 2012-02-14 22:24:07

回答

1

您可以使用Levenshtein distance

編輯

因爲我最終需要類似的東西我自己,這個問題仍然開放。這是我玩過的一些代碼。既提高了字符串距離,也將Levenshtein算法應用於路徑標記。

C++代碼

/* 
----- String based Levenshtein ---- 
Matching : this/is/a/strange/path 

0 : this/is/a/strange/path 
2 : this/is/a/strong/path 
2 : that/is/a/strange/path 
4 : is/this/a/strange/path 
5 : this/is/a/strange 
13 : this/is/a 
15 : this/is 
16 : that/was 
18 : this 
24 : completely/different/folder 

----- Path based Levenshtein ---- 
Matching : this/is/a/strange/path 

0 : this/is/a/strange/path 
1 : this/is/a/strange 
2 : this/is/a/strong/path 
2 : that/is/a/strange/path 
2 : this/is/a 
2 : is/this/a/strange/path 
3 : this/is 
4 : this 
7 : that/was 
8 : completely/different/folder 
*/ 

#include <string> 
#include <vector> 
#include <algorithm> 
#include <stdio.h> 

/// returns the smaller of two parameters 
template< typename T > 
    T levmin(T v1, T v2) 
    { return (v1 < v2) ? v1 : v2; } 

/// Returns the Levenshtein distance between the specified strings 
template < typename T, typename T_STR > 
    typename T_STR::size_type levstr(const T_STR &s1, const T_STR &s2) 
{ 
    typename T_STR::size_type l1 = s1.length(), l2 = s2.length(); 
    std::vector< typename T_STR::size_type > d((l1 + 1) * (l2 + 1)); 

    typename T_STR::size_type i, j; 
    for (i = 0; i <= l1; i++) 
     d[ i * l2 ] = i; 

    for (i = 0; i <= l2; i++) 
     d[ i ] = i; 

    for (i = 1; i <= l1; i++) 
     for (j = 1; j <= l2; j++) 
      d[ i * l2 + j ] = levmin(levmin(d[ (i - 1) * l2 + j ] + 1, d[ i * l2 + (j - 1) ] + 1), 
             d[ (i - 1) * l2 + (j - 1) ] + (s1[ i - 1 ] == s2[ j - 1 ] ? 0 : 1) 
            ); 

    return d[ (l1 * l2) + l2 ];  
} 

/// Returns the Levenshtein distance between the specified split paths 
template < typename T, typename T_STR, typename T_LST > 
    typename T_STR::size_type levpath(const T_LST &lst1, const T_LST &lst2) 
{ 
    typename T_STR::size_type l1 = lst1.size(), l2 = lst2.size(); 
    std::vector< typename T_STR::size_type > d((l1 + 1) * (l2 + 1)); 

    typename T_STR::size_type i, j; 
    for (i = 0; i <= l1; i++) 
     d[ i * l2 ] = i; 

    for (i = 0; i <= l2; i++) 
     d[ i ] = i; 

    for (i = 1; i <= l1; i++) 
     for (j = 1; j <= l2; j++) 
      d[ i * l2 + j ] = levmin(levmin(d[ (i - 1) * l2 + j ] + 1, d[ i * l2 + (j - 1) ] + 1), 
             d[ (i - 1) * l2 + (j - 1) ] + levstr< T, T_STR>(lst1[ i - 1 ], lst2[ j - 1 ]) 
            ); 

    return d[ (l1 * l2) + l2 ];  
} 

/// Generic split path function 
/* 
    Returns a vector of path tokens 
*/ 
template < typename T, typename T_STR, typename T_LST > 
    T_LST splitpath(const T_STR &s, const T sep) 
    { T_LST lst; 
     typename T_STR::size_type i = 0, l = 0; 
     while(T_STR::npos != (i = s.find_first_of(sep, i))) 
     { if (l < i) 
       lst.push_back(T_STR(s, l, i - l)); 
      l = ++i; 
     } // end while 
     if (l < s.length()) 
      lst.push_back(T_STR(s, l, s.length() - l)); 
     return lst; 
    } 

/// Generic join path function 
/* 
    Returns a string of joined vector tokens 
*/ 
template < typename T, typename T_STR, typename T_LST > 
    T_STR joinpath(const T_LST &p, const T sep) 
    { T_STR s; 
     for (typename T_LST::const_iterator it = p.begin(); it != p.end(); it++) 
     { if (s.length()) s += sep; s += *it; } 
     return s; 
    } 


// String types 
typedef char t_levchar; 
typedef std::basic_string<t_levchar> t_levstr; 
typedef std::vector<t_levstr> t_levpath; 
typedef std::vector<t_levpath> t_levpathlist; 

// Sort compare for split paths 
template< typename T, typename T_STR, typename T_LST > struct levcmp 
{ levcmp(const T_LST &p) { m_p = p; } 
    bool operator() (const T_LST &i, const T_LST &j) 
    { return levpath< T, T_STR, T_LST >(i, m_p) < levpath< T, T_STR, T_LST >(j, m_p); } 
    T_LST m_p; 
}; 

// Sort compare for strings 
template< typename T, typename T_STR > struct levcmp_str 
{ levcmp_str(const T_STR &s) { m_s = s; } 
    bool operator() (const T_STR &i, const T_STR &j) 
    { return levstr< T, T_STR >(i, m_s) < levstr< T, T_STR >(j, m_s); } 
    T_STR m_s; 
}; 

int main(int argc, char* argv[]) 
{ 
    // Path to compare with 
    const t_levchar* compare_path = "this/is/a/strange/path"; 

    // Paths to sort 
    const t_levchar* path_list[] = 
    { 
     "this/is/a/strong/path", 
     "that/is/a/strange/path", 
     "this/is/a/strange", 
     "this/is", 
     "this/is/a", 
     "this", 
     "this/is/a/strange/path", 
     "is/this/a/strange/path", 
     "that/was", 
     "completely/different/folder", 
     0 
    }; 

    printf("\n----- String based Levenshtein ----\n" 
      "Matching : %s\n\n", compare_path); 

    // Create vector from paths   
    std::vector<t_levstr> paths; 
    for(int i = 0; path_list[ i ]; i++) 
     paths.push_back(path_list[ i ]); 

    // Sort the paths 
    std::sort(paths.begin(), paths.end(), levcmp_str< t_levchar, t_levstr >(compare_path)); 

    // Show the result 
    for (std::vector<t_levstr>::iterator it = paths.begin(); it != paths.end(); it++) 
     printf("%d : %s\n", 
       (int)levstr< t_levchar, t_levstr >(*it, compare_path), 
       (const char*)it->c_str()); 

    printf("\n----- Path based Levenshtein ----\n" 
      "Matching : %s\n\n", compare_path); 

    // Create vector from split paths 
    t_levpath splitcompare = splitpath< t_levchar, t_levstr, t_levpath >(compare_path, '/'); 
    t_levpathlist splitpaths; 
    for (int i = 0; path_list[ i ]; i++) 
     splitpaths.push_back(splitpath< t_levchar, t_levstr, t_levpath >(path_list[ i ], '/')); 

    // Sort the paths 
    std::sort(splitpaths.begin(), splitpaths.end(), 
       levcmp< t_levchar, t_levstr, t_levpath >(splitcompare)); 

    // Show the results 
    for (t_levpathlist::iterator it = splitpaths.begin(); it != splitpaths.end(); it++) 
     printf("%d : %s\n", 
       (int)levpath< t_levchar, t_levstr, t_levpath >(*it, splitcompare), 
       (const char*)joinpath< t_levchar, t_levstr, t_levpath >(*it, '/').c_str()); 

    return 0; 
} 
+0

Levenshtein似乎有點過分。或者,也許這個問題已經改變。 – 2012-02-14 22:22:15

+0

你可能是對的,這個問題在我看來似乎暗示路徑中可能沒有完全匹配。 – bob2 2012-02-14 22:57:57

+0

我沒有考慮過。這是一個確定的可能性。或者也許他需要處理路徑中的鏈接。我的回答並沒有處理。 – 2012-02-14 23:04:25

1

我真的沒有一個現成的C++的答案,但我最近做的在C#中類似的東西,並用以下想出了:通過

循環整個矢量,檢查有趣的路徑,看它是否以元素開始。 最長的這樣的比賽是勝利者。這將是O(n)操作,具體取決於比較集中路徑的數量。

我上面的改進版本變得有點不同了,因爲我將要檢查一些我之前已經檢查過的條目。因此,我按照路徑的長度降序對向量進行排序,這樣我遇到的第一個匹配也是最好的(給我平均的O(n/2)操作,我認爲),並且將結果存儲到數組中一本字典,所以我不需要再強制搜索。

希望這會有所幫助!

+0

如果你想更新你的答案,問題就改變了。 – 2012-02-14 22:23:00

0
//I don't know what path is, I'll pretend it's a std::string 
//algorithm is the same, just functions change 
const std::vector<boost::path>& services; 
const std::string& findme = ??? 

std::vector<boost::path>::iterator result = services.end(); 
for(auto i = services.begin(); i != services.end(); ++i) { 
    //if this path is shorter than the best so far, skip it 
    if (result == services.end() || i->second.size()>result->second.size()) { 
     const size_t colonpos = i->second.find(':', 8); //find colon 
     //if this path is longer than findme, skip it 
     if (findme.size() >= i->second.size()-colonpos) { 
      //make sure they match, in _reverse_ order (faster in this case) 
      int o=i->second.size()-1; 
      for(; o>=colonpos; --o) { 
       if (i->second[o] != findme[o-colonpos]) 
        break; 
      } 
      //if it was a match, this is the new best result 
      if (o < colonpos) 
       result = i; 
     } 
    } 
} 

//either result is services.end() meaning none matched 
//or result is the _longest_ path that matched 

顯然,boost::path不是std::string,但可能有一個得到std::string或類似物體,所以你只需要該成員添加到i->secondresult->second