有路径的地图如何将tham与给定路径进行比较?

问题描述:

我们将boost路径映射到字符串对,如name:location(绝对位置路径a la usr/myfolder/)。我们给了一些位置la usr/myfolder/mysubfolder/myfile。如何找到最适合给定网址的地图位置?有路径的地图如何将tham与给定路径进行比较?

例子,我们有一个地图就像它,如果我们需要,我们可以诉诸:

service1:myfolder/ 
service2:myfolder/mysubfolder/ 
service3:myfolder/myothersubfolder/ 
service4:myfolder/mysubfolder/myfile 

我们给定值myfolder/mysubfolder/myfile/blablabla/(路径)。 我们想知道它与我们地图中哪个项目最相关。 搜索结果为service4为最相关内容的地图项。

那么如何通过给定的字符串值来查找它最相关的地图元素呢?

所以original question是关于一般字符串的情况下,但我有一些重新配置,所以没有,我只是在升压路径上工作。

+0

当你说这是在地图中,你是指“std :: map”还是一个向量?他们排序? – 2012-02-14 22:24:07

您可以使用Levenshtein distance

编辑

因为我最终需要类似的东西我自己,这个问题仍然开放。这是我玩过的一些代码。既提高了字符串距离,也将Levenshtein算法应用于路径标记。

C++代码

/* 
----- String based Levenshtein ---- 
Matching : this/is/a/strange/path 

0 : this/is/a/strange/path 
2 : this/is/a/strong/path 
2 : that/is/a/strange/path 
4 : is/this/a/strange/path 
5 : this/is/a/strange 
13 : this/is/a 
15 : this/is 
16 : that/was 
18 : this 
24 : completely/different/folder 

----- Path based Levenshtein ---- 
Matching : this/is/a/strange/path 

0 : this/is/a/strange/path 
1 : this/is/a/strange 
2 : this/is/a/strong/path 
2 : that/is/a/strange/path 
2 : this/is/a 
2 : is/this/a/strange/path 
3 : this/is 
4 : this 
7 : that/was 
8 : completely/different/folder 
*/ 

#include <string> 
#include <vector> 
#include <algorithm> 
#include <stdio.h> 

/// returns the smaller of two parameters 
template< typename T > 
    T levmin(T v1, T v2) 
    { return (v1 < v2) ? v1 : v2; } 

/// Returns the Levenshtein distance between the specified strings 
template < typename T, typename T_STR > 
    typename T_STR::size_type levstr(const T_STR &s1, const T_STR &s2) 
{ 
    typename T_STR::size_type l1 = s1.length(), l2 = s2.length(); 
    std::vector< typename T_STR::size_type > d((l1 + 1) * (l2 + 1)); 

    typename T_STR::size_type i, j; 
    for (i = 0; i <= l1; i++) 
     d[ i * l2 ] = i; 

    for (i = 0; i <= l2; i++) 
     d[ i ] = i; 

    for (i = 1; i <= l1; i++) 
     for (j = 1; j <= l2; j++) 
      d[ i * l2 + j ] = levmin(levmin(d[ (i - 1) * l2 + j ] + 1, d[ i * l2 + (j - 1) ] + 1), 
             d[ (i - 1) * l2 + (j - 1) ] + (s1[ i - 1 ] == s2[ j - 1 ] ? 0 : 1) 
            ); 

    return d[ (l1 * l2) + l2 ];  
} 

/// Returns the Levenshtein distance between the specified split paths 
template < typename T, typename T_STR, typename T_LST > 
    typename T_STR::size_type levpath(const T_LST &lst1, const T_LST &lst2) 
{ 
    typename T_STR::size_type l1 = lst1.size(), l2 = lst2.size(); 
    std::vector< typename T_STR::size_type > d((l1 + 1) * (l2 + 1)); 

    typename T_STR::size_type i, j; 
    for (i = 0; i <= l1; i++) 
     d[ i * l2 ] = i; 

    for (i = 0; i <= l2; i++) 
     d[ i ] = i; 

    for (i = 1; i <= l1; i++) 
     for (j = 1; j <= l2; j++) 
      d[ i * l2 + j ] = levmin(levmin(d[ (i - 1) * l2 + j ] + 1, d[ i * l2 + (j - 1) ] + 1), 
             d[ (i - 1) * l2 + (j - 1) ] + levstr< T, T_STR>(lst1[ i - 1 ], lst2[ j - 1 ]) 
            ); 

    return d[ (l1 * l2) + l2 ];  
} 

/// Generic split path function 
/* 
    Returns a vector of path tokens 
*/ 
template < typename T, typename T_STR, typename T_LST > 
    T_LST splitpath(const T_STR &s, const T sep) 
    { T_LST lst; 
     typename T_STR::size_type i = 0, l = 0; 
     while(T_STR::npos != (i = s.find_first_of(sep, i))) 
     { if (l < i) 
       lst.push_back(T_STR(s, l, i - l)); 
      l = ++i; 
     } // end while 
     if (l < s.length()) 
      lst.push_back(T_STR(s, l, s.length() - l)); 
     return lst; 
    } 

/// Generic join path function 
/* 
    Returns a string of joined vector tokens 
*/ 
template < typename T, typename T_STR, typename T_LST > 
    T_STR joinpath(const T_LST &p, const T sep) 
    { T_STR s; 
     for (typename T_LST::const_iterator it = p.begin(); it != p.end(); it++) 
     { if (s.length()) s += sep; s += *it; } 
     return s; 
    } 


// String types 
typedef char t_levchar; 
typedef std::basic_string<t_levchar> t_levstr; 
typedef std::vector<t_levstr> t_levpath; 
typedef std::vector<t_levpath> t_levpathlist; 

// Sort compare for split paths 
template< typename T, typename T_STR, typename T_LST > struct levcmp 
{ levcmp(const T_LST &p) { m_p = p; } 
    bool operator() (const T_LST &i, const T_LST &j) 
    { return levpath< T, T_STR, T_LST >(i, m_p) < levpath< T, T_STR, T_LST >(j, m_p); } 
    T_LST m_p; 
}; 

// Sort compare for strings 
template< typename T, typename T_STR > struct levcmp_str 
{ levcmp_str(const T_STR &s) { m_s = s; } 
    bool operator() (const T_STR &i, const T_STR &j) 
    { return levstr< T, T_STR >(i, m_s) < levstr< T, T_STR >(j, m_s); } 
    T_STR m_s; 
}; 

int main(int argc, char* argv[]) 
{ 
    // Path to compare with 
    const t_levchar* compare_path = "this/is/a/strange/path"; 

    // Paths to sort 
    const t_levchar* path_list[] = 
    { 
     "this/is/a/strong/path", 
     "that/is/a/strange/path", 
     "this/is/a/strange", 
     "this/is", 
     "this/is/a", 
     "this", 
     "this/is/a/strange/path", 
     "is/this/a/strange/path", 
     "that/was", 
     "completely/different/folder", 
     0 
    }; 

    printf("\n----- String based Levenshtein ----\n" 
      "Matching : %s\n\n", compare_path); 

    // Create vector from paths   
    std::vector<t_levstr> paths; 
    for(int i = 0; path_list[ i ]; i++) 
     paths.push_back(path_list[ i ]); 

    // Sort the paths 
    std::sort(paths.begin(), paths.end(), levcmp_str< t_levchar, t_levstr >(compare_path)); 

    // Show the result 
    for (std::vector<t_levstr>::iterator it = paths.begin(); it != paths.end(); it++) 
     printf("%d : %s\n", 
       (int)levstr< t_levchar, t_levstr >(*it, compare_path), 
       (const char*)it->c_str()); 

    printf("\n----- Path based Levenshtein ----\n" 
      "Matching : %s\n\n", compare_path); 

    // Create vector from split paths 
    t_levpath splitcompare = splitpath< t_levchar, t_levstr, t_levpath >(compare_path, '/'); 
    t_levpathlist splitpaths; 
    for (int i = 0; path_list[ i ]; i++) 
     splitpaths.push_back(splitpath< t_levchar, t_levstr, t_levpath >(path_list[ i ], '/')); 

    // Sort the paths 
    std::sort(splitpaths.begin(), splitpaths.end(), 
       levcmp< t_levchar, t_levstr, t_levpath >(splitcompare)); 

    // Show the results 
    for (t_levpathlist::iterator it = splitpaths.begin(); it != splitpaths.end(); it++) 
     printf("%d : %s\n", 
       (int)levpath< t_levchar, t_levstr, t_levpath >(*it, splitcompare), 
       (const char*)joinpath< t_levchar, t_levstr, t_levpath >(*it, '/').c_str()); 

    return 0; 
} 
+0

Levenshtein似乎有点过分。或者,也许这个问题已经改变。 – 2012-02-14 22:22:15

+0

你可能是对的,这个问题在我看来似乎暗示路径中可能没有完全匹配。 – bob2 2012-02-14 22:57:57

+0

我没有考虑过。这是一个确定的可能性。或者也许他需要处理路径中的链接。我的回答并没有处理。 – 2012-02-14 23:04:25

我真的没有一个现成的C++的答案,但我最近做的在C#中类似的东西,并用以下想出了:通过

循环整个矢量,检查有趣的路径,看它是否以元素开始。 最长的这样的比赛是胜利者。这将是O(n)操作,具体取决于比较集中路径的数量。

我上面的改进版本变得有点不同了,因为我将要检查一些我之前已经检查过的条目。因此,我按照路径的长度降序对向量进行排序,这样我遇到的第一个匹配也是最好的(给我平均的O(n/2)操作,我认为),并且将结果存储到数组中一本字典,所以我不需要再强制搜索。

希望这会有所帮助!

+0

如果你想更新你的答案,问题就改变了。 – 2012-02-14 22:23:00

//I don't know what path is, I'll pretend it's a std::string 
//algorithm is the same, just functions change 
const std::vector<boost::path>& services; 
const std::string& findme = ??? 

std::vector<boost::path>::iterator result = services.end(); 
for(auto i = services.begin(); i != services.end(); ++i) { 
    //if this path is shorter than the best so far, skip it 
    if (result == services.end() || i->second.size()>result->second.size()) { 
     const size_t colonpos = i->second.find(':', 8); //find colon 
     //if this path is longer than findme, skip it 
     if (findme.size() >= i->second.size()-colonpos) { 
      //make sure they match, in _reverse_ order (faster in this case) 
      int o=i->second.size()-1; 
      for(; o>=colonpos; --o) { 
       if (i->second[o] != findme[o-colonpos]) 
        break; 
      } 
      //if it was a match, this is the new best result 
      if (o < colonpos) 
       result = i; 
     } 
    } 
} 

//either result is services.end() meaning none matched 
//or result is the _longest_ path that matched 

显然,boost::path不是std::string,但可能有一个得到std::string或类似物体,所以你只需要该成员添加到i->secondresult->second