檢查 - 應在查詢的結尾你
注WHERE similarity > -1
工作 - 通過設定值,而不是-1
你可以控制的相似性閾值。越接近1,你想捕捉的對象就越相似。更接近0 - 更多對捕捉!
SELECT ID, Name1, Name2, similarity FROM
JS(// input table
(
SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2
FROM YourTable AS one
JOIN YourTable AS two ON one.ID = two.ID
HAVING Name1 < Name2
) ,
// input columns
ID, Name1, Name2,
// output schema
"[{name: 'ID', type:'string'},
{name: 'Name1', type:'string'},
{name: 'Name2', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ((str1.charAt(i) === str2.charAt(j)) ? 0 : 1);
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(r.Name1).toLowerCase();
} catch (ex) {
the_Name1 = r.Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(r.Name2).toLowerCase();
} catch (ex) {
the_Name2 = r.Name2.toLowerCase();
}
emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2,
similarity: 1 - Levenshtein.get(the_Name1, the_Name2)/the_Name1.length});
}"
)
WHERE similarity > -1
ORDER BY similarity DESC
你可以用下面的例子
SELECT ID, Name1, Name2, similarity FROM
JS(// input table
(
SELECT one.ID AS ID, one.Name AS Name1, two.Name AS Name2
FROM (
SELECT ID, Name FROM
(SELECT '123ABC' AS ID, 'Joe Smith' AS Name),
(SELECT '123ABC' AS ID, 'Joseph Smith' AS Name),
(SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name),
(SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name),
(SELECT '678LMN' AS ID, 'Suzyjones' AS Name),
(SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name),
(SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name),
(SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name),
(SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name),
(SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name),
) AS one
JOIN (
SELECT ID, Name FROM
(SELECT '123ABC' AS ID, 'Joe Smith' AS Name),
(SELECT '123ABC' AS ID, 'Joseph Smith' AS Name),
(SELECT '345XYZ' AS ID, 'Michael Johnson' AS Name),
(SELECT '345XYZ' AS ID, 'MikeJohnson' AS Name),
(SELECT '678LMN' AS ID, 'Suzyjones' AS Name),
(SELECT '678LMN' AS ID, 'Suzanne Mary Jones' AS Name),
(SELECT 'AAA' AS ID, 'Jordan Tigani' AS Name),
(SELECT 'AAA' AS ID, 'Felipe Hoffa' AS Name),
(SELECT 'BBB' AS ID, 'Mikhail Berlyant' AS Name),
(SELECT 'BBB' AS ID, 'Michael Sheldon' AS Name),
) AS two
ON one.ID = two.ID
HAVING Name1 < Name2
) ,
// input columns
ID, Name1, Name2,
// output schema
"[{name: 'ID', type:'string'},
{name: 'Name1', type:'string'},
{name: 'Name2', type:'string'},
{name: 'similarity', type:'float'}]
",
// function
"function(r, emit) {
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};
var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;
// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;
// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}
// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;
for (j=0; j<str2.length; ++j) {
curCol = nextCol;
// substution
nextCol = prevRow[j] + ((str1.charAt(i) === str2.charAt(j)) ? 0 : 1);
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}
// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}
return nextCol;
}
};
var the_Name1;
try {
the_Name1 = decodeURI(r.Name1).toLowerCase();
} catch (ex) {
the_Name1 = r.Name1.toLowerCase();
}
try {
the_Name2 = decodeURI(r.Name2).toLowerCase();
} catch (ex) {
the_Name2 = r.Name2.toLowerCase();
}
emit({ID: r.ID, Name1: the_Name1, Name2: the_Name2,
similarity: 1 - Levenshtein.get(the_Name1, the_Name2)/the_Name1.length});
}"
)
WHERE similarity > -1
ORDER BY similarity DESC
它產生以下結果
ID Name1 Name2 similarity
123ABC joe smith joseph smith 0.6666666666666667
345XYZ michael johnson mikejohnson 0.6666666666666667
678LMN suzanne mary jones suzyjones 0.5
BBB michael sheldon mikhail berlyant 0.4666666666666667
AAA felipe hoffa jordan tigani 0.0
修改標籤爲實際的數據庫 – dbmitch
你看到http://stackoverflow.com/a/測試38513900/5221944?這個解決了這些細微差別(適用於BigQuery),並且可以輕鬆移植到您的新示例中。順便說一句 - 你能在你之前的問題中實現它嗎? –
@dbmitch - 你是什麼意思? – wizkids121