SELECT fuzzy_match_first_word('catema', `word`, 80) FROM `dictionary` WHERE (`word` LIKE 'c%') AND (fuzzy_match_first_word('catema', `word`, 80)>=80)
另一个版本将对更大的字符串进行单词比较,而不仅仅是比较单个单词。
SELECT fuzzy_match('catema', `subject`, 80) FROM `dictionary` WHERE (fuzzy_match('catema', `subject`, 80)>=80)
预存程序:
DELIMITER //
CREATE OR REPLACE FUNCTION `fuzzy_match_first_word`(`str_needle` VARCHAR(64), `str_haystack` VARCHAR(4096), `minimum_quality` INT(11)) RETURNS INT(11)
DETERMINISTIC
BEGIN
DECLARE needleLen, haystackLen, iIdx, cLen, mLen, penalty, checkSpan, shiftAmount INT DEFAULT 0;
DECLARE sChar, subCharNeedle CHAR(1) DEFAULT ' ';
DECLARE res INT DEFAULT 100;
DECLARE n INT DEFAULT 2; -- assume first letter to be ok, needs to be checked by outer like on indexed field
DECLARE shifted INT DEFAULT 4; -- how often we allow letters being moved
SET needleLen = CHAR_LENGTH(str_needle);
SET haystackLen = CHAR_LENGTH(str_haystack);
SET checkSpan = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
IF (needleLen < 1) OR (haystackLen < 1) THEN SET res = 0; ELSE
SET sChar= LEFT(str_needle,1);
IF (haystackLen <= needleLen) THEN
SET cLen = haystackLen;
SET res = res-(20*(needleLen-haystackLen)); -- 30 penalty for each missing letter
if(res < minimum_quality) THEN RETURN 0; END IF;
SET mLen = cLen;
ELSE
SET cLen = needleLen;
SET mLen = haystackLen;
END IF;
WHILE n <= cLen DO
SET subCharNeedle = SUBSTRING(str_needle, n, 1);
IF(SUBSTRING(str_haystack, n + shiftAmount, 1) <> subCharNeedle) THEN
`fail_check`:
BEGIN -- check if not correct
SET penalty = 20; -- 20% reduction for each missed letter, 5% for closeness a close hit
FOR i IN 1..checkSpan DO
-- positive (assume missing letter more likely than a added letter)
SET iIdx = (n + i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
-- negative
SET iIdx = (n - i);
IF (iIdx > 0) AND (iIdx <= mLen) THEN
IF (SUBSTRING(str_haystack, iIdx + shiftAmount, 1) = subCharNeedle) THEN
SET penalty = 5*i;
IF shifted > 0 THEN
SET shifted = shifted-1;
SET shiftAmount = -i + shiftAmount;
END IF;
LEAVE `fail_check`;
END IF;
END IF;
END FOR;
END; -- end of fail_check
SET res = res - penalty;
if(res < minimum_quality) THEN RETURN 0; END IF;
END IF;
SET n = n + 1;
END WHILE;
END IF;
RETURN res;
END //
DELIMITER ;
DELIMITER //
CREATE OR REPLACE FUNCTION fuzzy_match(str_needle VARCHAR(64), str_haystack VARCHAR(4096), minimum_quality INT)
RETURNS INT DETERMINISTIC CONTAINS SQL
BEGIN
DECLARE needle_len, haystack_len, cIdx, iIdx, cLen, loop_abort, n INT DEFAULT 0;
DECLARE sub_len, check_span INT;
DECLARE sSub VARCHAR(4096);
DECLARE sChar, subChar_needle, subChar_tmp CHAR(1) DEFAULT ' ';
DECLARE res, rmatch_score, minq FLOAT DEFAULT 0;
SET str_needle = UPPER(REPLACE(TRIM(str_needle),' ',''));
SET str_haystack = UPPER(REPLACE(TRIM(str_haystack),' ',''));
SET needle_len = CHAR_LENGTH(str_needle);
SET haystack_len = CHAR_LENGTH(str_haystack);
SET minq = (minimum_quality / 100.0);
SET check_span = 2; -- Check_span decides how wide to check - Min: 1, Max: Not sensible beyond 5.
SET sChar= LEFT(str_needle,1);
IF (needle_len > 0) AND (haystack_len > 0) THEN
REPEAT
SET cIdx = IFNULL(LOCATE(sChar, str_haystack, cIdx+1), 0);
IF (cIdx > 0) THEN
SET sSub = SUBSTRING(str_haystack, cIdx, needle_len+1);
SET cLen = CHAR_LENGTH(sSub);
SET sub_len = CHAR_LENGTH(sSub);
SET cLen = (sub_len * (sub_len < needle_len)) + (needle_len * (sub_len >= needle_len));
SET rmatch_score = 0;
WHILE (loop_abort = 0) AND (n < cLen) DO
SET n = n + 1;
SET subChar_needle = SUBSTRING(str_needle, n, 1);
IF (subChar_tmp <> subChar_needle) THEN
SET subChar_tmp = subChar_needle;
FOR i IN -check_span..check_span DO
SET iIdx = (n + i - 1);
IF (iIdx >= 0) AND (iIdx < cLen) THEN
IF (subChar_needle = SUBSTRING(sSub, iIdx + 1, 1)) THEN
SET rmatch_score = rmatch_score + (check_span + 1 - ABS(i));
END IF;
END IF;
END FOR;
SET loop_abort = ((rmatch_score / (check_span * n)) < minq);
ELSE
SET rmatch_score = rmatch_score + check_span;
END IF;
END WHILE;
SET res = (rmatch_score / ((check_span + 1) * needle_len));
END IF;
UNTIL (cIdx <= 0) OR (res >= 1) END REPEAT;
END IF;
RETURN (res >= minq) * ROUND(res * 100);
END //
DELIMITER ;
4条答案
按热度按时间ryevplcw1#
我认为您应该使用
SOUNDS LIKE
或SOUNDEX()
由于您的数据集很小,一种解决方案可能是创建一个新表来存储每个文本字段中包含的各个单词或soundex值,并在该表上使用SOUNDS LIKE。
例如:
请参阅:http://dev.mysql.com/doc/refman/5.0/en/string-functions.html
我相信这是不可能的通配符seach字符串:(
efzxgjgh2#
MySQL不支持SOUNDEX全文搜索。
如果你想实现一个类似lucene的框架,这意味着你必须把所有的文档,把它们分成单词,然后为每个单词建立一个索引。
当有人搜索“right shlder”时,您必须对worlds表中的每个单词进行SOUNDEX搜索:
然后进行全文搜索:
其中,$result是包含第一个查询结果的数组。
ftf50wuq3#
您要查找的内容的技术术语是Levenshtein distance,用于计算两个序列(在本例中是字符串的字符序列)之间的差异。
PHP实际上有两个内置的函数,第一个是similar_text,另一个叫做levenshtein,它应该可以帮助你解决你的问题。如果它足够快,你必须进行基准测试。
elcex8rz4#
我们也遇到了同样的问题,我们做了两个相对较快的存储过程。Levenshtein距离很好,但对于自动完成搜索来说不是很好。“catema”将与“catamaran”匹配得相当好,尽管它的Levenshtein距离很差。使我们的函数非常适合于随你搜索的类型。
我们有2个版本,1是优化工作与一个大的索引单词表,并使用第一个字母,以缩小搜索的显着性能增益。
另一个版本将对更大的字符串进行单词比较,而不仅仅是比较单个单词。
预存程序: