feat(util): add fuzzy string search procedure

This commit is contained in:
Leonardo de Moura 2014-09-05 14:42:13 -07:00
parent f958e534bd
commit a31a25798c
5 changed files with 125 additions and 2 deletions

View file

@ -88,3 +88,6 @@ add_executable(thread thread.cpp)
target_link_libraries(thread "util" ${EXTRA_LIBS})
add_test(thread ${CMAKE_CURRENT_BINARY_DIR}/thread)
add_dependencies(thread import_test)
add_executable(bitap_fuzzy_search bitap_fuzzy_search.cpp)
target_link_libraries(bitap_fuzzy_search "util" ${EXTRA_LIBS})
add_test(bitap_fuzzy_search ${CMAKE_CURRENT_BINARY_DIR}/bitap_fuzzy_search)

View file

@ -0,0 +1,20 @@
/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#include "util/test.h"
#include "util/bitap_fuzzy_search.h"
using namespace lean;
static void tst1() {
bitap_fuzzy_search matcher("dicr", 1);
lean_assert(matcher.match("discriminate"));
}
int main() {
save_stack_info();
tst1();
return has_violations() ? 1 : 0;
}

View file

@ -9,6 +9,7 @@ add_library(util trace.cpp debug.cpp name.cpp name_set.cpp
bit_tricks.cpp safe_arith.cpp ascii.cpp memory.cpp shared_mutex.cpp
realpath.cpp script_state.cpp script_exception.cpp rb_map.cpp
lua.cpp luaref.cpp lua_named_param.cpp stackinfo.cpp lean_path.cpp
serializer.cpp lbool.cpp thread_script_state.cpp ${THREAD_CPP})
serializer.cpp lbool.cpp thread_script_state.cpp bitap_fuzzy_search.cpp
${THREAD_CPP})
target_link_libraries(util ${LEAN_LIBS})

View file

@ -0,0 +1,63 @@
/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#include <string>
#include <iostream>
#include "util/exception.h"
#include "util/bitap_fuzzy_search.h"
namespace lean {
bitap_fuzzy_search::bitap_fuzzy_search(std::string const & pattern, unsigned k):
m_R(k+1) {
if (pattern.size() > 63)
throw exception("pattern is too long");
m_k = k;
m_pattern_size = pattern.size();
for (unsigned i = 0; i < mask_size; i++)
m_pattern_mask[i] = ~static_cast<uint64>(0);
for (unsigned i = 0; i < m_pattern_size; i++) {
unsigned u = static_cast<unsigned char>(pattern[i]);
m_pattern_mask[u] &= ~(static_cast<uint64>(1) << i);
}
}
size_t bitap_fuzzy_search::operator()(std::string const & text) {
if (m_pattern_size == 0)
return 0;
for (unsigned i = 0; i < m_k+1; i++)
m_R[i] = ~static_cast<uint64>(1);
unsigned text_sz = text.size();
for (unsigned i = 0; i < text_sz; i++) {
uint64 old_Rd1 = m_R[0];
unsigned u = static_cast<unsigned char>(text[i]);
uint64 Sc = m_pattern_mask[u];
m_R[0] = (m_R[0] | Sc) << 1;
for (unsigned d = 1; d < m_k+1; d++) {
uint64 tmp = m_R[d];
m_R[d] =
// Case 1. there is a match with <= d errors upto this point, and
// current character is matching
((m_R[d] | Sc) << 1) &
// Case 2. there is a match with <= d-1 errors upto this point.
// This case corresponds to substitution.
(old_Rd1 << 1) &
// Case 3. there is a match with <= d-1 errors upto this point.
// This case corresponds to deletion.
(m_R[d-1] << 1) &
// Case 3. there is a match with <= d-1 errors upto this point.
// This case corresponds to insertion.
old_Rd1;
old_Rd1 = tmp;
}
if ((m_R[m_k] & (static_cast<uint64>(1) << m_pattern_size)) == 0)
return i - m_pattern_size + 1;
}
return std::string::npos;
}
}

View file

@ -0,0 +1,36 @@
/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#pragma once
#include <string>
#include <vector>
#include <limits>
#include "util/int64.h"
namespace lean {
/** \brief The bitap algorithm (aka Baeza-YatesGonnet algorithm) for approximate string matching.
Reference: http://dl.acm.org/citation.cfm?doid=135239.135244
*/
class bitap_fuzzy_search {
static constexpr unsigned mask_size = std::numeric_limits<unsigned char>::max()+1;
unsigned m_pattern_size;
uint64 m_pattern_mask[mask_size];
unsigned m_k;
std::vector<uint64> m_R;
public:
/** \brief Create a searcher for the given pattern (upto k errors).
\pre pattern.size() < 63
*/
bitap_fuzzy_search(std::string const & pattern, unsigned k);
/** \brief Return the position of pattern in text with upto k "errors"
Return std::string::npos if the pattern does not occur in text.
*/
size_t operator()(std::string const & text);
bool match(std::string const & text) { return operator()(text) != std::string::npos; }
};
}