feat(util): add fuzzy string search procedure
This commit is contained in:
parent
f958e534bd
commit
a31a25798c
5 changed files with 125 additions and 2 deletions
|
@ -88,3 +88,6 @@ add_executable(thread thread.cpp)
|
|||
target_link_libraries(thread "util" ${EXTRA_LIBS})
|
||||
add_test(thread ${CMAKE_CURRENT_BINARY_DIR}/thread)
|
||||
add_dependencies(thread import_test)
|
||||
add_executable(bitap_fuzzy_search bitap_fuzzy_search.cpp)
|
||||
target_link_libraries(bitap_fuzzy_search "util" ${EXTRA_LIBS})
|
||||
add_test(bitap_fuzzy_search ${CMAKE_CURRENT_BINARY_DIR}/bitap_fuzzy_search)
|
||||
|
|
20
src/tests/util/bitap_fuzzy_search.cpp
Normal file
20
src/tests/util/bitap_fuzzy_search.cpp
Normal file
|
@ -0,0 +1,20 @@
|
|||
/*
|
||||
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
|
||||
Released under Apache 2.0 license as described in the file LICENSE.
|
||||
|
||||
Author: Leonardo de Moura
|
||||
*/
|
||||
#include "util/test.h"
|
||||
#include "util/bitap_fuzzy_search.h"
|
||||
using namespace lean;
|
||||
|
||||
static void tst1() {
|
||||
bitap_fuzzy_search matcher("dicr", 1);
|
||||
lean_assert(matcher.match("discriminate"));
|
||||
}
|
||||
|
||||
int main() {
|
||||
save_stack_info();
|
||||
tst1();
|
||||
return has_violations() ? 1 : 0;
|
||||
}
|
|
@ -9,6 +9,7 @@ add_library(util trace.cpp debug.cpp name.cpp name_set.cpp
|
|||
bit_tricks.cpp safe_arith.cpp ascii.cpp memory.cpp shared_mutex.cpp
|
||||
realpath.cpp script_state.cpp script_exception.cpp rb_map.cpp
|
||||
lua.cpp luaref.cpp lua_named_param.cpp stackinfo.cpp lean_path.cpp
|
||||
serializer.cpp lbool.cpp thread_script_state.cpp ${THREAD_CPP})
|
||||
serializer.cpp lbool.cpp thread_script_state.cpp bitap_fuzzy_search.cpp
|
||||
${THREAD_CPP})
|
||||
|
||||
target_link_libraries(util ${LEAN_LIBS})
|
||||
|
|
63
src/util/bitap_fuzzy_search.cpp
Normal file
63
src/util/bitap_fuzzy_search.cpp
Normal file
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
|
||||
Released under Apache 2.0 license as described in the file LICENSE.
|
||||
|
||||
Author: Leonardo de Moura
|
||||
*/
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "util/exception.h"
|
||||
#include "util/bitap_fuzzy_search.h"
|
||||
|
||||
namespace lean {
|
||||
bitap_fuzzy_search::bitap_fuzzy_search(std::string const & pattern, unsigned k):
|
||||
m_R(k+1) {
|
||||
if (pattern.size() > 63)
|
||||
throw exception("pattern is too long");
|
||||
m_k = k;
|
||||
m_pattern_size = pattern.size();
|
||||
for (unsigned i = 0; i < mask_size; i++)
|
||||
m_pattern_mask[i] = ~static_cast<uint64>(0);
|
||||
for (unsigned i = 0; i < m_pattern_size; i++) {
|
||||
unsigned u = static_cast<unsigned char>(pattern[i]);
|
||||
m_pattern_mask[u] &= ~(static_cast<uint64>(1) << i);
|
||||
}
|
||||
}
|
||||
|
||||
size_t bitap_fuzzy_search::operator()(std::string const & text) {
|
||||
if (m_pattern_size == 0)
|
||||
return 0;
|
||||
for (unsigned i = 0; i < m_k+1; i++)
|
||||
m_R[i] = ~static_cast<uint64>(1);
|
||||
|
||||
unsigned text_sz = text.size();
|
||||
for (unsigned i = 0; i < text_sz; i++) {
|
||||
uint64 old_Rd1 = m_R[0];
|
||||
unsigned u = static_cast<unsigned char>(text[i]);
|
||||
uint64 Sc = m_pattern_mask[u];
|
||||
m_R[0] = (m_R[0] | Sc) << 1;
|
||||
|
||||
for (unsigned d = 1; d < m_k+1; d++) {
|
||||
uint64 tmp = m_R[d];
|
||||
m_R[d] =
|
||||
// Case 1. there is a match with <= d errors upto this point, and
|
||||
// current character is matching
|
||||
((m_R[d] | Sc) << 1) &
|
||||
// Case 2. there is a match with <= d-1 errors upto this point.
|
||||
// This case corresponds to substitution.
|
||||
(old_Rd1 << 1) &
|
||||
// Case 3. there is a match with <= d-1 errors upto this point.
|
||||
// This case corresponds to deletion.
|
||||
(m_R[d-1] << 1) &
|
||||
// Case 3. there is a match with <= d-1 errors upto this point.
|
||||
// This case corresponds to insertion.
|
||||
old_Rd1;
|
||||
old_Rd1 = tmp;
|
||||
}
|
||||
|
||||
if ((m_R[m_k] & (static_cast<uint64>(1) << m_pattern_size)) == 0)
|
||||
return i - m_pattern_size + 1;
|
||||
}
|
||||
return std::string::npos;
|
||||
}
|
||||
}
|
36
src/util/bitap_fuzzy_search.h
Normal file
36
src/util/bitap_fuzzy_search.h
Normal file
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
|
||||
Released under Apache 2.0 license as described in the file LICENSE.
|
||||
|
||||
Author: Leonardo de Moura
|
||||
*/
|
||||
#pragma once
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include "util/int64.h"
|
||||
|
||||
namespace lean {
|
||||
/** \brief The bitap algorithm (aka Baeza-Yates–Gonnet algorithm) for approximate string matching.
|
||||
Reference: http://dl.acm.org/citation.cfm?doid=135239.135244
|
||||
*/
|
||||
class bitap_fuzzy_search {
|
||||
static constexpr unsigned mask_size = std::numeric_limits<unsigned char>::max()+1;
|
||||
unsigned m_pattern_size;
|
||||
uint64 m_pattern_mask[mask_size];
|
||||
unsigned m_k;
|
||||
std::vector<uint64> m_R;
|
||||
public:
|
||||
/** \brief Create a searcher for the given pattern (upto k errors).
|
||||
|
||||
\pre pattern.size() < 63
|
||||
*/
|
||||
bitap_fuzzy_search(std::string const & pattern, unsigned k);
|
||||
/** \brief Return the position of pattern in text with upto k "errors"
|
||||
Return std::string::npos if the pattern does not occur in text.
|
||||
*/
|
||||
size_t operator()(std::string const & text);
|
||||
|
||||
bool match(std::string const & text) { return operator()(text) != std::string::npos; }
|
||||
};
|
||||
}
|
Loading…
Reference in a new issue