lean2/src/util/bitap_fuzzy_search.h
2014-09-05 18:01:09 -07:00

36 lines
1.2 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Copyright (c) 2014 Microsoft Corporation. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Author: Leonardo de Moura
*/
#pragma once
#include <string>
#include <vector>
#include <limits>
#include "util/int64.h"
namespace lean {
/** \brief The bitap algorithm (aka Baeza-YatesGonnet algorithm) for approximate string matching.
Reference: http://dl.acm.org/citation.cfm?doid=135239.135244
*/
class bitap_fuzzy_search {
static constexpr unsigned mask_size = std::numeric_limits<unsigned char>::max()+1;
unsigned m_pattern_size;
uint64 m_pattern_mask[mask_size];
unsigned m_k;
std::vector<uint64> m_R;
public:
/** \brief Create a searcher for the given pattern (upto k errors).
\pre pattern.size() < 63
*/
bitap_fuzzy_search(std::string const & pattern, unsigned k);
/** \brief Return the position of pattern in text with upto k "errors"
Return std::string::npos if the pattern does not occur in text.
*/
size_t operator()(std::string const & text);
bool match(std::string const & text) { return operator()(text) != std::string::npos; }
};
}