-
Notifications
You must be signed in to change notification settings - Fork 97
/
simhashsearchindex.hpp
133 lines (119 loc) · 5.17 KB
/
simhashsearchindex.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SIMHASHSEARCHINDEX_HPP
#define SIMHASHSEARCHINDEX_HPP
#include <cstdint>
#include <mutex>
#include "util/persistentmap.hpp"
// Pretend uint128_t was a standard type already.
typedef __uint128_t uint128_t;
// A simple memory-mapping backed index for querying a 128-bit SimHash.
// Permutations of the full 128-bit value are used for turning the SimHash
// into a family of hashes.
//
// Determining the concrete number of buckets is a somewhat tricky trade-
// off between memory consumption, query performance, and desired accuracy, so
// I will describe an example calculation in the following:
//
// There is a trade-off where the similarity search deteriorates quickly
// if you include things that are too dissimilar with your target.
// So the first thing that we need to decide "for what maximum hamming dist-
// ance of the SimHash vector do we want the search to work well".
//
// First, let us gather all the data we need to perform our calculation:
//
// A: Approximate number of functions in our index: 50 million
// B: Maximum "distance" from the query that we wish to find: 20% ~ 0.2
// C: Maximum acceptable odds of missing an item when querying: 5% ~ 0.05
// D: Maximum number of items we wish to "touch" on each query: ~5 million
// (this will determine per-item query latency)
//
// The value B leads us to 0.2 * 128 ~= 26 bits difference -- so we need to
// make sure we can efficiently find SimHash fingerprints that differ in up
// to 26 places.
//
// The probability of a given item to fall inside a randomly picked bucket
// extracted from N bits of the SimHash is the same as the probability of
// obtaining a N-bit run of white balls when drawing IID from an urn with
// 102 white and 26 black balls.
//
// The probability for a run of a given length can be calculated using the
// following Python function (warning, likely to contain subtle errors):
//
// def probability_for_run(N, B, W, n):
// """ Calculates the probability of obtaining a run of n white balls
// when iid drawing from an urn with N balls of which B are black """
// result = 1.0
// B = float(B)
// n = float(n)
// N = float(N)
// for i in range(int(n)):
// result *= ((N - i - B)/(N - i))
// result *= (B / (N - n - 1))
// return result
//
// Summing up the probabilities for white runs of 9 or more balls, we obtain
// that the odds of getting 9 or more white balls in a row is about ~15.47%
//
// So we want to achieve (1.0 - 0.1547)^buckets < 0.05
// We find that (1.0 - 0.1547)^18 ~= 0.049 < 0.05, so we see that with 18
// buckets the odds of missing one item goes below 5%. If we wanted the odds
// of missing one item to go below 1%, we will need 28 buckets.
//
// The 50 million entries will be split using the 8-bit bucket IDs, leading to
// expected 50m / 256.0 elements per bucket (about 200k). If we look at 18
// buckets per query, we expect the code to have to examine about 3.6m
// elements per query; if we look at 28 buckets, the code will have to ex-
// amine about 5.5m elements per query.
//
// Examining an element should be comparatively cheap, so we should still
// be able to get acceptable latency out of this.
class SimHashSearchIndex {
public:
// Uniquely identifies a bitwise permutation of the 128-bit value.
typedef uint8_t PermutationIndex;
// Upper 64 bits of the hash.
typedef uint64_t HashValueA;
// Lower 64 bits of the hash.
typedef uint64_t HashValueB;
// Unique ID for the function.
typedef uint64_t FunctionID;
typedef std::tuple<PermutationIndex, HashValueA, HashValueB, FunctionID>
IndexEntry;
typedef uint64_t FileID;
typedef uint64_t Address;
typedef std::pair<FileID, Address> FileAndAddress;
SimHashSearchIndex(const std::string& indexname,
bool create, uint8_t buckets = 50);
uint64_t QueryTopN(uint64_t hash_A, uint64_t hash_B, uint32_t how_many,
std::vector<std::pair<float, FileAndAddress>>* results);
uint64_t AddFunction(uint64_t hash_A, uint64_t hash_B, FileID file_id,
Address address);
uint64_t GetIndexFileSize();
uint64_t GetIndexFileFreeSpace();
uint64_t GetIndexSetSize() const;
uint64_t GetNumberOfIndexedFunctions() const;
uint8_t GetNumberOfBuckets() const;
double GetOddsOfRandomHit(uint32_t count) const;
void DumpIndexToStdout(bool all) const;
private:
// TODO(thomasdullien): As soon as the codebase is ported to C++14,
// replace the following mutex with a shared_mutex to allow concurrent
// reading from the index.
mutable std::mutex mutex_;
PersistentMap<FunctionID, FileAndAddress> id_to_file_and_address_;
PersistentSet<IndexEntry> search_index_;
uint8_t buckets_;
};
#endif // SIMHASHSEARCHINDEX_HPP