1 | /* |
---|---|

2 | * Copyright (C) 2005, 2006, 2008, 2010, 2013 Apple Inc. All rights reserved. |

3 | * Copyright (C) 2010 Patrick Gansterer <paroga@paroga.com> |

4 | * |

5 | * This library is free software; you can redistribute it and/or |

6 | * modify it under the terms of the GNU Library General Public |

7 | * License as published by the Free Software Foundation; either |

8 | * version 2 of the License, or (at your option) any later version. |

9 | * |

10 | * This library is distributed in the hope that it will be useful, |

11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | * Library General Public License for more details. |

14 | * |

15 | * You should have received a copy of the GNU Library General Public License |

16 | * along with this library; see the file COPYING.LIB. If not, write to |

17 | * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, |

18 | * Boston, MA 02110-1301, USA. |

19 | * |

20 | */ |

21 | |

22 | #ifndef WTF_Hasher_h |

23 | #define WTF_Hasher_h |

24 | |

25 | #include <unicode/utypes.h> |

26 | #include <wtf/text/LChar.h> |

27 | |

28 | namespace WTF { |

29 | |

30 | // Paul Hsieh's SuperFastHash |

31 | // http://www.azillionmonkeys.com/qed/hash.html |

32 | |

33 | // LChar data is interpreted as Latin-1-encoded (zero extended to 16 bits). |

34 | |

35 | // NOTE: The hash computation here must stay in sync with the create_hash_table script in |

36 | // JavaScriptCore and the CodeGeneratorJS.pm script in WebCore. |

37 | |

38 | // Golden ratio. Arbitrary start value to avoid mapping all zeros to a hash value of zero. |

39 | static const unsigned stringHashingStartValue = 0x9E3779B9U; |

40 | |

41 | class StringHasher { |

42 | public: |

43 | static const unsigned flagCount = 8; // Save 8 bits for StringImpl to use as flags. |

44 | |

45 | StringHasher() |

46 | : m_hash(stringHashingStartValue) |

47 | , m_hasPendingCharacter(false) |

48 | , m_pendingCharacter(0) |

49 | { |

50 | } |

51 | |

52 | // The hasher hashes two characters at a time, and thus an "aligned" hasher is one |

53 | // where an even number of characters have been added. Callers that always add |

54 | // characters two at a time can use the "assuming aligned" functions. |

55 | void addCharactersAssumingAligned(UChar a, UChar b) |

56 | { |

57 | ASSERT(!m_hasPendingCharacter); |

58 | m_hash += a; |

59 | m_hash = (m_hash << 16) ^ ((b << 11) ^ m_hash); |

60 | m_hash += m_hash >> 11; |

61 | } |

62 | |

63 | void addCharacter(UChar character) |

64 | { |

65 | if (m_hasPendingCharacter) { |

66 | m_hasPendingCharacter = false; |

67 | addCharactersAssumingAligned(m_pendingCharacter, character); |

68 | return; |

69 | } |

70 | |

71 | m_pendingCharacter = character; |

72 | m_hasPendingCharacter = true; |

73 | } |

74 | |

75 | void addCharacters(UChar a, UChar b) |

76 | { |

77 | if (m_hasPendingCharacter) { |

78 | #if !ASSERT_DISABLED |

79 | m_hasPendingCharacter = false; |

80 | #endif |

81 | addCharactersAssumingAligned(m_pendingCharacter, a); |

82 | m_pendingCharacter = b; |

83 | #if !ASSERT_DISABLED |

84 | m_hasPendingCharacter = true; |

85 | #endif |

86 | return; |

87 | } |

88 | |

89 | addCharactersAssumingAligned(a, b); |

90 | } |

91 | |

92 | template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(const T* data, unsigned length) |

93 | { |

94 | ASSERT(!m_hasPendingCharacter); |

95 | |

96 | bool remainder = length & 1; |

97 | length >>= 1; |

98 | |

99 | while (length--) { |

100 | addCharactersAssumingAligned(Converter(data[0]), Converter(data[1])); |

101 | data += 2; |

102 | } |

103 | |

104 | if (remainder) |

105 | addCharacter(Converter(*data)); |

106 | } |

107 | |

108 | template<typename T> void addCharactersAssumingAligned(const T* data, unsigned length) |

109 | { |

110 | addCharactersAssumingAligned<T, defaultConverter>(data, length); |

111 | } |

112 | |

113 | template<typename T, UChar Converter(T)> void addCharactersAssumingAligned(const T* data) |

114 | { |

115 | ASSERT(!m_hasPendingCharacter); |

116 | |

117 | while (T a = *data++) { |

118 | T b = *data++; |

119 | if (!b) { |

120 | addCharacter(Converter(a)); |

121 | break; |

122 | } |

123 | addCharactersAssumingAligned(Converter(a), Converter(b)); |

124 | } |

125 | } |

126 | |

127 | template<typename T> void addCharactersAssumingAligned(const T* data) |

128 | { |

129 | addCharactersAssumingAligned<T, defaultConverter>(data); |

130 | } |

131 | |

132 | template<typename T, UChar Converter(T)> void addCharacters(const T* data, unsigned length) |

133 | { |

134 | if (m_hasPendingCharacter && length) { |

135 | m_hasPendingCharacter = false; |

136 | addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++)); |

137 | --length; |

138 | } |

139 | addCharactersAssumingAligned<T, Converter>(data, length); |

140 | } |

141 | |

142 | template<typename T> void addCharacters(const T* data, unsigned length) |

143 | { |

144 | addCharacters<T, defaultConverter>(data, length); |

145 | } |

146 | |

147 | template<typename T, UChar Converter(T)> void addCharacters(const T* data) |

148 | { |

149 | if (m_hasPendingCharacter && *data) { |

150 | m_hasPendingCharacter = false; |

151 | addCharactersAssumingAligned(m_pendingCharacter, Converter(*data++)); |

152 | } |

153 | addCharactersAssumingAligned<T, Converter>(data); |

154 | } |

155 | |

156 | template<typename T> void addCharacters(const T* data) |

157 | { |

158 | addCharacters<T, defaultConverter>(data); |

159 | } |

160 | |

161 | unsigned hashWithTop8BitsMasked() const |

162 | { |

163 | unsigned result = avalancheBits(); |

164 | |

165 | // Reserving space from the high bits for flags preserves most of the hash's |

166 | // value, since hash lookup typically masks out the high bits anyway. |

167 | result &= (1U << (sizeof(result) * 8 - flagCount)) - 1; |

168 | |

169 | // This avoids ever returning a hash code of 0, since that is used to |

170 | // signal "hash not computed yet". Setting the high bit maintains |

171 | // reasonable fidelity to a hash code of 0 because it is likely to yield |

172 | // exactly 0 when hash lookup masks out the high bits. |

173 | if (!result) |

174 | result = 0x80000000 >> flagCount; |

175 | |

176 | return result; |

177 | } |

178 | |

179 | unsigned hash() const |

180 | { |

181 | unsigned result = avalancheBits(); |

182 | |

183 | // This avoids ever returning a hash code of 0, since that is used to |

184 | // signal "hash not computed yet". Setting the high bit maintains |

185 | // reasonable fidelity to a hash code of 0 because it is likely to yield |

186 | // exactly 0 when hash lookup masks out the high bits. |

187 | if (!result) |

188 | result = 0x80000000; |

189 | |

190 | return result; |

191 | } |

192 | |

193 | template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |

194 | { |

195 | StringHasher hasher; |

196 | hasher.addCharactersAssumingAligned<T, Converter>(data, length); |

197 | return hasher.hashWithTop8BitsMasked(); |

198 | } |

199 | |

200 | template<typename T, UChar Converter(T)> static unsigned computeHashAndMaskTop8Bits(const T* data) |

201 | { |

202 | StringHasher hasher; |

203 | hasher.addCharactersAssumingAligned<T, Converter>(data); |

204 | return hasher.hashWithTop8BitsMasked(); |

205 | } |

206 | |

207 | template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* data, unsigned length) |

208 | { |

209 | return computeHashAndMaskTop8Bits<T, defaultConverter>(data, length); |

210 | } |

211 | |

212 | template<typename T> static unsigned computeHashAndMaskTop8Bits(const T* data) |

213 | { |

214 | return computeHashAndMaskTop8Bits<T, defaultConverter>(data); |

215 | } |

216 | |

217 | template<typename T, UChar Converter(T)> static unsigned computeHash(const T* data, unsigned length) |

218 | { |

219 | StringHasher hasher; |

220 | hasher.addCharactersAssumingAligned<T, Converter>(data, length); |

221 | return hasher.hash(); |

222 | } |

223 | |

224 | template<typename T, UChar Converter(T)> static unsigned computeHash(const T* data) |

225 | { |

226 | StringHasher hasher; |

227 | hasher.addCharactersAssumingAligned<T, Converter>(data); |

228 | return hasher.hash(); |

229 | } |

230 | |

231 | template<typename T> static unsigned computeHash(const T* data, unsigned length) |

232 | { |

233 | return computeHash<T, defaultConverter>(data, length); |

234 | } |

235 | |

236 | template<typename T> static unsigned computeHash(const T* data) |

237 | { |

238 | return computeHash<T, defaultConverter>(data); |

239 | } |

240 | |

241 | static unsigned hashMemory(const void* data, unsigned length) |

242 | { |

243 | // FIXME: Why does this function use the version of the hash that drops the top 8 bits? |

244 | // We want that for all string hashing so we can use those bits in StringImpl and hash |

245 | // strings consistently, but I don't see why we'd want that for general memory hashing. |

246 | ASSERT(!(length % 2)); |

247 | return computeHashAndMaskTop8Bits<UChar>(static_cast<const UChar*>(data), length / sizeof(UChar)); |

248 | } |

249 | |

250 | template<size_t length> static unsigned hashMemory(const void* data) |

251 | { |

252 | static_assert(!(length % 2), "length must be a multiple of two!"); |

253 | return hashMemory(data, length); |

254 | } |

255 | |

256 | private: |

257 | static UChar defaultConverter(UChar character) |

258 | { |

259 | return character; |

260 | } |

261 | |

262 | static UChar defaultConverter(LChar character) |

263 | { |

264 | return character; |

265 | } |

266 | |

267 | unsigned avalancheBits() const |

268 | { |

269 | unsigned result = m_hash; |

270 | |

271 | // Handle end case. |

272 | if (m_hasPendingCharacter) { |

273 | result += m_pendingCharacter; |

274 | result ^= result << 11; |

275 | result += result >> 17; |

276 | } |

277 | |

278 | // Force "avalanching" of final 31 bits. |

279 | result ^= result << 3; |

280 | result += result >> 5; |

281 | result ^= result << 2; |

282 | result += result >> 15; |

283 | result ^= result << 10; |

284 | |

285 | return result; |

286 | } |

287 | |

288 | unsigned m_hash; |

289 | bool m_hasPendingCharacter; |

290 | UChar m_pendingCharacter; |

291 | }; |

292 | |

293 | class IntegerHasher { |

294 | public: |

295 | void add(unsigned integer) |

296 | { |

297 | m_underlyingHasher.addCharactersAssumingAligned(integer, integer >> 16); |

298 | } |

299 | |

300 | unsigned hash() const |

301 | { |

302 | return m_underlyingHasher.hash(); |

303 | } |

304 | |

305 | private: |

306 | StringHasher m_underlyingHasher; |

307 | }; |

308 | |

309 | } // namespace WTF |

310 | |

311 | using WTF::IntegerHasher; |

312 | using WTF::StringHasher; |

313 | |

314 | #endif // WTF_Hasher_h |

315 |