39 | |

40 | #ifndef QDRAWINGPRIMITIVE_SSE2_P_H |

41 | #define QDRAWINGPRIMITIVE_SSE2_P_H |

42 | |

43 | #include <QtGui/private/qtguiglobal_p.h> |

44 | #include <private/qsimd_p.h> |

45 | #include "qdrawhelper_x86_p.h" |

46 | #include "qrgba64_p.h" |

47 | |

48 | #ifdef __SSE2__ |

49 | |

50 | // |

51 | // W A R N I N G |

52 | // ------------- |

53 | // |

54 | // This file is not part of the Qt API. It exists purely as an |

55 | // implementation detail. This header file may change from version to |

56 | // version without notice, or even be removed. |

57 | // |

58 | // We mean it. |

59 | // |

60 | |

61 | QT_BEGIN_NAMESPACE |

62 | |

63 | /* |

64 | * Multiply the components of pixelVector by alphaChannel |

65 | * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |

66 | * colorMask must have 0x00ff00ff on each 32 bits component |

67 | * half must have the value 128 (0x80) for each 32 bits compnent |

68 | */ |

69 | #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \ |

70 | { \ |

71 | /* 1. separate the colors in 2 vectors so each color is on 16 bits \ |

72 | (in order to be multiplied by the alpha \ |

73 | each 32 bit of dstVectorAG are in the form 0x00AA00GG \ |

74 | each 32 bit of dstVectorRB are in the form 0x00RR00BB */\ |

75 | __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \ |

76 | __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \ |

77 | \ |

78 | /* 2. multiply the vectors by the alpha channel */\ |

79 | pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \ |

80 | pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \ |

81 | \ |

82 | /* 3. divide by 255, that's the tricky part. \ |

83 | we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \ |

84 | /** so first (X + X/256 + rounding) */\ |

85 | pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \ |

86 | pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \ |

87 | pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \ |

88 | pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \ |

89 | \ |

90 | /** second divide by 256 */\ |

91 | pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \ |

92 | /** for AG, we could >> 8 to divide followed by << 8 to put the \ |

93 | bytes in the correct position. By masking instead, we execute \ |

94 | only one instruction */\ |

95 | pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \ |

96 | \ |

97 | /* 4. combine the 2 pairs of colors */ \ |

98 | result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \ |

99 | } |

100 | |

101 | /* |

102 | * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |

103 | * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component |

104 | * colorMask must have 0x00ff00ff on each 32 bits component |

105 | * half must have the value 128 (0x80) for each 32 bits compnent |

106 | */ |

107 | #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \ |

108 | /* interpolate AG */\ |

109 | __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \ |

110 | __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \ |

111 | __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \ |

112 | __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \ |

113 | __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \ |

114 | finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \ |

115 | finalAG = _mm_add_epi16(finalAG, half); \ |

116 | finalAG = _mm_andnot_si128(colorMask, finalAG); \ |

117 | \ |

118 | /* interpolate RB */\ |

119 | __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \ |

120 | __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \ |

121 | __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \ |

122 | __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \ |

123 | __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \ |

124 | finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \ |

125 | finalRB = _mm_add_epi16(finalRB, half); \ |

126 | finalRB = _mm_srli_epi16(finalRB, 8); \ |

127 | \ |

128 | /* combine */\ |

129 | result = _mm_or_si128(finalAG, finalRB); \ |

130 | } |

131 | |

132 | // same as BLEND_SOURCE_OVER_ARGB32_SSE2, but for one vector srcVector |

133 | #define BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) { \ |

134 | const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \ |

135 | if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \ |

136 | /* all opaque */ \ |

137 | _mm_store_si128((__m128i *)&dst[x], srcVector); \ |

138 | } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \ |

139 | /* not fully transparent */ \ |

140 | /* extract the alpha channel on 2 x 16 bits */ \ |

141 | /* so we have room for the multiplication */ \ |

142 | /* each 32 bits will be in the form 0x00AA00AA */ \ |

143 | /* with A being the 1 - alpha */ \ |

144 | __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \ |

145 | alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \ |

146 | alphaChannel = _mm_sub_epi16(one, alphaChannel); \ |

147 | \ |

148 | const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \ |

149 | __m128i destMultipliedByOneMinusAlpha; \ |

150 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \ |

151 | \ |

152 | /* result = s + d * (1-alpha) */\ |

153 | const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \ |

154 | _mm_store_si128((__m128i *)&dst[x], result); \ |

155 | } \ |

156 | } |

157 | |

158 | |

159 | // Basically blend src over dst with the const alpha defined as constAlphaVector. |

160 | // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: |

161 | //const __m128i nullVector = _mm_set1_epi32(0); |

162 | //const __m128i half = _mm_set1_epi16(0x80); |

163 | //const __m128i one = _mm_set1_epi16(0xff); |

164 | //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |

165 | //const __m128i alphaMask = _mm_set1_epi32(0xff000000); |

166 | // |

167 | // The computation being done is: |

168 | // result = s + d * (1-alpha) |

169 | // with shortcuts if fully opaque or fully transparent. |

170 | #define BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \ |

171 | int x = 0; \ |

172 | \ |

173 | /* First, get dst aligned. */ \ |

174 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \ |

175 | blend_pixel(dst[x], src[x]); \ |

176 | } \ |

177 | \ |

178 | for (; x < length-3; x += 4) { \ |

179 | const __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[x]); \ |

180 | BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) \ |

181 | } \ |

182 | SIMD_EPILOGUE(x, length, 3) { \ |

183 | blend_pixel(dst[x], src[x]); \ |

184 | } \ |

185 | } |

186 | |

187 | // Basically blend src over dst with the const alpha defined as constAlphaVector. |

188 | // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as: |

189 | //const __m128i nullVector = _mm_set1_epi32(0); |

190 | //const __m128i half = _mm_set1_epi16(0x80); |

191 | //const __m128i one = _mm_set1_epi16(0xff); |

192 | //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |

193 | // |

194 | // The computation being done is: |

195 | // dest = (s + d * sia) * ca + d * cia |

196 | // = s * ca + d * (sia * ca + cia) |

197 | // = s * ca + d * (1 - sa*ca) |

198 | #define BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector) \ |

199 | { \ |

200 | int x = 0; \ |

201 | \ |

202 | ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \ |

203 | blend_pixel(dst[x], src[x], const_alpha); \ |

204 | } \ |

205 | \ |

206 | for (; x < length-3; x += 4) { \ |

207 | __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[x]); \ |

208 | if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { \ |

209 | BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); \ |

210 | \ |

211 | __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \ |

212 | alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \ |

213 | alphaChannel = _mm_sub_epi16(one, alphaChannel); \ |

214 | \ |

215 | const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \ |

216 | __m128i destMultipliedByOneMinusAlpha; \ |

217 | BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \ |

218 | \ |

219 | const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \ |

220 | _mm_store_si128((__m128i *)&dst[x], result); \ |

221 | } \ |

222 | } \ |

223 | SIMD_EPILOGUE(x, length, 3) { \ |

224 | blend_pixel(dst[x], src[x], const_alpha); \ |

225 | } \ |

226 | } |

227 | |

228 | QT_END_NAMESPACE |

229 | |

230 | #endif // __SSE2__ |

231 | |

232 | QT_BEGIN_NAMESPACE |

233 | #if QT_COMPILER_SUPPORTS_HERE(SSE4_1) |

234 | QT_FUNCTION_TARGET(SSE2) |

235 | Q_ALWAYS_INLINE void Q_DECL_VECTORCALL reciprocal_mul_ss(__m128 &ia, const __m128 a, float mul) |

236 | { |

237 | ia = _mm_rcp_ss(a); // Approximate 1/a |

238 | // Improve precision of ia using Newton-Raphson |

239 | ia = _mm_sub_ss(_mm_add_ss(ia, ia), _mm_mul_ss(ia, _mm_mul_ss(ia, a))); |

240 | ia = _mm_mul_ss(ia, _mm_set_ss(mul)); |

241 | ia = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0,0,0,0)); |

242 | } |

243 | |

244 | QT_FUNCTION_TARGET(SSE4_1) |

245 | inline QRgb qUnpremultiply_sse4(QRgb p) |

246 | { |

247 | const uint alpha = qAlpha(p); |

248 | if (alpha == 255) |

249 | return p; |

250 | if (alpha == 0) |

251 | return 0; |

252 | const __m128 va = _mm_set1_ps(alpha); |

253 | __m128 via; |

254 | reciprocal_mul_ss(via, va, 255.0f); // Approximate 1/a |

255 | __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p)); |

256 | vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via)); |

257 | vl = _mm_packus_epi32(vl, vl); |

258 | vl = _mm_insert_epi16(vl, alpha, 3); |

259 | vl = _mm_packus_epi16(vl, vl); |

260 | return _mm_cvtsi128_si32(vl); |

261 | } |

262 | |

263 | template<enum QtPixelOrder PixelOrder> |

264 | QT_FUNCTION_TARGET(SSE4_1) |

265 | inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p) |

266 | { |

267 | const uint alpha = qAlpha(p); |

268 | if (alpha == 255) |

269 | return qConvertRgb32ToRgb30<PixelOrder>(p); |

270 | if (alpha == 0) |

271 | return 0; |

272 | Q_CONSTEXPR float mult = 1023.0f / (255 >> 6); |

273 | const uint newalpha = (alpha >> 6); |

274 | const __m128 va = _mm_set1_ps(alpha); |

275 | __m128 via; |

276 | reciprocal_mul_ss(via, va, mult * newalpha); |

277 | __m128i vl = _mm_cvtsi32_si128(p); |

278 | vl = _mm_cvtepu8_epi32(vl); |

279 | vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via)); |

280 | vl = _mm_packus_epi32(vl, vl); |

281 | uint rgb30 = (newalpha << 30); |

282 | rgb30 |= ((uint)_mm_extract_epi16(vl, 1)) << 10; |

283 | if (PixelOrder == PixelOrderRGB) { |

284 | rgb30 |= ((uint)_mm_extract_epi16(vl, 2)) << 20; |

285 | rgb30 |= ((uint)_mm_extract_epi16(vl, 0)); |

286 | } else { |

287 | rgb30 |= ((uint)_mm_extract_epi16(vl, 0)) << 20; |

288 | rgb30 |= ((uint)_mm_extract_epi16(vl, 2)); |

289 | } |

290 | return rgb30; |

291 | } |

292 | |

293 | template<enum QtPixelOrder PixelOrder> |

294 | QT_FUNCTION_TARGET(SSE4_1) |

295 | inline uint qConvertRgba64ToRgb32_sse4(QRgba64 p) |

296 | { |

297 | if (p.isTransparent()) |

298 | return 0; |

299 | __m128i vl = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&p)); |

300 | if (!p.isOpaque()) { |

301 | const __m128 va = _mm_set1_ps(p.alpha()); |

302 | __m128 via; |

303 | reciprocal_mul_ss(via, va, 65535.0f); |

304 | vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128()); |

305 | vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl) , via)); |

306 | vl = _mm_packus_epi32(vl, vl); |

307 | vl = _mm_insert_epi16(vl, p.alpha(), 3); |

308 | } |

309 | if (PixelOrder == PixelOrderBGR) |

310 | vl = _mm_shufflelo_epi16(vl, _MM_SHUFFLE(3, 0, 1, 2)); |

311 | return toArgb32(vl); |

312 | } |

313 | #endif |

314 | QT_END_NAMESPACE |

315 | |

316 | #endif // QDRAWINGPRIMITIVE_SSE2_P_H |

317 |