Developer Documentation
GPUCacheOptimizer.cc
1/*===========================================================================*\
2 * *
3 * OpenFlipper *
4 * Copyright (c) 2001-2015, RWTH-Aachen University *
5 * Department of Computer Graphics and Multimedia *
6 * All rights reserved. *
7 * www.openflipper.org *
8 * *
9 *---------------------------------------------------------------------------*
10 * This file is part of OpenFlipper. *
11 *---------------------------------------------------------------------------*
12 * *
13 * Redistribution and use in source and binary forms, with or without *
14 * modification, are permitted provided that the following conditions *
15 * are met: *
16 * *
17 * 1. Redistributions of source code must retain the above copyright notice, *
18 * this list of conditions and the following disclaimer. *
19 * *
20 * 2. Redistributions in binary form must reproduce the above copyright *
21 * notice, this list of conditions and the following disclaimer in the *
22 * documentation and/or other materials provided with the distribution. *
23 * *
24 * 3. Neither the name of the copyright holder nor the names of its *
25 * contributors may be used to endorse or promote products derived from *
26 * this software without specific prior written permission. *
27 * *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
30 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
31 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER *
32 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
33 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
34 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
35 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
36 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
37 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
38 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
39 * *
40\*===========================================================================*/
41
42
43#include "GPUCacheOptimizer.hh"
44#include <cassert>
45#include <cmath>
46#include <vector>
47#include <cstring>
48
49//=============================================================================
50
51namespace ACG
52{
53
54//=============================================================================
55
56GPUCacheOptimizer::GPUCacheOptimizer( unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void* pIndices) :
57 m_NumVerts(NumVerts),
58 m_NumTris(NumTris),
59 m_IndexSize(IndexSize),
60 m_pIndices(pIndices),
61 m_NumTransformations(0)
62{
63 m_pTriMap = new unsigned int[m_NumTris];
64}
65
66GPUCacheOptimizer::~GPUCacheOptimizer(void)
67{
68 delete [] m_pTriMap;
69}
70
71//=============================================================================
72
73unsigned int GPUCacheOptimizer::GetIndex(unsigned int i) const
74{
75 assert(i < m_NumTris * 3);
76
77 return GetIndex(i, m_IndexSize, m_pIndices);
78}
79
80unsigned int GPUCacheOptimizer::GetIndex(unsigned int i, unsigned int IndexSize, const void* pIB)
81{
82 switch (IndexSize)
83 {
84 case 4: return ((const unsigned int*)pIB)[i]; break;
85 case 2: return ((const unsigned short*)pIB)[i]; break;
86 case 1: return ((const unsigned char*)pIB)[i]; break;
87 default:
88 assert(i == 1 || i == 2 || i == 4); // throw error
89 }
90 return 0xFFFFFFFF;
91}
92
93void GPUCacheOptimizer::SetIndex(unsigned int i, unsigned int val, unsigned int IndexSize, void* pIB)
94{
95 switch (IndexSize)
96 {
97 case 4: ((unsigned int*)pIB)[i] = val; break;
98 case 2: ((unsigned short*)pIB)[i] = val; break;
99 case 1: ((unsigned char*)pIB)[i] = val; break;
100 default:
101 assert(i == 1 || i == 2 || i == 4); // throw error
102 }
103}
104
105//=============================================================================
106
107void GPUCacheOptimizer::WriteIndexBuffer(unsigned int DstIndexSize, void* pDst)
108{
109 assert(DstIndexSize == 1 ||DstIndexSize == 2 || DstIndexSize == 4);
110 // TODO: warning log, if DstIndexSize < m_IndexSize
111
112 // support for 'in-place' operation via tmpbuf copy
113 char* pSrc = (char*)m_pIndices;
114
115 int bTmpCopy = 0;
116 if (pDst == pSrc)
117 {
118 pSrc = new char[m_IndexSize * m_NumTris * 3];
119 memcpy(pSrc, m_pIndices, m_IndexSize * m_NumTris * 3);
120
121 bTmpCopy = 1;
122 }
123
124 for (unsigned int i = 0; i < m_NumTris; ++i)
125 {
126 for (int k = 0; k < 3; ++k)
127 {
128 unsigned int TriVertex = GetIndex(m_pTriMap[i] * 3 + k, m_IndexSize, pSrc);
129
130 // copy remapped tri indices
131 SetIndex(i * 3 + k, TriVertex, DstIndexSize, pDst);
132 }
133 }
134
135 if (bTmpCopy) delete [] pSrc;
136}
137
138//=============================================================================
139
140void GPUCacheOptimizer::RemapVertices(unsigned int NumTris, unsigned int NumVerts,
141 const unsigned int* pVertMap,
142 unsigned int IndexSize, void* pInOutIndices,
143 unsigned int VertexStride, void* pInOutVertices)
144{
145 if (pVertMap && pInOutIndices && pInOutVertices && VertexStride)
146 {
147 // make tmp vertex buffer copy
148 char* pTmpBuf = new char[VertexStride * NumVerts];
149 memcpy(pTmpBuf, pInOutVertices, VertexStride * NumVerts);
150
151 char* pVertexOut = (char*)pInOutVertices;
152
153 // apply on vertex buffer
154
155 for (unsigned int i = 0; i < NumVerts; ++i)
156 {
157 // some mapping destinations might be invalid
158 // this vertex is unused, ignore then
159 if (pVertMap[i] < NumVerts)
160 memcpy(pVertexOut + pVertMap[i] * VertexStride,
161 pTmpBuf + i * VertexStride, VertexStride);
162 }
163
164 // apply on index buffer
165
166 for (unsigned int i = 0; i < NumTris * 3; ++i)
167 {
168 // IndexBuffer[i] = VertMap[IndexBuffer[i]]
169
170 unsigned int v = GetIndex(i, IndexSize, pInOutIndices);
171 SetIndex(i, pVertMap[v], IndexSize, pInOutIndices);
172 }
173
174 delete [] pTmpBuf;
175 }
176}
177
178//=============================================================================
179
180void GPUCacheOptimizer::OptimizeVertices(unsigned int NumTris, unsigned int NumVerts,
181 unsigned int IndexSize, const void* pIndices,
182 unsigned int* pVertMap)
183{
184 // straight forward algorithm
185 // simply iterate over indices and increment vertex location if unvisited vertex found
186 unsigned int uCounter = 0; // vertex counter
187
188 memset(pVertMap, 0xFF, NumVerts * sizeof(unsigned int));
189
190 for (unsigned int i = 0; i < NumTris * 3; ++i)
191 {
192 unsigned int vertex;
193
194 if (IndexSize == 2) vertex = ((const unsigned short*)pIndices)[i];
195 else vertex = ((const unsigned int*)pIndices)[i];
196
197 if (pVertMap[vertex] == 0xFFFFFFFF)
198 pVertMap[vertex] = uCounter++;
199 }
200}
201
202//=============================================================================
203
204unsigned int GPUCacheOptimizer::ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize)
205{
206 if (m_NumTransformations) return m_NumTransformations;
207
208 unsigned int NumIndices = 3 * m_NumTris;
209 if (!NumIndices) return 0;
210
211 unsigned int* Cache = new unsigned int[VertexCacheSize];
212 unsigned int NumSlotsInUse = 0;
213 unsigned int LastSlot = 0;
214 m_NumTransformations = 0;
215
216 for (unsigned int i = 0; i < m_NumTris; ++i)
217 {
218 unsigned int t = m_pTriMap[i];
219
220 // for each vertex of triangle t:
221 for (int k = 0; k < 3; ++k)
222 {
223 unsigned int Idx = GetIndex(t * 3 + k); // vertex index
224
225 int bInCache = 0;
226 for (unsigned int l = 0; l < NumSlotsInUse && !bInCache; ++l)
227 {
228 if (Cache[l] == Idx) bInCache = 1;
229 }
230
231 if (!bInCache)
232 {
233 ++m_NumTransformations;
234 if (NumSlotsInUse < VertexCacheSize)
235 {
236 Cache[NumSlotsInUse++] = Idx;
237 ++LastSlot;
238 }
239 else
240 {
241 if (LastSlot == VertexCacheSize) LastSlot = 0;
242 Cache[LastSlot++] = Idx;
243 }
244 }
245 }
246
247 }
248
249 delete [] Cache;
250
251 return m_NumTransformations;
252}
253
254//=============================================================================
255
256float GPUCacheOptimizer::ComputeACMR(unsigned int VertexCacheSize)
257{
258 unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
259 return float(NumT) / float(m_NumTris);
260}
261
262//=============================================================================
263
264float GPUCacheOptimizer::ComputeATVR(unsigned int VertexCacheSize)
265{
266 unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
267 return float(NumT) / float(m_NumVerts);
268}
269
270//=============================================================================
271
272// forsyth's score function
273void GPUCacheOptimizer::Opt_Vertex::FindScore(unsigned int MaxSizeVertexCache)
274{
275
276
277
278 float fNewScore = -1.0f; // -1 : vertex unused
279 if (iNumTrisLeft > 0)
280 {
281 if (iCachePos < 0) fNewScore = 0.0f; // not in FIFO
282 else
283 {
284
285 if (iCachePos < 3){ // last tri => fixed score
286
287 const float LastTriScore = 0.75f;
288 fNewScore = LastTriScore;
289
290 } else
291 {
292 const float CacheDecayPower = 1.5f;
293
294 // check for cache_pos < MaxSizeCachePos here..
295 // Points for being high in the cache.
296 const float Scaler = 1.0f / (MaxSizeVertexCache - 3);
297 fNewScore = 1.0f - ( iCachePos - 3 ) * Scaler;
298 fNewScore = powf( fNewScore, CacheDecayPower);
299 }
300 }
301
302 // Bonus points for having a low number of tris still to
303 // use the vert, so we get rid of lone verts quickly.
304
305 const float ValenceBoostScale = 2.0f;
306 const float ValenceBoostPower = 0.5f;
307
308 float ValenceBoost = powf( float(iNumTrisLeft), -float(ValenceBoostPower));
309 fNewScore += ValenceBoostScale * ValenceBoost;
310 }
311
312 fScore = fNewScore;
313}
314
315void GPUCacheOptimizer::Opt_Vertex::RemoveTriFromList(unsigned int tri)
316{
317 for (int k = 0; k < iNumTrisLeft; ++k)
318 {
319 // replace tri with last tri in this list
320 if (pTris[k] == tri)
321 {
322 pTris[k] = pTris[iNumTrisLeft-1];
323 break;
324 }
325 }
326 --iNumTrisLeft;
327}
328
329//=============================================================================
330// tipsify
331
332GPUCacheOptimizerTipsify::GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
333: GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
334{
335 // optimization notes:
336 // - cache unfriendly layout of Opt_Vertex: high read/write access cost to Opt_vertex data
337
338 if (NumVerts < 3 || !NumTris) return;
339
340 Opt_Vertex* pVerts = new Opt_Vertex[NumVerts];
341 Opt_Tris* pTris = new Opt_Tris[NumTris];
342
343 // OPTIMIZATION: memset vs constructor initialization: memset - 400 ms, constructor - 770 ms (at 10 mil vertices)
344 memset(pVerts, 0, sizeof(Opt_Vertex) * NumVerts);
345
346 // build adjacency, same start as in forsyth class
347 for (unsigned int i = 0; i < NumTris; ++i)
348 {
349 // copy vertex indices of this tri
350 Opt_Tris* pThisTri = pTris + i;
351
352 // copy vertex indices of this tri
353 for (unsigned int k = 0; k < 3; ++k)
354 {
355
356 const unsigned int idx = GetIndex(i * 3 + k);
357 pThisTri->v[k] = idx;
358
359 // count # tris per vertex
360 ++pVerts[idx].iNumTrisTotal;
361 }
362 }
363
364 // OPTIMIZATION: allocate one buffer for the complete vertex adjacency list (instead of allocating a new buffer for each vertex)
365 const unsigned int vertexTriAdjBufSize = NumTris * 3;
366 unsigned int vertexTriAdjBufOffset = 0;
367 unsigned int* vertexTriAdjBuf = new unsigned int[vertexTriAdjBufSize];
368
369 // create list of tris per vertex
370 for (unsigned int i = 0; i < NumTris; ++i)
371 {
372 // add this tri to per vertex tri list
373 for (int k = 0; k < 3; ++k)
374 {
375 Opt_Vertex* pV = pVerts + pTris[i].v[k];
376 if (!pV->pTris) {
377 pV->pTris = vertexTriAdjBuf + vertexTriAdjBufOffset;
378 vertexTriAdjBufOffset += pV->iNumTrisTotal;
379 }
380
381 // abuse <numTrisLeft> as temporal up counter
382 // (automatically sums to numTris, exactly what we want)
383 pV->pTris[pV->iNumTrisLeft++] = i;
384
385 pV->iCachePos = 0;
386 }
387 }
388
389 // use the cache_pos of the OptFaces_Vertex as the time stamp
390
391 //=============================================================================
392 // OPTIMIZATION:
393 // push and pop on DeadEndVertexStack greatly increases processing time
394 // -> replace with fixed size ring stack
395 // std::vector<unsigned int> DeadEndVertexStack;
396 // DeadEndVertexStack.reserve(2048);
397 RingStack DeadEndVertexStack(128);
398
399
400 int f = 0; // arbitrary starting index (vertex)
401 int iTimeStamp = CacheSize + 1;
402 unsigned int i = 1; // cursor
403
404 unsigned int numTrisAdded = 0;
405
406 std::vector<unsigned int> N; // 1-ring of next candidates
407 N.reserve(2048);
408
409 while (f >= 0)
410 {
411 N.clear();
412
413 // this vertex
414 Opt_Vertex* pV = pVerts + f;
415
416 // for each adjacent tri of this vertex
417 for (int m = 0; m < pV->iNumTrisTotal; ++m)
418 {
419 Opt_Tris* pT = pTris + pV->pTris[m];
420
421 if (!pT->bAdded)
422 {
423 // append
424 m_pTriMap[numTrisAdded++] = pV->pTris[m];
425
426 for (int k = 0; k < 3; ++k)
427 {
428 // push to cache
429 const unsigned int v = pT->v[k];
430 // DeadEndVertexStack.push_back(pT->v[k]);
431 DeadEndVertexStack.push(v);
432
433 // insert
434 N.push_back(v);
435
436 Opt_Vertex* adjV = pVerts + v;
437
438 --adjV->iNumTrisLeft;
439
440 if (iTimeStamp - adjV->iCachePos > (int)CacheSize)
441 adjV->iCachePos = iTimeStamp++;
442 }
443 pT->bAdded = 1;
444 }
445 }
446
447
448 // select next fanning vertex
449 // Get-Next-Vertex
450 {
451 int n = -1, p = -1; // best candidate and priority
452 for (unsigned int k = 0; k < N.size(); ++k)
453 {
454 // for each vertex in N
455 Opt_Vertex* pV = pVerts + N[k];
456
457 if (pV->iNumTrisLeft > 0)
458 {
459 // error here in pseudo code:
460 // literal p should be named m here
461 // to find the best vertex
462 int m = 0;
463 if (iTimeStamp - pV->iCachePos + 2 * pV->iNumTrisLeft <= (int)CacheSize)
464 m = iTimeStamp - pV->iCachePos;
465
466 if (m > p)
467 {
468 p = m;
469 n = N[k];
470 }
471 }
472 }
473
474 if (n == -1)
475 {
476 // Skip-Dead-End
477 while (DeadEndVertexStack.length() && (n == -1))
478 {
479 // unsigned int d = DeadEndVertexStack.back();
480 // DeadEndVertexStack.pop_back();
481 const unsigned int d = DeadEndVertexStack.pop();
482
483 if (pVerts[d].iNumTrisLeft > 0)
484 n = d;
485 }
486
487 while (i+1 < NumVerts && (n == -1))
488 {
489 ++i;
490 if (pVerts[i].iNumTrisLeft > 0)
491 n = i;
492 }
493 }
494
495 f = n;
496 }
497 }
498
499 // debugging purpose
500 // int capac = N.capacity();
501 // capac = DeadEndVertexStack.capacity();
502
503 delete [] vertexTriAdjBuf;
504
505 delete [] pVerts;
506 delete [] pTris;
507}
508
509//=============================================================================
510
511GPUCacheEfficiencyTester::GPUCacheEfficiencyTester(unsigned int NumTris, unsigned int NumVerts,
512 unsigned int IndexSize, const void* pIndices)
513: GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
514{
515 for (unsigned int i = 0; i < NumTris; ++i) m_pTriMap[i] = i;
516}
517
518//=============================================================================
519
520
521}
GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
The actual computation happens here in this constructor.
void WriteIndexBuffer(unsigned int DstIndexSize, void *pDst)
Applies the remapping on the initial pIndices constructor's param and stores the result in the given ...
unsigned int ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize=16)
Returns the total number of vertex transforms performed with a certain VertexCache.
float ComputeACMR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ACMR: Average Cache Miss Ratio.
static void RemapVertices(unsigned int NumTris, unsigned int NumVerts, const unsigned int *pVertMap, unsigned int IndexSize, void *pInOutIndices, unsigned int VertexStride, void *pInOutVertices)
Applies the remap table of OptimizeVertices to a vertex and index buffer.
float ComputeATVR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ATVR: Average Transform to Vertex Ratio similar to A...
static void OptimizeVertices(unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices, unsigned int *pVertMap)
Reorders vertex buffer to minimize memory address jumps.
Namespace providing different geometric functions concerning angles.
Simple and fast fixed size stack used in tipsify implementation.
unsigned int length() const
current stack length
unsigned int v[3]
vertices of this triangle
void FindScore(unsigned int MaxSizeVertexCache)
forsyth's score function