Developer Documentation
GPUCacheOptimizer.cc
1 /*===========================================================================*\
2  * *
3  * OpenFlipper *
4  * Copyright (c) 2001-2015, RWTH-Aachen University *
5  * Department of Computer Graphics and Multimedia *
6  * All rights reserved. *
7  * www.openflipper.org *
8  * *
9  *---------------------------------------------------------------------------*
10  * This file is part of OpenFlipper. *
11  *---------------------------------------------------------------------------*
12  * *
13  * Redistribution and use in source and binary forms, with or without *
14  * modification, are permitted provided that the following conditions *
15  * are met: *
16  * *
17  * 1. Redistributions of source code must retain the above copyright notice, *
18  * this list of conditions and the following disclaimer. *
19  * *
20  * 2. Redistributions in binary form must reproduce the above copyright *
21  * notice, this list of conditions and the following disclaimer in the *
22  * documentation and/or other materials provided with the distribution. *
23  * *
24  * 3. Neither the name of the copyright holder nor the names of its *
25  * contributors may be used to endorse or promote products derived from *
26  * this software without specific prior written permission. *
27  * *
28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
29  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
30  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A *
31  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER *
32  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
33  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
34  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
35  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
36  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
37  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
38  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
39  * *
40 \*===========================================================================*/
41 
42 
43 #include "GPUCacheOptimizer.hh"
44 #include <cassert>
45 #include <cmath>
46 #include <vector>
47 #include <cstring>
48 
49 //=============================================================================
50 
51 namespace ACG
52 {
53 
54 //=============================================================================
55 
56 GPUCacheOptimizer::GPUCacheOptimizer( unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void* pIndices) :
57  m_NumVerts(NumVerts),
58  m_NumTris(NumTris),
59  m_IndexSize(IndexSize),
60  m_pIndices(pIndices),
61  m_NumTransformations(0)
62 {
63  m_pTriMap = new unsigned int[m_NumTris];
64 }
65 
66 GPUCacheOptimizer::~GPUCacheOptimizer(void)
67 {
68  delete [] m_pTriMap;
69 }
70 
71 //=============================================================================
72 
73 unsigned int GPUCacheOptimizer::GetIndex(unsigned int i) const
74 {
75  assert(i < m_NumTris * 3);
76 
77  return GetIndex(i, m_IndexSize, m_pIndices);
78 }
79 
80 unsigned int GPUCacheOptimizer::GetIndex(unsigned int i, unsigned int IndexSize, const void* pIB)
81 {
82  switch (IndexSize)
83  {
84  case 4: return ((const unsigned int*)pIB)[i]; break;
85  case 2: return ((const unsigned short*)pIB)[i]; break;
86  case 1: return ((const unsigned char*)pIB)[i]; break;
87  default:
88  assert(i == 1 || i == 2 || i == 4); // throw error
89  }
90  return 0xFFFFFFFF;
91 }
92 
93 void GPUCacheOptimizer::SetIndex(unsigned int i, unsigned int val, unsigned int IndexSize, void* pIB)
94 {
95  switch (IndexSize)
96  {
97  case 4: ((unsigned int*)pIB)[i] = val; break;
98  case 2: ((unsigned short*)pIB)[i] = val; break;
99  case 1: ((unsigned char*)pIB)[i] = val; break;
100  default:
101  assert(i == 1 || i == 2 || i == 4); // throw error
102  }
103 }
104 
105 //=============================================================================
106 
107 void GPUCacheOptimizer::WriteIndexBuffer(unsigned int DstIndexSize, void* pDst)
108 {
109  assert(DstIndexSize == 1 ||DstIndexSize == 2 || DstIndexSize == 4);
110  // TODO: warning log, if DstIndexSize < m_IndexSize
111 
112  // support for 'in-place' operation via tmpbuf copy
113  char* pSrc = (char*)m_pIndices;
114 
115  int bTmpCopy = 0;
116  if (pDst == pSrc)
117  {
118  pSrc = new char[m_IndexSize * m_NumTris * 3];
119  memcpy(pSrc, m_pIndices, m_IndexSize * m_NumTris * 3);
120 
121  bTmpCopy = 1;
122  }
123 
124  for (unsigned int i = 0; i < m_NumTris; ++i)
125  {
126  for (int k = 0; k < 3; ++k)
127  {
128  unsigned int TriVertex = GetIndex(m_pTriMap[i] * 3 + k, m_IndexSize, pSrc);
129 
130  // copy remapped tri indices
131  SetIndex(i * 3 + k, TriVertex, DstIndexSize, pDst);
132  }
133  }
134 
135  if (bTmpCopy) delete [] pSrc;
136 }
137 
138 //=============================================================================
139 
140 void GPUCacheOptimizer::RemapVertices(unsigned int NumTris, unsigned int NumVerts,
141  const unsigned int* pVertMap,
142  unsigned int IndexSize, void* pInOutIndices,
143  unsigned int VertexStride, void* pInOutVertices)
144 {
145  if (pVertMap && pInOutIndices && pInOutVertices && VertexStride)
146  {
147  // make tmp vertex buffer copy
148  char* pTmpBuf = new char[VertexStride * NumVerts];
149  memcpy(pTmpBuf, pInOutVertices, VertexStride * NumVerts);
150 
151  char* pVertexOut = (char*)pInOutVertices;
152 
153  // apply on vertex buffer
154 
155  for (unsigned int i = 0; i < NumVerts; ++i)
156  {
157  // some mapping destinations might be invalid
158  // this vertex is unused, ignore then
159  if (pVertMap[i] < NumVerts)
160  memcpy(pVertexOut + pVertMap[i] * VertexStride,
161  pTmpBuf + i * VertexStride, VertexStride);
162  }
163 
164  // apply on index buffer
165 
166  for (unsigned int i = 0; i < NumTris * 3; ++i)
167  {
168  // IndexBuffer[i] = VertMap[IndexBuffer[i]]
169 
170  unsigned int v = GetIndex(i, IndexSize, pInOutIndices);
171  SetIndex(i, pVertMap[v], IndexSize, pInOutIndices);
172  }
173 
174  delete [] pTmpBuf;
175  }
176 }
177 
178 //=============================================================================
179 
180 void GPUCacheOptimizer::OptimizeVertices(unsigned int NumTris, unsigned int NumVerts,
181  unsigned int IndexSize, const void* pIndices,
182  unsigned int* pVertMap)
183 {
184  // straight forward algorithm
185  // simply iterate over indices and increment vertex location if unvisited vertex found
186  unsigned int uCounter = 0; // vertex counter
187 
188  memset(pVertMap, 0xFF, NumVerts * sizeof(unsigned int));
189 
190  for (unsigned int i = 0; i < NumTris * 3; ++i)
191  {
192  unsigned int vertex;
193 
194  if (IndexSize == 2) vertex = ((const unsigned short*)pIndices)[i];
195  else vertex = ((const unsigned int*)pIndices)[i];
196 
197  if (pVertMap[vertex] == 0xFFFFFFFF)
198  pVertMap[vertex] = uCounter++;
199  }
200 }
201 
202 //=============================================================================
203 
204 unsigned int GPUCacheOptimizer::ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize)
205 {
206  if (m_NumTransformations) return m_NumTransformations;
207 
208  unsigned int NumIndices = 3 * m_NumTris;
209  if (!NumIndices) return 0;
210 
211  unsigned int* Cache = new unsigned int[VertexCacheSize];
212  unsigned int NumSlotsInUse = 0;
213  unsigned int LastSlot = 0;
214  m_NumTransformations = 0;
215 
216  for (unsigned int i = 0; i < m_NumTris; ++i)
217  {
218  unsigned int t = m_pTriMap[i];
219 
220  // for each vertex of triangle t:
221  for (int k = 0; k < 3; ++k)
222  {
223  unsigned int Idx = GetIndex(t * 3 + k); // vertex index
224 
225  int bInCache = 0;
226  for (unsigned int k = 0; k < NumSlotsInUse && !bInCache; ++k)
227  {
228  if (Cache[k] == Idx) bInCache = 1;
229  }
230 
231  if (!bInCache)
232  {
233  ++m_NumTransformations;
234  if (NumSlotsInUse < VertexCacheSize)
235  {
236  Cache[NumSlotsInUse++] = Idx;
237  ++LastSlot;
238  }
239  else
240  {
241  if (LastSlot == VertexCacheSize) LastSlot = 0;
242  Cache[LastSlot++] = Idx;
243  }
244  }
245  }
246 
247  }
248 
249  delete [] Cache;
250 
251  return m_NumTransformations;
252 }
253 
254 //=============================================================================
255 
256 float GPUCacheOptimizer::ComputeACMR(unsigned int VertexCacheSize)
257 {
258  unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
259  return float(NumT) / float(m_NumTris);
260 }
261 
262 //=============================================================================
263 
264 float GPUCacheOptimizer::ComputeATVR(unsigned int VertexCacheSize)
265 {
266  unsigned int NumT = ComputeNumberOfVertexTransformations(VertexCacheSize);
267  return float(NumT) / float(m_NumVerts);
268 }
269 
270 //=============================================================================
271 
272 // forsyth's score function
273 void GPUCacheOptimizer::Opt_Vertex::FindScore(unsigned int MaxSizeVertexCache)
274 {
275 
276 
277 
278  float fNewScore = -1.0f; // -1 : vertex unused
279  if (iNumTrisLeft > 0)
280  {
281  if (iCachePos < 0) fNewScore = 0.0f; // not in FIFO
282  else
283  {
284 
285  if (iCachePos < 3){ // last tri => fixed score
286 
287  const float LastTriScore = 0.75f;
288  fNewScore = LastTriScore;
289 
290  } else
291  {
292  const float CacheDecayPower = 1.5f;
293 
294  // check for cache_pos < MaxSizeCachePos here..
295  // Points for being high in the cache.
296  const float Scaler = 1.0f / (MaxSizeVertexCache - 3);
297  fNewScore = 1.0f - ( iCachePos - 3 ) * Scaler;
298  fNewScore = powf( fNewScore, CacheDecayPower);
299  }
300  }
301 
302  // Bonus points for having a low number of tris still to
303  // use the vert, so we get rid of lone verts quickly.
304 
305  const float ValenceBoostScale = 2.0f;
306  const float ValenceBoostPower = 0.5f;
307 
308  float ValenceBoost = powf( float(iNumTrisLeft), -float(ValenceBoostPower));
309  fNewScore += ValenceBoostScale * ValenceBoost;
310  }
311 
312  fScore = fNewScore;
313 }
314 
315 void GPUCacheOptimizer::Opt_Vertex::RemoveTriFromList(unsigned int tri)
316 {
317  for (int k = 0; k < iNumTrisLeft; ++k)
318  {
319  // replace tri with last tri in this list
320  if (pTris[k] == tri)
321  {
322  pTris[k] = pTris[iNumTrisLeft-1];
323  break;
324  }
325  }
326  --iNumTrisLeft;
327 }
328 
329 //=============================================================================
330 // tipsify
331 
332 GPUCacheOptimizerTipsify::GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
333 : GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
334 {
335  // optimization notes:
336  // - cache unfriendly layout of Opt_Vertex: high read/write access cost to Opt_vertex data
337 
338  if (NumVerts < 3 || !NumTris) return;
339 
340  Opt_Vertex* pVerts = new Opt_Vertex[NumVerts];
341  Opt_Tris* pTris = new Opt_Tris[NumTris];
342 
343  // OPTIMIZATION: memset vs constructor initialization: memset - 400 ms, constructor - 770 ms (at 10 mil vertices)
344  memset(pVerts, 0, sizeof(Opt_Vertex) * NumVerts);
345 
346  // build adjacency, same start as in forsyth class
347  for (unsigned int i = 0; i < NumTris; ++i)
348  {
349  // copy vertex indices of this tri
350  Opt_Tris* pThisTri = pTris + i;
351 
352  // copy vertex indices of this tri
353  for (unsigned int k = 0; k < 3; ++k)
354  {
355 
356  const unsigned int idx = GetIndex(i * 3 + k);
357  pThisTri->v[k] = idx;
358 
359  // count # tris per vertex
360  ++pVerts[idx].iNumTrisTotal;
361  }
362  }
363 
364  // OPTIMIZATION: allocate one buffer for the complete vertex adjacency list (instead of allocating a new buffer for each vertex)
365  const unsigned int vertexTriAdjBufSize = NumTris * 3;
366  unsigned int vertexTriAdjBufOffset = 0;
367  unsigned int* vertexTriAdjBuf = new unsigned int[vertexTriAdjBufSize];
368 
369  // create list of tris per vertex
370  for (unsigned int i = 0; i < NumTris; ++i)
371  {
372  // add this tri to per vertex tri list
373  for (int k = 0; k < 3; ++k)
374  {
375  Opt_Vertex* pV = pVerts + pTris[i].v[k];
376  if (!pV->pTris) {
377  pV->pTris = vertexTriAdjBuf + vertexTriAdjBufOffset;
378  vertexTriAdjBufOffset += pV->iNumTrisTotal;
379  }
380 
381  // abuse <numTrisLeft> as temporal up counter
382  // (automatically sums to numTris, exactly what we want)
383  pV->pTris[pV->iNumTrisLeft++] = i;
384 
385  pV->iCachePos = 0;
386  }
387  }
388 
389  // use the cache_pos of the OptFaces_Vertex as the time stamp
390 
391  //=============================================================================
392  // OPTIMIZATION:
393  // push and pop on DeadEndVertexStack greatly increases processing time
394  // -> replace with fixed size ring stack
395  // std::vector<unsigned int> DeadEndVertexStack;
396  // DeadEndVertexStack.reserve(2048);
397  RingStack DeadEndVertexStack(128);
398 
399 
400  int f = 0; // arbitrary starting index (vertex)
401  int iTimeStamp = CacheSize + 1;
402  unsigned int i = 1; // cursor
403 
404  unsigned int numTrisAdded = 0;
405 
406  std::vector<unsigned int> N; // 1-ring of next candidates
407  N.reserve(2048);
408 
409  while (f >= 0)
410  {
411  N.clear();
412 
413  // this vertex
414  Opt_Vertex* pV = pVerts + f;
415 
416  // for each adjacent tri of this vertex
417  for (int m = 0; m < pV->iNumTrisTotal; ++m)
418  {
419  Opt_Tris* pT = pTris + pV->pTris[m];
420 
421  if (!pT->bAdded)
422  {
423  // append
424  m_pTriMap[numTrisAdded++] = pV->pTris[m];
425 
426  for (int k = 0; k < 3; ++k)
427  {
428  // push to cache
429  const unsigned int v = pT->v[k];
430  // DeadEndVertexStack.push_back(pT->v[k]);
431  DeadEndVertexStack.push(v);
432 
433  // insert
434  N.push_back(v);
435 
436  Opt_Vertex* adjV = pVerts + v;
437 
438  --adjV->iNumTrisLeft;
439 
440  if (iTimeStamp - adjV->iCachePos > (int)CacheSize)
441  adjV->iCachePos = iTimeStamp++;
442  }
443  pT->bAdded = 1;
444  }
445  }
446 
447 
448  // select next fanning vertex
449  // Get-Next-Vertex
450  {
451  int n = -1, p = -1; // best candidate and priority
452  for (unsigned int k = 0; k < N.size(); ++k)
453  {
454  // for each vertex in N
455  Opt_Vertex* pV = pVerts + N[k];
456 
457  if (pV->iNumTrisLeft > 0)
458  {
459  // error here in pseudo code:
460  // literal p should be named m here
461  // to find the best vertex
462  int m = 0;
463  if (iTimeStamp - pV->iCachePos + 2 * pV->iNumTrisLeft <= (int)CacheSize)
464  m = iTimeStamp - pV->iCachePos;
465 
466  if (m > p)
467  {
468  p = m;
469  n = N[k];
470  }
471  }
472  }
473 
474  if (n == -1)
475  {
476  // Skip-Dead-End
477  while (DeadEndVertexStack.length() && (n == -1))
478  {
479  // unsigned int d = DeadEndVertexStack.back();
480  // DeadEndVertexStack.pop_back();
481  const unsigned int d = DeadEndVertexStack.pop();
482 
483  if (pVerts[d].iNumTrisLeft > 0)
484  n = d;
485  }
486 
487  while (i+1 < NumVerts && (n == -1))
488  {
489  ++i;
490  if (pVerts[i].iNumTrisLeft > 0)
491  n = i;
492  }
493  }
494 
495  f = n;
496  }
497  }
498 
499  // debugging purpose
500  // int capac = N.capacity();
501  // capac = DeadEndVertexStack.capacity();
502 
503  delete [] vertexTriAdjBuf;
504 
505  delete [] pVerts;
506  delete [] pTris;
507 }
508 
509 //=============================================================================
510 
511 GPUCacheEfficiencyTester::GPUCacheEfficiencyTester(unsigned int NumTris, unsigned int NumVerts,
512  unsigned int IndexSize, const void* pIndices)
513 : GPUCacheOptimizer(NumTris, NumVerts, IndexSize, pIndices)
514 {
515  for (unsigned int i = 0; i < NumTris; ++i) m_pTriMap[i] = i;
516 }
517 
518 //=============================================================================
519 
520 
521 }
void FindScore(unsigned int MaxSizeVertexCache)
forsyth&#39;s score function
int iNumTrisTotal
tris using this vertex
Namespace providing different geometric functions concerning angles.
static void OptimizeVertices(unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices, unsigned int *pVertMap)
Reorders vertex buffer to minimize memory address jumps.
static void RemapVertices(unsigned int NumTris, unsigned int NumVerts, const unsigned int *pVertMap, unsigned int IndexSize, void *pInOutIndices, unsigned int VertexStride, void *pInOutVertices)
Applies the remap table of OptimizeVertices to a vertex and index buffer.
GPUCacheOptimizerTipsify(unsigned int CacheSize, unsigned int NumTris, unsigned int NumVerts, unsigned int IndexSize, const void *pIndices)
The actual computation happens here in this constructor.
float ComputeATVR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ATVR: Average Transform to Vertex Ratio similar to A...
void WriteIndexBuffer(unsigned int DstIndexSize, void *pDst)
Applies the remapping on the initial pIndices constructor&#39;s param and stores the result in the given ...
unsigned int v[3]
vertices of this triangle
int iNumTrisLeft
tris left to add to final result
unsigned int length() const
current stack length
unsigned int ComputeNumberOfVertexTransformations(unsigned int VertexCacheSize=16)
Returns the total number of vertex transforms performed with a certain VertexCache.
float ComputeACMR(unsigned int VertexCacheSize=16)
Measures the efficiency use of the vertex cache. ACMR: Average Cache Miss Ratio.
Simple and fast fixed size stack used in tipsify implementation.