Commit 50508b42 authored by Philip Trettner's avatar Philip Trettner

first version of new tracing API in aion

parent 4e26b73b
#include "Tracer.hh"
#include <chrono>
#include <mutex>
#include <vector>
namespace
{
thread_local size_t profile_chunk_size = 1000 * TRACER_TRACE_SIZE;
thread_local size_t profile_chunk_size_max = 10 * (1 << 20); // 10 MB
thread_local float profile_growth_factor = 1.6f;
struct chunk
{
uint32_t const* data_begin;
uint32_t const* data_end;
chunk* next_chunk;
// TODO: for time sync
// std::chrono::steady_clock::time_point creation_time;
// uint64_t creation_cycles;
// uint32_t creation_cpu;
};
struct thread_chunks
{
std::thread::id thread;
chunk* chunks_start = nullptr;
chunk* chunks_end = nullptr;
};
std::mutex profile_chunks_mutex;
std::vector<thread_chunks*> profile_threads;
thread_local thread_chunks* profile_local_thread = nullptr;
thread_local std::vector<chunk*> profile_local_chunks;
thread_local std::string profile_thread_name;
}
namespace aion
{
namespace tracing
{
void set_thread_chunk_size(size_t size, float growth_factor, size_t max_size)
{
profile_chunk_size = size;
profile_growth_factor = growth_factor;
profile_chunk_size_max = max_size;
}
void set_thread_name(const std::string& name) { profile_thread_name = name; }
uint32_t* detail::alloc_chunk()
{
// register thread
if (profile_local_thread == nullptr)
{
profile_local_thread = new thread_chunks;
profile_local_thread->thread = std::this_thread::get_id();
// LOCKED: add to list of threads
profile_chunks_mutex.lock();
profile_threads.push_back(profile_local_thread);
profile_chunks_mutex.unlock();
}
auto data_end = detail::tdata().curr;
auto d = new uint32_t[profile_chunk_size + TRACER_TRACE_SIZE](); // zero-init
detail::tdata().curr = d;
detail::tdata().end = d + profile_chunk_size;
auto c = new chunk;
c->data_begin = d;
c->data_end = d + profile_chunk_size + TRACER_TRACE_SIZE;
c->next_chunk = nullptr;
profile_local_chunks.push_back(c);
// growth
profile_chunk_size *= profile_growth_factor;
if (profile_chunk_size > profile_chunk_size_max)
profile_chunk_size = profile_chunk_size_max;
// LOCKED: add chunk
{
profile_chunks_mutex.lock();
// add list of chunks
if (profile_local_thread->chunks_end == nullptr)
{
profile_local_thread->chunks_start = c;
profile_local_thread->chunks_end = c;
}
else
{
profile_local_thread->chunks_end->data_end = data_end;
profile_local_thread->chunks_end->next_chunk = c;
profile_local_thread->chunks_end = c;
}
profile_chunks_mutex.unlock();
}
return d;
}
void visit_thread(visitor& v)
{
v.on_thread(std::this_thread::get_id());
auto curr_chunk = 0u;
uint32_t const* curr_datum = profile_local_chunks[0]->data_begin;
// get next word
auto get = [&]() -> uint32_t {
if (curr_datum == profile_local_chunks[curr_chunk]->data_end)
{
++curr_chunk;
curr_datum = profile_local_chunks[curr_chunk]->data_begin;
}
auto val = *curr_datum;
++curr_datum;
return val;
};
while (true)
{
auto v0 = get();
if (v0 == 0x0)
return; // rest is not done
if (v0 != TRACER_END_VALUE)
{
auto v1 = get();
auto loc = (location*)(((uint64_t)v1 << 32uLL) | v0);
auto lo = get();
auto hi = get();
auto cpu = get();
auto cycles = ((uint64_t)hi << 32) | lo;
v.on_trace_start(loc, cycles, cpu);
}
else
{
auto lo = get();
auto hi = get();
auto cpu = get();
auto cycles = ((uint64_t)hi << 32) | lo;
v.on_trace_end(cycles, cpu);
}
}
}
}
}
#pragma once
#include <cstdint>
#include <thread>
#ifdef _MSC_VER
#include <intrin.h>
#endif
/**
* Main Macro: TRACE(...)
*
* See https://godbolt.org/z/qs33MN
*
* Usage:
* int foo() {
* TRACE();
*
* return do_stuff();
* }
*
* Overhead: ~70-105 cycles
*
* TODO: low-overhead version without alloc_chunk check (~80 cycles)
*/
#define TRACE(...) \
(void)__VA_ARGS__ " has to be a string literal"; \
static aion::tracing::location TRACER_MACRO_JOIN(_action_label, __LINE__) = {__FILE__, TRACER_PRETTY_FUNC, "" __VA_ARGS__, __LINE__}; \
aion::tracing::detail::raii_profiler TRACER_MACRO_JOIN(_action_, __LINE__)(&TRACER_MACRO_JOIN(_action_label, __LINE__))
// Implementation:
#define TRACER_TRACE_SIZE 9
#define TRACER_LABEL_MASK 0x80000000
#define TRACER_END_VALUE 0xFFFFFFFF
#define TRACER_MACRO_JOIN_IMPL(arg1, arg2) arg1##arg2
#define TRACER_MACRO_JOIN(arg1, arg2) TRACER_MACRO_JOIN_IMPL(arg1, arg2)
#ifndef _MSC_VER
#define TRACER_FORCEINLINE __attribute__((always_inline))
#define TRACER_NOINLINE __attribute__((noinline))
#define TRACER_PRETTY_FUNC __PRETTY_FUNCTION__
#define TRACER_LIKELY(x) __builtin_expect((x), 1)
#define TRACER_UNLIKELY(x) __builtin_expect((x), 0)
#define TRACER_COLD __attribute__((cold))
#else
#define TRACER_FORCEINLINE __forceinline
#define TRACER_NOINLINE __declspec(noinline)
#define TRACER_PRETTY_FUNC __FUNCTION__
#define TRACER_LIKELY(x) x
#define TRACER_UNLIKELY(x) x
#define TRACER_COLD
#endif
namespace aion
{
namespace tracing
{
/// sets the size of newly allocated chunks
/// is a per-thread setting
void set_thread_chunk_size(size_t size, float growth_factor = 1.6f, size_t max_size = 10 * (1 << 20));
/// user-defined name for this thread
void set_thread_name(std::string const& name);
struct location
{
char const* file;
char const* function;
char const* name;
int line;
};
/// visitor base class, call order is:
/// on_thread
/// -> nested on_trace_start .. on_trace_end
/// traces might not have _end if they are still running
struct visitor
{
virtual void on_thread(std::thread::id thread) {}
virtual void on_trace_start(location* loc, uint64_t cycles, uint32_t cpu) {}
virtual void on_trace_end(uint64_t cycles, uint32_t cpu) {}
};
void visit_thread(visitor& v);
/// writes all trace points to a json file
/// Format:
/// {
/// "locations": [
/// {
/// "file": "...",
/// "function": "...",
/// "name": "...",
/// "line": 42
/// },
/// ...
/// ],
/// "threads": [
/// {
/// "name": "...",
/// "id": 123456,
/// "trace": [
/// {
/// "loc": 0, <-- index into location array
/// "start": 123456789,
/// "end": 123456789,
/// "cpu_start": 7,
/// "cpu_end": 7,
/// "trace": [
/// ... <-- nested trace array
/// ]
/// },
/// ...
/// ]
/// },
/// ...
/// ]
/// }
void write_json(std::string const& filename);
/// writes a csv where all trace points are summarized per-location
void write_summary_csv(std::string const& filename);
/// writes various output formats to a given directory
/// NOTE: does nothing if directory does not exist
void write_dir(std::string const& path);
namespace detail
{
struct thread_data
{
uint32_t* curr;
uint32_t* end; ///< not actually end, has a TRACER_TRACE_SIZE buffer at the end
};
TRACER_COLD TRACER_NOINLINE uint32_t* alloc_chunk();
inline thread_data& tdata()
{
static thread_local thread_data data = {nullptr, nullptr};
return data;
}
struct raii_profiler
{
raii_profiler(location* loc)
{
auto pd = tdata().curr;
if (TRACER_UNLIKELY(pd >= tdata().end)) // alloc new chunk
pd = alloc_chunk();
tdata().curr = pd + 5;
*(location**)pd = loc;
unsigned int core;
#ifdef _MSC_VER
int64_t cc = __rdtscp(&core);
*(int64_t*)(pd + 2) = cc;
#else
unsigned int lo, hi;
__asm__ __volatile__("rdtscp" : "=a"(lo), "=d"(hi), "=c"(core));
pd[2] = lo;
pd[3] = hi;
#endif
pd[4] = core;
}
~raii_profiler()
{
auto pd = tdata().curr;
if (TRACER_UNLIKELY(pd >= tdata().end)) // alloc new chunk
pd = alloc_chunk();
tdata().curr = pd + 4;
unsigned int core;
#ifdef _MSC_VER
int64_t cc = __rdtscp(&core);
pd[0] = TRACER_END_VALUE;
*(int64_t*)(pd + 1) = cc;
#else
unsigned int lo, hi;
__asm__ __volatile__("rdtscp" : "=a"(lo), "=d"(hi), "=c"(core));
pd[0] = TRACER_END_VALUE;
pd[1] = lo;
pd[2] = hi;
#endif
pd[3] = core;
}
};
}
}
}
#include "Tracer.hh"
#include <map>
#include <vector>
#include <fstream>
namespace aion
{
namespace tracing
{
void write_dir(const std::string &path)
{
write_json(path + "/trace.json");
write_summary_csv(path + "/trace.csv");
}
void write_json(const std::string &filename)
{
std::ofstream out(filename);
if (!out.good())
return;
// TODO:
out << "implement me";
}
void write_summary_csv(const std::string &filename)
{
std::ofstream out(filename);
if (!out.good())
return;
struct entry
{
int count = 0;
uint64_t cycles_total = 0;
uint64_t cycles_children = 0;
uint64_t cycles_min = std::numeric_limits<uint64_t>::max();
uint64_t cycles_max = 0;
};
struct stack_entry
{
location *loc;
uint64_t cycles;
uint64_t cycles_children;
};
struct visitor : tracing::visitor
{
std::map<location *, entry> entries;
std::vector<stack_entry> stack;
int depth = 1;
std::thread::id thread;
virtual void on_thread(std::thread::id thread) override
{
this->thread = thread;
stack.clear();
// TODO: close pending actions
}
virtual void on_trace_start(tracing::location *loc, uint64_t cycles, uint32_t cpu) override { stack.push_back({loc, cycles, 0}); }
virtual void on_trace_end(uint64_t cycles, uint32_t cpu) override
{
auto se = stack.back();
stack.pop_back();
auto dt = cycles - se.cycles;
auto &e = entries[se.loc];
e.count++;
e.cycles_total += dt;
e.cycles_children += se.cycles_children;
e.cycles_min = std::min(e.cycles_min, dt);
e.cycles_max = std::max(e.cycles_max, dt);
if (!stack.empty())
stack.back().cycles_children += dt;
}
};
visitor v;
tracing::visit_thread(v);
out << "name,file,function,count,total,avg,min,max,total_body,avg_body\n";
for (auto const &kvp : v.entries)
{
auto l = kvp.first;
auto e = kvp.second;
out << '"' << l->name << '"' << ",";
out << '"' << l->file << ":" << l->line << '"' << ",";
out << '"' << l->function << '"' << ",";
out << e.count << ",";
out << e.cycles_total << ",";
out << e.cycles_total / double(e.count) << ",";
out << e.cycles_min << ",";
out << e.cycles_max << ",";
out << e.cycles_total - e.cycles_children << ",";
out << (e.cycles_total - e.cycles_children) / double(e.count);
out << "\n";
}
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment