...
 
Commits (3)
......@@ -39,14 +39,4 @@
* *
\*===========================================================================*/
#include "Histogram.hh"
namespace ACG {
template<>
QString HistogramT<double>::getBoundaryLabel(size_t idx) const {
// TODO: choose accuracy based on avg_bin_size_
return QString::number(bin_boundaries_[idx], 'g', 2);
}
} // namespace ACG
//#include "Histogram.hh"
......@@ -39,8 +39,7 @@
* *
\*===========================================================================*/
#ifndef ACG_HISTOGRAM_HH
#define ACG_HISTOGRAM_HH
#pragma once
#include <vector>
#include <cassert>
......@@ -48,9 +47,11 @@
#include <exception>
#include <algorithm>
#include <type_traits>
#include <map>
#include <QString>
#include "SmartPointer.hh"
#include "../Config/ACGDefines.hh"
namespace ACG {
......@@ -61,6 +62,13 @@ public:
PerBoundary,
};
Histogram() = default;
Histogram(std::vector<size_t> &&bins,
std::vector<double> &&bin_widths)
: bins_(std::move(bins)),
bin_widths_(std::move(bin_widths))
{}
virtual ~Histogram() = default;
const std::vector<size_t> &getBins() const { return bins_; }
const std::vector<double> &getBinWidths() const { return bin_widths_; }
......@@ -76,83 +84,50 @@ protected:
};
// we need to be careful with ranges, some sums (e.g. INT_MAX - INT_MIN) do not fit into a signed int,
// so we store bin sizes as doubles. With specialization or some tricks we
// could probably use the next-biggest integer type, but if we're using
// the biggest integer type already, we should to fall back to double anyways.
inline QString formatValue (int val) {
return QString::number(val);
}
inline QString formatValue (unsigned int val) {
return QString::number(val);
}
inline QString formatValue (double val) {
return QString::number(val, 'g', 3);
}
template<typename T>
class UnbinnedHistogram : public Histogram
{
public:
UnbinnedHistogram(std::vector<size_t> &&bin_counts,
std::vector<T> &&bin_values)
: Histogram(std::move(bin_counts),
std::vector<double>(bin_counts.size(), 1.)),
bin_values_(std::move(bin_values))
{
}
double getTotalWidth() const override { return bins_.size();};
LabelType getLabelType() const override { return LabelType::PerBin; };
QString getBinLabel (size_t idx) const override { return formatValue(bin_values_[idx]);}
private:
std::vector<T> bin_values_;
};
template<typename T>
class HistogramT : public Histogram {
public:
HistogramT(const std::vector<int> &histogram,
const std::vector<T> &bin_boundaries,
const std::vector<double> &bin_widths)
HistogramT() = default;
HistogramT(std::vector<size_t> &&histogram,
std::vector<T> &&bin_boundaries,
std::vector<double> &&bin_widths
)
: Histogram(std::move(histogram), std::move(bin_widths)),
bin_boundaries_(std::move(bin_boundaries))
{
if (bins_.size() != bin_widths_.size()
|| bins_.size() + 1 != bin_boundaries_.size()) {
throw std::runtime_error("Histogram constructor sizes don't match.");
}
bins_ = histogram;
bin_boundaries_ = bin_boundaries;
bin_widths_ = bin_widths;
double range = bin_boundaries.back() - bin_boundaries.front();
avg_bin_size_ = range / bins_.size();
}
template<typename IterT>
HistogramT(IterT begin, IterT end, size_t max_bins)
{
static_assert(std::is_assignable<T&, typename IterT::value_type>::value, "IterT incompatible with T.");
static_assert(std::is_floating_point<typename IterT::value_type>::value, "HistogramT currently only supports floating point values.");
assert(max_bins > 0);
const size_t n = std::distance(begin, end);
if (n == 0) return;
const auto minmax = std::minmax_element(begin, end);
const T min = *minmax.first;
const T max = *minmax.second;
const double min_dbl = static_cast<double>(min);
const double range = static_cast<double>(max) - min_dbl;
const size_t n_bins_max = std::min(max_bins, n);
bin_boundaries_.reserve(n_bins_max + 1);
T last_boundary = min;
bin_boundaries_.push_back(min);
for (size_t i = 1; i < n_bins_max; ++i) {
// Adding range/n_bins to a accumulator might seem more efficient/elegant,
// but might cause numeric issues.
// This multiplication order is bad for huge ranges that cause overflows,
// however I assume tiny ranges are more common than huge values and more
// important to get right. If you disagree, add a case distinction or something better.
T boundary = static_cast<T>(min + (i * range) / n_bins_max);
// avoid zero-sized bins (happens for many ints with values in a small range)
if (boundary != last_boundary || i == 0) {
bin_boundaries_.push_back(boundary);
bin_widths_.push_back(boundary - last_boundary);
}
last_boundary = boundary;
}
bin_boundaries_.push_back(max); // avoid rounding issues etc by explicitly picking max.
bin_widths_.push_back(max - last_boundary);
bin_boundaries_.shrink_to_fit();
size_t n_bins = bin_boundaries_.size() - 1;
bins_.resize(n_bins);
// note that due to rounding, our bins may have differing sizes, which matters
// if we handle integral types (relative size difference worst case: bin width 1 vs 2).
// Be careful to select the right bin.
std::for_each(begin, end, [&](const T &val) {
auto it = std::upper_bound(bin_boundaries_.begin(), bin_boundaries_.end(), val);
if (it == bin_boundaries_.end()) --it; // the last value is exactly max!
size_t idx = std::distance(bin_boundaries_.begin(), it);
assert(idx > 0);
++bins_[idx - 1];
});
avg_bin_size_ = range / n_bins;
}
const std::vector<T> &getBinBoundaries() const {
......@@ -169,20 +144,125 @@ public:
return LabelType::PerBoundary;
}
QString getBoundaryLabel (size_t idx) const override;
QString getBoundaryLabel (size_t idx) const override
{
// TODO: for floating point types, choose accuracy depending on bin size
return formatValue(bin_boundaries_[idx]);
};
private:
std::vector<T> bin_boundaries_;
double avg_bin_size_ = 0.0;
};
template<typename T>
QString HistogramT<T>::getBoundaryLabel(size_t idx) const {
return QString::number(bin_boundaries_[idx]);
std::unique_ptr<Histogram> create_histogram_unbinned(const std::map<T, size_t> &counts)
{
std::vector<T> values;
std::vector<size_t> histogram;
values.reserve(counts.size());
histogram.reserve(counts.size());
for (const auto &entry: counts)
{
values.push_back(entry.first);
histogram.push_back(entry.second);
}
return ptr::make_unique<UnbinnedHistogram<T>>(std::move(histogram), std::move(values));
}
template<typename T, typename Iterable>
std::unique_ptr<Histogram> create_histogram_autorange(const Iterable &range, size_t max_bins = 50)
{
// we need to be careful with ranges, some sums (e.g. INT_MAX - INT_MIN) do not fit into a signed int,
// so we store bin sizes as doubles. With specialization or some tricks we
// could probably use the next-biggest integer type, but if we're using
// the biggest integer type already, we should to fall back to double anyways.
std::vector<T> bin_boundaries;
std::vector<size_t> bins;
std::vector<double> bin_widths;
const size_t n = std::distance(begin(range), end(range));
if (n == 0) return {};
const auto minmax = std::minmax_element(begin(range), end(range));
const T min = *minmax.first;
const T max = *minmax.second;
const double min_dbl = static_cast<double>(min);
const double val_range = static_cast<double>(max) - min_dbl;
const size_t n_bins_max = std::min(max_bins, n);
bin_boundaries.reserve(n_bins_max + 1);
T last_boundary = min;
bin_boundaries.push_back(min);
for (size_t i = 1; i < n_bins_max; ++i) {
// Adding val_range/n_bins to a accumulator might seem more efficient/elegant,
// but might cause numeric issues.
// This multiplication order is bad for huge ranges that cause overflows,
// however I assume tiny ranges are more common than huge values and more
// important to get right. If you disagree, add a case distinction or something better.
T boundary = static_cast<T>(min + (i * val_range) / n_bins_max);
// avoid zero-sized bins (happens for many ints with values in a small range)
if (boundary != last_boundary || i == 0) {
bin_boundaries.push_back(boundary);
bin_widths.push_back(boundary - last_boundary);
}
last_boundary = boundary;
}
bin_boundaries.push_back(max); // avoid rounding issues etc by explicitly picking max.
bin_widths.push_back(max - last_boundary);
bin_boundaries.shrink_to_fit();
size_t n_bins = bin_boundaries.size() - 1;
bins.resize(n_bins);
// note that due to rounding, our bins may have differing sizes, which matters
// if we handle integral types (relative size difference worst case: bin width 1 vs 2).
// Be careful to select the right bin.
std::for_each(begin(range), end(range), [&](const T &val) {
auto it = std::upper_bound(bin_boundaries.begin(), bin_boundaries.end(), val);
if (it == bin_boundaries.end()) --it; // the last value is exactly max!
size_t idx = std::distance(bin_boundaries.begin(), it);
assert(idx > 0);
++bins[idx - 1];
});
return ptr::make_unique<HistogramT<T>>(std::move(bins), std::move(bin_boundaries), std::move(bin_widths));
}
template<typename Iterable>
std::unique_ptr<Histogram> create_histogram_auto(const Iterable &range, size_t max_bins = 50)
{
using T = typename std::remove_cv<
typename std::remove_reference<
decltype(*begin(range))
>::type
>::type;
const size_t n = std::distance(begin(range), end(range));
if (n == 0) return {};
std::map<T, size_t> elem_counts;
bool too_many_unique = false;
for (const auto &v: range)
{
++elem_counts[v];
if (elem_counts.size() > max_bins)
{
too_many_unique = true;
break;
}
}
if (too_many_unique) {
return create_histogram_autorange<T>(range, max_bins);
} else {
return create_histogram_unbinned(elem_counts);
}
}
} // namespace ACG
#endif // ACG_HISTOGRAM_HH