|
| 1 | +// Copyright The OpenTelemetry Authors |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +package serializeprofiles // import "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/elasticsearchexporter/internal/serializer/otelserializer/serializeprofiles" |
| 5 | + |
| 6 | +import ( |
| 7 | + "fmt" |
| 8 | + "math/rand/v2" |
| 9 | +) |
| 10 | + |
| 11 | +// ## Why do we need downsampling ? |
| 12 | +// For every (virtual) CPU core, the host agent (HA) retrieves 20 stacktrace events which are |
| 13 | +// stored as a timeseries in an ES index (name 'profiling-events'). With an increasing number |
| 14 | +// of hosts and/or an increasing number of cores the number of stored events per second |
| 15 | +// become high quickly. E.g. data from 10000 cores generate 846 million events per day. |
| 16 | +// Since users want to drill down into e.g. single hosts and/or single applications, we can't |
| 17 | +// reduce the amount of data in advance. Querying such amounts data is costly, even when using |
| 18 | +// highly specialised database backends - costly in terms of I/O, CPU. And this results in |
| 19 | +// increased latency - the user has to wait eventually a long time for his query results. |
| 20 | +// In order to reduce the costs and to keep the latency as low as possible, we add 'helper' |
| 21 | +// indexes with downsampled subsets of the stacktrace events. |
| 22 | +// |
| 23 | +// ## How does our downsampling approach work ? |
| 24 | +// The idea is to create downsampled indexes with factors of 5^N (5, 25, 125, 625, ...). |
| 25 | +// In the 5^1 index we would store 1/5th of the original events, in the 5^2 index we store |
| 26 | +// 1/25th of the original events and so on. |
| 27 | +// So each event has a probability of p=1/5=0.2 to also be stored in the next downsampled index. |
| 28 | +// Since we aggregate identical stacktrace events by timestamp when reported and stored, we have |
| 29 | +// a 'Count' value for each. To be statistically correct, we have to apply p=0.2 to each single |
| 30 | +// event independently and not just to the aggregate. We can do so by looping over 'Count' and |
| 31 | +// apply p=0.2 on every iteration to generate a new 'Count' value for the next downsampled index. |
| 32 | +// We only store aggregates with 'Count' > 0. |
| 33 | +// |
| 34 | +// At some point we decided that 20k events per query is good enough. With 5^N it means that we |
| 35 | +// possibly can end up with 5x more events (100k) from an index. As of this writing, retrieving |
| 36 | +// and processing of 100k events is still fast enough. While in Clickhouse we could further |
| 37 | +// downsample on-the-fly to get 20k, ES currently doesn't allow this (may change in the future). |
| 38 | +// |
| 39 | +// At query time we have to find the index that has enough data to be statistically sound, |
| 40 | +// without having too much data to avoid costs and latency. The code for that is implemented on |
| 41 | +// the read path (Kibana profiler plugin) and described there in detail. |
| 42 | +// |
| 43 | +// ## Example of a query / calculation |
| 44 | +// Let's imagine, a given query spans a time range of 7 days and would result in 100 million |
| 45 | +// events without down-sampling. But we only really need 20k events for a good enough result. |
| 46 | +// In the 5^1 downsampled index we have 5x less data - this still results in 20 millions events. |
| 47 | +// Going deeper we end up in the 5^5 downsampled index with 32k results - 5^4 would give us 160k |
| 48 | +// (too many) and 5^6 would give us 6.4k events (not enough). |
| 49 | +// We now read and process all 32k events from the 5^5 index. The counts for any aggregation |
| 50 | +// (TopN, Flamegraph, ...) needs to be multiplied by 5^5, which is an estimate of what we would |
| 51 | +// have found in the full events index (the not downsampled index). |
| 52 | +// |
| 53 | +// ## How deep do we have to downsample ? |
| 54 | +// The current code assumes an arbitrary upper limit of 100k CPU cores and a query time range |
| 55 | +// of 7 days. (Please be aware that we get 20 events per core per second only if the core is |
| 56 | +// 100% busy.) |
| 57 | +// |
| 58 | +// The condition is |
| 59 | +// |
| 60 | +// (100k * 86400 * 7 * 20) / 5^N in [20k, 100k-1] |
| 61 | +// ^-- max number of events per second |
| 62 | +// ^------ number of days |
| 63 | +// ^-------------- seconds per day |
| 64 | +// ^--------------------- number of cores |
| 65 | +// |
| 66 | +// For N=11 the condition is satisfied with a value of 24772. |
| 67 | +// In numbers, the 5^11 downsampled index holds 48828125x fewer entries than the full events table. |
| 68 | +// |
| 69 | +// ## What is the cost of downsampling ? |
| 70 | +// The additional cost in terms of storage size is |
| 71 | +// |
| 72 | +// 1/5^1 +1/5^2 + ... + 1/5^11 = 25% |
| 73 | +// |
| 74 | +// The same goes for the additional CPU cost on the write path. |
| 75 | +// |
| 76 | +// The average benefit on the read/query path depends on the query. But it seems that in average |
| 77 | +// a factor of few hundred to a few thousand in terms of I/O, CPU and latency can be achieved. |
| 78 | +const ( |
| 79 | + maxEventsIndexes = 11 |
| 80 | + samplingFactor = 5 |
| 81 | + samplingRatio = 1.0 / float64(samplingFactor) |
| 82 | + |
| 83 | + eventsIndexPrefix = "profiling-events" |
| 84 | +) |
| 85 | + |
| 86 | +var eventIndices = initEventIndexes(maxEventsIndexes) |
| 87 | + |
| 88 | +// A fixed seed is used for deterministic tests and development. |
| 89 | +// There is no downside in using a fixed seed in production. |
| 90 | +var rnd = rand.New(rand.NewPCG(0, 0)) |
| 91 | + |
| 92 | +// initEventIndexes initializes eventIndexes to avoid calculations for every TraceEvent later. |
| 93 | +func initEventIndexes(count int) []string { |
| 94 | + indices := make([]string, 0, count) |
| 95 | + |
| 96 | + for i := range count { |
| 97 | + indices = append(indices, fmt.Sprintf("%s-%dpow%02d", |
| 98 | + eventsIndexPrefix, samplingFactor, i+1)) |
| 99 | + } |
| 100 | + |
| 101 | + return indices |
| 102 | +} |
| 103 | + |
| 104 | +func IndexDownsampledEvent(event StackTraceEvent, pushData func(any, string, string) error) error { |
| 105 | + // Each event has a probability of p=1/5=0.2 to go from one index into the next downsampled |
| 106 | + // index. Since we aggregate identical stacktrace events by timestamp when reported and stored, |
| 107 | + // we have a 'Count' value for each. To be statistically correct, we have to apply p=0.2 to |
| 108 | + // each single stacktrace event independently and not just to the aggregate. We can do so by |
| 109 | + // looping over 'Count' and apply p=0.2 on every iteration to generate a new 'Count' value for |
| 110 | + // the next downsampled index. |
| 111 | + // We only store aggregates with 'Count' > 0. If 'Count' becomes 0, we are done and can |
| 112 | + // continue with the next stacktrace event. |
| 113 | + for _, index := range eventIndices { |
| 114 | + var count uint16 |
| 115 | + for range event.Count { |
| 116 | + // samplingRatio is the probability p=0.2 for an event to be copied into the next |
| 117 | + // downsampled index. |
| 118 | + if rnd.Float64() < samplingRatio { |
| 119 | + count++ |
| 120 | + } |
| 121 | + } |
| 122 | + if count == 0 { |
| 123 | + return nil |
| 124 | + } |
| 125 | + |
| 126 | + // Store the event with its new downsampled count in the downsampled index. |
| 127 | + event.Count = count |
| 128 | + |
| 129 | + if err := pushData(event, "", index); err != nil { |
| 130 | + return err |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + return nil |
| 135 | +} |
0 commit comments