1// SPDX-License-Identifier: GPL-2.0 2// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> 3 4#include <linux/kernel.h> 5#include <linux/percpu.h> 6#include <linux/slab.h> 7#include <linux/static_key.h> 8#include <linux/interrupt.h> 9#include <linux/idr.h> 10#include <linux/irq.h> 11#include <linux/math64.h> 12 13#include <trace/events/irq.h> 14 15#include "internals.h" 16 17DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); 18 19DEFINE_PER_CPU(struct irq_timings, irq_timings); 20 21struct irqt_stat { 22 u64 next_evt; 23 u64 last_ts; 24 u64 variance; 25 u32 avg; 26 u32 nr_samples; 27 int anomalies; 28 int valid; 29}; 30 31static DEFINE_IDR(irqt_stats); 32 33void irq_timings_enable(void) 34{ 35 static_branch_enable(&irq_timing_enabled); 36} 37 38void irq_timings_disable(void) 39{ 40 static_branch_disable(&irq_timing_enabled); 41} 42 43/** 44 * irqs_update - update the irq timing statistics with a new timestamp 45 * 46 * @irqs: an irqt_stat struct pointer 47 * @ts: the new timestamp 48 * 49 * The statistics are computed online, in other words, the code is 50 * designed to compute the statistics on a stream of values rather 51 * than doing multiple passes on the values to compute the average, 52 * then the variance. The integer division introduces a loss of 53 * precision but with an acceptable error margin regarding the results 54 * we would have with the double floating precision: we are dealing 55 * with nanosec, so big numbers, consequently the mantisse is 56 * negligeable, especially when converting the time in usec 57 * afterwards. 58 * 59 * The computation happens at idle time. When the CPU is not idle, the 60 * interrupts' timestamps are stored in the circular buffer, when the 61 * CPU goes idle and this routine is called, all the buffer's values 62 * are injected in the statistical model continuying to extend the 63 * statistics from the previous busy-idle cycle. 64 * 65 * The observations showed a device will trigger a burst of periodic 66 * interrupts followed by one or two peaks of longer time, for 67 * instance when a SD card device flushes its cache, then the periodic 68 * intervals occur again. A one second inactivity period resets the 69 * stats, that gives us the certitude the statistical values won't 70 * exceed 1x10^9, thus the computation won't overflow. 71 * 72 * Basically, the purpose of the algorithm is to watch the periodic 73 * interrupts and eliminate the peaks. 74 * 75 * An interrupt is considered periodically stable if the interval of 76 * its occurences follow the normal distribution, thus the values 77 * comply with: 78 * 79 * avg - 3 x stddev < value < avg + 3 x stddev 80 * 81 * Which can be simplified to: 82 * 83 * -3 x stddev < value - avg < 3 x stddev 84 * 85 * abs(value - avg) < 3 x stddev 86 * 87 * In order to save a costly square root computation, we use the 88 * variance. For the record, stddev = sqrt(variance). The equation 89 * above becomes: 90 * 91 * abs(value - avg) < 3 x sqrt(variance) 92 * 93 * And finally we square it: 94 * 95 * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 96 * 97 * (value - avg) x (value - avg) < 9 x variance 98 * 99 * Statistically speaking, any values out of this interval is 100 * considered as an anomaly and is discarded. However, a normal 101 * distribution appears when the number of samples is 30 (it is the 102 * rule of thumb in statistics, cf. "30 samples" on Internet). When 103 * there are three consecutive anomalies, the statistics are resetted. 104 * 105 */ 106static void irqs_update(struct irqt_stat *irqs, u64 ts) 107{ 108 u64 old_ts = irqs->last_ts; 109 u64 variance = 0; 110 u64 interval; 111 s64 diff; 112 113 /* 114 * The timestamps are absolute time values, we need to compute 115 * the timing interval between two interrupts. 116 */ 117 irqs->last_ts = ts; 118 119 /* 120 * The interval type is u64 in order to deal with the same 121 * type in our computation, that prevent mindfuck issues with 122 * overflow, sign and division. 123 */ 124 interval = ts - old_ts; 125 126 /* 127 * The interrupt triggered more than one second apart, that 128 * ends the sequence as predictible for our purpose. In this 129 * case, assume we have the beginning of a sequence and the 130 * timestamp is the first value. As it is impossible to 131 * predict anything at this point, return. 132 * 133 * Note the first timestamp of the sequence will always fall 134 * in this test because the old_ts is zero. That is what we 135 * want as we need another timestamp to compute an interval. 136 */ 137 if (interval >= NSEC_PER_SEC) { 138 memset(irqs, 0, sizeof(*irqs)); 139 irqs->last_ts = ts; 140 return; 141 } 142 143 /* 144 * Pre-compute the delta with the average as the result is 145 * used several times in this function. 146 */ 147 diff = interval - irqs->avg; 148 149 /* 150 * Increment the number of samples. 151 */ 152 irqs->nr_samples++; 153 154 /* 155 * Online variance divided by the number of elements if there 156 * is more than one sample. Normally the formula is division 157 * by nr_samples - 1 but we assume the number of element will be 158 * more than 32 and dividing by 32 instead of 31 is enough 159 * precise. 160 */ 161 if (likely(irqs->nr_samples > 1)) 162 variance = irqs->variance >> IRQ_TIMINGS_SHIFT; 163 164 /* 165 * The rule of thumb in statistics for the normal distribution 166 * is having at least 30 samples in order to have the model to 167 * apply. Values outside the interval are considered as an 168 * anomaly. 169 */ 170 if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { 171 /* 172 * After three consecutive anomalies, we reset the 173 * stats as it is no longer stable enough. 174 */ 175 if (irqs->anomalies++ >= 3) { 176 memset(irqs, 0, sizeof(*irqs)); 177 irqs->last_ts = ts; 178 return; 179 } 180 } else { 181 /* 182 * The anomalies must be consecutives, so at this 183 * point, we reset the anomalies counter. 184 */ 185 irqs->anomalies = 0; 186 } 187 188 /* 189 * The interrupt is considered stable enough to try to predict 190 * the next event on it. 191 */ 192 irqs->valid = 1; 193 194 /* 195 * Online average algorithm: 196 * 197 * new_average = average + ((value - average) / count) 198 * 199 * The variance computation depends on the new average 200 * to be computed here first. 201 * 202 */ 203 irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); 204 205 /* 206 * Online variance algorithm: 207 * 208 * new_variance = variance + (value - average) x (value - new_average) 209 * 210 * Warning: irqs->avg is updated with the line above, hence 211 * 'interval - irqs->avg' is no longer equal to 'diff' 212 */ 213 irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); 214 215 /* 216 * Update the next event 217 */ 218 irqs->next_evt = ts + irqs->avg; 219} 220 221/** 222 * irq_timings_next_event - Return when the next event is supposed to arrive 223 * 224 * During the last busy cycle, the number of interrupts is incremented 225 * and stored in the irq_timings structure. This information is 226 * necessary to: 227 * 228 * - know if the index in the table wrapped up: 229 * 230 * If more than the array size interrupts happened during the 231 * last busy/idle cycle, the index wrapped up and we have to 232 * begin with the next element in the array which is the last one 233 * in the sequence, otherwise it is a the index 0. 234 * 235 * - have an indication of the interrupts activity on this CPU 236 * (eg. irq/sec) 237 * 238 * The values are 'consumed' after inserting in the statistical model, 239 * thus the count is reinitialized. 240 * 241 * The array of values **must** be browsed in the time direction, the 242 * timestamp must increase between an element and the next one. 243 * 244 * Returns a nanosec time based estimation of the earliest interrupt, 245 * U64_MAX otherwise. 246 */ 247u64 irq_timings_next_event(u64 now) 248{ 249 struct irq_timings *irqts = this_cpu_ptr(&irq_timings); 250 struct irqt_stat *irqs; 251 struct irqt_stat __percpu *s; 252 u64 ts, next_evt = U64_MAX; 253 int i, irq = 0; 254 255 /* 256 * This function must be called with the local irq disabled in 257 * order to prevent the timings circular buffer to be updated 258 * while we are reading it. 259 */ 260 lockdep_assert_irqs_disabled(); 261 262 /* 263 * Number of elements in the circular buffer: If it happens it 264 * was flushed before, then the number of elements could be 265 * smaller than IRQ_TIMINGS_SIZE, so the count is used, 266 * otherwise the array size is used as we wrapped. The index 267 * begins from zero when we did not wrap. That could be done 268 * in a nicer way with the proper circular array structure 269 * type but with the cost of extra computation in the 270 * interrupt handler hot path. We choose efficiency. 271 * 272 * Inject measured irq/timestamp to the statistical model 273 * while decrementing the counter because we consume the data 274 * from our circular buffer. 275 */ 276 for (i = irqts->count & IRQ_TIMINGS_MASK, 277 irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); 278 irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { 279 280 irq = irq_timing_decode(irqts->values[i], &ts); 281 282 s = idr_find(&irqt_stats, irq); 283 if (s) { 284 irqs = this_cpu_ptr(s); 285 irqs_update(irqs, ts); 286 } 287 } 288 289 /* 290 * Look in the list of interrupts' statistics, the earliest 291 * next event. 292 */ 293 idr_for_each_entry(&irqt_stats, s, i) { 294 295 irqs = this_cpu_ptr(s); 296 297 if (!irqs->valid) 298 continue; 299 300 if (irqs->next_evt <= now) { 301 irq = i; 302 next_evt = now; 303 304 /* 305 * This interrupt mustn't use in the future 306 * until new events occur and update the 307 * statistics. 308 */ 309 irqs->valid = 0; 310 break; 311 } 312 313 if (irqs->next_evt < next_evt) { 314 irq = i; 315 next_evt = irqs->next_evt; 316 } 317 } 318 319 return next_evt; 320} 321 322void irq_timings_free(int irq) 323{ 324 struct irqt_stat __percpu *s; 325 326 s = idr_find(&irqt_stats, irq); 327 if (s) { 328 free_percpu(s); 329 idr_remove(&irqt_stats, irq); 330 } 331} 332 333int irq_timings_alloc(int irq) 334{ 335 struct irqt_stat __percpu *s; 336 int id; 337 338 /* 339 * Some platforms can have the same private interrupt per cpu, 340 * so this function may be be called several times with the 341 * same interrupt number. Just bail out in case the per cpu 342 * stat structure is already allocated. 343 */ 344 s = idr_find(&irqt_stats, irq); 345 if (s) 346 return 0; 347 348 s = alloc_percpu(*s); 349 if (!s) 350 return -ENOMEM; 351 352 idr_preload(GFP_KERNEL); 353 id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); 354 idr_preload_end(); 355 356 if (id < 0) { 357 free_percpu(s); 358 return id; 359 } 360 361 return 0; 362} 363