1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.statistics.descriptive;
18
19 import java.util.Arrays;
20 import java.util.Objects;
21 import java.util.function.IntToDoubleFunction;
22 import java.util.function.IntToLongFunction;
23 import org.apache.commons.numbers.arrays.Selection;
24
25 /**
26 * Provides quantile computation.
27 *
28 * <p>For values of length {@code n}:
29 * <ul>
30 * <li>The result is {@code NaN} if {@code n = 0}.</li>
31 * <li>The result is {@code values[0]} if {@code n = 1}.</li>
32 * <li>Otherwise the result is computed using the {@link EstimationMethod}.</li>
33 * </ul>
34 *
35 * <p>Computation of multiple quantiles will handle duplicate and unordered
36 * probabilities. Passing ordered probabilities is recommended if the order is already
37 * known as this can improve efficiency; for example using uniform spacing through the
38 * array data, or to identify extreme values from the data such as {@code [0.001, 0.999]}.
39 *
40 * <p>This implementation respects the ordering imposed by
41 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
42 * in the selected positions in the fully sorted values then the result is {@code NaN}.
43 *
44 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
45 *
46 * <p>Instances of this class are immutable and thread-safe.
47 *
48 * <p><strong>Support for {@code long} arrays</strong>
49 *
50 * <p>The result on {@code long} values can be returned as a {@code double} or
51 * a {@code long} using a {@link StatisticResult}.
52 *
53 * <p>The {@code double} result is computed within 1 ULP of the exact result. In some
54 * cases this may be outside the range defined by the minimum and maximum of the input
55 * array following rounding to a 53-bit floating point representation. For example a
56 * quantile of an array containing only {@link Long#MAX_VALUE} as a {@code double} is
57 * 2<sup>63</sup>, which is the closest representation of 2<sup>63</sup> - 1.
58 *
59 * <p>The {@code long} result is returned using the nearest whole number.
60 * In the event of ties the result is rounded towards positive infinity.
61 * This value will always be within the range defined by the minimum and maximum
62 * of the input array. Due to interpolation it may be a value not observed in
63 * the input values.
64 *
65 * <p>Interpolation between two {@code long} values requires extended precision
66 * floating-point arithmetic. This can be avoided using a discontinuous {@link EstimationMethod}.
67 * In this case the {@code long} quantile will be a value observed in the input values.
68 *
69 * <p>If the array length {@code n} is zero the result as a {@code double} is
70 * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}.
71 *
72 * <p>Multiple quantile results required as only one of the primitive values can be converted
73 * to a primitive array using a stream, for example:
74 *
75 * <pre>{@code
76 * long[] values = ...
77 * double[] p = Quantile.probabilities(10);
78 * Quantile q = Quantile.withDefaults();
79 * long[] result = Arrays.stream(q.evaluate(values, p))
80 * .mapToLong(StatisticResult::getAsLong)
81 * .toArray();
82 * }</pre>
83 *
84 * @see #with(NaNPolicy)
85 * @see <a href="https://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a>
86 * @since 1.1
87 */
88 public final class Quantile {
89 /** Message when the probability is not in the range {@code [0, 1]}. */
90 private static final String INVALID_PROBABILITY = "Invalid probability: ";
91 /** Message when no probabilities are provided for the varargs method. */
92 private static final String NO_PROBABILITIES_SPECIFIED = "No probabilities specified";
93 /** Message when the size is not valid. */
94 private static final String INVALID_SIZE = "Invalid size: ";
95 /** Message when the number of probabilities in a range is not valid. */
96 private static final String INVALID_NUMBER_OF_PROBABILITIES = "Invalid number of probabilities: ";
97
98 /** Default instance. Method 8 is recommended by Hyndman and Fan. */
99 private static final Quantile DEFAULT = new Quantile(false, NaNPolicy.INCLUDE, EstimationMethod.HF8);
100
101 /** Flag to indicate if the data should be copied. */
102 private final boolean copy;
103 /** NaN policy for floating point data. */
104 private final NaNPolicy nanPolicy;
105 /** Transformer for NaN data. */
106 private final NaNTransformer nanTransformer;
107 /** Estimation type used to determine the value from the quantile. */
108 private final EstimationMethod estimationType;
109
110 /**
111 * @param copy Flag to indicate if the data should be copied.
112 * @param nanPolicy NaN policy.
113 * @param estimationType Estimation type used to determine the value from the quantile.
114 */
115 private Quantile(boolean copy, NaNPolicy nanPolicy, EstimationMethod estimationType) {
116 this.copy = copy;
117 this.nanPolicy = nanPolicy;
118 this.estimationType = estimationType;
119 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
120 }
121
122 /**
123 * Return a new instance with the default options.
124 *
125 * <ul>
126 * <li>{@linkplain #withCopy(boolean) Copy = false}</li>
127 * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li>
128 * <li>{@linkplain #with(EstimationMethod) Estimation method = HF8}</li>
129 * </ul>
130 *
131 * <p>Note: The default options configure for processing in-place and including
132 * {@code NaN} values in the data. This is the most efficient mode and has the
133 * smallest memory consumption.
134 *
135 * @return the quantile implementation
136 * @see #withCopy(boolean)
137 * @see #with(NaNPolicy)
138 * @see #with(EstimationMethod)
139 */
140 public static Quantile withDefaults() {
141 return DEFAULT;
142 }
143
144 /**
145 * Return an instance with the configured copy behaviour. If {@code false} then
146 * the input array will be modified by the call to evaluate the quantiles; otherwise
147 * the computation uses a copy of the data.
148 *
149 * @param v Value.
150 * @return an instance
151 */
152 public Quantile withCopy(boolean v) {
153 return new Quantile(v, nanPolicy, estimationType);
154 }
155
156 /**
157 * Return an instance with the configured {@link NaNPolicy}.
158 *
159 * <p>Note: This implementation respects the ordering imposed by
160 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
161 * considered greater than all other values, and all {@code NaN} values are equal. The
162 * {@link NaNPolicy} changes the computation of the statistic in the presence of
163 * {@code NaN} values.
164 *
165 * <ul>
166 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
167 * the size of the data <em>includes</em> the {@code NaN} values and the quantile will be
168 * {@code NaN} if any value used for quantile interpolation is {@code NaN}.</li>
169 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
170 * the size of the data <em>excludes</em> the {@code NaN} values and the quantile will
171 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
172 * and the result is {@code NaN}.</li>
173 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
174 * values.</li>
175 * </ul>
176 *
177 * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
178 *
179 * @param v Value.
180 * @return an instance
181 */
182 public Quantile with(NaNPolicy v) {
183 return new Quantile(copy, Objects.requireNonNull(v), estimationType);
184 }
185
186 /**
187 * Return an instance with the configured {@link EstimationMethod}.
188 *
189 * @param v Value.
190 * @return an instance
191 */
192 public Quantile with(EstimationMethod v) {
193 return new Quantile(copy, nanPolicy, Objects.requireNonNull(v));
194 }
195
196 /**
197 * Generate {@code n} evenly spaced probabilities in the range {@code [0, 1]}.
198 *
199 * <pre>
200 * 1/(n + 1), 2/(n + 1), ..., n/(n + 1)
201 * </pre>
202 *
203 * @param n Number of probabilities.
204 * @return the probabilities
205 * @throws IllegalArgumentException if {@code n < 1}
206 */
207 public static double[] probabilities(int n) {
208 checkNumberOfProbabilities(n);
209 final double c1 = n + 1.0;
210 final double[] p = new double[n];
211 for (int i = 0; i < n; i++) {
212 p[i] = (i + 1.0) / c1;
213 }
214 return p;
215 }
216
217 /**
218 * Generate {@code n} evenly spaced probabilities in the range {@code [p1, p2]}.
219 *
220 * <pre>
221 * w = p2 - p1
222 * p1 + w/(n + 1), p1 + 2w/(n + 1), ..., p1 + nw/(n + 1)
223 * </pre>
224 *
225 * @param n Number of probabilities.
226 * @param p1 Lower probability.
227 * @param p2 Upper probability.
228 * @return the probabilities
229 * @throws IllegalArgumentException if {@code n < 1}; if the probabilities are not in the
230 * range {@code [0, 1]}; or {@code p2 <= p1}.
231 */
232 public static double[] probabilities(int n, double p1, double p2) {
233 checkProbability(p1);
234 checkProbability(p2);
235 if (p2 <= p1) {
236 throw new IllegalArgumentException("Invalid range: [" + p1 + ", " + p2 + "]");
237 }
238 final double[] p = probabilities(n);
239 for (int i = 0; i < n; i++) {
240 p[i] = (1 - p[i]) * p1 + p[i] * p2;
241 }
242 return p;
243 }
244
245 /**
246 * Evaluate the {@code p}-th quantile of the values.
247 *
248 * <p>Note: This method may partially sort the input values if not configured to
249 * {@link #withCopy(boolean) copy} the input data.
250 *
251 * <p><strong>Performance</strong>
252 *
253 * <p>It is not recommended to use this method for repeat calls for different quantiles
254 * within the same values. The {@link #evaluate(double[], double...)} method should be used
255 * which provides better performance.
256 *
257 * @param values Values.
258 * @param p Probability for the quantile to compute.
259 * @return the quantile
260 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
261 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
262 * @see #evaluate(double[], double...)
263 * @see #with(NaNPolicy)
264 */
265 public double evaluate(double[] values, double p) {
266 return compute(values, 0, values.length, p);
267 }
268
269 /**
270 * Evaluate the {@code p}-th quantile of the specified range of values.
271 *
272 * <p>Note: This method may partially sort the input values if not configured to
273 * {@link #withCopy(boolean) copy} the input data.
274 *
275 * <p><strong>Performance</strong>
276 *
277 * <p>It is not recommended to use this method for repeat calls for different quantiles
278 * within the same values. The {@link #evaluateRange(double[], int, int, double...)} method should be used
279 * which provides better performance.
280 *
281 * @param values Values.
282 * @param from Inclusive start of the range.
283 * @param to Exclusive end of the range.
284 * @param p Probability for the quantile to compute.
285 * @return the quantile
286 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]};
287 * or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
288 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
289 * @see #evaluateRange(double[], int, int, double...)
290 * @see #with(NaNPolicy)
291 * @since 1.2
292 */
293 public double evaluateRange(double[] values, int from, int to, double p) {
294 Statistics.checkFromToIndex(from, to, values.length);
295 return compute(values, from, to, p);
296 }
297
298 /**
299 * Compute the {@code p}-th quantile of the specified range of values.
300 *
301 * @param values Values.
302 * @param from Inclusive start of the range.
303 * @param to Exclusive end of the range.
304 * @param p Probability for the quantile to compute.
305 * @return the quantile
306 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
307 */
308 private double compute(double[] values, int from, int to, double p) {
309 checkProbability(p);
310 // Floating-point data handling
311 final int[] bounds = new int[2];
312 final double[] x = nanTransformer.apply(values, from, to, bounds);
313 final int start = bounds[0];
314 final int end = bounds[1];
315 final int n = end - start;
316 // Special cases
317 if (n <= 1) {
318 return n == 0 ? Double.NaN : x[start];
319 }
320
321 final double pos = estimationType.index(p, n);
322 final int ip = (int) pos;
323 final int i = start + ip;
324
325 // Partition and compute
326 if (pos > ip) {
327 Selection.select(x, start, end, new int[] {i, i + 1});
328 return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
329 }
330 Selection.select(x, start, end, i);
331 return x[i];
332 }
333
334 /**
335 * Evaluate the {@code p}-th quantiles of the values.
336 *
337 * <p>Note: This method may partially sort the input values if not configured to
338 * {@link #withCopy(boolean) copy} the input data.
339 *
340 * @param values Values.
341 * @param p Probabilities for the quantiles to compute.
342 * @return the quantiles
343 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
344 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
345 * @see #with(NaNPolicy)
346 */
347 public double[] evaluate(double[] values, double... p) {
348 return compute(values, 0, values.length, p);
349 }
350
351 /**
352 * Evaluate the {@code p}-th quantiles of the specified range of values.
353 *
354 * <p>Note: This method may partially sort the input values if not configured to
355 * {@link #withCopy(boolean) copy} the input data.
356 *
357 * @param values Values.
358 * @param from Inclusive start of the range.
359 * @param to Exclusive end of the range.
360 * @param p Probabilities for the quantiles to compute.
361 * @return the quantiles
362 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
363 * no probabilities are specified; or if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
364 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
365 * @see #with(NaNPolicy)
366 * @since 1.2
367 */
368 public double[] evaluateRange(double[] values, int from, int to, double... p) {
369 Statistics.checkFromToIndex(from, to, values.length);
370 return compute(values, from, to, p);
371 }
372
373 /**
374 * Compute the {@code p}-th quantiles of the specified range of values.
375 *
376 * @param values Values.
377 * @param from Inclusive start of the range.
378 * @param to Exclusive end of the range.
379 * @param p Probabilities for the quantiles to compute.
380 * @return the quantiles
381 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
382 * or no probabilities are specified.
383 */
384 private double[] compute(double[] values, int from, int to, double... p) {
385 checkProbabilities(p);
386 // Floating-point data handling
387 final int[] bounds = new int[2];
388 final double[] x = nanTransformer.apply(values, from, to, bounds);
389 final int start = bounds[0];
390 final int end = bounds[1];
391 final int n = end - start;
392 // Special cases
393 final double[] q = new double[p.length];
394 if (n <= 1) {
395 Arrays.fill(q, n == 0 ? Double.NaN : x[start]);
396 return q;
397 }
398
399 // Collect interpolation positions. We use the output q as storage.
400 final int[] indices = computeIndices(n, p, q, start);
401
402 // Partition
403 Selection.select(x, start, end, indices);
404
405 // Compute
406 for (int k = 0; k < p.length; k++) {
407 // ip in [0, n); i in [start, end)
408 final int ip = (int) q[k];
409 final int i = start + ip;
410 if (q[k] > ip) {
411 q[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
412 } else {
413 q[k] = x[i];
414 }
415 }
416 return q;
417 }
418
419 /**
420 * Evaluate the {@code p}-th quantile of the values.
421 *
422 * <p>Note: This method may partially sort the input values if not configured to
423 * {@link #withCopy(boolean) copy} the input data.
424 *
425 * <p><strong>Performance</strong>
426 *
427 * <p>It is not recommended to use this method for repeat calls for different quantiles
428 * within the same values. The {@link #evaluate(int[], double...)} method should be used
429 * which provides better performance.
430 *
431 * @param values Values.
432 * @param p Probability for the quantile to compute.
433 * @return the quantile
434 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
435 * @see #evaluate(int[], double...)
436 */
437 public double evaluate(int[] values, double p) {
438 return compute(values, 0, values.length, p);
439 }
440
441 /**
442 * Evaluate the {@code p}-th quantile of the specified range of values.
443 *
444 * <p>Note: This method may partially sort the input values if not configured to
445 * {@link #withCopy(boolean) copy} the input data.
446 *
447 * <p><strong>Performance</strong>
448 *
449 * <p>It is not recommended to use this method for repeat calls for different quantiles
450 * within the same values. The {@link #evaluateRange(int[], int, int, double...)} method should be used
451 * which provides better performance.
452 *
453 * @param values Values.
454 * @param from Inclusive start of the range.
455 * @param to Exclusive end of the range.
456 * @param p Probability for the quantile to compute.
457 * @return the quantile
458 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
459 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
460 * @see #evaluateRange(int[], int, int, double...)
461 * @since 1.2
462 */
463 public double evaluateRange(int[] values, int from, int to, double p) {
464 Statistics.checkFromToIndex(from, to, values.length);
465 return compute(values, from, to, p);
466 }
467
468 /**
469 * Compute the {@code p}-th quantile of the specified range of values.
470 *
471 * @param values Values.
472 * @param from Inclusive start of the range.
473 * @param to Exclusive end of the range.
474 * @param p Probability for the quantile to compute.
475 * @return the quantile
476 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
477 */
478 private double compute(int[] values, int from, int to, double p) {
479 checkProbability(p);
480 final int n = to - from;
481 // Special cases
482 if (n <= 1) {
483 return n == 0 ? Double.NaN : values[from];
484 }
485
486 // Create the range
487 final int[] x;
488 final int start;
489 final int end;
490 if (copy) {
491 x = Statistics.copy(values, from, to);
492 start = 0;
493 end = n;
494 } else {
495 x = values;
496 start = from;
497 end = to;
498 }
499
500 final double pos = estimationType.index(p, n);
501 final int ip = (int) pos;
502 final int i = start + ip;
503
504 // Partition and compute
505 if (pos > ip) {
506 Selection.select(x, start, end, new int[] {i, i + 1});
507 return Interpolation.interpolate((double) x[i], (double) x[i + 1], pos - ip);
508 }
509 Selection.select(x, start, end, i);
510 return x[i];
511 }
512
513 /**
514 * Evaluate the {@code p}-th quantiles of the values.
515 *
516 * <p>Note: This method may partially sort the input values if not configured to
517 * {@link #withCopy(boolean) copy} the input data.
518 *
519 * @param values Values.
520 * @param p Probabilities for the quantiles to compute.
521 * @return the quantiles
522 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
523 * or no probabilities are specified.
524 */
525 public double[] evaluate(int[] values, double... p) {
526 return compute(values, 0, values.length, p);
527 }
528
529 /**
530 * Evaluate the {@code p}-th quantiles of the specified range of values..
531 *
532 * <p>Note: This method may partially sort the input values if not configured to
533 * {@link #withCopy(boolean) copy} the input data.
534 *
535 * @param values Values.
536 * @param from Inclusive start of the range.
537 * @param to Exclusive end of the range.
538 * @param p Probabilities for the quantiles to compute.
539 * @return the quantiles
540 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
541 * or no probabilities are specified.
542 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
543 * @since 1.2
544 */
545 public double[] evaluateRange(int[] values, int from, int to, double... p) {
546 Statistics.checkFromToIndex(from, to, values.length);
547 return compute(values, from, to, p);
548 }
549
550 /**
551 * Evaluate the {@code p}-th quantiles of the specified range of values..
552 *
553 * <p>Note: This method may partially sort the input values if not configured to
554 * {@link #withCopy(boolean) copy} the input data.
555 *
556 * @param values Values.
557 * @param from Inclusive start of the range.
558 * @param to Exclusive end of the range.
559 * @param p Probabilities for the quantiles to compute.
560 * @return the quantiles
561 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
562 * or no probabilities are specified.
563 */
564 private double[] compute(int[] values, int from, int to, double... p) {
565 checkProbabilities(p);
566 final int n = to - from;
567 // Special cases
568 final double[] q = new double[p.length];
569 if (n <= 1) {
570 Arrays.fill(q, n == 0 ? Double.NaN : values[from]);
571 return q;
572 }
573
574 // Create the range
575 final int[] x;
576 final int start;
577 final int end;
578 if (copy) {
579 x = Statistics.copy(values, from, to);
580 start = 0;
581 end = n;
582 } else {
583 x = values;
584 start = from;
585 end = to;
586 }
587
588 // Collect interpolation positions. We use the output q as storage.
589 final int[] indices = computeIndices(n, p, q, start);
590
591 // Partition
592 Selection.select(x, start, end, indices);
593
594 // Compute
595 for (int k = 0; k < p.length; k++) {
596 // ip in [0, n); i in [start, end)
597 final int ip = (int) q[k];
598 final int i = start + ip;
599 if (q[k] > ip) {
600 q[k] = Interpolation.interpolate((double) x[i], (double) x[i + 1], q[k] - ip);
601 } else {
602 q[k] = x[i];
603 }
604 }
605 return q;
606 }
607
608 /**
609 * Evaluate the {@code p}-th quantile of the values.
610 *
611 * <p>Note: This method may partially sort the input values if not configured to
612 * {@link #withCopy(boolean) copy} the input data.
613 *
614 * <p><strong>Performance</strong>
615 *
616 * <p>It is not recommended to use this method for repeat calls for different
617 * quantiles within the same values. The {@link #evaluate(long[], double...)} method
618 * should be used which provides better performance.
619 *
620 * @param values Values.
621 * @param p Probability for the quantile to compute.
622 * @return the quantile
623 * @throws IllegalArgumentException if the probability {@code p} is not in the range
624 * {@code [0, 1]}
625 * @see #evaluate(long[], double...)
626 * @since 1.3
627 */
628 public StatisticResult evaluate(long[] values, double p) {
629 return compute(values, 0, values.length, p);
630 }
631
632 /**
633 * Evaluate the {@code p}-th quantile of the specified range of values.
634 *
635 * <p>Note: This method may partially sort the input values if not configured to
636 * {@link #withCopy(boolean) copy} the input data.
637 *
638 * <p><strong>Performance</strong>
639 *
640 * <p>It is not recommended to use this method for repeat calls for different quantiles
641 * within the same values. The {@link #evaluateRange(long[], int, int, double...)} method should be used
642 * which provides better performance.
643 *
644 * @param values Values.
645 * @param from Inclusive start of the range.
646 * @param to Exclusive end of the range.
647 * @param p Probability for the quantile to compute.
648 * @return the quantile
649 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
650 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
651 * @see #evaluateRange(long[], int, int, double...)
652 * @since 1.3
653 */
654 public StatisticResult evaluateRange(long[] values, int from, int to, double p) {
655 Statistics.checkFromToIndex(from, to, values.length);
656 return compute(values, from, to, p);
657 }
658
659 /**
660 * Compute the {@code p}-th quantile of the specified range of values.
661 *
662 * @param values Values.
663 * @param from Inclusive start of the range.
664 * @param to Exclusive end of the range.
665 * @param p Probability for the quantile to compute.
666 * @return the quantile
667 * @throws IllegalArgumentException if the probability {@code p} is not in the range {@code [0, 1]}
668 * @since 1.3
669 */
670 private StatisticResult compute(long[] values, int from, int to, double p) {
671 checkProbability(p);
672 final int n = to - from;
673 // Special cases
674 if (n <= 1) {
675 return n == 0 ?
676 () -> Double.NaN :
677 Statistics.createStatisticResult(values[from]);
678 }
679
680 // Create the range
681 final long[] x;
682 final int start;
683 final int end;
684 if (copy) {
685 x = Statistics.copy(values, from, to);
686 start = 0;
687 end = n;
688 } else {
689 x = values;
690 start = from;
691 end = to;
692 }
693
694 final double pos = estimationType.index(p, n);
695 final int ip = (int) pos;
696 final int i = start + ip;
697
698 // Partition and compute
699 if (pos > ip) {
700 Selection.select(x, start, end, new int[] {i, i + 1});
701 return Interpolation.interpolate(x[i], x[i + 1], pos - ip);
702 }
703 Selection.select(x, start, end, i);
704 return Statistics.createStatisticResult(x[i]);
705 }
706
707 /**
708 * Evaluate the {@code p}-th quantiles of the values.
709 *
710 * <p>Note: This method may partially sort the input values if not configured to
711 * {@link #withCopy(boolean) copy} the input data.
712 *
713 * @param values Values.
714 * @param p Probabilities for the quantiles to compute.
715 * @return the quantiles
716 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
717 * or no probabilities are specified.
718 * @since 1.3
719 */
720 public StatisticResult[] evaluate(long[] values, double... p) {
721 return compute(values, 0, values.length, p);
722 }
723
724 /**
725 * Evaluate the {@code p}-th quantiles of the specified range of values..
726 *
727 * <p>Note: This method may partially sort the input values if not configured to
728 * {@link #withCopy(boolean) copy} the input data.
729 *
730 * @param values Values.
731 * @param from Inclusive start of the range.
732 * @param to Exclusive end of the range.
733 * @param p Probabilities for the quantiles to compute.
734 * @return the quantiles
735 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
736 * or no probabilities are specified.
737 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
738 * @since 1.3
739 */
740 public StatisticResult[] evaluateRange(long[] values, int from, int to, double... p) {
741 Statistics.checkFromToIndex(from, to, values.length);
742 return compute(values, from, to, p);
743 }
744
745 /**
746 * Evaluate the {@code p}-th quantiles of the specified range of values..
747 *
748 * <p>Note: This method may partially sort the input values if not configured to
749 * {@link #withCopy(boolean) copy} the input data.
750 *
751 * @param values Values.
752 * @param from Inclusive start of the range.
753 * @param to Exclusive end of the range.
754 * @param p Probabilities for the quantiles to compute.
755 * @return the quantiles
756 * @throws IllegalArgumentException if any probability {@code p} is not in the range {@code [0, 1]};
757 * or no probabilities are specified.
758 */
759 private StatisticResult[] compute(long[] values, int from, int to, double... p) {
760 checkProbabilities(p);
761 final int n = to - from;
762 // Special cases
763 final StatisticResult[] result = new StatisticResult[p.length];
764 if (n <= 1) {
765 final StatisticResult r = n == 0 ?
766 () -> Double.NaN :
767 Statistics.createStatisticResult(values[from]);
768 Arrays.fill(result, r);
769 return result;
770 }
771
772 // Create the range
773 final long[] x;
774 final int start;
775 final int end;
776 if (copy) {
777 x = Statistics.copy(values, from, to);
778 start = 0;
779 end = n;
780 } else {
781 x = values;
782 start = from;
783 end = to;
784 }
785
786 // Collect interpolation positions
787 final double[] q = new double[p.length];
788 final int[] indices = computeIndices(n, p, q, start);
789
790 // Partition
791 Selection.select(x, start, end, indices);
792
793 // Compute
794 for (int k = 0; k < p.length; k++) {
795 // ip in [0, n); i in [start, end)
796 final int ip = (int) q[k];
797 final int i = start + ip;
798 if (q[k] > ip) {
799 result[k] = Interpolation.interpolate(x[i], x[i + 1], q[k] - ip);
800 } else {
801 result[k] = Statistics.createStatisticResult(x[i]);
802 }
803 }
804 return result;
805 }
806
807 /**
808 * Evaluate the {@code p}-th quantile of the sorted values provided as a {@code double}.
809 *
810 * <p>This method can be used when the values of known size are already sorted.
811 * It can be used for primitive types not supported by other evaluation methods.
812 * Numeric types {@code byte}, {@code short} and {@code float} can be converted to
813 * type {@code double} without loss of precision.
814 *
815 * <pre>{@code
816 * short[] x = ...
817 * Arrays.sort(x);
818 * double q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.05);
819 * }</pre>
820 *
821 * <p>If the sorted array is a {@code long} datatype this method can lose information
822 * about the precision of the quantiles due to primitive type conversion. Use
823 * the method {@link #evaluateAsLong(int, IntToLongFunction, double)} to compute
824 * the {@code long} quantile result.
825 *
826 * @param n Size of the values.
827 * @param values Values function.
828 * @param p Probability for the quantile to compute.
829 * @return the quantile
830 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is
831 * not in the range {@code [0, 1]}.
832 * @see #evaluateAsLong(int, IntToLongFunction, double)
833 */
834 public double evaluate(int n, IntToDoubleFunction values, double p) {
835 checkSize(n);
836 checkProbability(p);
837 // Special case
838 if (n <= 1) {
839 return n == 0 ? Double.NaN : values.applyAsDouble(0);
840 }
841 final double pos = estimationType.index(p, n);
842 final int i = (int) pos;
843 final double v1 = values.applyAsDouble(i);
844 if (pos > i) {
845 final double v2 = values.applyAsDouble(i + 1);
846 return Interpolation.interpolate(v1, v2, pos - i);
847 }
848 return v1;
849 }
850
851 /**
852 * Evaluate the {@code p}-th quantiles of the sorted values provided as a {@code double}.
853 *
854 * <p>This method can be used when the values of known size are already sorted.
855 * It can be used for primitive types not supported by other evaluation methods.
856 * Numeric types {@code byte}, {@code short} and {@code float} can be converted to
857 * type {@code double} without loss of precision.
858 *
859 * <pre>{@code
860 * short[] x = ...
861 * Arrays.sort(x);
862 * double[] q = Quantile.withDefaults().evaluate(x.length, i -> x[i], 0.25, 0.5, 0.75);
863 * }</pre>
864 *
865 * <p>If the sorted array is a {@code long} datatype this method can lose information
866 * about the precision of the quantiles due to primitive type conversion. Use
867 * the method {@link #evaluateAsLong(int, IntToLongFunction, double...)} to compute
868 * the {@code long} quantile result.
869 *
870 * @param n Size of the values.
871 * @param values Values function.
872 * @param p Probabilities for the quantiles to compute.
873 * @return the quantiles
874 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is
875 * not in the range {@code [0, 1]}; or no probabilities are specified.
876 * @see #evaluateAsLong(int, IntToLongFunction, double...)
877 */
878 public double[] evaluate(int n, IntToDoubleFunction values, double... p) {
879 checkSize(n);
880 checkProbabilities(p);
881 // Special case
882 final double[] q = new double[p.length];
883 if (n <= 1) {
884 Arrays.fill(q, n == 0 ? Double.NaN : values.applyAsDouble(0));
885 return q;
886 }
887 for (int k = 0; k < p.length; k++) {
888 final double pos = estimationType.index(p[k], n);
889 final int i = (int) pos;
890 final double v1 = values.applyAsDouble(i);
891 if (pos > i) {
892 final double v2 = values.applyAsDouble(i + 1);
893 q[k] = Interpolation.interpolate(v1, v2, pos - i);
894 } else {
895 q[k] = v1;
896 }
897 }
898 return q;
899 }
900
901 /**
902 * Evaluate the {@code p}-th quantile of the sorted values provided as a {@code long}.
903 *
904 * <p>This method can be used when the values of known size are already sorted.
905 *
906 * <pre>{@code
907 * long[] x = ...
908 * Arrays.sort(x);
909 * StatisticResult q = Quantile.withDefaults()
910 * .evaluateAsLong(x.length, i -> x[i], 0.05);
911 * }</pre>
912 *
913 * <p>Note: It is not recommended to sort data for use only in the quantile computation.
914 * The {@link #evaluate(long[], double)} method will partially sort the data as required
915 * and in most cases will be more efficient.
916 *
917 * @param n Size of the values.
918 * @param values Values function.
919 * @param p Probability for the quantile to compute.
920 * @return the quantile
921 * @throws IllegalArgumentException if {@code size < 0}; or if the probability {@code p} is
922 * not in the range {@code [0, 1]}.
923 * @since 1.3
924 */
925 public StatisticResult evaluateAsLong(int n, IntToLongFunction values, double p) {
926 checkSize(n);
927 checkProbability(p);
928 // Special case
929 if (n <= 1) {
930 return n == 0 ?
931 () -> Double.NaN :
932 Statistics.createStatisticResult(values.applyAsLong(0));
933 }
934 final double pos = estimationType.index(p, n);
935 final int i = (int) pos;
936 final long v1 = values.applyAsLong(i);
937 if (pos > i) {
938 final long v2 = values.applyAsLong(i + 1);
939 return Interpolation.interpolate(v1, v2, pos - i);
940 }
941 return Statistics.createStatisticResult(v1);
942 }
943
944 /**
945 * Evaluate the {@code p}-th quantiles of the sorted values provided as a {@code long}.
946 *
947 * <p>This method can be used when the values of known size are already sorted.
948 *
949 * <pre>{@code
950 * long[] x = ...
951 * Arrays.sort(x);
952 * StatisticResult[] q = Quantile.withDefaults()
953 * .evaluateAsLong(x.length, i -> x[i], 0.25, 0.5, 0.75);
954 * }</pre>
955 *
956 * <p>Note: It is not recommended to sort data for use only in the quantile computation.
957 * The {@link #evaluate(long[], double...)} method will partially sort the data as required
958 * and in most cases will be more efficient.
959 *
960 * @param n Size of the values.
961 * @param values Values function.
962 * @param p Probabilities for the quantiles to compute.
963 * @return the quantiles
964 * @throws IllegalArgumentException if {@code size < 0}; if any probability {@code p} is
965 * not in the range {@code [0, 1]}; or no probabilities are specified.
966 * @since 1.3
967 */
968 public StatisticResult[] evaluateAsLong(int n, IntToLongFunction values, double... p) {
969 checkSize(n);
970 checkProbabilities(p);
971 // Special case
972 final StatisticResult[] result = new StatisticResult[p.length];
973 if (n <= 1) {
974 final StatisticResult r = n == 0 ?
975 () -> Double.NaN :
976 Statistics.createStatisticResult(values.applyAsLong(0));
977 Arrays.fill(result, r);
978 return result;
979 }
980 for (int k = 0; k < p.length; k++) {
981 final double pos = estimationType.index(p[k], n);
982 final int i = (int) pos;
983 final long v1 = values.applyAsLong(i);
984 if (pos > i) {
985 final long v2 = values.applyAsLong(i + 1);
986 result[k] = Interpolation.interpolate(v1, v2, pos - i);
987 } else {
988 result[k] = Statistics.createStatisticResult(v1);
989 }
990 }
991 return result;
992 }
993
994 /**
995 * Check the probability {@code p} is in the range {@code [0, 1]}.
996 *
997 * @param p Probability for the quantile to compute.
998 * @throws IllegalArgumentException if the probability is not in the range {@code [0, 1]}
999 */
1000 private static void checkProbability(double p) {
1001 // Logic negation will detect NaN
1002 if (!(p >= 0 && p <= 1)) {
1003 throw new IllegalArgumentException(INVALID_PROBABILITY + p);
1004 }
1005 }
1006
1007 /**
1008 * Check the probabilities {@code p} are in the range {@code [0, 1]}.
1009 *
1010 * @param p Probabilities for the quantiles to compute.
1011 * @throws IllegalArgumentException if any probabilities {@code p} is not in the range {@code [0, 1]};
1012 * or no probabilities are specified.
1013 */
1014 private static void checkProbabilities(double... p) {
1015 if (p.length == 0) {
1016 throw new IllegalArgumentException(NO_PROBABILITIES_SPECIFIED);
1017 }
1018 for (final double pp : p) {
1019 checkProbability(pp);
1020 }
1021 }
1022
1023 /**
1024 * Check the {@code size} is positive.
1025 *
1026 * @param n Size of the values.
1027 * @throws IllegalArgumentException if {@code size < 0}
1028 */
1029 private static void checkSize(int n) {
1030 if (n < 0) {
1031 throw new IllegalArgumentException(INVALID_SIZE + n);
1032 }
1033 }
1034
1035 /**
1036 * Check the number of probabilities {@code n} is strictly positive.
1037 *
1038 * @param n Number of probabilities.
1039 * @throws IllegalArgumentException if {@code c < 1}
1040 */
1041 private static void checkNumberOfProbabilities(int n) {
1042 if (n < 1) {
1043 throw new IllegalArgumentException(INVALID_NUMBER_OF_PROBABILITIES + n);
1044 }
1045 }
1046
1047 /**
1048 * Compute the indices required for quantile interpolation.
1049 *
1050 * <p>The zero-based interpolation index in {@code [0, n)} is
1051 * saved into the working array {@code q} for each {@code p}.
1052 *
1053 * <p>The indices are incremented by the provided {@code offset} to allow
1054 * addressing sub-ranges of a larger array.
1055 *
1056 * @param n Size of the data.
1057 * @param p Probabilities for the quantiles to compute.
1058 * @param q Working array for quantiles in {@code [0, n)}.
1059 * @param offset Array offset.
1060 * @return the indices in {@code [offset, offset + n)}
1061 */
1062 private int[] computeIndices(int n, double[] p, double[] q, int offset) {
1063 final int[] indices = new int[p.length << 1];
1064 int count = 0;
1065 for (int k = 0; k < p.length; k++) {
1066 final double pos = estimationType.index(p[k], n);
1067 q[k] = pos;
1068 final int i = (int) pos;
1069 indices[count++] = offset + i;
1070 if (pos > i) {
1071 // Require the next index for interpolation
1072 indices[count++] = offset + i + 1;
1073 }
1074 }
1075 if (count < indices.length) {
1076 return Arrays.copyOf(indices, count);
1077 }
1078 return indices;
1079 }
1080
1081 /**
1082 * Enumerates estimation methods for a quantile. Provides the nine quantile algorithms
1083 * defined in Hyndman and Fan (1996)[1] as {@code HF1 - HF9}.
1084 *
1085 * <p>Samples quantiles are defined by:
1086 *
1087 * <p>\[ Q(p) = (1 - \gamma) x_j + \gamma x_{j+1} \]
1088 *
1089 * <p>where \( \frac{j-m}{n} \leq p \le \frac{j-m+1}{n} \), \( x_j \) is the \( j \)th
1090 * order statistic, \( n \) is the sample size, the value of \( \gamma \) is a function
1091 * of \( j = \lfloor np+m \rfloor \) and \( g = np + m - j \), and \( m \) is a constant
1092 * determined by the sample quantile type.
1093 *
1094 * <p>Note that the real-valued position \( np + m \) is a 1-based index and
1095 * \( j \in [1, n] \). If the real valued position is computed as beyond the lowest or
1096 * highest values in the sample, this implementation will return the minimum or maximum
1097 * observation respectively.
1098 *
1099 * <p>Types 1, 2, and 3 are discontinuous functions of \( p \); types 4 to 9 are continuous
1100 * functions of \( p \).
1101 *
1102 * <p>For the continuous functions, the probability \( p_k \) is provided for the \( k \)-th order
1103 * statistic in size \( n \). Samples quantiles are equivalently obtained to \( Q(p) \) by
1104 * linear interpolation between points \( (p_k, x_k) \) and \( (p_{k+1}, x_{k+1}) \) for
1105 * any \( p_k \leq p \leq p_{k+1} \).
1106 *
1107 * <ol>
1108 * <li>Hyndman and Fan (1996)
1109 * <i>Sample Quantiles in Statistical Packages.</i>
1110 * The American Statistician, 50, 361-365.
1111 * <a href="https://www.jstor.org/stable/2684934">doi.org/10.2307/2684934</a></li>
1112 * <li><a href="https://en.wikipedia.org/wiki/Quantile">Quantile (Wikipedia)</a></li>
1113 * </ol>
1114 */
1115 public enum EstimationMethod {
1116 /**
1117 * Inverse of the empirical distribution function.
1118 *
1119 * <p>\( m = 0 \). \( \gamma = 0 \) if \( g = 0 \), and 1 otherwise.
1120 */
1121 HF1 {
1122 @Override
1123 double position0(double p, int n) {
1124 // position = np + 0. This is 1-based so adjust to 0-based.
1125 return Math.ceil(n * p) - 1;
1126 }
1127 },
1128 /**
1129 * Similar to {@link #HF1} with averaging at discontinuities.
1130 *
1131 * <p>\( m = 0 \). \( \gamma = 0.5 \) if \( g = 0 \), and 1 otherwise.
1132 */
1133 HF2 {
1134 @Override
1135 double position0(double p, int n) {
1136 final double pos = n * p;
1137 // Average at discontinuities
1138 final int j = (int) pos;
1139 final double g = pos - j;
1140 if (g == 0) {
1141 return j - 0.5;
1142 }
1143 // As HF1 : ceil(j + g) - 1
1144 return j;
1145 }
1146 },
1147 /**
1148 * The observation closest to \( np \). Ties are resolved to the nearest even order statistic.
1149 *
1150 * <p>\( m = -1/2 \). \( \gamma = 0 \) if \( g = 0 \) and \( j \) is even, and 1 otherwise.
1151 */
1152 HF3 {
1153 @Override
1154 double position0(double p, int n) {
1155 // Let rint do the work for ties to even
1156 return Math.rint(n * p) - 1;
1157 }
1158 },
1159 /**
1160 * Linear interpolation of the inverse of the empirical CDF.
1161 *
1162 * <p>\( m = 0 \). \( p_k = \frac{k}{n} \).
1163 */
1164 HF4 {
1165 @Override
1166 double position0(double p, int n) {
1167 // np + 0 - 1
1168 return n * p - 1;
1169 }
1170 },
1171 /**
1172 * A piecewise linear function where the knots are the values midway through the steps of
1173 * the empirical CDF. Proposed by Hazen (1914) and popular amongst hydrologists.
1174 *
1175 * <p>\( m = 1/2 \). \( p_k = \frac{k - 1/2}{n} \).
1176 */
1177 HF5 {
1178 @Override
1179 double position0(double p, int n) {
1180 // np + 0.5 - 1
1181 return n * p - 0.5;
1182 }
1183 },
1184 /**
1185 * Linear interpolation of the expectations for the order statistics for the uniform
1186 * distribution on [0,1]. Proposed by Weibull (1939).
1187 *
1188 * <p>\( m = p \). \( p_k = \frac{k}{n + 1} \).
1189 *
1190 * <p>This method computes the quantile as per the Apache Commons Math Percentile
1191 * legacy implementation.
1192 */
1193 HF6 {
1194 @Override
1195 double position0(double p, int n) {
1196 // np + p - 1
1197 return (n + 1) * p - 1;
1198 }
1199 },
1200 /**
1201 * Linear interpolation of the modes for the order statistics for the uniform
1202 * distribution on [0,1]. Proposed by Gumbull (1939).
1203 *
1204 * <p>\( m = 1 - p \). \( p_k = \frac{k - 1}{n - 1} \).
1205 */
1206 HF7 {
1207 @Override
1208 double position0(double p, int n) {
1209 // np + 1-p - 1
1210 return (n - 1) * p;
1211 }
1212 },
1213 /**
1214 * Linear interpolation of the approximate medians for order statistics.
1215 *
1216 * <p>\( m = (p + 1)/3 \). \( p_k = \frac{k - 1/3}{n + 1/3} \).
1217 *
1218 * <p>As per Hyndman and Fan (1996) this approach is most recommended as it provides
1219 * an approximate median-unbiased estimate regardless of distribution.
1220 */
1221 HF8 {
1222 @Override
1223 double position0(double p, int n) {
1224 return n * p + (p + 1) / 3 - 1;
1225 }
1226 },
1227 /**
1228 * Quantile estimates are approximately unbiased for the expected order statistics if
1229 * \( x \) is normally distributed.
1230 *
1231 * <p>\( m = p/4 + 3/8 \). \( p_k = \frac{k - 3/8}{n + 1/4} \).
1232 */
1233 HF9 {
1234 @Override
1235 double position0(double p, int n) {
1236 // np + p/4 + 3/8 - 1
1237 return (n + 0.25) * p - 0.625;
1238 }
1239 };
1240
1241 /**
1242 * Finds the real-valued position for calculation of the quantile.
1243 *
1244 * <p>Return {@code i + g} such that the quantile value from sorted data is:
1245 *
1246 * <p>value = data[i] + g * (data[i+1] - data[i])
1247 *
1248 * <p>Warning: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
1249 *
1250 * <p>Note: In contrast to the definition of Hyndman and Fan in the class header
1251 * which uses a 1-based position, this is a zero based index. This change is for
1252 * convenience when addressing array positions.
1253 *
1254 * @param p p<sup>th</sup> quantile.
1255 * @param n Size.
1256 * @return a real-valued position (0-based) into the range {@code [0, n)}
1257 */
1258 abstract double position0(double p, int n);
1259
1260 /**
1261 * Finds the index {@code i} and fractional part {@code g} of a real-valued position
1262 * to interpolate the quantile.
1263 *
1264 * <p>Return {@code i + g} such that the quantile value from sorted data is:
1265 *
1266 * <p>value = data[i] + g * (data[i+1] - data[i])
1267 *
1268 * <p>Note: Interpolation should not use {@code data[i+1]} unless {@code g != 0}.
1269 *
1270 * @param p p<sup>th</sup> quantile.
1271 * @param n Size.
1272 * @return index (in [0, n-1])
1273 */
1274 final double index(double p, int n) {
1275 final double pos = position0(p, n);
1276 // Bounds check in [0, n-1]
1277 if (pos < 0) {
1278 return 0;
1279 }
1280 if (pos > n - 1.0) {
1281 return n - 1.0;
1282 }
1283 return pos;
1284 }
1285 }
1286 }