View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.statistics.descriptive;
19  
20  import java.util.Objects;
21  import org.apache.commons.numbers.arrays.Selection;
22  
23  /**
24   * Returns the median of the available values.
25   *
26   * <p>For values of length {@code n}, let {@code k = n / 2}:
27   * <ul>
28   * <li>The result is {@code NaN} if {@code n = 0}.</li>
29   * <li>The result is {@code values[k]} if {@code n} is odd.</li>
30   * <li>The result is {@code (values[k - 1] + values[k]) / 2} if {@code n} is even.</li>
31   * </ul>
32   *
33   * <p>This implementation respects the ordering imposed by
34   * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
35   * in the selected positions in the fully sorted values then the result is {@code NaN}.
36   *
37   * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
38   *
39   * <p>Instances of this class are immutable and thread-safe.
40   *
41   * <p><strong>Support for {@code long} arrays</strong>
42   *
43   * <p>The result on {@code long} values can be returned as a {@code double} or
44   * a {@code long} using a {@link StatisticResult}.
45   *
46   * <p>The {@code double} result is the closest representable value
47   * following floating-point rounding rules. This may be outside the
48   * range defined by the minimum and maximum of the input array following
49   * rounding to a 53-bit floating point representation.
50   * For example the median of an array containing only {@link Long#MAX_VALUE}
51   * as a {@code double} is 2<sup>63</sup>, which is the closest representation of
52   * 2<sup>63</sup> - 1.
53   *
54   * <p>The {@code long} result is returned using the nearest whole number.
55   * In the event of ties the result is rounded towards positive infinity.
56   * For example the median of {@code [2, 3]} is 3, and of {@code [-2, -3]} is -2.
57   * This value will always be within the range defined by the minimum and maximum
58   * of the input array. Due to interpolation it may be a value not observed in
59   * the input values.
60   *
61   * <p>If the array length {@code n} is zero the result as a {@code double} is
62   * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}.
63   *
64   * @see #with(NaNPolicy)
65   * @see <a href="https://en.wikipedia.org/wiki/Median">Median (Wikipedia)</a>
66   * @since 1.1
67   */
68  public final class Median {
69      /** Default instance. */
70      private static final Median DEFAULT = new Median(false, NaNPolicy.INCLUDE);
71  
72      /** Flag to indicate if the data should be copied. */
73      private final boolean copy;
74      /** NaN policy for floating point data. */
75      private final NaNPolicy nanPolicy;
76      /** Transformer for NaN data. */
77      private final NaNTransformer nanTransformer;
78  
79      /**
80       * @param copy Flag to indicate if the data should be copied.
81       * @param nanPolicy NaN policy.
82       */
83      private Median(boolean copy, NaNPolicy nanPolicy) {
84          this.copy = copy;
85          this.nanPolicy = nanPolicy;
86          nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
87      }
88  
89      /**
90       * Return a new instance with the default options.
91       *
92       * <ul>
93       * <li>{@linkplain #withCopy(boolean) Copy = false}</li>
94       * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li>
95       * </ul>
96       *
97       * <p>Note: The default options configure for processing in-place and including
98       * {@code NaN} values in the data. This is the most efficient mode and has the
99       * smallest memory consumption.
100      *
101      * @return the median implementation
102      * @see #withCopy(boolean)
103      * @see #with(NaNPolicy)
104      */
105     public static Median withDefaults() {
106         return DEFAULT;
107     }
108 
109     /**
110      * Return an instance with the configured copy behaviour. If {@code false} then
111      * the input array will be modified by the call to evaluate the median; otherwise
112      * the computation uses a copy of the data.
113      *
114      * @param v Value.
115      * @return an instance
116      */
117     public Median withCopy(boolean v) {
118         return new Median(v, nanPolicy);
119     }
120 
121     /**
122      * Return an instance with the configured {@link NaNPolicy}.
123      *
124      * <p>Note: This implementation respects the ordering imposed by
125      * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
126      * considered greater than all other values, and all {@code NaN} values are equal. The
127      * {@link NaNPolicy} changes the computation of the statistic in the presence of
128      * {@code NaN} values.
129      *
130      * <ul>
131      * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
132      * the size of the data <em>includes</em> the {@code NaN} values and the median will be
133      * {@code NaN} if any value used for median interpolation is {@code NaN}.</li>
134      * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
135      * the size of the data <em>excludes</em> the {@code NaN} values and the median will
136      * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
137      * and the result is {@code NaN}.</li>
138      * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
139      * values.</li>
140      * </ul>
141      *
142      * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
143      *
144      * @param v Value.
145      * @return an instance
146      */
147     public Median with(NaNPolicy v) {
148         return new Median(copy, Objects.requireNonNull(v));
149     }
150 
151     /**
152      * Evaluate the median.
153      *
154      * <p>Note: This method may partially sort the input values if not configured to
155      * {@link #withCopy(boolean) copy} the input data.
156      *
157      * @param values Values.
158      * @return the median
159      * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
160      * @see #with(NaNPolicy)
161      */
162     public double evaluate(double[] values) {
163         return compute(values, 0, values.length);
164     }
165 
166     /**
167      * Evaluate the median of the specified range.
168      *
169      * <p>Note: This method may partially sort the input values if not configured to
170      * {@link #withCopy(boolean) copy} the input data.
171      *
172      * @param values Values.
173      * @param from Inclusive start of the range.
174      * @param to Exclusive end of the range.
175      * @return the median
176      * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
177      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
178      * @see #with(NaNPolicy)
179      * @since 1.2
180      */
181     public double evaluateRange(double[] values, int from, int to) {
182         Statistics.checkFromToIndex(from, to, values.length);
183         return compute(values, from, to);
184     }
185 
186     /**
187      * Compute the median of the specified range.
188      *
189      * @param values Values.
190      * @param from Inclusive start of the range.
191      * @param to Exclusive end of the range.
192      * @return the median
193      */
194     private double compute(double[] values, int from, int to) {
195         // Floating-point data handling
196         final int[] bounds = new int[2];
197         final double[] x = nanTransformer.apply(values, from, to, bounds);
198         final int start = bounds[0];
199         final int end = bounds[1];
200         final int n = end - start;
201         // Special cases
202         if (n <= 2) {
203             switch (n) {
204             case 2:
205                 // Sorting the array matches the behaviour of Quantile for n==2
206                 // Handle NaN and signed zeros
207                 if (Double.compare(x[start + 1], x[start]) < 0) {
208                     final double t = x[start];
209                     x[start] = x[start + 1];
210                     x[start + 1] = t;
211                 }
212                 return Interpolation.mean(x[start], x[start + 1]);
213             case 1:
214                 return x[start];
215             default:
216                 return Double.NaN;
217             }
218         }
219         // Median index (including the offset)
220         final int m = (start + end) >>> 1;
221         // Odd
222         if ((n & 0x1) == 1) {
223             Selection.select(x, start, end, m);
224             return x[m];
225         }
226         // Even: require (m-1, m)
227         Selection.select(x, start, end, new int[] {m - 1, m});
228         return Interpolation.mean(x[m - 1], x[m]);
229     }
230 
231     /**
232      * Evaluate the median.
233      *
234      * <p>Note: This method may partially sort the input values if not configured to
235      * {@link #withCopy(boolean) copy} the input data.
236      *
237      * @param values Values.
238      * @return the median
239      */
240     public double evaluate(int[] values) {
241         return compute(values, 0, values.length);
242     }
243 
244     /**
245      * Evaluate the median of the specified range.
246      *
247      * <p>Note: This method may partially sort the input values if not configured to
248      * {@link #withCopy(boolean) copy} the input data.
249      *
250      * @param values Values.
251      * @param from Inclusive start of the range.
252      * @param to Exclusive end of the range.
253      * @return the median
254      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
255      * @since 1.2
256      */
257     public double evaluateRange(int[] values, int from, int to) {
258         Statistics.checkFromToIndex(from, to, values.length);
259         return compute(values, from, to);
260     }
261 
262     /**
263      * Compute the median of the specified range.
264      *
265      * @param values Values.
266      * @param from Inclusive start of the range.
267      * @param to Exclusive end of the range.
268      * @return the median
269      */
270     private double compute(int[] values, int from, int to) {
271         final int[] x;
272         final int start;
273         final int end;
274         if (copy) {
275             x = Statistics.copy(values, from, to);
276             start = 0;
277             end = x.length;
278         } else {
279             x = values;
280             start = from;
281             end = to;
282         }
283         final int n = end - start;
284         // Special cases
285         if (n <= 2) {
286             switch (n) {
287             case 2:
288                 // Sorting the array matches the behaviour of Quantile for n==2
289                 if (x[start + 1] < x[start]) {
290                     final int t = x[start];
291                     x[start] = x[start + 1];
292                     x[start + 1] = t;
293                 }
294                 return Interpolation.mean(x[start], x[start + 1]);
295             case 1:
296                 return x[start];
297             default:
298                 return Double.NaN;
299             }
300         }
301         // Median index (including the offset)
302         final int m = (start + end) >>> 1;
303         // Odd
304         if ((n & 0x1) == 1) {
305             Selection.select(x, start, end, m);
306             return x[m];
307         }
308         // Even: require (m-1, m)
309         Selection.select(x, start, end, new int[] {m - 1, m});
310         return Interpolation.mean(x[m - 1], x[m]);
311     }
312 
313 
314     /**
315      * Evaluate the median.
316      *
317      * <p>If the input length is even the result requires interpolation of two values.
318      * The returned median will interpolate the {@code double} or {@code long} result
319      * on demand. This is more efficient when only one result is required. Consumers
320      * of the result should store the appropriate evaluated value if repeated use is
321      * required.
322      *
323      * <p>Note: This method may partially sort the input values if not configured to
324      * {@link #withCopy(boolean) copy} the input data.
325      *
326      * @param values Values.
327      * @return the median
328      * @since 1.3
329      */
330     public StatisticResult evaluate(long[] values) {
331         return compute(values, 0, values.length);
332     }
333 
334     /**
335      * Evaluate the median of the specified range.
336      *
337      * <p>If the input sub-range length is even the result requires interpolation of two values.
338      * The returned median will interpolate the {@code double} or {@code long} result
339      * on demand. This is more efficient when only one result is required. Consumers
340      * of the result should store the appropriate evaluated value if repeated use is
341      * required.
342      *
343      * <p>Note: This method may partially sort the input values if not configured to
344      * {@link #withCopy(boolean) copy} the input data.
345      *
346      * @param values Values.
347      * @param from Inclusive start of the range.
348      * @param to Exclusive end of the range.
349      * @return the median
350      * @throws IndexOutOfBoundsException if the sub-range is out of bounds
351      * @since 1.3
352      */
353     public StatisticResult evaluateRange(long[] values, int from, int to) {
354         Statistics.checkFromToIndex(from, to, values.length);
355         return compute(values, from, to);
356     }
357 
358     /**
359      * Compute the median of the specified range.
360      *
361      * @param values Values.
362      * @param from Inclusive start of the range.
363      * @param to Exclusive end of the range.
364      * @return the median
365      */
366     private StatisticResult compute(long[] values, int from, int to) {
367         final long[] x;
368         final int start;
369         final int end;
370         if (copy) {
371             x = Statistics.copy(values, from, to);
372             start = 0;
373             end = x.length;
374         } else {
375             x = values;
376             start = from;
377             end = to;
378         }
379         final int n = end - start;
380         // Special cases
381         if (n <= 2) {
382             switch (n) {
383             case 2:
384                 // Sorting the array matches the behaviour of Quantile for n==2
385                 if (x[start + 1] < x[start]) {
386                     final long t = x[start];
387                     x[start] = x[start + 1];
388                     x[start + 1] = t;
389                 }
390                 return Interpolation.mean(x[start], x[start + 1]);
391             case 1:
392                 return Statistics.createStatisticResult(x[start]);
393             default:
394                 return () -> Double.NaN;
395             }
396         }
397         // Median index (including the offset)
398         final int m = (start + end) >>> 1;
399         // Odd
400         if ((n & 0x1) == 1) {
401             Selection.select(x, start, end, m);
402             return Statistics.createStatisticResult(x[m]);
403         }
404         // Even: require (m-1, m)
405         Selection.select(x, start, end, new int[] {m - 1, m});
406         return Interpolation.mean(x[m - 1], x[m]);
407     }
408 }