1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.statistics.descriptive;
19
20 import java.util.Objects;
21 import org.apache.commons.numbers.arrays.Selection;
22
23 /**
24 * Returns the median of the available values.
25 *
26 * <p>For values of length {@code n}, let {@code k = n / 2}:
27 * <ul>
28 * <li>The result is {@code NaN} if {@code n = 0}.</li>
29 * <li>The result is {@code values[k]} if {@code n} is odd.</li>
30 * <li>The result is {@code (values[k - 1] + values[k]) / 2} if {@code n} is even.</li>
31 * </ul>
32 *
33 * <p>This implementation respects the ordering imposed by
34 * {@link Double#compare(double, double)} for {@code NaN} values. If a {@code NaN} occurs
35 * in the selected positions in the fully sorted values then the result is {@code NaN}.
36 *
37 * <p>The {@link NaNPolicy} can be used to change the behaviour on {@code NaN} values.
38 *
39 * <p>Instances of this class are immutable and thread-safe.
40 *
41 * <p><strong>Support for {@code long} arrays</strong>
42 *
43 * <p>The result on {@code long} values can be returned as a {@code double} or
44 * a {@code long} using a {@link StatisticResult}.
45 *
46 * <p>The {@code double} result is the closest representable value
47 * following floating-point rounding rules. This may be outside the
48 * range defined by the minimum and maximum of the input array following
49 * rounding to a 53-bit floating point representation.
50 * For example the median of an array containing only {@link Long#MAX_VALUE}
51 * as a {@code double} is 2<sup>63</sup>, which is the closest representation of
52 * 2<sup>63</sup> - 1.
53 *
54 * <p>The {@code long} result is returned using the nearest whole number.
55 * In the event of ties the result is rounded towards positive infinity.
56 * For example the median of {@code [2, 3]} is 3, and of {@code [-2, -3]} is -2.
57 * This value will always be within the range defined by the minimum and maximum
58 * of the input array. Due to interpolation it may be a value not observed in
59 * the input values.
60 *
61 * <p>If the array length {@code n} is zero the result as a {@code double} is
62 * {@code NaN} and the result as a {@code long} will raise an {@link ArithmeticException}.
63 *
64 * @see #with(NaNPolicy)
65 * @see <a href="https://en.wikipedia.org/wiki/Median">Median (Wikipedia)</a>
66 * @since 1.1
67 */
68 public final class Median {
69 /** Default instance. */
70 private static final Median DEFAULT = new Median(false, NaNPolicy.INCLUDE);
71
72 /** Flag to indicate if the data should be copied. */
73 private final boolean copy;
74 /** NaN policy for floating point data. */
75 private final NaNPolicy nanPolicy;
76 /** Transformer for NaN data. */
77 private final NaNTransformer nanTransformer;
78
79 /**
80 * @param copy Flag to indicate if the data should be copied.
81 * @param nanPolicy NaN policy.
82 */
83 private Median(boolean copy, NaNPolicy nanPolicy) {
84 this.copy = copy;
85 this.nanPolicy = nanPolicy;
86 nanTransformer = NaNTransformers.createNaNTransformer(nanPolicy, copy);
87 }
88
89 /**
90 * Return a new instance with the default options.
91 *
92 * <ul>
93 * <li>{@linkplain #withCopy(boolean) Copy = false}</li>
94 * <li>{@linkplain #with(NaNPolicy) NaN policy = include}</li>
95 * </ul>
96 *
97 * <p>Note: The default options configure for processing in-place and including
98 * {@code NaN} values in the data. This is the most efficient mode and has the
99 * smallest memory consumption.
100 *
101 * @return the median implementation
102 * @see #withCopy(boolean)
103 * @see #with(NaNPolicy)
104 */
105 public static Median withDefaults() {
106 return DEFAULT;
107 }
108
109 /**
110 * Return an instance with the configured copy behaviour. If {@code false} then
111 * the input array will be modified by the call to evaluate the median; otherwise
112 * the computation uses a copy of the data.
113 *
114 * @param v Value.
115 * @return an instance
116 */
117 public Median withCopy(boolean v) {
118 return new Median(v, nanPolicy);
119 }
120
121 /**
122 * Return an instance with the configured {@link NaNPolicy}.
123 *
124 * <p>Note: This implementation respects the ordering imposed by
125 * {@link Double#compare(double, double)} for {@code NaN} values: {@code NaN} is
126 * considered greater than all other values, and all {@code NaN} values are equal. The
127 * {@link NaNPolicy} changes the computation of the statistic in the presence of
128 * {@code NaN} values.
129 *
130 * <ul>
131 * <li>{@link NaNPolicy#INCLUDE}: {@code NaN} values are moved to the end of the data;
132 * the size of the data <em>includes</em> the {@code NaN} values and the median will be
133 * {@code NaN} if any value used for median interpolation is {@code NaN}.</li>
134 * <li>{@link NaNPolicy#EXCLUDE}: {@code NaN} values are moved to the end of the data;
135 * the size of the data <em>excludes</em> the {@code NaN} values and the median will
136 * never be {@code NaN} for non-zero size. If all data are {@code NaN} then the size is zero
137 * and the result is {@code NaN}.</li>
138 * <li>{@link NaNPolicy#ERROR}: An exception is raised if the data contains {@code NaN}
139 * values.</li>
140 * </ul>
141 *
142 * <p>Note that the result is identical for all policies if no {@code NaN} values are present.
143 *
144 * @param v Value.
145 * @return an instance
146 */
147 public Median with(NaNPolicy v) {
148 return new Median(copy, Objects.requireNonNull(v));
149 }
150
151 /**
152 * Evaluate the median.
153 *
154 * <p>Note: This method may partially sort the input values if not configured to
155 * {@link #withCopy(boolean) copy} the input data.
156 *
157 * @param values Values.
158 * @return the median
159 * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
160 * @see #with(NaNPolicy)
161 */
162 public double evaluate(double[] values) {
163 return compute(values, 0, values.length);
164 }
165
166 /**
167 * Evaluate the median of the specified range.
168 *
169 * <p>Note: This method may partially sort the input values if not configured to
170 * {@link #withCopy(boolean) copy} the input data.
171 *
172 * @param values Values.
173 * @param from Inclusive start of the range.
174 * @param to Exclusive end of the range.
175 * @return the median
176 * @throws IllegalArgumentException if the values contain NaN and the configuration is {@link NaNPolicy#ERROR}
177 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
178 * @see #with(NaNPolicy)
179 * @since 1.2
180 */
181 public double evaluateRange(double[] values, int from, int to) {
182 Statistics.checkFromToIndex(from, to, values.length);
183 return compute(values, from, to);
184 }
185
186 /**
187 * Compute the median of the specified range.
188 *
189 * @param values Values.
190 * @param from Inclusive start of the range.
191 * @param to Exclusive end of the range.
192 * @return the median
193 */
194 private double compute(double[] values, int from, int to) {
195 // Floating-point data handling
196 final int[] bounds = new int[2];
197 final double[] x = nanTransformer.apply(values, from, to, bounds);
198 final int start = bounds[0];
199 final int end = bounds[1];
200 final int n = end - start;
201 // Special cases
202 if (n <= 2) {
203 switch (n) {
204 case 2:
205 // Sorting the array matches the behaviour of Quantile for n==2
206 // Handle NaN and signed zeros
207 if (Double.compare(x[start + 1], x[start]) < 0) {
208 final double t = x[start];
209 x[start] = x[start + 1];
210 x[start + 1] = t;
211 }
212 return Interpolation.mean(x[start], x[start + 1]);
213 case 1:
214 return x[start];
215 default:
216 return Double.NaN;
217 }
218 }
219 // Median index (including the offset)
220 final int m = (start + end) >>> 1;
221 // Odd
222 if ((n & 0x1) == 1) {
223 Selection.select(x, start, end, m);
224 return x[m];
225 }
226 // Even: require (m-1, m)
227 Selection.select(x, start, end, new int[] {m - 1, m});
228 return Interpolation.mean(x[m - 1], x[m]);
229 }
230
231 /**
232 * Evaluate the median.
233 *
234 * <p>Note: This method may partially sort the input values if not configured to
235 * {@link #withCopy(boolean) copy} the input data.
236 *
237 * @param values Values.
238 * @return the median
239 */
240 public double evaluate(int[] values) {
241 return compute(values, 0, values.length);
242 }
243
244 /**
245 * Evaluate the median of the specified range.
246 *
247 * <p>Note: This method may partially sort the input values if not configured to
248 * {@link #withCopy(boolean) copy} the input data.
249 *
250 * @param values Values.
251 * @param from Inclusive start of the range.
252 * @param to Exclusive end of the range.
253 * @return the median
254 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
255 * @since 1.2
256 */
257 public double evaluateRange(int[] values, int from, int to) {
258 Statistics.checkFromToIndex(from, to, values.length);
259 return compute(values, from, to);
260 }
261
262 /**
263 * Compute the median of the specified range.
264 *
265 * @param values Values.
266 * @param from Inclusive start of the range.
267 * @param to Exclusive end of the range.
268 * @return the median
269 */
270 private double compute(int[] values, int from, int to) {
271 final int[] x;
272 final int start;
273 final int end;
274 if (copy) {
275 x = Statistics.copy(values, from, to);
276 start = 0;
277 end = x.length;
278 } else {
279 x = values;
280 start = from;
281 end = to;
282 }
283 final int n = end - start;
284 // Special cases
285 if (n <= 2) {
286 switch (n) {
287 case 2:
288 // Sorting the array matches the behaviour of Quantile for n==2
289 if (x[start + 1] < x[start]) {
290 final int t = x[start];
291 x[start] = x[start + 1];
292 x[start + 1] = t;
293 }
294 return Interpolation.mean(x[start], x[start + 1]);
295 case 1:
296 return x[start];
297 default:
298 return Double.NaN;
299 }
300 }
301 // Median index (including the offset)
302 final int m = (start + end) >>> 1;
303 // Odd
304 if ((n & 0x1) == 1) {
305 Selection.select(x, start, end, m);
306 return x[m];
307 }
308 // Even: require (m-1, m)
309 Selection.select(x, start, end, new int[] {m - 1, m});
310 return Interpolation.mean(x[m - 1], x[m]);
311 }
312
313
314 /**
315 * Evaluate the median.
316 *
317 * <p>If the input length is even the result requires interpolation of two values.
318 * The returned median will interpolate the {@code double} or {@code long} result
319 * on demand. This is more efficient when only one result is required. Consumers
320 * of the result should store the appropriate evaluated value if repeated use is
321 * required.
322 *
323 * <p>Note: This method may partially sort the input values if not configured to
324 * {@link #withCopy(boolean) copy} the input data.
325 *
326 * @param values Values.
327 * @return the median
328 * @since 1.3
329 */
330 public StatisticResult evaluate(long[] values) {
331 return compute(values, 0, values.length);
332 }
333
334 /**
335 * Evaluate the median of the specified range.
336 *
337 * <p>If the input sub-range length is even the result requires interpolation of two values.
338 * The returned median will interpolate the {@code double} or {@code long} result
339 * on demand. This is more efficient when only one result is required. Consumers
340 * of the result should store the appropriate evaluated value if repeated use is
341 * required.
342 *
343 * <p>Note: This method may partially sort the input values if not configured to
344 * {@link #withCopy(boolean) copy} the input data.
345 *
346 * @param values Values.
347 * @param from Inclusive start of the range.
348 * @param to Exclusive end of the range.
349 * @return the median
350 * @throws IndexOutOfBoundsException if the sub-range is out of bounds
351 * @since 1.3
352 */
353 public StatisticResult evaluateRange(long[] values, int from, int to) {
354 Statistics.checkFromToIndex(from, to, values.length);
355 return compute(values, from, to);
356 }
357
358 /**
359 * Compute the median of the specified range.
360 *
361 * @param values Values.
362 * @param from Inclusive start of the range.
363 * @param to Exclusive end of the range.
364 * @return the median
365 */
366 private StatisticResult compute(long[] values, int from, int to) {
367 final long[] x;
368 final int start;
369 final int end;
370 if (copy) {
371 x = Statistics.copy(values, from, to);
372 start = 0;
373 end = x.length;
374 } else {
375 x = values;
376 start = from;
377 end = to;
378 }
379 final int n = end - start;
380 // Special cases
381 if (n <= 2) {
382 switch (n) {
383 case 2:
384 // Sorting the array matches the behaviour of Quantile for n==2
385 if (x[start + 1] < x[start]) {
386 final long t = x[start];
387 x[start] = x[start + 1];
388 x[start + 1] = t;
389 }
390 return Interpolation.mean(x[start], x[start + 1]);
391 case 1:
392 return Statistics.createStatisticResult(x[start]);
393 default:
394 return () -> Double.NaN;
395 }
396 }
397 // Median index (including the offset)
398 final int m = (start + end) >>> 1;
399 // Odd
400 if ((n & 0x1) == 1) {
401 Selection.select(x, start, end, m);
402 return Statistics.createStatisticResult(x[m]);
403 }
404 // Even: require (m-1, m)
405 Selection.select(x, start, end, new int[] {m - 1, m});
406 return Interpolation.mean(x[m - 1], x[m]);
407 }
408 }