001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017 package org.apache.commons.math.stat.descriptive.moment;
018
019 import java.io.Serializable;
020
021 import org.apache.commons.math.exception.NullArgumentException;
022 import org.apache.commons.math.exception.util.LocalizedFormats;
023 import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
024 import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
025
026 /**
027 * Computes the variance of the available values. By default, the unbiased
028 * "sample variance" definitional formula is used:
029 * <p>
030 * variance = sum((x_i - mean)^2) / (n - 1) </p>
031 * <p>
032 * where mean is the {@link Mean} and <code>n</code> is the number
033 * of sample observations.</p>
034 * <p>
035 * The definitional formula does not have good numerical properties, so
036 * this implementation does not compute the statistic using the definitional
037 * formula. <ul>
038 * <li> The <code>getResult</code> method computes the variance using
039 * updating formulas based on West's algorithm, as described in
040 * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
041 * J. G. Lewis 1979, <i>Communications of the ACM</i>,
042 * vol. 22 no. 9, pp. 526-531.</a></li>
043 * <li> The <code>evaluate</code> methods leverage the fact that they have the
044 * full array of values in memory to execute a two-pass algorithm.
045 * Specifically, these methods use the "corrected two-pass algorithm" from
046 * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
047 * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
048 * Note that adding values using <code>increment</code> or
049 * <code>incrementAll</code> and then executing <code>getResult</code> will
050 * sometimes give a different, less accurate, result than executing
051 * <code>evaluate</code> with the full array of values. The former approach
052 * should only be used when the full array of values is not available.</p>
053 * <p>
054 * The "population variance" ( sum((x_i - mean)^2) / n ) can also
055 * be computed using this statistic. The <code>isBiasCorrected</code>
056 * property determines whether the "population" or "sample" value is
057 * returned by the <code>evaluate</code> and <code>getResult</code> methods.
058 * To compute population variances, set this property to <code>false.</code>
059 * </p>
060 * <p>
061 * <strong>Note that this implementation is not synchronized.</strong> If
062 * multiple threads access an instance of this class concurrently, and at least
063 * one of the threads invokes the <code>increment()</code> or
064 * <code>clear()</code> method, it must be synchronized externally.</p>
065 *
066 * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
067 */
068 public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
069
070 /** Serializable version identifier */
071 private static final long serialVersionUID = -9111962718267217978L;
072
073 /** SecondMoment is used in incremental calculation of Variance*/
074 protected SecondMoment moment = null;
075
076 /**
077 * Boolean test to determine if this Variance should also increment
078 * the second moment, this evaluates to false when this Variance is
079 * constructed with an external SecondMoment as a parameter.
080 */
081 protected boolean incMoment = true;
082
083 /**
084 * Determines whether or not bias correction is applied when computing the
085 * value of the statisic. True means that bias is corrected. See
086 * {@link Variance} for details on the formula.
087 */
088 private boolean isBiasCorrected = true;
089
090 /**
091 * Constructs a Variance with default (true) <code>isBiasCorrected</code>
092 * property.
093 */
094 public Variance() {
095 moment = new SecondMoment();
096 }
097
098 /**
099 * Constructs a Variance based on an external second moment.
100 *
101 * @param m2 the SecondMoment (Third or Fourth moments work
102 * here as well.)
103 */
104 public Variance(final SecondMoment m2) {
105 incMoment = false;
106 this.moment = m2;
107 }
108
109 /**
110 * Constructs a Variance with the specified <code>isBiasCorrected</code>
111 * property
112 *
113 * @param isBiasCorrected setting for bias correction - true means
114 * bias will be corrected and is equivalent to using the argumentless
115 * constructor
116 */
117 public Variance(boolean isBiasCorrected) {
118 moment = new SecondMoment();
119 this.isBiasCorrected = isBiasCorrected;
120 }
121
122 /**
123 * Constructs a Variance with the specified <code>isBiasCorrected</code>
124 * property and the supplied external second moment.
125 *
126 * @param isBiasCorrected setting for bias correction - true means
127 * bias will be corrected
128 * @param m2 the SecondMoment (Third or Fourth moments work
129 * here as well.)
130 */
131 public Variance(boolean isBiasCorrected, SecondMoment m2) {
132 incMoment = false;
133 this.moment = m2;
134 this.isBiasCorrected = isBiasCorrected;
135 }
136
137 /**
138 * Copy constructor, creates a new {@code Variance} identical
139 * to the {@code original}
140 *
141 * @param original the {@code Variance} instance to copy
142 */
143 public Variance(Variance original) {
144 copy(original, this);
145 }
146
147 /**
148 * {@inheritDoc}
149 * <p>If all values are available, it is more accurate to use
150 * {@link #evaluate(double[])} rather than adding values one at a time
151 * using this method and then executing {@link #getResult}, since
152 * <code>evaluate</code> leverages the fact that is has the full
153 * list of values together to execute a two-pass algorithm.
154 * See {@link Variance}.</p>
155 */
156 @Override
157 public void increment(final double d) {
158 if (incMoment) {
159 moment.increment(d);
160 }
161 }
162
163 /**
164 * {@inheritDoc}
165 */
166 @Override
167 public double getResult() {
168 if (moment.n == 0) {
169 return Double.NaN;
170 } else if (moment.n == 1) {
171 return 0d;
172 } else {
173 if (isBiasCorrected) {
174 return moment.m2 / (moment.n - 1d);
175 } else {
176 return moment.m2 / (moment.n);
177 }
178 }
179 }
180
181 /**
182 * {@inheritDoc}
183 */
184 public long getN() {
185 return moment.getN();
186 }
187
188 /**
189 * {@inheritDoc}
190 */
191 @Override
192 public void clear() {
193 if (incMoment) {
194 moment.clear();
195 }
196 }
197
198 /**
199 * Returns the variance of the entries in the input array, or
200 * <code>Double.NaN</code> if the array is empty.
201 * <p>
202 * See {@link Variance} for details on the computing algorithm.</p>
203 * <p>
204 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
205 * <p>
206 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
207 * <p>
208 * Does not change the internal state of the statistic.</p>
209 *
210 * @param values the input array
211 * @return the variance of the values or Double.NaN if length = 0
212 * @throws IllegalArgumentException if the array is null
213 */
214 @Override
215 public double evaluate(final double[] values) {
216 if (values == null) {
217 throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
218 }
219 return evaluate(values, 0, values.length);
220 }
221
222 /**
223 * Returns the variance of the entries in the specified portion of
224 * the input array, or <code>Double.NaN</code> if the designated subarray
225 * is empty.
226 * <p>
227 * See {@link Variance} for details on the computing algorithm.</p>
228 * <p>
229 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
230 * <p>
231 * Does not change the internal state of the statistic.</p>
232 * <p>
233 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
234 *
235 * @param values the input array
236 * @param begin index of the first array element to include
237 * @param length the number of elements to include
238 * @return the variance of the values or Double.NaN if length = 0
239 * @throws IllegalArgumentException if the array is null or the array index
240 * parameters are not valid
241 */
242 @Override
243 public double evaluate(final double[] values, final int begin, final int length) {
244
245 double var = Double.NaN;
246
247 if (test(values, begin, length)) {
248 clear();
249 if (length == 1) {
250 var = 0.0;
251 } else if (length > 1) {
252 Mean mean = new Mean();
253 double m = mean.evaluate(values, begin, length);
254 var = evaluate(values, m, begin, length);
255 }
256 }
257 return var;
258 }
259
260 /**
261 * <p>Returns the weighted variance of the entries in the specified portion of
262 * the input array, or <code>Double.NaN</code> if the designated subarray
263 * is empty.</p>
264 * <p>
265 * Uses the formula <pre>
266 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
267 * </pre>
268 * where weightedMean is the weighted mean</p>
269 * <p>
270 * This formula will not return the same result as the unweighted variance when all
271 * weights are equal, unless all weights are equal to 1. The formula assumes that
272 * weights are to be treated as "expansion values," as will be the case if for example
273 * the weights represent frequency counts. To normalize weights so that the denominator
274 * in the variance computation equals the length of the input vector minus one, use <pre>
275 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
276 * </pre>
277 * <p>
278 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
279 * <p>
280 * Throws <code>IllegalArgumentException</code> if any of the following are true:
281 * <ul><li>the values array is null</li>
282 * <li>the weights array is null</li>
283 * <li>the weights array does not have the same length as the values array</li>
284 * <li>the weights array contains one or more infinite values</li>
285 * <li>the weights array contains one or more NaN values</li>
286 * <li>the weights array contains negative values</li>
287 * <li>the start and length arguments do not determine a valid array</li>
288 * </ul></p>
289 * <p>
290 * Does not change the internal state of the statistic.</p>
291 * <p>
292 * Throws <code>IllegalArgumentException</code> if either array is null.</p>
293 *
294 * @param values the input array
295 * @param weights the weights array
296 * @param begin index of the first array element to include
297 * @param length the number of elements to include
298 * @return the weighted variance of the values or Double.NaN if length = 0
299 * @throws IllegalArgumentException if the parameters are not valid
300 * @since 2.1
301 */
302 public double evaluate(final double[] values, final double[] weights,
303 final int begin, final int length) {
304
305 double var = Double.NaN;
306
307 if (test(values, weights,begin, length)) {
308 clear();
309 if (length == 1) {
310 var = 0.0;
311 } else if (length > 1) {
312 Mean mean = new Mean();
313 double m = mean.evaluate(values, weights, begin, length);
314 var = evaluate(values, weights, m, begin, length);
315 }
316 }
317 return var;
318 }
319
320 /**
321 * <p>
322 * Returns the weighted variance of the entries in the the input array.</p>
323 * <p>
324 * Uses the formula <pre>
325 * Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
326 * </pre>
327 * where weightedMean is the weighted mean</p>
328 * <p>
329 * This formula will not return the same result as the unweighted variance when all
330 * weights are equal, unless all weights are equal to 1. The formula assumes that
331 * weights are to be treated as "expansion values," as will be the case if for example
332 * the weights represent frequency counts. To normalize weights so that the denominator
333 * in the variance computation equals the length of the input vector minus one, use <pre>
334 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
335 * </pre>
336 * <p>
337 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
338 * <p>
339 * Throws <code>IllegalArgumentException</code> if any of the following are true:
340 * <ul><li>the values array is null</li>
341 * <li>the weights array is null</li>
342 * <li>the weights array does not have the same length as the values array</li>
343 * <li>the weights array contains one or more infinite values</li>
344 * <li>the weights array contains one or more NaN values</li>
345 * <li>the weights array contains negative values</li>
346 * </ul></p>
347 * <p>
348 * Does not change the internal state of the statistic.</p>
349 * <p>
350 * Throws <code>IllegalArgumentException</code> if either array is null.</p>
351 *
352 * @param values the input array
353 * @param weights the weights array
354 * @return the weighted variance of the values
355 * @throws IllegalArgumentException if the parameters are not valid
356 * @since 2.1
357 */
358 public double evaluate(final double[] values, final double[] weights) {
359 return evaluate(values, weights, 0, values.length);
360 }
361
362 /**
363 * Returns the variance of the entries in the specified portion of
364 * the input array, using the precomputed mean value. Returns
365 * <code>Double.NaN</code> if the designated subarray is empty.
366 * <p>
367 * See {@link Variance} for details on the computing algorithm.</p>
368 * <p>
369 * The formula used assumes that the supplied mean value is the arithmetic
370 * mean of the sample data, not a known population parameter. This method
371 * is supplied only to save computation when the mean has already been
372 * computed.</p>
373 * <p>
374 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
375 * <p>
376 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
377 * <p>
378 * Does not change the internal state of the statistic.</p>
379 *
380 * @param values the input array
381 * @param mean the precomputed mean value
382 * @param begin index of the first array element to include
383 * @param length the number of elements to include
384 * @return the variance of the values or Double.NaN if length = 0
385 * @throws IllegalArgumentException if the array is null or the array index
386 * parameters are not valid
387 */
388 public double evaluate(final double[] values, final double mean,
389 final int begin, final int length) {
390
391 double var = Double.NaN;
392
393 if (test(values, begin, length)) {
394 if (length == 1) {
395 var = 0.0;
396 } else if (length > 1) {
397 double accum = 0.0;
398 double dev = 0.0;
399 double accum2 = 0.0;
400 for (int i = begin; i < begin + length; i++) {
401 dev = values[i] - mean;
402 accum += dev * dev;
403 accum2 += dev;
404 }
405 double len = length;
406 if (isBiasCorrected) {
407 var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
408 } else {
409 var = (accum - (accum2 * accum2 / len)) / len;
410 }
411 }
412 }
413 return var;
414 }
415
416 /**
417 * Returns the variance of the entries in the input array, using the
418 * precomputed mean value. Returns <code>Double.NaN</code> if the array
419 * is empty.
420 * <p>
421 * See {@link Variance} for details on the computing algorithm.</p>
422 * <p>
423 * If <code>isBiasCorrected</code> is <code>true</code> the formula used
424 * assumes that the supplied mean value is the arithmetic mean of the
425 * sample data, not a known population parameter. If the mean is a known
426 * population parameter, or if the "population" version of the variance is
427 * desired, set <code>isBiasCorrected</code> to <code>false</code> before
428 * invoking this method.</p>
429 * <p>
430 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
431 * <p>
432 * Throws <code>IllegalArgumentException</code> if the array is null.</p>
433 * <p>
434 * Does not change the internal state of the statistic.</p>
435 *
436 * @param values the input array
437 * @param mean the precomputed mean value
438 * @return the variance of the values or Double.NaN if the array is empty
439 * @throws IllegalArgumentException if the array is null
440 */
441 public double evaluate(final double[] values, final double mean) {
442 return evaluate(values, mean, 0, values.length);
443 }
444
445 /**
446 * Returns the weighted variance of the entries in the specified portion of
447 * the input array, using the precomputed weighted mean value. Returns
448 * <code>Double.NaN</code> if the designated subarray is empty.
449 * <p>
450 * Uses the formula <pre>
451 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
452 * </pre></p>
453 * <p>
454 * The formula used assumes that the supplied mean value is the weighted arithmetic
455 * mean of the sample data, not a known population parameter. This method
456 * is supplied only to save computation when the mean has already been
457 * computed.</p>
458 * <p>
459 * This formula will not return the same result as the unweighted variance when all
460 * weights are equal, unless all weights are equal to 1. The formula assumes that
461 * weights are to be treated as "expansion values," as will be the case if for example
462 * the weights represent frequency counts. To normalize weights so that the denominator
463 * in the variance computation equals the length of the input vector minus one, use <pre>
464 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
465 * </pre>
466 * <p>
467 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
468 * <p>
469 * Throws <code>IllegalArgumentException</code> if any of the following are true:
470 * <ul><li>the values array is null</li>
471 * <li>the weights array is null</li>
472 * <li>the weights array does not have the same length as the values array</li>
473 * <li>the weights array contains one or more infinite values</li>
474 * <li>the weights array contains one or more NaN values</li>
475 * <li>the weights array contains negative values</li>
476 * <li>the start and length arguments do not determine a valid array</li>
477 * </ul></p>
478 * <p>
479 * Does not change the internal state of the statistic.</p>
480 *
481 * @param values the input array
482 * @param weights the weights array
483 * @param mean the precomputed weighted mean value
484 * @param begin index of the first array element to include
485 * @param length the number of elements to include
486 * @return the variance of the values or Double.NaN if length = 0
487 * @throws IllegalArgumentException if the parameters are not valid
488 * @since 2.1
489 */
490 public double evaluate(final double[] values, final double[] weights,
491 final double mean, final int begin, final int length) {
492
493 double var = Double.NaN;
494
495 if (test(values, weights, begin, length)) {
496 if (length == 1) {
497 var = 0.0;
498 } else if (length > 1) {
499 double accum = 0.0;
500 double dev = 0.0;
501 double accum2 = 0.0;
502 for (int i = begin; i < begin + length; i++) {
503 dev = values[i] - mean;
504 accum += weights[i] * (dev * dev);
505 accum2 += weights[i] * dev;
506 }
507
508 double sumWts = 0;
509 for (int i = 0; i < weights.length; i++) {
510 sumWts += weights[i];
511 }
512
513 if (isBiasCorrected) {
514 var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
515 } else {
516 var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
517 }
518 }
519 }
520 return var;
521 }
522
523 /**
524 * <p>Returns the weighted variance of the values in the input array, using
525 * the precomputed weighted mean value.</p>
526 * <p>
527 * Uses the formula <pre>
528 * Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
529 * </pre></p>
530 * <p>
531 * The formula used assumes that the supplied mean value is the weighted arithmetic
532 * mean of the sample data, not a known population parameter. This method
533 * is supplied only to save computation when the mean has already been
534 * computed.</p>
535 * <p>
536 * This formula will not return the same result as the unweighted variance when all
537 * weights are equal, unless all weights are equal to 1. The formula assumes that
538 * weights are to be treated as "expansion values," as will be the case if for example
539 * the weights represent frequency counts. To normalize weights so that the denominator
540 * in the variance computation equals the length of the input vector minus one, use <pre>
541 * <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
542 * </pre>
543 * <p>
544 * Returns 0 for a single-value (i.e. length = 1) sample.</p>
545 * <p>
546 * Throws <code>IllegalArgumentException</code> if any of the following are true:
547 * <ul><li>the values array is null</li>
548 * <li>the weights array is null</li>
549 * <li>the weights array does not have the same length as the values array</li>
550 * <li>the weights array contains one or more infinite values</li>
551 * <li>the weights array contains one or more NaN values</li>
552 * <li>the weights array contains negative values</li>
553 * </ul></p>
554 * <p>
555 * Does not change the internal state of the statistic.</p>
556 *
557 * @param values the input array
558 * @param weights the weights array
559 * @param mean the precomputed weighted mean value
560 * @return the variance of the values or Double.NaN if length = 0
561 * @throws IllegalArgumentException if the parameters are not valid
562 * @since 2.1
563 */
564 public double evaluate(final double[] values, final double[] weights, final double mean) {
565 return evaluate(values, weights, mean, 0, values.length);
566 }
567
568 /**
569 * @return Returns the isBiasCorrected.
570 */
571 public boolean isBiasCorrected() {
572 return isBiasCorrected;
573 }
574
575 /**
576 * @param biasCorrected The isBiasCorrected to set.
577 */
578 public void setBiasCorrected(boolean biasCorrected) {
579 this.isBiasCorrected = biasCorrected;
580 }
581
582 /**
583 * {@inheritDoc}
584 */
585 @Override
586 public Variance copy() {
587 Variance result = new Variance();
588 copy(this, result);
589 return result;
590 }
591
592 /**
593 * Copies source to dest.
594 * <p>Neither source nor dest can be null.</p>
595 *
596 * @param source Variance to copy
597 * @param dest Variance to copy to
598 * @throws NullPointerException if either source or dest is null
599 */
600 public static void copy(Variance source, Variance dest) {
601 if (source == null ||
602 dest == null) {
603 throw new NullArgumentException();
604 }
605 dest.setData(source.getDataRef());
606 dest.moment = source.moment.copy();
607 dest.isBiasCorrected = source.isBiasCorrected;
608 dest.incMoment = source.incMoment;
609 }
610 }