Skip to content

Commit

Permalink
Adjust eweights calculation to avoid precision issues (#509)
Browse files Browse the repository at this point in the history
* Adjust eweights calculation to avoid precision issues.

The modified function is equivalent to dividing all weights by the largest value.

```
julia> x = [
        0.3
        0.42857142857142855
        0.6122448979591837
        0.8746355685131197
        1.249479383590171
        1.7849705479859588
        2.549957925694227
        3.642797036706039
        5.203995766722913
        7.434279666747019
        ]
10-element Array{Float64,1}:
 0.3
 0.42857142857142855
 0.6122448979591837
 0.8746355685131197
 1.249479383590171
 1.7849705479859588
 2.549957925694227
 3.642797036706039
 5.203995766722913
 7.434279666747019

julia> x ./ last(x)
10-element Array{Float64,1}:
 0.04035360699999998
 0.057648009999999965
 0.08235429999999996
 0.11764899999999996
 0.16806999999999994
 0.24009999999999995
 0.34299999999999997
 0.49
 0.7
 1.0

julia> using StatsBase
[ Info: Recompiling stale cache file /Users/rory/.julia/compiled/v1.0/StatsBase/EZjIG.ji for StatsBase [2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91]

julia> eweights(1:10, 0.3)
10-element Weights{Float64,Float64,Array{Float64,1}}:
 0.04035360699999998
 0.05764800999999997
 0.08235429999999996
 0.11764899999999996
 0.16806999999999994
 0.24009999999999995
 0.3429999999999999
 0.48999999999999994
 0.7
 1.0
```

* Fix eweight tests.

* Add a couple links to the docstring.

* Address review comments and properly deprecate unscaled behaviour with a `scaled::DepBool` kwarg.

* Rename scaled => scale.

* Fixup docstring slightly and don't make (t, λ, n) method a public method.

* More docstring updates.

* Reword `n` explanation

* Bump patch release
  • Loading branch information
rofinn committed Nov 19, 2021
1 parent 179c533 commit 1678fd1
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 64 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "StatsBase"
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
authors = ["JuliaStats"]
version = "0.33.12"
version = "0.33.13"

[deps]
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Expand Down
6 changes: 3 additions & 3 deletions src/common.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ const RealFP = Union{Float32, Float64}
# A convenient typealias for deprecating default corrected Bool
const DepBool = Union{Bool, Nothing}

function depcheck(fname::Symbol, b::DepBool)
if b == nothing
msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour."
function depcheck(fname::Symbol, varname::Symbol, b::DepBool)
if b === nothing
msg = "$fname will default to $varname=true in the future. Use $varname=false for previous behaviour."
Base.depwarn(msg, fname)
false
else
Expand Down
6 changes: 3 additions & 3 deletions src/cov.jl
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ _scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) =
## weighted cov
covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1;
corrected::DepBool=nothing) =
rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected)))
rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, :corrected, corrected)))


cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) =
covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected))
covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, :corrected, corrected))

function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1)
c = covm(x, mean, w, vardim; corrected=false)
Expand All @@ -120,7 +120,7 @@ end
function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1;
corrected::DepBool=nothing)
m = mean(x, wv, dims=dims)
return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected))
return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, :corrected, corrected))
end

"""
Expand Down
30 changes: 15 additions & 15 deletions src/moments.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ the population variance is computed by replacing
* `Weights`: `ArgumentError` (bias correction not supported)
"""
varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
_moment2(v, w, m; corrected=depcheck(:varm, corrected))
_moment2(v, w, m; corrected=depcheck(:varm, :corrected, corrected))

"""
var(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
Expand All @@ -40,7 +40,7 @@ replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights
"""
function var(v::RealArray, w::AbstractWeights; mean=nothing,
corrected::DepBool=nothing)
corrected = depcheck(:var, corrected)
corrected = depcheck(:var, :corrected, corrected)

if mean == nothing
varm(v, w, Statistics.mean(v, w); corrected=corrected)
Expand All @@ -53,14 +53,14 @@ end

function varm!(R::AbstractArray, A::RealArray, w::AbstractWeights, M::RealArray,
dim::Int; corrected::DepBool=nothing)
corrected = depcheck(:varm!, corrected)
corrected = depcheck(:varm!, :corrected, corrected)
rmul!(_wsum_centralize!(R, abs2, A, convert(Vector, w), M, dim, true),
varcorrection(w, corrected))
end

function var!(R::AbstractArray, A::RealArray, w::AbstractWeights, dims::Int;
mean=nothing, corrected::DepBool=nothing)
corrected = depcheck(:var!, corrected)
corrected = depcheck(:var!, :corrected, corrected)

if mean == 0
varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims;
Expand All @@ -84,14 +84,14 @@ end

function varm(A::RealArray, w::AbstractWeights, M::RealArray, dim::Int;
corrected::DepBool=nothing)
corrected = depcheck(:varm, corrected)
corrected = depcheck(:varm, :corrected, corrected)
varm!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, M,
dim; corrected=corrected)
end

function var(A::RealArray, w::AbstractWeights, dim::Int; mean=nothing,
corrected::DepBool=nothing)
corrected = depcheck(:var, corrected)
corrected = depcheck(:var, :corrected, corrected)
var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim;
mean=mean, corrected=corrected)
end
Expand All @@ -115,7 +115,7 @@ dependent on the type of weights used:
* `Weights`: `ArgumentError` (bias correction not supported)
"""
stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) =
sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected)))
sqrt(varm(v, w, m, corrected=depcheck(:stdm, :corrected, corrected)))

"""
std(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false)
Expand All @@ -136,18 +136,18 @@ weights used:
* `Weights`: `ArgumentError` (bias correction not supported)
"""
std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) =
sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected)))
sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, :corrected, corrected)))

stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) =
sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, corrected)))
sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, :corrected, corrected)))

stdm(v::RealArray, w::AbstractWeights, m::RealArray, dim::Int;
corrected::DepBool=nothing) =
sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, corrected)))
sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, :corrected, corrected)))

std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing,
corrected::DepBool=nothing) =
sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, corrected)))
sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, :corrected, corrected)))

##### Fused statistics
"""
Expand Down Expand Up @@ -183,12 +183,12 @@ end

function mean_and_var(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
m = mean(x, w)
v = varm(x, w, m; corrected=depcheck(:mean_and_var, corrected))
v = varm(x, w, m; corrected=depcheck(:mean_and_var, :corrected, corrected))
m, v
end
function mean_and_std(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing)
m = mean(x, w)
s = stdm(x, w, m; corrected=depcheck(:mean_and_std, corrected))
s = stdm(x, w, m; corrected=depcheck(:mean_and_std, :corrected, corrected))
m, s
end

Expand All @@ -208,13 +208,13 @@ end
function mean_and_var(x::RealArray, w::AbstractWeights, dims::Int;
corrected::DepBool=nothing)
m = mean(x, w, dims=dims)
v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, corrected))
v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, :corrected, corrected))
m, v
end
function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int;
corrected::DepBool=nothing)
m = mean(x, w, dims=dims)
s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, corrected))
s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, :corrected, corrected))
m, s
end

Expand Down
75 changes: 49 additions & 26 deletions src/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -205,59 +205,82 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs))
end

"""
eweights(t::AbstractVector{<:Integer}, λ::Real)
eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T
eweights(n::Integer, λ::Real)
eweights(t::AbstractVector{<:Integer}, λ::Real; scale=false)
eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real; scale=false) where T
eweights(n::Integer, λ::Real; scale=false)
Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past
observations, which in this case corresponds to larger integer values `i` in `t`.
If an integer `n` is provided, weights are generated for values from 1 to `n`
(equivalent to `t = 1:n`).
observations (larger integer values `i` in `t`).
The integer value `n` represents the number of past observations to consider.
`n` defaults to `maximum(t) - minimum(t) + 1` if only `t` is passed in
and the elements are integers, and to `length(r)` if a superset range `r` is also passed in.
If `n` is explicitly passed instead of `t`, `t` defaults to `1:n`.
For each element `i` in `t` the weight value is computed as:
If `scale` is `true` then for each element `i` in `t` the weight value is computed as:
``(1 - λ)^{n - i}``
If `scale` is `false` then each value is computed as:
``λ (1 - λ)^{1 - i}``
# Arguments
- `t::AbstractVector`: temporal indices or timestamps
- `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps
- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n`
- `n::Integer`: the number of past events to consider
- `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``.
As this value approaches 0, the resulting weights will be almost equal,
while values closer to 1 will put greater weight on the tail elements of the vector.
# Keyword arguments
- `scale::Bool`: Return the weights scaled to between 0 and 1 (default: false)
# Examples
```julia-repl
julia> eweights(1:10, 0.3)
julia> eweights(1:10, 0.3; scale=true)
10-element Weights{Float64,Float64,Array{Float64,1}}:
0.3
0.42857142857142855
0.6122448979591837
0.8746355685131197
1.249479383590171
1.7849705479859588
2.549957925694227
3.642797036706039
5.203995766722913
7.434279666747019
0.04035360699999998
0.05764800999999997
0.08235429999999996
0.11764899999999996
0.16806999999999994
0.24009999999999995
0.3429999999999999
0.48999999999999994
0.7
1.0
```
"""
function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer
# Links
- https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average
- https://en.wikipedia.org/wiki/Exponential_smoothing
"""
function eweights(t::AbstractVector{<:Integer}, λ::Real; kwargs...)
isempty(t) && return Weights(copy(t), 0)
(lo, hi) = extrema(t)
return _eweights(t, λ, hi - lo + 1; kwargs...)
end

eweights(n::Integer, λ::Real; kwargs...) = _eweights(1:n, λ, n; kwargs...)
eweights(t::AbstractVector, r::AbstractRange, λ::Real; kwargs...) =
_eweights(something.(indexin(t, r)), λ, length(r); kwargs...)

function _eweights(t::AbstractVector{<:Integer}, λ::Real, n::Integer; scale::DepBool=nothing)
0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1"))
f = depcheck(:eweights, :scale, scale) ? _scaled_eweight : _unscaled_eweight

w0 = map(t) do i
i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers"))
λ * (1 - λ)^(1 - i)
f(i, λ, n)
end

s = sum(w0)
Weights(w0, s)
end

eweights(n::Integer, λ::Real) = eweights(1:n, λ)
eweights(t::AbstractVector, r::AbstractRange, λ::Real) =
eweights(something.(indexin(t, r)), λ)
_unscaled_eweight(i, λ, n) = λ * (1 - λ)^(1 - i)
_scaled_eweight(i, λ, n) = (1 - λ)^(n - i)

# NOTE: no variance correction is implemented for exponential weights

Expand Down Expand Up @@ -310,7 +333,7 @@ julia> uweights(3)
1
1
1
julia> uweights(Float64, 3)
3-element UnitWeights{Float64}:
1.0
Expand Down
41 changes: 25 additions & 16 deletions test/weights.jl
Original file line number Diff line number Diff line change
Expand Up @@ -502,53 +502,62 @@ end
end

@testset "Exponential Weights" begin
λ = 0.2
@testset "Usage" begin
θ = 5.25
λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method
v =*(1-λ)^(1-i) for i = 1:4]
v = [(1 - λ) ^ (4 - i) for i = 1:4]
w = Weights(v)

@test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071]
@test round.(w, digits=4) == [0.512, 0.64, 0.8, 1.0]

@testset "basic" begin
@test eweights(1:4, λ) w
@test eweights(1:4, λ; scale=true) w
end

@testset "1:n" begin
@test eweights(4, λ) w
@test eweights(4, λ; scale=true) w
end

@testset "indexin" begin
v = [λ*(1-λ)^(1-i) for i = 1:10]
v = [(1 - λ) ^ (10 - i) for i = 1:10]

# Test that we should be able to skip indices easily
@test eweights([1, 3, 5, 7], 1:10, λ) Weights(v[[1, 3, 5, 7]])
@test eweights([1, 3, 5, 7], 1:10, λ; scale=true) Weights(v[[1, 3, 5, 7]])

# This should also work with actual time types
t1 = DateTime(2019, 1, 1, 1)
tx = t1 + Hour(7)
tn = DateTime(2019, 1, 2, 1)
tn = DateTime(2019, 1, 1, 10)

@test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) Weights(v[[1, 3, 5, 7]])
@test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ; scale=true) Weights(v[[1, 3, 5, 7]])
end
end

@testset "Empty" begin
@test eweights(0, 0.3) == Weights(Float64[])
@test eweights(1:0, 0.3) == Weights(Float64[])
@test eweights(Int[], 1:10, 0.4) == Weights(Float64[])
@test eweights(0, 0.3; scale=true) == Weights(Float64[])
@test eweights(1:0, 0.3; scale=true) == Weights(Float64[])
@test eweights(Int[], 1:10, 0.4; scale=true) == Weights(Float64[])
end

@testset "Failure Conditions" begin
# λ > 1.0
@test_throws ArgumentError eweights(1, 1.1)
@test_throws ArgumentError eweights(1, 1.1; scale=true)

# time indices are not all positive non-zero integers
@test_throws ArgumentError eweights([0, 1, 2, 3], 0.3)
@test_throws ArgumentError eweights([0, 1, 2, 3], 0.3; scale=true)

# Passing in an array of bools will work because Bool <: Integer,
# but any `false` values will trigger the same argument error as 0.0
@test_throws ArgumentError eweights([true, false, true, true], 0.3)
@test_throws ArgumentError eweights([true, false, true, true], 0.3; scale=true)
end

@testset "scale=false" begin
v =* (1 - λ)^(1 - i) for i = 1:4]
w = Weights(v)

@test round.(w, digits=4) == [0.2, 0.25, 0.3125, 0.3906]

wv = eweights(1:10, λ; scale=false)
@test eweights(1:10, λ; scale=true) wv / maximum(wv)
end
end

Expand Down

2 comments on commit 1678fd1

@rofinn
Copy link
Member Author

@rofinn rofinn commented on 1678fd1 Nov 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/49065

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.33.13 -m "<description of version>" 1678fd1d12f6fe1ddc3ff8a6b9b7b4679e490818
git push origin v0.33.13

Please sign in to comment.