Skip to content

Commit

Permalink
Merge pull request #134 from IBM/pipelines
Browse files Browse the repository at this point in the history
Add new pipelines (linear, concat)
  • Loading branch information
ppalmes committed Feb 26, 2020
2 parents aee39ce + 68e34fc commit 2bfc767
Show file tree
Hide file tree
Showing 13 changed files with 539 additions and 129 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TSML"
uuid = "198dc43e-9e51-5cd7-9d40-d9794d335912"
authors = ["Paulito Palmes <ppalmes@gmail.com>"]
version = "2.4.7"
version = "2.4.8"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down
20 changes: 15 additions & 5 deletions src/TSML.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module TSML

export fit!, transform!
export fit!, transform!,fit_transform!

# reexport common functions to Main
include("pkgdeps.jl")
Expand All @@ -27,7 +27,7 @@ export Baseline,Identity

include("basefilters.jl")
using .BaseFilters
export Imputer,Pipeline,OneHotEncoder,Wrapper
export Imputer,OneHotEncoder,Wrapper

include("valdatefilters.jl")
using .ValDateFilters
Expand Down Expand Up @@ -77,14 +77,21 @@ include("timescaledb.jl")
using .TimescaleDBs
export TimescaleDB

include("demo.jl")
using .TSMLDemo
export tsml_demo

include("ensemble.jl")
using .EnsembleMethods
export VoteEnsemble, StackEnsemble, BestLearner

include("featureselector.jl")
using .FeatureSelectors
export FeatureSelector, CatFeatureSelector, NumFeatureSelector, CatNumDiscriminator

include("pipeline.jl")
using .Pipelines
export @pipeline @pipelinex
export Pipeline, ComboPipeline


include("schema.jl")
using .Schemalizers
export Schemalizer, ML, table
Expand All @@ -97,4 +104,7 @@ include("argparse.jl")
using .ArgumentParsers
export tsmlmain

include("demo.jl")
using .TSMLDemo
export tsml_demo
end # module
2 changes: 1 addition & 1 deletion src/argparse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const DATEINTERVAL = Dict(
function parse_commandline()
s = ArgParseSettings()

@add_arg_table s begin
@add_arg_table! s begin
"--aggregate"
help = "aggregate interval such as: minutely, hourly, weekly, monthly, Dates.Minute(30),Dates.Hour(2)"
arg_type = String
Expand Down
81 changes: 0 additions & 81 deletions src/basefilters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,87 +176,6 @@ function transform!(imp::Imputer, instances::DataFrame)
return new_instances |> DataFrame
end

"""
Pipeline(
Dict(
# Transformers as list to chain in sequence.
:transformers => [OneHotEncoder(), Imputer()],
# Transformer args as list applied to same index transformer.
:transformer_args => nothing
)
)
Chains multiple transformers in sequence.
Examples:
inputfile =joinpath(dirname(pathof(TSML)),"../data/testdata.csv")
csvreader = CSVDateValReader(Dict(:filename=>inputfile,:dateformat=>"d/m/y H:M"))
filter1 = DateValgator()
filter2 = DateValNNer(Dict(:nnsize=>1))
mypipeline = Pipeline(Dict(
:transformers => [csvreader,filter1,filter2]
)
)
fit!(mypipeline)
res=transform!(mypipeline)
Implements: `fit!`, `transform!`
"""
mutable struct Pipeline <: Transformer
model
args

function Pipeline(args=Dict())
default_args = Dict(
# Transformers as list to chain in sequence.
:transformers => [OneHotEncoder(), Imputer()],
# Transformer args as list applied to same index transformer.
:transformer_args => nothing
)
new(nothing, mergedict(default_args, args))
end
end

function fit!(pipe::Pipeline, features::DataFrame=DataFrame(), labels::Vector=[])
instances=deepcopy(features)
transformers = pipe.args[:transformers]
transformer_args = pipe.args[:transformer_args]

current_instances = instances
new_transformers = Transformer[]

# fit-transform all except last element
# last element calls fit only
trlength = length(transformers)
for t_index in 1:(trlength - 1)
transformer = createtransformer(transformers[t_index], transformer_args)
push!(new_transformers, transformer)
fit!(transformer, current_instances, labels)
current_instances = transform!(transformer, current_instances)
end
transformer = createtransformer(transformers[trlength], transformer_args)
push!(new_transformers, transformer)
fit!(transformer, current_instances, labels)

pipe.model = Dict(
:transformers => new_transformers,
:transformer_args => transformer_args
)
end

function transform!(pipe::Pipeline, instances::DataFrame=DataFrame())
transformers = pipe.model[:transformers]

current_instances = deepcopy(instances)
for t_index in 1:length(transformers)
transformer = transformers[t_index]
current_instances = transform!(transformer, current_instances)
end

return current_instances
end

"""
Wrapper(
default_args = Dict(
Expand Down
2 changes: 1 addition & 1 deletion src/demo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using Statistics
using Plots

using TSML: Statifier
using TSML.BaseFilters: Pipeline
using TSML.Pipelines: Pipeline
using TSML.ValDateFilters: DateValgator, DateValNNer
using TSML.TSMLTypes
import TSML.TSMLTypes.fit! # to overload
Expand Down
184 changes: 184 additions & 0 deletions src/featureselector.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
module FeatureSelectors

using DataFrames
using Random

using TSML.TSMLTypes
using TSML.BaseFilters
using TSML.Utils

import TSML.TSMLTypes: fit!, transform!
export fit!, transform!
export FeatureSelector, CatFeatureSelector, NumFeatureSelector, CatNumDiscriminator

export feature_test

# generic way to extract num/cat features by specifying their columns
mutable struct FeatureSelector <: Transformer
name::String
model::Dict
args::Dict

function FeatureSelector(args::Dict = Dict())
default_args = Dict(
:name => "featureselector",
:columns => Int[],
)
cargs=nested_dict_merge(default_args,args)
cargs[:name] = cargs[:name]*"_"*randstring(3)
new(cargs[:name],Dict(),cargs)
end
end

function FeatureSelector(cols::Vector{Int})
FeatureSelector(Dict(:columns => cols))
end

function fit!(ft::FeatureSelector, features::DataFrame, labels::Vector=[])
if features == DataFrame()
error("empty dataframe")
end
ft.model = ft.args
end

function transform!(ft::FeatureSelector, features::DataFrame)
if features == DataFrame()
error("empty dataframe")
end
return features[:,ft.model[:columns]]
end

# ----------
# automatically extracts cat features based on their inferred element non-number types
mutable struct CatFeatureSelector <: Transformer
name::String
model::Dict
args::Dict

function CatFeatureSelector(args::Dict = Dict())
default_args = Dict(
:name => "catfeatsel",
)
cargs=nested_dict_merge(default_args,args)
cargs[:name] = cargs[:name]*"_"*randstring(3)
new(cargs[:name],Dict(),cargs)
end
end

function fit!(ft::CatFeatureSelector, features::DataFrame, labels::Vector=[])
if features == DataFrame()
error("empty dataframe")
end
catcols,_ = find_catnum_columns(features)

# create model
ft.model = Dict(
:nominal_columns => catcols
)
end

function transform!(ft::CatFeatureSelector, features::DataFrame)
catcols = ft.model[:nominal_columns]
return features[:,catcols]
end

# ---------
# automatically extracts numeric features based on their inferred element types
mutable struct NumFeatureSelector <: Transformer
name::String
model::Dict
args::Dict

function NumFeatureSelector(args::Dict = Dict())
default_args = Dict(
:name => "numfeatsel"
)
cargs=nested_dict_merge(default_args,args)
cargs[:name] = cargs[:name]*"_"*randstring(3)
new(cargs[:name],Dict(),cargs)
end
end

function fit!(ft::NumFeatureSelector, features::DataFrame, labels::Vector=[])
if features == DataFrame()
error("empty dataframe")
end
_,realcols = find_catnum_columns(features)

# create model
ft.model = Dict(
:real_columns => realcols
)
end

function transform!(ft::NumFeatureSelector, features::DataFrame)
realcols = ft.model[:real_columns]
return features[:,realcols]
end


# ---------
# convert numeric categories to string based on count of unique elements
mutable struct CatNumDiscriminator <: Transformer
name::String
model::Dict
args::Dict

function CatNumDiscriminator(args::Dict = Dict())
default_args = Dict(
:name => "catnumdisc",
# default max categories for numeric-encoded categories
:maxcategories => 24,
)
cargs=nested_dict_merge(default_args,args)
cargs[:name] = cargs[:name]*"_"*randstring(3)
new(cargs[:name],Dict(),cargs)
end
end

function CatNumDiscriminator(maxcat::Int)
CatNumDiscriminator(Dict(:maxcategories=>maxcat))
end

function fit!(ft::CatNumDiscriminator, features::DataFrame, labels::Vector=[])
if features == DataFrame()
error("empty dataframe")
end
catcols,realcols = find_catnum_columns(features,ft.args[:maxcategories])

# create model
ft.model = Dict(
:real_columns => realcols,
:nominal_columns => catcols
)
end

function transform!(ft::CatNumDiscriminator, features::DataFrame)
catcols = ft.model[:nominal_columns]
features[!,catcols] .= features[!,catcols] .|> string
return features
end

function feature_test()
data = getiris()
X = data[:,1:5]
X[!,5] = X[!,5] .|> string
catfeat = FeatureSelector([5])
numfeat = FeatureSelector([1,2,3,4])
autocat = CatFeatureSelector()
autonum = NumFeatureSelector()
@assert (fit_transform!(catfeat,X) .== X[:,5]) |> Matrix |> sum == 150
@assert (fit_transform!(numfeat,X) .== X[:,1:4]) |> Matrix |> sum == 600
@assert (fit_transform!(autocat,X) .== X[:,5]) |> Matrix |> sum == 150
@assert (fit_transform!(autonum,X) .== X[:,1:4]) |> Matrix |> sum == 600
catnumdata = hcat(X,repeat([1,2,3,4,5],30))
catnum = CatNumDiscriminator()
fit_transform!(catnum,catnumdata)
@assert eltype(catnumdata[:,6]) <: String
catnumdata = hcat(X,repeat([1,2,3,4,5],30))
catnum = CatNumDiscriminator(0)
fit_transform!(catnum,catnumdata)
@assert eltype(catnumdata[:,6]) <: Int
end

end
17 changes: 17 additions & 0 deletions src/normalizer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ mutable struct Normalizer <: Transformer
end
end

function Normalizer(st::Symbol)
Normalizer(Dict(:method=>st))
end

"""
fit!(st::Statifier, features::T, labels::Vector=[]) where {T<:Union{Vector,Matrix,DataFrame}}
Expand Down Expand Up @@ -106,6 +110,8 @@ function processnumeric(norm::Normalizer,features::Matrix)
pca(features)
elseif norm.args[:method] == :ppca
ppca(features)
elseif norm.args[:method] == :ica
ica(features)
elseif norm.args[:method] == :fa
fa(features)
elseif norm.args[:method] == :sqrt
Expand Down Expand Up @@ -148,6 +154,17 @@ function pca(X)
end


function ica(X,kk::Int=0)
k = kk
if k == 0
k = size(X)[2]
end
xp = X' |> collect |> Matrix{Float64}
m = fit(ICA,xp,k)
transform(m,xp)' |> collect
end


# ppca
function ppca(X)
xp = X' |> collect |> Matrix{Float64}
Expand Down

0 comments on commit 2bfc767

Please sign in to comment.