Skip to content

Commit

Permalink
Merge pull request #3 from IBM/data_pipeline
Browse files Browse the repository at this point in the history
Data pipeline
  • Loading branch information
ppalmes committed Apr 1, 2019
2 parents 620e592 + 3b1e0c4 commit dbd2c20
Show file tree
Hide file tree
Showing 13 changed files with 48,959 additions and 62 deletions.
32 changes: 19 additions & 13 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ version = "0.5.2"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "195a3ffcb8b0762684b6821de18f83a16455c6ea"
git-tree-sha1 = "84aa74986c5b9b898b0d1acaf3258741ee64754f"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "2.0.0"
version = "2.1.0"

[[Conda]]
deps = ["Compat", "JSON", "VersionParsing"]
Expand Down Expand Up @@ -65,6 +65,12 @@ version = "0.15.0"
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[DecisionTree]]
deps = ["DelimitedFiles", "Distributed", "LinearAlgebra", "Random", "ScikitLearnBase", "Statistics", "Test"]
git-tree-sha1 = "a0a1b9f70f9c57819aed52b2ce570de77bf0a6cf"
uuid = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
version = "0.8.1"

[[DelimitedFiles]]
deps = ["Mmap"]
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Expand All @@ -81,9 +87,9 @@ version = "0.9.1"

[[FileIO]]
deps = ["Pkg", "Random", "Test"]
git-tree-sha1 = "c94b0787956629036fb2b20fccde9e52b89d079a"
git-tree-sha1 = "da32159d4a2e526338506685e280e39ed2f18961"
uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
version = "1.0.5"
version = "1.0.6"

[[FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
Expand Down Expand Up @@ -188,12 +194,6 @@ git-tree-sha1 = "4bf69aaf823b119b034e091e16b18311aa191663"
uuid = "78c3b35d-d492-501b-9361-3d52fe80e533"
version = "0.5.7"

[[Nullables]]
deps = ["Compat"]
git-tree-sha1 = "ae1a63457e14554df2159b0b028f48536125092d"
uuid = "4d1e1d77-625e-5b40-9113-a560ec7a8ecd"
version = "0.0.8"

[[OrderedCollections]]
deps = ["Random", "Serialization", "Test"]
git-tree-sha1 = "85619a3f3e17bb4761fe1b1fd47f0e979f964d5b"
Expand Down Expand Up @@ -271,6 +271,12 @@ version = "2.0.1"
[[SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"

[[ScikitLearnBase]]
deps = ["Compat", "LinearAlgebra", "Random", "Statistics", "Test"]
git-tree-sha1 = "0e27caac9a456b531193117c538d739d2d6e91c2"
uuid = "6e75b9c4-186b-50bd-896f-2d2496a4843e"
version = "0.4.1"

[[Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"

Expand Down Expand Up @@ -324,10 +330,10 @@ deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[TimeZones]]
deps = ["Compat", "EzXML", "Mocking", "Nullables"]
git-tree-sha1 = "5437144a2bbb5b661783ad34b0d19d5696845b25"
deps = ["Dates", "EzXML", "Mocking", "Printf", "Serialization", "Test", "Unicode"]
git-tree-sha1 = "fdf5d2136d16498cb67d648cedd33b83c599e0c5"
uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53"
version = "0.8.5"
version = "0.9.0"

[[TranscodingStreams]]
deps = ["Pkg", "Random", "Test"]
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ authors = ["Paulito Palmes <ppalmes@gmail.com>"]
version = "0.1.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
DecisionTree = "7806a523-6efd-50cb-b5f6-3fa6f1930dbb"
MLDataUtils = "cc2ba9b6-d476-5e6d-8eaf-a92d5412d41d"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
RCall = "6f49c342-dc21-5d91-9882-a32aef131414"
Expand Down
73 changes: 73 additions & 0 deletions data/process.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
using Pkg
Pkg.activate("..")
using TSML
using TSML.TSMLTransformers
using DataFrames
using Dates
using CSV
using Plots

fname ="testdata.csv"
dat = CSV.read(fname)
rename!(dat,names(dat)[1]=>:Date,names(dat)[2]=>:Value)
dat[:Date] = DateTime.(dat[:Date],"d/m/y H:M")
orig = deepcopy(dat)
filter1 = DateValgator()
filter2 = DateValNNer(Dict(:nnsize=>1))

fit!(filter1,dat,[])
res1=transform!(filter1,dat)

fit!(filter2,res1,[])
res2=transform!(filter2,res1)

mypipeline = Pipeline(Dict(
:transformers => [csvreader,filter1,filter2]
)
)

fit!(mypipeline)
res = transform!(mypipeline)

Plots.plot(res[:Value][end-3000:end])

rfname = replace(fname,".csv"=>"-result.csv")
res |> CSV.write(rfname)

using TSML.TSMLTypes
import TSML.TSMLTypes.fit!
import TSML.TSMLTypes.transform!

mutable struct CSVDateValReader <: Transformer
model
args
function CSVDateValReader(args=Dict())
default_args = Dict(
:filename => "",
:dateformat => ""
)
new(nothing,mergedict(default_args,args))
end
end

function fit!(csvrdr::CSVDateValReader,x::T=[],y::Vector=[]) where {T<:Union{DataFrame,Vector,Matrix}}
fname = csvrdr.args[:filename]
fmt = csvrdr.args[:dateformat]
(fname != "" && fmt != "") || error("missing filename or date format")
model = csvrdr.args
end

function transform!(csvrdr::CSVDateValReader,x::T=[]) where {T<:Union{DataFrame,Vector,Matrix}}
fname = csvrdr.args[:filename]
fmt = csvrdr.args[:dateformat]
df = CSV.read(fname)
ncol(df) == 2 || error("dataframe should have only two columns: Date,Value")
rename!(df,names(df)[1]=>:Date,names(df)[2]=>:Value)
df[:Date] = DateTime.(df[:Date],fmt)
df
end

csvreader = CSVDateValReader(Dict(:filename=>"testdata.csv",:dateformat=>"d/m/y H:M"))

fit!(csvreader)
transform!(csvreader)

0 comments on commit dbd2c20

Please sign in to comment.