Skip to content

Commit

Permalink
Merge pull request #559 from zinggAI/0.3.5
Browse files Browse the repository at this point in the history
issue #473 and #285
  • Loading branch information
sonalgoyal committed Apr 1, 2023
2 parents dd0ab0e + 9c7b6ce commit 75970fb
Show file tree
Hide file tree
Showing 407 changed files with 85 additions and 79 deletions.
2 changes: 1 addition & 1 deletion examples/amazon-google/AmazonGoogle.py
Expand Up @@ -33,7 +33,7 @@

args.setOutput(outputPipe)

options = ClientOptions([ClientOptions.PHASE,"link"])
options = ClientOptions([ClientOptions.PHASE,"match"])

#Zingg execution for the given phase
zingg = Zingg(args, options)
Expand Down
22 changes: 2 additions & 20 deletions examples/amazon-google/configWithStopWords.json
Expand Up @@ -48,16 +48,7 @@
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"description\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
"schema": "id string, title string, description string, manufacturer string, price double"
},
{
"name":"google",
Expand All @@ -67,16 +58,7 @@
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"description\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
"schema": "id string, title string, description string, manufacturer string, price double"
}
],
"labelDataSampleSize" : 0.4,
Expand Down
2 changes: 1 addition & 1 deletion examples/febrl/FebrlExample.py
Expand Up @@ -37,7 +37,7 @@

args.setOutput(outputPipe)

options = ClientOptions([ClientOptions.PHASE,"label"])
options = ClientOptions([ClientOptions.PHASE,"match"])

#Zingg execution for the given phase
zingg = Zingg(args, options)
Expand Down
2 changes: 1 addition & 1 deletion examples/iTunes-amazon/iTunesAmazon.py
Expand Up @@ -37,7 +37,7 @@
outputPipe = CsvPipe("iTunesAmazonresult", "/tmp/iTunesAmazonOutput")
args.setOutput(outputPipe)

options = ClientOptions([ClientOptions.PHASE,"link"])
options = ClientOptions([ClientOptions.PHASE,"match"])

#Zingg execution for the given phase
zingg = Zingg(args, options)
Expand Down
6 changes: 3 additions & 3 deletions examples/ncVoters5M/ncVoters.py
Expand Up @@ -8,7 +8,7 @@
givenname = FieldDefinition("givenname", "string", MatchType.FUZZY)
surname = FieldDefinition("surname", "string", MatchType.EXACT)
suburb = FieldDefinition("suburb","string", MatchType.FUZZY)
postcode = FieldDefinition("postcode", "double", MatchType.EXACT)
postcode = FieldDefinition("postcode", "string", MatchType.EXACT)

fieldDefs = [recid, givenname, surname, suburb, postcode]
args.setFieldDefinition(fieldDefs)
Expand All @@ -21,7 +21,7 @@
#reading dataset into inputPipe and settint it up in 'args'
#below line should not be required if you are reading from in memory dataset
#in that case, replace df with input df
schema = "recid string, givenname string, surname string, suburb string, postcode double "
schema = "recid string, givenname string, surname string, suburb string, postcode string "
inputPipe = CsvPipe("ncVotersTest", "examples/ncVoters5M/5Party-ocp20/", schema)
args.setData(inputPipe)

Expand All @@ -30,7 +30,7 @@

args.setOutput(outputPipe)

options = ClientOptions([ClientOptions.PHASE,"trainMatch"])
options = ClientOptions([ClientOptions.PHASE,"match"])

#Zingg execution for the given phase
zingg = Zingg(args, options)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1679653593791,"sparkVersion":"3.1.2","uid":"pipeline_37bd53de1842","paramMap":{"stageUids":["vecAssembler_ab6a72b9ff3a","poly_5441bacea924","logreg_8bb21326b817"]},"defaultParamMap":{}}
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1680262758849,"sparkVersion":"3.1.2","uid":"pipeline_7af7179ee2c9","paramMap":{"stageUids":["vecAssembler_d9c5b06776d6","poly_9e0e03752d9e","logreg_c64f720bac8d"]},"defaultParamMap":{}}
Binary file not shown.
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1679653593336,"sparkVersion":"3.1.2","uid":"vecAssembler_ab6a72b9ff3a","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4","z_sim5","z_sim6","z_sim7","z_sim8","z_sim9","z_sim10","z_sim11","z_sim12","z_sim13","z_sim14","z_sim15","z_sim16","z_sim17","z_sim18","z_sim19"]},"defaultParamMap":{"outputCol":"vecAssembler_ab6a72b9ff3a__output","handleInvalid":"error"}}
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1680262758941,"sparkVersion":"3.1.2","uid":"vecAssembler_d9c5b06776d6","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4","z_sim5","z_sim6","z_sim7","z_sim8","z_sim9","z_sim10","z_sim11","z_sim12","z_sim13","z_sim14","z_sim15","z_sim16","z_sim17","z_sim18","z_sim19"]},"defaultParamMap":{"outputCol":"vecAssembler_d9c5b06776d6__output","handleInvalid":"error"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.PolynomialExpansion","timestamp":1680262759042,"sparkVersion":"3.1.2","uid":"poly_9e0e03752d9e","paramMap":{"inputCol":"z_featurevector","degree":3,"outputCol":"z_feature"},"defaultParamMap":{"degree":2,"outputCol":"poly_9e0e03752d9e__output"}}
Binary file not shown.
Binary file not shown.
Binary file not shown.

This file was deleted.

Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.LogisticRegressionModel","timestamp":1680262759138,"sparkVersion":"3.1.2","uid":"logreg_c64f720bac8d","paramMap":{"maxIter":100,"featuresCol":"z_feature","labelCol":"z_isMatch","fitIntercept":true,"regParam":1.0E-4,"probabilityCol":"z_probability","threshold":0.4,"predictionCol":"z_prediction"},"defaultParamMap":{"maxIter":100,"featuresCol":"features","elasticNetParam":0.0,"labelCol":"label","fitIntercept":true,"regParam":0.0,"maxBlockSizeInMB":0.0,"rawPredictionCol":"rawPrediction","probabilityCol":"probability","aggregationDepth":2,"tol":1.0E-6,"family":"auto","threshold":0.5,"predictionCol":"prediction","standardization":true}}
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.Pipeline","timestamp":1679653593142,"sparkVersion":"3.1.2","uid":"pipeline_37bd53de1842","paramMap":{"stageUids":["vecAssembler_ab6a72b9ff3a","poly_5441bacea924","logreg_8bb21326b817"]},"defaultParamMap":{}}
{"class":"org.apache.spark.ml.Pipeline","timestamp":1680262758290,"sparkVersion":"3.1.2","uid":"pipeline_7af7179ee2c9","paramMap":{"stageUids":["vecAssembler_d9c5b06776d6","poly_9e0e03752d9e","logreg_c64f720bac8d"]},"defaultParamMap":{}}
Binary file not shown.
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1679653593881,"sparkVersion":"3.1.2","uid":"vecAssembler_ab6a72b9ff3a","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4","z_sim5","z_sim6","z_sim7","z_sim8","z_sim9","z_sim10","z_sim11","z_sim12","z_sim13","z_sim14","z_sim15","z_sim16","z_sim17","z_sim18","z_sim19"]},"defaultParamMap":{"outputCol":"vecAssembler_ab6a72b9ff3a__output","handleInvalid":"error"}}
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1680262758441,"sparkVersion":"3.1.2","uid":"vecAssembler_d9c5b06776d6","paramMap":{"outputCol":"z_featurevector","inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4","z_sim5","z_sim6","z_sim7","z_sim8","z_sim9","z_sim10","z_sim11","z_sim12","z_sim13","z_sim14","z_sim15","z_sim16","z_sim17","z_sim18","z_sim19"]},"defaultParamMap":{"outputCol":"vecAssembler_d9c5b06776d6__output","handleInvalid":"error"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.PolynomialExpansion","timestamp":1680262758623,"sparkVersion":"3.1.2","uid":"poly_9e0e03752d9e","paramMap":{"inputCol":"z_featurevector","degree":3,"outputCol":"z_feature"},"defaultParamMap":{"degree":2,"outputCol":"poly_9e0e03752d9e__output"}}
Binary file not shown.

This file was deleted.

Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.LogisticRegression","timestamp":1680262758752,"sparkVersion":"3.1.2","uid":"logreg_c64f720bac8d","paramMap":{"maxIter":100,"featuresCol":"z_feature","labelCol":"z_isMatch","fitIntercept":true,"probabilityCol":"z_probability","predictionCol":"z_prediction"},"defaultParamMap":{"maxIter":100,"featuresCol":"features","elasticNetParam":0.0,"labelCol":"label","fitIntercept":true,"regParam":0.0,"maxBlockSizeInMB":0.0,"rawPredictionCol":"rawPrediction","probabilityCol":"probability","aggregationDepth":2,"tol":1.0E-6,"family":"auto","threshold":0.5,"predictionCol":"prediction","standardization":true}}
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator","timestamp":1679653592846,"sparkVersion":"3.1.2","uid":"binEval_193e68f2fb20","paramMap":{"labelCol":"z_isMatch"},"defaultParamMap":{"metricName":"areaUnderROC","rawPredictionCol":"rawPrediction","labelCol":"label","numBins":1000}}
{"class":"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator","timestamp":1680262758064,"sparkVersion":"3.1.2","uid":"binEval_d85e7822d500","paramMap":{"labelCol":"z_isMatch"},"defaultParamMap":{"metricName":"areaUnderROC","labelCol":"label","numBins":1000,"rawPredictionCol":"rawPrediction"}}
Binary file modified models/100/model/classifier/best.model/metadata/.part-00000.crc
Binary file not shown.
2 changes: 1 addition & 1 deletion models/100/model/classifier/best.model/metadata/part-00000
@@ -1 +1 @@
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1679653592157,"sparkVersion":"3.1.2","uid":"cv_9205a151ad83","paramMap":{"seed":-1191137437,"numFolds":2,"foldCol":"","estimatorParamMaps":[[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_8bb21326b817","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_8bb21326b817","name":"threshold","value":"0.55","isJson":"true"}]]},"defaultParamMap":{"seed":-1191137437,"numFolds":3,"foldCol":""},"avgMetrics":[0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571],"persistSubModels":false}
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1680262757721,"sparkVersion":"3.1.2","uid":"cv_b6b3e46792cc","paramMap":{"numFolds":2,"seed":-1191137437,"foldCol":"","estimatorParamMaps":[[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.4","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.45","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.5","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0E-4","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.001","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.01","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"0.1","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.55","isJson":"true"}],[{"parent":"logreg_c64f720bac8d","name":"regParam","value":"1.0","isJson":"true"},{"parent":"logreg_c64f720bac8d","name":"threshold","value":"0.55","isJson":"true"}]]},"defaultParamMap":{"numFolds":3,"seed":-1191137437,"foldCol":""},"avgMetrics":[0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571,0.9821428571428571],"persistSubModels":false}
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1 +1 @@
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1680245294860,"sparkVersion":"3.1.2","uid":"pipeline_f729a6b01032","paramMap":{"stageUids":["vecAssembler_5a4a15106c47","poly_993555b1f9db","logreg_d93ca38a2205"]},"defaultParamMap":{}}
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1680265841416,"sparkVersion":"3.1.2","uid":"pipeline_f47d307ed35b","paramMap":{"stageUids":["vecAssembler_7e2cbf38f293","poly_747cf01a21ce","logreg_549169031548"]},"defaultParamMap":{}}
Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1680265841509,"sparkVersion":"3.1.2","uid":"vecAssembler_7e2cbf38f293","paramMap":{"inputCols":["z_sim0","z_sim1","z_sim2","z_sim3","z_sim4","z_sim5","z_sim6","z_sim7","z_sim8","z_sim9","z_sim10","z_sim11","z_sim12","z_sim13","z_sim14","z_sim15","z_sim16","z_sim17"],"outputCol":"z_featurevector"},"defaultParamMap":{"handleInvalid":"error","outputCol":"vecAssembler_7e2cbf38f293__output"}}
Binary file not shown.
@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.PolynomialExpansion","timestamp":1680265841610,"sparkVersion":"3.1.2","uid":"poly_747cf01a21ce","paramMap":{"outputCol":"z_feature","inputCol":"z_featurevector","degree":3},"defaultParamMap":{"outputCol":"poly_747cf01a21ce__output","degree":2}}
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 75970fb

Please sign in to comment.