Skip to content

Commit

Permalink
Merge pull request #823 from zinggAI/enterprise
Browse files Browse the repository at this point in the history
canopy changes
  • Loading branch information
vikasgupta78 committed Apr 22, 2024
2 parents 1c8b1c2 + bd48ac9 commit bb109e1
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 21 deletions.
38 changes: 25 additions & 13 deletions common/core/src/main/java/zingg/common/core/block/Block.java
Expand Up @@ -66,16 +66,13 @@ public void setDupes(ZFrame<D,R,C> dupes) {
/**
* @return the types
*
* public Class[] getTypes() { return types; }
*/

/**
* @param types
* the types to set
* the types to set
*
* public void setTypes(Class[] types) { this.types = types; }
*
* /**
*
* @return the maxSize
*/
public long getMaxSize() {
Expand All @@ -84,7 +81,7 @@ public long getMaxSize() {

/**
* @param maxSize
* the maxSize to set
* the maxSize to set
*/
public void setMaxSize(long maxSize) {
this.maxSize = maxSize;
Expand All @@ -102,17 +99,24 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
this.functionsMap = m;
}

protected Canopy<R> getCanopy(){
return new Canopy<R>();
}

public Canopy<R>getNodeFromCurrent(Canopy<R>node, HashFunction<D,R,C,T> function,
FieldDefinition context) {
Canopy<R>trial = new Canopy<R>();
Canopy<R>trial = getCanopy();
trial = node.copyTo(trial);
// node.training, node.dupeN, function, context);
trial.function = function;
trial.context = context;
return trial;
}

public void estimateElimCount(Canopy<R> c, long elimCount) {
c.estimateElimCount();
}

public abstract T getDataTypeFromString(String t);

public Canopy<R>getBestNode(Tree<Canopy<R>> tree, Canopy<R>parent, Canopy<R>node,
Expand All @@ -122,14 +126,18 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
Canopy<R>best = null;

for (FieldDefinition field : fieldsOfInterest) {
LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt "
+ getDataTypeFromString(field.getDataType()));
if (LOG.isDebugEnabled()){
LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt "
+ getDataTypeFromString(field.getDataType()));
}
//Class type = FieldClass.getFieldClassClass(field.getFieldClass());
FieldDefinition context = field;
if (least ==0) break;//how much better can it get?
// applicable functions
List<HashFunction<D,R,C,T>> functions = functionsMap.get(getDataTypeFromString(field.getDataType()));
LOG.debug("functions are " + functions);
if (LOG.isDebugEnabled()){
LOG.debug("functions are " + functions);
}

if (functions != null) {

Expand All @@ -140,11 +148,13 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
//!childless.contains(function, field.fieldName)
)
{
LOG.debug("Evaluating field " + field.fieldName
if (LOG.isDebugEnabled()){
LOG.debug("Evaluating field " + field.fieldName
+ " and function " + function + " for " + field.dataType);
}
Canopy<R>trial = getNodeFromCurrent(node, function,
context);
trial.estimateElimCount();
estimateElimCount(trial, least);
long elimCount = trial.getElimCount();


Expand Down Expand Up @@ -178,7 +188,9 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
}*/
}
else {
LOG.debug("No child " + function);
if (LOG.isDebugEnabled()){
LOG.debug("No child " + function);
}
//childless.add(function, field.fieldName);
}

Expand Down
14 changes: 7 additions & 7 deletions common/core/src/main/java/zingg/common/core/block/Canopy.java
Expand Up @@ -20,19 +20,19 @@ public class Canopy<R> implements Serializable {
public static final Log LOG = LogFactory.getLog(Canopy.class);

// created by function edge leading from parent to this node
HashFunction function;
protected HashFunction function;
// aplied on field
FieldDefinition context;
protected FieldDefinition context;
// list of duplicates passed from parent
List<R> dupeN;
protected List<R> dupeN;
// number of duplicates eliminated after function applied on fn context
long elimCount;
protected long elimCount;
// hash of canopy
Object hash;
protected Object hash;
// training set
List<R> training;
protected List<R> training;
// duplicates remaining after function is applied
List<R> dupeRemaining;
protected List<R> dupeRemaining;

public Canopy() {
}
Expand Down
Expand Up @@ -32,7 +32,7 @@ public String call(String field) {
r = field.trim().substring(0, endIndex);
}
}
LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r);
//LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r);
return r;
}

Expand Down
1 change: 1 addition & 0 deletions spark/core/src/test/java/zingg/block/TestBlock.java
Expand Up @@ -50,6 +50,7 @@ public void testTree() throws Throwable {
// primary deciding is unique year so identityInteger should have been picked
Canopy<Row> head = blockingTree.getHead();
assertEquals("identityInteger", head.getFunction().getName());
blockingTree.toString();

}

Expand Down

0 comments on commit bb109e1

Please sign in to comment.