Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

canopy changes #823

Merged
merged 2 commits into from Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
38 changes: 25 additions & 13 deletions common/core/src/main/java/zingg/common/core/block/Block.java
Expand Up @@ -66,16 +66,13 @@ public void setDupes(ZFrame<D,R,C> dupes) {
/**
* @return the types
*
* public Class[] getTypes() { return types; }
*/

/**
* @param types
* the types to set
* the types to set
*
* public void setTypes(Class[] types) { this.types = types; }
*
* /**
*
* @return the maxSize
*/
public long getMaxSize() {
Expand All @@ -84,7 +81,7 @@ public long getMaxSize() {

/**
* @param maxSize
* the maxSize to set
* the maxSize to set
*/
public void setMaxSize(long maxSize) {
this.maxSize = maxSize;
Expand All @@ -102,17 +99,24 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
this.functionsMap = m;
}

protected Canopy<R> getCanopy(){
return new Canopy<R>();
}

public Canopy<R>getNodeFromCurrent(Canopy<R>node, HashFunction<D,R,C,T> function,
FieldDefinition context) {
Canopy<R>trial = new Canopy<R>();
Canopy<R>trial = getCanopy();
trial = node.copyTo(trial);
// node.training, node.dupeN, function, context);
trial.function = function;
trial.context = context;
return trial;
}

public void estimateElimCount(Canopy<R> c, long elimCount) {
c.estimateElimCount();
}

public abstract T getDataTypeFromString(String t);

public Canopy<R>getBestNode(Tree<Canopy<R>> tree, Canopy<R>parent, Canopy<R>node,
Expand All @@ -122,14 +126,18 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
Canopy<R>best = null;

for (FieldDefinition field : fieldsOfInterest) {
LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt "
+ getDataTypeFromString(field.getDataType()));
if (LOG.isDebugEnabled()){
LOG.debug("Trying for " + field + " with data type " + field.getDataType() + " and real dt "
+ getDataTypeFromString(field.getDataType()));
}
//Class type = FieldClass.getFieldClassClass(field.getFieldClass());
FieldDefinition context = field;
if (least ==0) break;//how much better can it get?
// applicable functions
List<HashFunction<D,R,C,T>> functions = functionsMap.get(getDataTypeFromString(field.getDataType()));
LOG.debug("functions are " + functions);
if (LOG.isDebugEnabled()){
LOG.debug("functions are " + functions);
}

if (functions != null) {

Expand All @@ -140,11 +148,13 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
//!childless.contains(function, field.fieldName)
)
{
LOG.debug("Evaluating field " + field.fieldName
if (LOG.isDebugEnabled()){
LOG.debug("Evaluating field " + field.fieldName
+ " and function " + function + " for " + field.dataType);
}
Canopy<R>trial = getNodeFromCurrent(node, function,
context);
trial.estimateElimCount();
estimateElimCount(trial, least);
long elimCount = trial.getElimCount();


Expand Down Expand Up @@ -178,7 +188,9 @@ protected void setFunctionsMap(ListMap<T, HashFunction<D,R,C,T>> m) {
}*/
}
else {
LOG.debug("No child " + function);
if (LOG.isDebugEnabled()){
LOG.debug("No child " + function);
}
//childless.add(function, field.fieldName);
}

Expand Down
14 changes: 7 additions & 7 deletions common/core/src/main/java/zingg/common/core/block/Canopy.java
Expand Up @@ -20,19 +20,19 @@ public class Canopy<R> implements Serializable {
public static final Log LOG = LogFactory.getLog(Canopy.class);

// created by function edge leading from parent to this node
HashFunction function;
protected HashFunction function;
// aplied on field
FieldDefinition context;
protected FieldDefinition context;
// list of duplicates passed from parent
List<R> dupeN;
protected List<R> dupeN;
// number of duplicates eliminated after function applied on fn context
long elimCount;
protected long elimCount;
// hash of canopy
Object hash;
protected Object hash;
// training set
List<R> training;
protected List<R> training;
// duplicates remaining after function is applied
List<R> dupeRemaining;
protected List<R> dupeRemaining;

public Canopy() {
}
Expand Down
Expand Up @@ -32,7 +32,7 @@ public String call(String field) {
r = field.trim().substring(0, endIndex);
}
}
LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r);
//LOG.debug("Applying " + this.getName() + " on " + field + " and returning " + r);
return r;
}

Expand Down
1 change: 1 addition & 0 deletions spark/core/src/test/java/zingg/block/TestBlock.java
Expand Up @@ -50,6 +50,7 @@ public void testTree() throws Throwable {
// primary deciding is unique year so identityInteger should have been picked
Canopy<Row> head = blockingTree.getHead();
assertEquals("identityInteger", head.getFunction().getName());
blockingTree.toString();

}

Expand Down