Skip to content

Commit

Permalink
Merge pull request #824 from zinggAI/issue821
Browse files Browse the repository at this point in the history
added null or blank and exact match type to int, long and date issue#821
  • Loading branch information
vikasgupta78 committed May 1, 2024
2 parents cb52bbd + ce80e07 commit e8e4c06
Show file tree
Hide file tree
Showing 12 changed files with 299 additions and 3 deletions.
Expand Up @@ -4,10 +4,14 @@

import zingg.common.client.FieldDefinition;
import zingg.common.client.MatchType;
import zingg.common.core.similarity.function.CheckNullFunction;
import zingg.common.core.similarity.function.DateSimilarityFunction;
import zingg.common.core.similarity.function.SimilarityFunctionExact;

public class DateFeature extends BaseFeature<Date> {

private static final long serialVersionUID = 1L;

public DateFeature() {

}
Expand All @@ -28,6 +32,12 @@ public void init(FieldDefinition f) {
if (f.getMatchType().contains(MatchType.FUZZY)) {
addSimFunction(new DateSimilarityFunction());
}
if (f.getMatchType().contains(MatchType.EXACT)) {
addSimFunction(new SimilarityFunctionExact<Date>("DateSimilarityFunctionExact"));
}
if (f.getMatchType().contains(MatchType.NULL_OR_BLANK)) {
addSimFunction(new CheckNullFunction<Date>("CheckNullFunctionDate"));
}
}

}
Expand Up @@ -2,9 +2,13 @@

import zingg.common.client.FieldDefinition;
import zingg.common.client.MatchType;
import zingg.common.core.similarity.function.CheckNullFunction;
import zingg.common.core.similarity.function.IntegerSimilarityFunction;
import zingg.common.core.similarity.function.SimilarityFunctionExact;
public class IntFeature extends BaseFeature<Integer> {

private static final long serialVersionUID = 1L;

public IntFeature() {

}
Expand All @@ -14,6 +18,12 @@ public void init(FieldDefinition newParam) {
if (newParam.getMatchType().contains(MatchType.FUZZY)) {
addSimFunction(new IntegerSimilarityFunction());
}
if (newParam.getMatchType().contains(MatchType.EXACT)) {
addSimFunction(new SimilarityFunctionExact<Integer>("IntegerSimilarityFunctionExact"));
}
if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) {
addSimFunction(new CheckNullFunction<Integer>("CheckNullFunctionInt"));
}
}

}
Expand Up @@ -2,7 +2,9 @@

import zingg.common.client.FieldDefinition;
import zingg.common.client.MatchType;
import zingg.common.core.similarity.function.CheckNullFunction;
import zingg.common.core.similarity.function.LongSimilarityFunction;
import zingg.common.core.similarity.function.SimilarityFunctionExact;
public class LongFeature extends BaseFeature<Long> {

private static final long serialVersionUID = 1L;
Expand All @@ -16,6 +18,12 @@ public void init(FieldDefinition newParam) {
if (newParam.getMatchType().contains(MatchType.FUZZY)) {
addSimFunction(new LongSimilarityFunction());
}
if (newParam.getMatchType().contains(MatchType.EXACT)) {
addSimFunction(new SimilarityFunctionExact<Long>("LongSimilarityFunctionExact"));
}
if (newParam.getMatchType().contains(MatchType.NULL_OR_BLANK)) {
addSimFunction(new CheckNullFunction<Long>("CheckNullFunctionLong"));
}
}

}
@@ -0,0 +1,26 @@
package zingg.common.core.similarity.function;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class CheckNullFunction<T> extends SimFunction<T> {

private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory
.getLog(CheckNullFunction.class);

public CheckNullFunction(String name) {
super(name);
}

@Override
public Double call(T first, T second) {
if (first != null && second != null) {
return 1d;
}
return 0d;
}



}
@@ -0,0 +1,21 @@
package zingg.common.core.similarity.function;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class SimilarityFunctionExact<T> extends SimFunction<T> {
private static final long serialVersionUID = 1L;
public static final Log LOG = LogFactory
.getLog(SimilarityFunctionExact.class);

public SimilarityFunctionExact(String name) {
super(name);
}

@Override
public Double call(T first, T second) {
if (first == null || second == null) return 1d;
double score = first.equals(second) ? 1d : 0d;
return score;
}
}
@@ -0,0 +1,34 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.util.Date;

import org.junit.jupiter.api.Test;
public class TestCheckNullFunctionDate {

@Test
public void testFirstNull() {
assertEquals(0d, simFunc().call(null, new Date(2)));
}

@Test
public void testSecondNull() {
assertEquals(0d, simFunc().call(new Date(1), null));
}

@Test
public void testBothNull() {
assertEquals(0d, simFunc().call(null, null));
}

@Test
public void testBothNotNull() {
assertEquals(1d, simFunc().call(new Date(1), new Date(2)));
}

protected CheckNullFunction<Date> simFunc() {
return new CheckNullFunction<Date>("CheckNullFunctionDate");
}

}
@@ -0,0 +1,34 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import org.junit.jupiter.api.Test;

public class TestCheckNullFunctionInt {


@Test
public void testFirstNull() {
assertEquals(0d, simFunc().call(null, 2));
}

@Test
public void testSecondNull() {
assertEquals(0d, simFunc().call(1, null));
}

@Test
public void testBothNull() {
assertEquals(0d, simFunc().call(null, null));
}

@Test
public void testBothNotNull() {
assertEquals(1d, simFunc().call(1, 2));
}

protected CheckNullFunction<Integer> simFunc() {
return new CheckNullFunction<Integer>("CheckNullFunctionInt");
}

}
@@ -0,0 +1,34 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import org.junit.jupiter.api.Test;

public class TestCheckNullFunctionLong {


@Test
public void testFirstNull() {
assertEquals(0d, simFunc().call(null, 2l));
}

@Test
public void testSecondNull() {
assertEquals(0d, simFunc().call(1l, null));
}

@Test
public void testBothNull() {
assertEquals(0d, simFunc().call(null, null));
}

@Test
public void testBothNotNull() {
assertEquals(1d, simFunc().call(1l, 2l));
}

protected CheckNullFunction<Long> simFunc() {
return new CheckNullFunction<Long>("CheckNullFunctionLong");
}

}
@@ -0,0 +1,42 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.util.Date;

import org.junit.jupiter.api.Test;

public class TestDateSimilarityFunctionExact {


@Test
public void testFirstNull() {
assertEquals(1d, simFunc().call(null, new Date(2)));
}


@Test
public void testSecondNull() {
assertEquals(1d, simFunc().call(new Date(1), null));
}

@Test
public void testBothNull() {
assertEquals(1d, simFunc().call(null, null));
}

@Test
public void testNotEqual() {
assertEquals(0d, simFunc().call(new Date(101), new Date(102)));
}

@Test
public void testEqual() {
assertEquals(1d, simFunc().call(new Date(101), new Date(101)));
}

protected SimilarityFunctionExact<Date> simFunc() {
return new SimilarityFunctionExact<Date>("DateSimilarityFunctionExact");
}

}
@@ -0,0 +1,38 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import org.junit.jupiter.api.Test;

public class TestIntegerSimilarityFunctionExact {

@Test
public void testFirstNull() {
assertEquals(1d, simFunc().call(null, 2));
}

@Test
public void testSecondNull() {
assertEquals(1d, simFunc().call(1, null));
}

@Test
public void testBothNull() {
assertEquals(1d, simFunc().call(null, null));
}

@Test
public void testNotEqual() {
assertEquals(0d, simFunc().call(101, 102));
}

@Test
public void testEqual() {
assertEquals(1d, simFunc().call(101, 101));
}

protected SimilarityFunctionExact<Integer> simFunc() {
return new SimilarityFunctionExact<Integer>("IntegerSimilarityFunctionExact");
}

}
@@ -0,0 +1,39 @@
package zingg.common.core.similarity.function;

import static org.junit.jupiter.api.Assertions.assertEquals;

import org.junit.jupiter.api.Test;

public class TestLongSimilarityFunctionExact {


@Test
public void testFirstNull() {
assertEquals(1d, simFunc().call(null, 2l));
}

@Test
public void testSecondNull() {
assertEquals(1d, simFunc().call(1l, null));
}

@Test
public void testBothNull() {
assertEquals(1d, simFunc().call(null, null));
}

@Test
public void testNotEqual() {
assertEquals(0d, simFunc().call(101l, 102l));
}

@Test
public void testEqual() {
assertEquals(1d, simFunc().call(101l, 101l));
}

protected SimilarityFunctionExact<Long> simFunc() {
return new SimilarityFunctionExact<Long>("LongSimilarityFunctionExact");
}

}
6 changes: 3 additions & 3 deletions docs/stepbystep/configuration/field-definitions.md
Expand Up @@ -32,12 +32,12 @@ Type of the column - string, integer, double, etc.

| Match Type | Description | Can be applied to |
| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, double, date |
| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string |
| FUZZY | Broad matches with typos, abbreviations, and other variations. | string, integer, long, double, date |
| EXACT | No tolerance with variations, Preferable for country codes, pin codes, and other categorical variables where you expect no variations. | string, integer, long, date |
| DONT\_USE | Appears in the output but no computation is done on these. Helpful for fields like ids that are required in the output. DONT\_USE fields are not shown to the user while labeling, if [showConcise](field-definitions.md#showconcise) is set to true. | any |
| EMAIL | Matches only the id part of the email before the @ character | any |
| PINCODE | Matches pin codes like xxxxx-xxxx with xxxxx | string |
| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string |
| NULL\_OR\_BLANK | By default Zingg treats nulls as matches, but if we add this to a field which has other match type like FUZZY, Zingg will build a feature for null values and learn | string, integer, long, date |
| TEXT | Compares words overlap between two strings. Good for descriptive fields without much typos | string |
| NUMERIC | extracts numbers from strings and compares how many of them are same across both strings, for example apartment numbers. | string |
| NUMERIC\_WITH\_UNITS | extracts product codes or numbers with units, for example 16gb from strings and compares how many are same across both strings | string |
Expand Down

0 comments on commit e8e4c06

Please sign in to comment.