From 24d61a62ee09eef2a930322cfbb0d4f13f13c5ca Mon Sep 17 00:00:00 2001 From: Shriprajwal K <40351128+ShriprajwalK@users.noreply.github.com> Date: Sun, 21 Nov 2021 17:26:33 +0530 Subject: [PATCH] feat: add k-gram shingle to Jaccard/Cosine sim (#11) * feat: add Shingle function * test: update unit tests for Cosine/Jaccard with shingle Co-authored-by: Shriprajwal K Co-authored-by: hbollon --- cosine.go | 22 +++++++++++++++++----- cosine_test.go | 27 ++++++++++++++++++++++++++- jaccard.go | 26 +++++++++++++++++++------- jaccard_test.go | 28 +++++++++++++++++++++++++++- shingle.go | 34 ++++++++++++++++++++++++++++++++++ shingle_test.go | 37 +++++++++++++++++++++++++++++++++++++ string-analysis.go | 5 +++-- string-analysis_test.go | 28 ++++++++++++++-------------- 8 files changed, 177 insertions(+), 30 deletions(-) create mode 100644 shingle.go create mode 100644 shingle_test.go diff --git a/cosine.go b/cosine.go index fe63a4d..e837c2e 100644 --- a/cosine.go +++ b/cosine.go @@ -8,12 +8,24 @@ import ( ) // CosineSimilarity use cosine algorithm to return a similarity index between string vectors -// Takes two strings as parameters and return an index. -// This algorithm is only effective between sentences and not unique words. -func CosineSimilarity(str1, str2 string) float32 { +// Takes two strings as parameters, a split length which define the k-gram single length +// (if zero split string on whitespaces) and return an index. +func CosineSimilarity(str1, str2 string, splitLength int) float32 { + if str1 == "" || str2 == "" { + return 0 + } + // Split string before rune conversion for cosine calculation - splittedStr1 := strings.Split(str1, " ") - splittedStr2 := strings.Split(str2, " ") + // If splitLength == 0 then split on whitespaces + // Else use shingle algorithm + var splittedStr1, splittedStr2 []string + if splitLength == 0 { + splittedStr1 = strings.Split(str1, " ") + splittedStr2 = strings.Split(str2, " ") + } else { + splittedStr1 = ShingleSlice(str1, splitLength) + splittedStr2 = ShingleSlice(str2, splitLength) + } // Conversion of plitted string into rune array runeStr1 := make([][]rune, len(splittedStr1)) diff --git a/cosine_test.go b/cosine_test.go index 4514d81..48574c2 100644 --- a/cosine_test.go +++ b/cosine_test.go @@ -36,13 +36,38 @@ func TestCosineSimilarity(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := CosineSimilarity(tt.args.str1, tt.args.str2); got != tt.want { + if got := CosineSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want { t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want) } }) } } +func TestCosineShingleSimilarity(t *testing.T) { + type args struct { + str1 string + str2 string + } + tests := []struct { + name string + args args + want float32 + }{ + {"Cosine shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.09759001}, + {"Cosine shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.5335784}, + {"Cosine shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.61977977}, + {"Cosine shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.76980036}, + {"Cosine shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8944272}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := CosineSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want { + t.Errorf("CosineSimilarity() with shingle 2 = %v, want %v", got, tt.want) + } + }) + } +} + func Test_union(t *testing.T) { type args struct { a []string diff --git a/jaccard.go b/jaccard.go index 4e82560..79fff53 100644 --- a/jaccard.go +++ b/jaccard.go @@ -5,12 +5,25 @@ import ( ) // JaccardSimilarity compute the jaccard similarity coeffecient between two strings -// Takes two strings as parameters and return an index. -// This algorithm is only effective between sentences and not unique words. -func JaccardSimilarity(str1, str2 string) float32 { - // Split string before rune conversion for cosine calculation - splittedStr1 := strings.Split(str1, " ") - splittedStr2 := strings.Split(str2, " ") +// Takes two strings as parameters, a split length which define the k-gram single length +// (if zero split string on whitespaces) and return an index. +func JaccardSimilarity(str1, str2 string, splitLength int) float32 { + if str1 == "" || str2 == "" { + return 0 + } + + // Split string before rune conversion for jaccard calculation + // If splitLength == 0 then split on whitespaces + // Else use shingle algorithm + var splittedStr1, splittedStr2 []string + if splitLength == 0 { + splittedStr1 = strings.Split(str1, " ") + splittedStr2 = strings.Split(str2, " ") + } else { + splittedStr1 = ShingleSlice(str1, splitLength) + splittedStr2 = ShingleSlice(str2, splitLength) + } + // Conversion of splitted string into rune array runeStr1 := make([][]rune, len(splittedStr1)) for i, str := range splittedStr1 { @@ -23,7 +36,6 @@ func JaccardSimilarity(str1, str2 string) float32 { // Create union keywords slice between input strings unionStr := union(splittedStr1, splittedStr2) - jacc := float32(len(runeStr1) + len(runeStr2) - len(unionStr)) return jacc / float32(len(unionStr)) diff --git a/jaccard_test.go b/jaccard_test.go index ce737e8..b07a8fb 100644 --- a/jaccard_test.go +++ b/jaccard_test.go @@ -23,9 +23,35 @@ func TestJaccardSimilarity(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - if got := JaccardSimilarity(tt.args.str1, tt.args.str2); got != tt.want { + if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want { t.Errorf("JaccardSimilarity() = %v, want %v", got, tt.want) } }) } } + +func TestJaccardShingleSimilarity(t *testing.T) { + type args struct { + str1 string + str2 string + } + tests := []struct { + name string + args args + want float32 + }{ + {"Jaccard shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.04761905}, + {"Jaccard shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.3548387}, + {"Jaccard shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.44}, + {"Jaccard shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.61538464}, + {"Jaccard shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want { + t.Errorf("JaccardSimilarity() with shingle 2 = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/shingle.go b/shingle.go new file mode 100644 index 0000000..29ee6c0 --- /dev/null +++ b/shingle.go @@ -0,0 +1,34 @@ +package edlib + +// Shingle Find the k-gram of a string for a given k +// Takes a string and an integer as parameters and return a map. +// Returns an empty map if the string is empty or if k is 0 +func Shingle(s string, k int) map[string]int { + m := make(map[string]int) + if s != "" && k != 0 { + runeS := []rune(s) + + for i := 0; i < len(runeS)-k+1; i++ { + m[string(runeS[i:i+k])]++ + } + } + return m +} + +// ShingleSlice Find the k-gram of a string for a given k +// Takes a string and an integer as parameters and return a slice. +// Returns an empty slice if the string is empty or if k is 0 +func ShingleSlice(s string, k int) []string { + var out []string + m := make(map[string]int) + if s != "" && k != 0 { + runeS := []rune(s) + for i := 0; i < len(runeS)-k+1; i++ { + m[string(runeS[i:i+k])]++ + } + for k := range m { + out = append(out, k) + } + } + return out +} diff --git a/shingle_test.go b/shingle_test.go new file mode 100644 index 0000000..721dc25 --- /dev/null +++ b/shingle_test.go @@ -0,0 +1,37 @@ +package edlib + +import ( + "reflect" + "testing" +) + +func TestShingle(t *testing.T) { + type args struct { + str string + k int + } + tests := []struct { + name string + args args + want map[string]int + }{ + {"shingle 1", args{"Radiohead", 2}, map[string]int{"Ra": 1, "ad": 2, "di": 1, "ea": 1, "he": 1, "io": 1, "oh": 1}}, + {"shingle 1-1", args{"Radiohead", 3}, map[string]int{"Rad": 1, "adi": 1, "dio": 1, "ead": 1, "hea": 1, "ioh": 1, "ohe": 1}}, + {"shingle 2", args{"I love horror movies", 2}, map[string]int{" h": 1, " l": 1, " m": 1, "I ": 1, "e ": 1, "es": 1, "ho": 1, "ie": 1, "lo": 1, "mo": 1, "or": 2, "ov": 2, "r ": 1, "ro": 1, "rr": 1, "ve": 1, "vi": 1}}, + {"shingle 3", args{"私の名前はジョンです", 2}, map[string]int{"です": 1, "の名": 1, "はジ": 1, "ジョ": 1, "ョン": 1, "ンで": 1, "前は": 1, "名前": 1, "私の": 1}}, + {"shingle 4", args{"🙂😄🙂😄 😄🙂😄", 2}, map[string]int{" 😄": 1, "😄 ": 1, "😄🙂": 2, "🙂😄": 3}}, + {"shingle 5", args{"", 100}, make(map[string]int)}, + {"shingle 6", args{"hello", 0}, make(map[string]int)}, + {"shingle 7", args{"四畳半神話大系", 7}, map[string]int{"四畳半神話大系": 1}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := Shingle(tt.args.str, tt.args.k) + eq := reflect.DeepEqual(got, tt.want) + if !eq { + t.Errorf("Shingle() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/string-analysis.go b/string-analysis.go index 983c47a..a8e2795 100644 --- a/string-analysis.go +++ b/string-analysis.go @@ -24,6 +24,7 @@ const ( // StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter. // Use defined Algorithm type. +// Through this function, Cosine and Jaccard algorithms are used with Shingle split method with a length of 2. func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error) { switch algo { case Levenshtein: @@ -45,9 +46,9 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error case JaroWinkler: return JaroWinklerSimilarity(str1, str2), nil case Cosine: - return CosineSimilarity(str1, str2), nil + return CosineSimilarity(str1, str2, 2), nil case Jaccard: - return JaccardSimilarity(str1, str2), nil + return JaccardSimilarity(str1, str2, 2), nil default: return 0.0, errors.New("Illegal argument for algorithm method") } diff --git a/string-analysis_test.go b/string-analysis_test.go index c003f4d..46ea4d7 100644 --- a/string-analysis_test.go +++ b/string-analysis_test.go @@ -132,26 +132,26 @@ func TestStringsSimilarity(t *testing.T) { {"Cosine : Second arg empty", args{"abcde", "", Cosine}, 0.0, false}, {"Cosine : Same args", args{"abcde", "abcde", Cosine}, 1.0, false}, {"Cosine : No characters match", args{"abcd", "effgghh", Cosine}, 0.0, false}, - {"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.0, false}, - {"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.0, false}, - {"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.0, false}, - {"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.0, false}, - {"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.20412414, false}, - {"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.28867513, false}, + {"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.25, false}, + {"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.4, false}, + {"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.3779645, false}, + {"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.09759001, false}, + {"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.5335784, false}, + {"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.61977977, false}, // Jaccard method {"Jaccard : First arg empty", args{"", "abcde", Jaccard}, 0.0, false}, {"Jaccard : Second arg empty", args{"abcde", "", Jaccard}, 0.0, false}, {"Jaccard : Same args", args{"abcde", "abcde", Jaccard}, 1.0, false}, {"Jaccard : No characters match", args{"abcd", "effgghh", Jaccard}, 0.0, false}, - {"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.0, false}, - {"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.0, false}, - {"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.0, false}, - {"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.0, false}, - {"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 1.0 / 9.0, false}, - {"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 1.0 / 6.0, false}, - {"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.0, false}, - {"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 2.0 / 3.0, false}, + {"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.14285715, false}, + {"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.25, false}, + {"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.22222222, false}, + {"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.04761905, false}, + {"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 0.3548387, false}, + {"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 0.44, false}, + {"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false}, + {"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false}, // Illegal argument error {"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},