Skip to content

Commit

Permalink
feat: add k-gram shingle to Jaccard/Cosine sim (#11)
Browse files Browse the repository at this point in the history
* feat: add Shingle function

* test: update unit tests for Cosine/Jaccard with shingle

Co-authored-by: Shriprajwal K <the_daemon_lord@Shriprajwals-MacBook-Air.local>
Co-authored-by: hbollon <hugo.bollon@gmail.com>
  • Loading branch information
3 people committed Nov 21, 2021
1 parent 4f1acf0 commit 24d61a6
Show file tree
Hide file tree
Showing 8 changed files with 177 additions and 30 deletions.
22 changes: 17 additions & 5 deletions cosine.go
Expand Up @@ -8,12 +8,24 @@ import (
)

// CosineSimilarity use cosine algorithm to return a similarity index between string vectors
// Takes two strings as parameters and return an index.
// This algorithm is only effective between sentences and not unique words.
func CosineSimilarity(str1, str2 string) float32 {
// Takes two strings as parameters, a split length which define the k-gram single length
// (if zero split string on whitespaces) and return an index.
func CosineSimilarity(str1, str2 string, splitLength int) float32 {
if str1 == "" || str2 == "" {
return 0
}

// Split string before rune conversion for cosine calculation
splittedStr1 := strings.Split(str1, " ")
splittedStr2 := strings.Split(str2, " ")
// If splitLength == 0 then split on whitespaces
// Else use shingle algorithm
var splittedStr1, splittedStr2 []string
if splitLength == 0 {
splittedStr1 = strings.Split(str1, " ")
splittedStr2 = strings.Split(str2, " ")
} else {
splittedStr1 = ShingleSlice(str1, splitLength)
splittedStr2 = ShingleSlice(str2, splitLength)
}

// Conversion of plitted string into rune array
runeStr1 := make([][]rune, len(splittedStr1))
Expand Down
27 changes: 26 additions & 1 deletion cosine_test.go
Expand Up @@ -36,13 +36,38 @@ func TestCosineSimilarity(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := CosineSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want)
}
})
}
}

func TestCosineShingleSimilarity(t *testing.T) {
type args struct {
str1 string
str2 string
}
tests := []struct {
name string
args args
want float32
}{
{"Cosine shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.09759001},
{"Cosine shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.5335784},
{"Cosine shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.61977977},
{"Cosine shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.76980036},
{"Cosine shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8944272},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
t.Errorf("CosineSimilarity() with shingle 2 = %v, want %v", got, tt.want)
}
})
}
}

func Test_union(t *testing.T) {
type args struct {
a []string
Expand Down
26 changes: 19 additions & 7 deletions jaccard.go
Expand Up @@ -5,12 +5,25 @@ import (
)

// JaccardSimilarity compute the jaccard similarity coeffecient between two strings
// Takes two strings as parameters and return an index.
// This algorithm is only effective between sentences and not unique words.
func JaccardSimilarity(str1, str2 string) float32 {
// Split string before rune conversion for cosine calculation
splittedStr1 := strings.Split(str1, " ")
splittedStr2 := strings.Split(str2, " ")
// Takes two strings as parameters, a split length which define the k-gram single length
// (if zero split string on whitespaces) and return an index.
func JaccardSimilarity(str1, str2 string, splitLength int) float32 {
if str1 == "" || str2 == "" {
return 0
}

// Split string before rune conversion for jaccard calculation
// If splitLength == 0 then split on whitespaces
// Else use shingle algorithm
var splittedStr1, splittedStr2 []string
if splitLength == 0 {
splittedStr1 = strings.Split(str1, " ")
splittedStr2 = strings.Split(str2, " ")
} else {
splittedStr1 = ShingleSlice(str1, splitLength)
splittedStr2 = ShingleSlice(str2, splitLength)
}

// Conversion of splitted string into rune array
runeStr1 := make([][]rune, len(splittedStr1))
for i, str := range splittedStr1 {
Expand All @@ -23,7 +36,6 @@ func JaccardSimilarity(str1, str2 string) float32 {

// Create union keywords slice between input strings
unionStr := union(splittedStr1, splittedStr2)

jacc := float32(len(runeStr1) + len(runeStr2) - len(unionStr))

return jacc / float32(len(unionStr))
Expand Down
28 changes: 27 additions & 1 deletion jaccard_test.go
Expand Up @@ -23,9 +23,35 @@ func TestJaccardSimilarity(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
t.Errorf("JaccardSimilarity() = %v, want %v", got, tt.want)
}
})
}
}

func TestJaccardShingleSimilarity(t *testing.T) {
type args struct {
str1 string
str2 string
}
tests := []struct {
name string
args args
want float32
}{
{"Jaccard shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.04761905},
{"Jaccard shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.3548387},
{"Jaccard shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.44},
{"Jaccard shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.61538464},
{"Jaccard shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
t.Errorf("JaccardSimilarity() with shingle 2 = %v, want %v", got, tt.want)
}
})
}
}
34 changes: 34 additions & 0 deletions shingle.go
@@ -0,0 +1,34 @@
package edlib

// Shingle Find the k-gram of a string for a given k
// Takes a string and an integer as parameters and return a map.
// Returns an empty map if the string is empty or if k is 0
func Shingle(s string, k int) map[string]int {
m := make(map[string]int)
if s != "" && k != 0 {
runeS := []rune(s)

for i := 0; i < len(runeS)-k+1; i++ {
m[string(runeS[i:i+k])]++
}
}
return m
}

// ShingleSlice Find the k-gram of a string for a given k
// Takes a string and an integer as parameters and return a slice.
// Returns an empty slice if the string is empty or if k is 0
func ShingleSlice(s string, k int) []string {
var out []string
m := make(map[string]int)
if s != "" && k != 0 {
runeS := []rune(s)
for i := 0; i < len(runeS)-k+1; i++ {
m[string(runeS[i:i+k])]++
}
for k := range m {
out = append(out, k)
}
}
return out
}
37 changes: 37 additions & 0 deletions shingle_test.go
@@ -0,0 +1,37 @@
package edlib

import (
"reflect"
"testing"
)

func TestShingle(t *testing.T) {
type args struct {
str string
k int
}
tests := []struct {
name string
args args
want map[string]int
}{
{"shingle 1", args{"Radiohead", 2}, map[string]int{"Ra": 1, "ad": 2, "di": 1, "ea": 1, "he": 1, "io": 1, "oh": 1}},
{"shingle 1-1", args{"Radiohead", 3}, map[string]int{"Rad": 1, "adi": 1, "dio": 1, "ead": 1, "hea": 1, "ioh": 1, "ohe": 1}},
{"shingle 2", args{"I love horror movies", 2}, map[string]int{" h": 1, " l": 1, " m": 1, "I ": 1, "e ": 1, "es": 1, "ho": 1, "ie": 1, "lo": 1, "mo": 1, "or": 2, "ov": 2, "r ": 1, "ro": 1, "rr": 1, "ve": 1, "vi": 1}},
{"shingle 3", args{"私の名前はジョンです", 2}, map[string]int{"です": 1, "の名": 1, "はジ": 1, "ジョ": 1, "ョン": 1, "ンで": 1, "前は": 1, "名前": 1, "私の": 1}},
{"shingle 4", args{"🙂😄🙂😄 😄🙂😄", 2}, map[string]int{" 😄": 1, "😄 ": 1, "😄🙂": 2, "🙂😄": 3}},
{"shingle 5", args{"", 100}, make(map[string]int)},
{"shingle 6", args{"hello", 0}, make(map[string]int)},
{"shingle 7", args{"四畳半神話大系", 7}, map[string]int{"四畳半神話大系": 1}},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := Shingle(tt.args.str, tt.args.k)
eq := reflect.DeepEqual(got, tt.want)
if !eq {
t.Errorf("Shingle() = %v, want %v", got, tt.want)
}
})
}
}
5 changes: 3 additions & 2 deletions string-analysis.go
Expand Up @@ -24,6 +24,7 @@ const (

// StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
// Use defined Algorithm type.
// Through this function, Cosine and Jaccard algorithms are used with Shingle split method with a length of 2.
func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error) {
switch algo {
case Levenshtein:
Expand All @@ -45,9 +46,9 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
case JaroWinkler:
return JaroWinklerSimilarity(str1, str2), nil
case Cosine:
return CosineSimilarity(str1, str2), nil
return CosineSimilarity(str1, str2, 2), nil
case Jaccard:
return JaccardSimilarity(str1, str2), nil
return JaccardSimilarity(str1, str2, 2), nil
default:
return 0.0, errors.New("Illegal argument for algorithm method")
}
Expand Down
28 changes: 14 additions & 14 deletions string-analysis_test.go
Expand Up @@ -132,26 +132,26 @@ func TestStringsSimilarity(t *testing.T) {
{"Cosine : Second arg empty", args{"abcde", "", Cosine}, 0.0, false},
{"Cosine : Same args", args{"abcde", "abcde", Cosine}, 1.0, false},
{"Cosine : No characters match", args{"abcd", "effgghh", Cosine}, 0.0, false},
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.0, false},
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.0, false},
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.0, false},
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.0, false},
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.20412414, false},
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.28867513, false},
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.25, false},
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.4, false},
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.3779645, false},
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.09759001, false},
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.5335784, false},
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.61977977, false},

// Jaccard method
{"Jaccard : First arg empty", args{"", "abcde", Jaccard}, 0.0, false},
{"Jaccard : Second arg empty", args{"abcde", "", Jaccard}, 0.0, false},
{"Jaccard : Same args", args{"abcde", "abcde", Jaccard}, 1.0, false},
{"Jaccard : No characters match", args{"abcd", "effgghh", Jaccard}, 0.0, false},
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.0, false},
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.0, false},
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.0, false},
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.0, false},
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 1.0 / 9.0, false},
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 1.0 / 6.0, false},
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.0, false},
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 2.0 / 3.0, false},
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.14285715, false},
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.25, false},
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.22222222, false},
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.04761905, false},
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 0.3548387, false},
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 0.44, false},
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},

// Illegal argument error
{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},
Expand Down

0 comments on commit 24d61a6

Please sign in to comment.