Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Shingle and ShingleSlidingWindow #11

Merged
merged 5 commits into from Nov 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
22 changes: 17 additions & 5 deletions cosine.go
Expand Up @@ -8,12 +8,24 @@ import (
)

// CosineSimilarity use cosine algorithm to return a similarity index between string vectors
// Takes two strings as parameters and return an index.
// This algorithm is only effective between sentences and not unique words.
func CosineSimilarity(str1, str2 string) float32 {
// Takes two strings as parameters, a split length which define the k-gram single length
// (if zero split string on whitespaces) and return an index.
func CosineSimilarity(str1, str2 string, splitLength int) float32 {
if str1 == "" || str2 == "" {
return 0
}

// Split string before rune conversion for cosine calculation
splittedStr1 := strings.Split(str1, " ")
splittedStr2 := strings.Split(str2, " ")
// If splitLength == 0 then split on whitespaces
// Else use shingle algorithm
var splittedStr1, splittedStr2 []string
if splitLength == 0 {
splittedStr1 = strings.Split(str1, " ")
splittedStr2 = strings.Split(str2, " ")
} else {
splittedStr1 = ShingleSlice(str1, splitLength)
splittedStr2 = ShingleSlice(str2, splitLength)
}

// Conversion of plitted string into rune array
runeStr1 := make([][]rune, len(splittedStr1))
Expand Down
27 changes: 26 additions & 1 deletion cosine_test.go
Expand Up @@ -36,13 +36,38 @@ func TestCosineSimilarity(t *testing.T) {
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := CosineSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want)
}
})
}
}

func TestCosineShingleSimilarity(t *testing.T) {
type args struct {
str1 string
str2 string
}
tests := []struct {
name string
args args
want float32
}{
{"Cosine shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.09759001},
{"Cosine shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.5335784},
{"Cosine shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.61977977},
{"Cosine shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.76980036},
{"Cosine shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8944272},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
t.Errorf("CosineSimilarity() with shingle 2 = %v, want %v", got, tt.want)
}
})
}
}

func Test_union(t *testing.T) {
type args struct {
a []string
Expand Down
26 changes: 19 additions & 7 deletions jaccard.go
Expand Up @@ -5,12 +5,25 @@ import (
)

// JaccardSimilarity compute the jaccard similarity coeffecient between two strings
// Takes two strings as parameters and return an index.
// This algorithm is only effective between sentences and not unique words.
func JaccardSimilarity(str1, str2 string) float32 {
// Split string before rune conversion for cosine calculation
splittedStr1 := strings.Split(str1, " ")
splittedStr2 := strings.Split(str2, " ")
// Takes two strings as parameters, a split length which define the k-gram single length
// (if zero split string on whitespaces) and return an index.
func JaccardSimilarity(str1, str2 string, splitLength int) float32 {
if str1 == "" || str2 == "" {
return 0
}

// Split string before rune conversion for jaccard calculation
// If splitLength == 0 then split on whitespaces
// Else use shingle algorithm
var splittedStr1, splittedStr2 []string
if splitLength == 0 {
splittedStr1 = strings.Split(str1, " ")
splittedStr2 = strings.Split(str2, " ")
} else {
splittedStr1 = ShingleSlice(str1, splitLength)
splittedStr2 = ShingleSlice(str2, splitLength)
}

// Conversion of splitted string into rune array
runeStr1 := make([][]rune, len(splittedStr1))
for i, str := range splittedStr1 {
Expand All @@ -23,7 +36,6 @@ func JaccardSimilarity(str1, str2 string) float32 {

// Create union keywords slice between input strings
unionStr := union(splittedStr1, splittedStr2)

jacc := float32(len(runeStr1) + len(runeStr2) - len(unionStr))

return jacc / float32(len(unionStr))
Expand Down
28 changes: 27 additions & 1 deletion jaccard_test.go
Expand Up @@ -23,9 +23,35 @@ func TestJaccardSimilarity(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
t.Errorf("JaccardSimilarity() = %v, want %v", got, tt.want)
}
})
}
}

func TestJaccardShingleSimilarity(t *testing.T) {
type args struct {
str1 string
str2 string
}
tests := []struct {
name string
args args
want float32
}{
{"Jaccard shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.04761905},
{"Jaccard shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.3548387},
{"Jaccard shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.44},
{"Jaccard shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.61538464},
{"Jaccard shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
t.Errorf("JaccardSimilarity() with shingle 2 = %v, want %v", got, tt.want)
}
})
}
}
34 changes: 34 additions & 0 deletions shingle.go
@@ -0,0 +1,34 @@
package edlib

// Shingle Find the k-gram of a string for a given k
// Takes a string and an integer as parameters and return a map.
// Returns an empty map if the string is empty or if k is 0
func Shingle(s string, k int) map[string]int {
m := make(map[string]int)
if s != "" && k != 0 {
runeS := []rune(s)

for i := 0; i < len(runeS)-k+1; i++ {
m[string(runeS[i:i+k])]++
}
}
return m
}

// ShingleSlice Find the k-gram of a string for a given k
// Takes a string and an integer as parameters and return a slice.
// Returns an empty slice if the string is empty or if k is 0
func ShingleSlice(s string, k int) []string {
var out []string
m := make(map[string]int)
if s != "" && k != 0 {
runeS := []rune(s)
for i := 0; i < len(runeS)-k+1; i++ {
m[string(runeS[i:i+k])]++
}
for k := range m {
out = append(out, k)
}
}
return out
}
37 changes: 37 additions & 0 deletions shingle_test.go
@@ -0,0 +1,37 @@
package edlib

import (
"reflect"
"testing"
)

func TestShingle(t *testing.T) {
type args struct {
str string
k int
}
tests := []struct {
name string
args args
want map[string]int
}{
{"shingle 1", args{"Radiohead", 2}, map[string]int{"Ra": 1, "ad": 2, "di": 1, "ea": 1, "he": 1, "io": 1, "oh": 1}},
{"shingle 1-1", args{"Radiohead", 3}, map[string]int{"Rad": 1, "adi": 1, "dio": 1, "ead": 1, "hea": 1, "ioh": 1, "ohe": 1}},
{"shingle 2", args{"I love horror movies", 2}, map[string]int{" h": 1, " l": 1, " m": 1, "I ": 1, "e ": 1, "es": 1, "ho": 1, "ie": 1, "lo": 1, "mo": 1, "or": 2, "ov": 2, "r ": 1, "ro": 1, "rr": 1, "ve": 1, "vi": 1}},
{"shingle 3", args{"私の名前はジョンです", 2}, map[string]int{"です": 1, "の名": 1, "はジ": 1, "ジョ": 1, "ョン": 1, "ンで": 1, "前は": 1, "名前": 1, "私の": 1}},
{"shingle 4", args{"🙂😄🙂😄 😄🙂😄", 2}, map[string]int{" 😄": 1, "😄 ": 1, "😄🙂": 2, "🙂😄": 3}},
{"shingle 5", args{"", 100}, make(map[string]int)},
{"shingle 6", args{"hello", 0}, make(map[string]int)},
{"shingle 7", args{"四畳半神話大系", 7}, map[string]int{"四畳半神話大系": 1}},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := Shingle(tt.args.str, tt.args.k)
eq := reflect.DeepEqual(got, tt.want)
if !eq {
t.Errorf("Shingle() = %v, want %v", got, tt.want)
}
})
}
}
5 changes: 3 additions & 2 deletions string-analysis.go
Expand Up @@ -24,6 +24,7 @@ const (

// StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
// Use defined Algorithm type.
// Through this function, Cosine and Jaccard algorithms are used with Shingle split method with a length of 2.
func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error) {
switch algo {
case Levenshtein:
Expand All @@ -45,9 +46,9 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
case JaroWinkler:
return JaroWinklerSimilarity(str1, str2), nil
case Cosine:
return CosineSimilarity(str1, str2), nil
return CosineSimilarity(str1, str2, 2), nil
case Jaccard:
return JaccardSimilarity(str1, str2), nil
return JaccardSimilarity(str1, str2, 2), nil
default:
return 0.0, errors.New("Illegal argument for algorithm method")
}
Expand Down
28 changes: 14 additions & 14 deletions string-analysis_test.go
Expand Up @@ -132,26 +132,26 @@ func TestStringsSimilarity(t *testing.T) {
{"Cosine : Second arg empty", args{"abcde", "", Cosine}, 0.0, false},
{"Cosine : Same args", args{"abcde", "abcde", Cosine}, 1.0, false},
{"Cosine : No characters match", args{"abcd", "effgghh", Cosine}, 0.0, false},
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.0, false},
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.0, false},
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.0, false},
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.0, false},
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.20412414, false},
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.28867513, false},
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.25, false},
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.4, false},
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.3779645, false},
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.09759001, false},
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.5335784, false},
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.61977977, false},

// Jaccard method
{"Jaccard : First arg empty", args{"", "abcde", Jaccard}, 0.0, false},
{"Jaccard : Second arg empty", args{"abcde", "", Jaccard}, 0.0, false},
{"Jaccard : Same args", args{"abcde", "abcde", Jaccard}, 1.0, false},
{"Jaccard : No characters match", args{"abcd", "effgghh", Jaccard}, 0.0, false},
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.0, false},
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.0, false},
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.0, false},
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.0, false},
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 1.0 / 9.0, false},
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 1.0 / 6.0, false},
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.0, false},
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 2.0 / 3.0, false},
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.14285715, false},
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.25, false},
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.22222222, false},
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.04761905, false},
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 0.3548387, false},
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 0.44, false},
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},

// Illegal argument error
{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},
Expand Down