Skip to content

Commit

Permalink
Add qgram and sorensen-dice (#14)
Browse files Browse the repository at this point in the history
* feat: add qgram and sorensen-dice

* fix: Change function names and add Qgram and SorensenDice to string-analysis

* feat(qgram): add similarity function to return an index
test: fix test cases for QGram

Co-authored-by: hbollon <hugo.bollon@gmail.com>
  • Loading branch information
ShriprajwalK and hbollon committed Jan 31, 2022
1 parent 34fcab0 commit 5f65401
Show file tree
Hide file tree
Showing 7 changed files with 202 additions and 14 deletions.
29 changes: 15 additions & 14 deletions README.md
Expand Up @@ -44,20 +44,21 @@ Designed to be fully compatible with Unicode characters!<br>
This library is 100% test covered 😁

## Features
- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance)
- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions ✨
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance)
- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants :
- OSA (Optimal string alignment) ✨
- Adjacent transpositions ✨
- [Jaro & Jaro-Winkler](https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler) similarity algorithms ✨
- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) algorithm to compare strings ✨
- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)

- Computed similarity percentage functions based on all available edit distance algorithms in this lib ✨
- Fuzzy search functions based on edit distance with unique or multiples strings output ✨
- Unicode compatibility ! 🥳
- And many more to come !

- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance)
- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance)
- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants:
- OSA (Optimal string alignment)
- Adjacent transpositions
- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) similarity algorithms
- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)
- [QGram](https://en.wikipedia.org/wiki/N-gram)
- [Sorensen-Dice](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
- Computed similarity percentage functions based on all available edit distance algorithms in this lib
- Fuzzy search functions based on edit distance with unique or multiples strings output
- Unicode compatibility 🥳

## Benchmarks
You can check an interactive Google chart with few benchmark cases for all similarity algorithms in this library through **StringsSimilarity** function [here](http://benchgraph.codingberg.com/q5)
Expand Down
56 changes: 56 additions & 0 deletions qgram.go
@@ -0,0 +1,56 @@
package edlib

import (
"math"
)

// QgramDistance compute the q-gram similarity between two strings
// Takes two strings as parameters, a split length which defines the k-gram shingle length
func QgramDistance(str1, str2 string, splitLength int) int {
splittedStr1 := Shingle(str1, splitLength)
splittedStr2 := Shingle(str2, splitLength)

union := make(map[string]int)
for i := range splittedStr1 {
union[i] = 0
}
for i := range splittedStr2 {
union[i] = 0
}

res := 0

for i := range union {
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
}

return res
}

// QgramDistanceCustomNgram compute the q-gram similarity between two custom set of individuals
// Takes two n-gram map as parameters
func QgramDistanceCustomNgram(splittedStr1, splittedStr2 map[string]int) int {
union := make(map[string]int)
for i := range splittedStr1 {
union[i] = 0
}
for i := range splittedStr2 {
union[i] = 0
}

res := 0
for i := range union {
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
}

return res
}

// QgramSimilarity compute a similarity index (between 0 and 1) between two strings from a Qgram distance
// Takes two strings as parameters, a split length which defines the k-gram shingle length
func QgramSimilarity(str1, str2 string, splitLength int) float32 {
splittedStr1 := Shingle(str1, splitLength)
splittedStr2 := Shingle(str2, splitLength)
res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2))
return 1 - (res / float32(len(splittedStr1)+len(splittedStr2)))
}
35 changes: 35 additions & 0 deletions qgram_test.go
@@ -0,0 +1,35 @@
package edlib

import (
"testing"
)

func TestQgramDistance(t *testing.T) {
type args struct {
str1 string
str2 string
splitLength int
}
tests := []struct {
name string
args args
want int
}{
{"Qgram sim 1", args{"Radiohead", "Radiohead", 2}, 0.0},
{"Qgram sim 2", args{"ABCD", "ABCE", 2}, 2.0},
{"Qgram sim 3", args{"Radiohead", "Carly Rae Jepsen", 2}, 21.0},
{"Qgram sim 4", args{"I love horror movies", "Lights out is a horror movie", 2}, 22.0},
{"Qgram sim 5", args{"love horror movies", "Lights out horror movie", 2}, 15.0},
{"Qgram sim 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 5},
{"Qgram sim 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 4},
{"Qgram sim 8", args{"", "", 2}, 0.0},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := QgramDistance(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
t.Errorf("QgramDistance() = %v, want %v", got, tt.want)
}
})
}
}
19 changes: 19 additions & 0 deletions sorensen-dice.go
@@ -0,0 +1,19 @@
package edlib

// SorensenDiceCoefficient computes the Sorensen-Dice coefficient between two strings
// Takes two strings as parameters, a split length which defines the k-gram shingle length
func SorensenDiceCoefficient(str1, str2 string, splitLength int) float32 {
if str1 == "" && str2 == "" {
return 0
}
shingle1 := Shingle(str1, splitLength)
shingle2 := Shingle(str2, splitLength)

intersection := float32(0)
for i := range shingle1 {
if _, ok := shingle2[i]; ok {
intersection++
}
}
return 2.0 * intersection / float32(len(shingle1)+len(shingle2))
}
35 changes: 35 additions & 0 deletions sorensen-dice_test.go
@@ -0,0 +1,35 @@
package edlib

import (
"testing"
)

func TestSorensenDiceCoefficient(t *testing.T) {
type args struct {
str1 string
str2 string
splitLength int
}
tests := []struct {
name string
args args
want float32
}{
{"SorensenDiceCoefficient 1", args{"night", "nacht", 2}, 0.25},
{"SorensenDiceCoefficient 2", args{"Radiohead", "Radiohead", 2}, 1.0},
{"SorensenDiceCoefficient 3", args{"", "", 2}, 0.0},
{"SorensenDiceCoefficient 4", args{"Radiohead", "Carly Rae Jepsen", 2}, 0.09090909},
{"SorensenDiceCoefficient 5", args{"I love horror movies", "Lights out is a horror movie", 2}, 0.52380955},
{"SorensenDiceCoefficient 6", args{"love horror movies", "Lights out horror movie", 2}, 0.6111111},
{"SorensenDiceCoefficient 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 0.7619048},
{"SorensenDiceCoefficient 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 0.8888889},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := SorensenDiceCoefficient(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
t.Errorf("SorensenDiceCoefficient() = %v, want %v", got, tt.want)
}
})
}
}
6 changes: 6 additions & 0 deletions string-analysis.go
Expand Up @@ -20,6 +20,8 @@ const (
JaroWinkler
Cosine
Jaccard
SorensenDice
Qgram
)

// StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
Expand Down Expand Up @@ -49,6 +51,10 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
return CosineSimilarity(str1, str2, 2), nil
case Jaccard:
return JaccardSimilarity(str1, str2, 2), nil
case SorensenDice:
return SorensenDiceCoefficient(str1, str2, 2), nil
case Qgram:
return QgramSimilarity(str1, str2, 2), nil
default:
return 0.0, errors.New("Illegal argument for algorithm method")
}
Expand Down
36 changes: 36 additions & 0 deletions string-analysis_test.go
Expand Up @@ -153,6 +153,42 @@ func TestStringsSimilarity(t *testing.T) {
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},

// SorensenDice method
{"SorensenDice : First arg empty", args{"", "abcde", SorensenDice}, 0.0, false},
{"SorensenDice : Second arg empty", args{"abcde", "", SorensenDice}, 0.0, false},
{"SorensenDice : Same args", args{"abcde", "abcde", SorensenDice}, 1.0, false},
{"SorensenDice : No characters match", args{"abcd", "effgghh", SorensenDice}, 0.0, false},
{"SorensenDice : CRATE/TRACE", args{"CRATE", "TRACE", SorensenDice}, 0.25, false},
{"SorensenDice : MARTHA/MARHTA", args{"MARTHA", "MARHTA", SorensenDice}, 0.4, false},
{"SorensenDice : DIXON/DICKSONX", args{"DIXON", "DICKSONX", SorensenDice}, 0.36363637, false},
{"SorensenDice Sentence 1", args{"night", "nacht", SorensenDice}, 0.25, false},
{"SorensenDice Sentence 2", args{"Radiohead", "Radiohead", SorensenDice}, 1.0, false},
{"SorensenDice Sentence 3", args{"", "", SorensenDice}, 0.0, false},
{"SorensenDice Sentence 4", args{"Radiohead", "Carly Rae Jepsen", SorensenDice}, 0.09090909, false},
{"SorensenDice Sentence 5", args{"I love horror movies", "Lights out is a horror movie", SorensenDice}, 0.52380955, false},
{"SorensenDice Sentence 6", args{"love horror movies", "Lights out horror movie", SorensenDice}, 0.6111111, false},
{"SorensenDice Sentence 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", SorensenDice}, 0.7619048, false},
{"SorensenDice Sentence 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", SorensenDice}, 0.8888889, false},

// Qgram method
{"Qgram: First arg empty", args{"", "abcde", Qgram}, 0.0, false},
{"Qgram : Second arg empty", args{"abcde", "", Qgram}, 0.0, false},
{"Qgram : Same args", args{"abcde", "abcde", Qgram}, 1.0, false},
{"Qgram : No characters match", args{"abcd", "effgghh", Qgram}, 0.0, false},
{"Qgram : CRATE/TRACE", args{"CRATE", "TRACE", Qgram}, 0.25, false},
{"Qgram : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Qgram}, 0.39999998, false},
{"Qgram : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Qgram}, 0.36363637, false},
{"Qgram Sentence 1", args{"Radiohead", "Radiohead", Qgram}, 1.0, false},
{"Qgram Sentence 2", args{"ABCD", "ABCE", Qgram}, 0.6666666, false},
{"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.04545456, false},
{"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.47619045, false},
{"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.5833334, false},
{"Qgram Sentence 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Qgram}, 0.7619048, false},
{"Qgram Sentence 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Qgram}, 0.5555556, false},

// TODO: Must refactor compare method to handle NaN values
// {"Qgram Sentence 8", args{"", "", Qgram}, float32(math.NaN()), false},

// Illegal argument error
{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},
}
Expand Down

0 comments on commit 5f65401

Please sign in to comment.