Extra stopword pass to remove missed stopwords, changed position bonu…

…s calcuation to be by phrase
bendangelo · Dec 7, 2023 · f1e9d64 · f1e9d64
1 parent 3407221
commit f1e9d64
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 16 deletions.
diff --git a/lib/keyphrase.rb b/lib/keyphrase.rb
@@ -66,15 +66,23 @@ def buildStopwordRegExPattern stopwords, lang
   def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
     phrases = Array.new
 
-    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
+    # first clean by removing unwanted special chars
+    # second remove all stop words
+    # third, remove uncaught stopwords in second pass
+    # using a | as an easy way to divide the text by stopwords
+    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
 
     filtered_sentences.each do |parts|
       parts.split("|").each do |part|
+        next if part.empty?
+
+        # remove blacklisted things, like 1234.45.34
+        # clean up spacing between words
         part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
 
-        if !part.empty?
-          phrases.push part
-        end
+        next if part.empty?
+
+        phrases.push part
       end
     end
 
@@ -118,21 +126,23 @@ def calculateWordScores phrases
   # 4
   def generateCandidateKeywordScores phrases, scores, position_bonus
     candidates = Hash.new 0
-    word_index = 0
+    phrase_index = 0
 
     phrases.each do |phrase|
       words = seperateWords(phrase)
       score = 0
       words.each do |word|
         score += scores[word]
 
-        # Normalize the score based on the position
-        if position_bonus
-          normalized_score = 1.0 / (word_index + 1)
-          score += normalized_score
-          word_index += 1
-        end
       end
+
+      # Boost score based on the phrase position in the text
+      if position_bonus
+        normalized_score = 1.0 / (phrase_index + 1)
+        score += normalized_score
+        phrase_index += 1
+      end
+
       candidates[phrase] = score
     end
 

diff --git a/spec/keyphrase_spec.rb b/spec/keyphrase_spec.rb
@@ -12,20 +12,20 @@
         result = Keyphrase.analyse "Chopin - Nocturne op.9 No.2 F#"
 
         expect(result.keys).to eq ["Nocturne op.9 No.2", "Chopin" ]
-        expect(result["Nocturne op.9 No.2"]).to eq 10.083333333333332
+        expect(result["Nocturne op.9 No.2"]).to eq 9.5
       end
 
       it do
         result = Keyphrase.analyse "Do You See It...Yet? | New York Nadia
         poem written and performed by New York Nadia (Nadia Asencio); video production by The Raw Speak and IZ Parrot"
 
-        expect(result.keys).to eq ["The Raw Speak", "New York Nadia", "York Nadia", "Nadia Asencio", "poem written", "video production", "IZ Parrot", "You", "It", "performed"]
+        expect(result.keys).to eq ["York Nadia", "poem written", "Nadia Asencio", "video production", "Raw Speak", "IZ Parrot", "performed"]
       end
 
       it do
-        result = Keyphrase.analyse "Secrets Of The Smithsonian: Humanity's Hidden History | Jay Myers"
+        result = Keyphrase.analyse "Secrets Of The The The Smithsonian: Humanity's Hidden History | Jay Myers"
 
-        expect(result.keys).to eq ["The Smithsonian", "Hidden History", "Jay Myers", "Secrets", "Humanity"]
+        expect(result.keys).to eq ["Hidden History", "Jay Myers", "Secrets", "Smithsonian", "Humanity"]
       end
 
       it "should remove non-words" do
@@ -63,7 +63,7 @@
       it "should split on &" do
         result = Keyphrase.analyse "Making & Cutting Sex on The Beach for BeScented Fragrance Oil Supply"
 
-        expect(result.keys).to eq ["BeScented Fragrance Oil Supply", "Cutting Sex", "The Beach", "Making"]
+        expect(result.keys).to eq ["BeScented Fragrance Oil Supply", "Cutting Sex", "Making", "Beach"]
       end
 
       it do