Skip to content

Commit

Permalink
Extra stopword pass to remove missed stopwords, changed position bonu…
Browse files Browse the repository at this point in the history
…s calcuation to be by phrase
  • Loading branch information
bendangelo committed Dec 7, 2023
1 parent 3407221 commit f1e9d64
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 16 deletions.
32 changes: 21 additions & 11 deletions lib/keyphrase.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,23 @@ def buildStopwordRegExPattern stopwords, lang
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
phrases = Array.new

filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
# first clean by removing unwanted special chars
# second remove all stop words
# third, remove uncaught stopwords in second pass
# using a | as an easy way to divide the text by stopwords
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }

filtered_sentences.each do |parts|
parts.split("|").each do |part|
next if part.empty?

# remove blacklisted things, like 1234.45.34
# clean up spacing between words
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip

if !part.empty?
phrases.push part
end
next if part.empty?

phrases.push part
end
end

Expand Down Expand Up @@ -118,21 +126,23 @@ def calculateWordScores phrases
# 4
def generateCandidateKeywordScores phrases, scores, position_bonus
candidates = Hash.new 0
word_index = 0
phrase_index = 0

phrases.each do |phrase|
words = seperateWords(phrase)
score = 0
words.each do |word|
score += scores[word]

# Normalize the score based on the position
if position_bonus
normalized_score = 1.0 / (word_index + 1)
score += normalized_score
word_index += 1
end
end

# Boost score based on the phrase position in the text
if position_bonus
normalized_score = 1.0 / (phrase_index + 1)
score += normalized_score
phrase_index += 1
end

candidates[phrase] = score
end

Expand Down
10 changes: 5 additions & 5 deletions spec/keyphrase_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@
result = Keyphrase.analyse "Chopin - Nocturne op.9 No.2 F#"

expect(result.keys).to eq ["Nocturne op.9 No.2", "Chopin" ]
expect(result["Nocturne op.9 No.2"]).to eq 10.083333333333332
expect(result["Nocturne op.9 No.2"]).to eq 9.5
end

it do
result = Keyphrase.analyse "Do You See It...Yet? | New York Nadia
poem written and performed by New York Nadia (Nadia Asencio); video production by The Raw Speak and IZ Parrot"

expect(result.keys).to eq ["The Raw Speak", "New York Nadia", "York Nadia", "Nadia Asencio", "poem written", "video production", "IZ Parrot", "You", "It", "performed"]
expect(result.keys).to eq ["York Nadia", "poem written", "Nadia Asencio", "video production", "Raw Speak", "IZ Parrot", "performed"]
end

it do
result = Keyphrase.analyse "Secrets Of The Smithsonian: Humanity's Hidden History | Jay Myers"
result = Keyphrase.analyse "Secrets Of The The The Smithsonian: Humanity's Hidden History | Jay Myers"

expect(result.keys).to eq ["The Smithsonian", "Hidden History", "Jay Myers", "Secrets", "Humanity"]
expect(result.keys).to eq ["Hidden History", "Jay Myers", "Secrets", "Smithsonian", "Humanity"]
end

it "should remove non-words" do
Expand Down Expand Up @@ -63,7 +63,7 @@
it "should split on &" do
result = Keyphrase.analyse "Making & Cutting Sex on The Beach for BeScented Fragrance Oil Supply"

expect(result.keys).to eq ["BeScented Fragrance Oil Supply", "Cutting Sex", "The Beach", "Making"]
expect(result.keys).to eq ["BeScented Fragrance Oil Supply", "Cutting Sex", "Making", "Beach"]
end

it do
Expand Down

0 comments on commit f1e9d64

Please sign in to comment.